[
  {
    "path": ".aws/task_definition.json",
    "content": "{\n    \"taskDefinitionArn\": \"arn:aws:ecs:eu-west-1:253053805092:task-definition/megaparse-task:2\",\n    \"containerDefinitions\": [\n        {\n            \"name\": \"megaparse\",\n            \"image\": \"quay.io/unstructured-io/unstructured-api:latest\",\n            \"cpu\": 0,\n            \"portMappings\": [\n                {\n                    \"containerPort\": 8000,\n                    \"hostPort\": 8000,\n                    \"protocol\": \"tcp\"\n                }\n            ],\n            \"essential\": true,\n            \"environment\": [\n                {\n                    \"name\": \"UNSTRUCTURED_HI_RES_MODEL_NAME\",\n                    \"value\": \"detectron2_onnx\"\n                },\n                {\n                    \"name\": \"UNSTRUCTURED_PARALLEL_MODE_ENABLED\",\n                    \"value\": \"false\"\n                }\n            ],\n            \"mountPoints\": [],\n            \"volumesFrom\": [],\n            \"logConfiguration\": {\n                \"logDriver\": \"awslogs\",\n                \"options\": {\n                    \"awslogs-group\": \"/ecs/megaparse\",\n                    \"awslogs-region\": \"eu-west-1\",\n                    \"awslogs-stream-prefix\": \"ecs\"\n                }\n            },\n            \"systemControls\": []\n        }\n    ],\n    \"family\": \"megaparse-task\",\n    \"executionRoleArn\": \"arn:aws:iam::253053805092:role/megaparse-ecsTaskExecutionRole\",\n    \"networkMode\": \"awsvpc\",\n    \"revision\": 2,\n    \"volumes\": [],\n    \"status\": \"ACTIVE\",\n    \"requiresAttributes\": [\n        {\n            \"name\": \"com.amazonaws.ecs.capability.logging-driver.awslogs\"\n        },\n        {\n            \"name\": \"ecs.capability.execution-role-awslogs\"\n        },\n        {\n            \"name\": \"com.amazonaws.ecs.capability.docker-remote-api.1.19\"\n        },\n        {\n            \"name\": \"com.amazonaws.ecs.capability.docker-remote-api.1.18\"\n        },\n        {\n            \"name\": \"ecs.capability.task-eni\"\n        }\n    ],\n    \"placementConstraints\": [],\n    \"compatibilities\": [\n        \"EC2\",\n        \"FARGATE\"\n    ],\n    \"requiresCompatibilities\": [\n        \"FARGATE\"\n    ],\n    \"cpu\": \"2048\",\n    \"memory\": \"8192\",\n    \"tags\": []\n}"
  },
  {
    "path": ".flake8",
    "content": "[flake8]\n; Minimal configuration for Flake8 to work with Black.\nmax-line-length = 100\nignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100\n"
  },
  {
    "path": ".gitattributes",
    "content": "*.ipynb linguist-vendored\n*.html linguist-vendored"
  },
  {
    "path": ".github/workflows/CI.yml",
    "content": "name: Run tests\n\non:\n  pull_request:\n  workflow_dispatch:\n\nenv:\n  NATS_TOKEN: test\n\njobs:\n  test:\n    name: Run tests on Python ${{ matrix.python-version }}\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        python-version: [\"3.11\", \"3.12\"]\n    steps:\n      - name: 👀 Checkout code\n        uses: actions/checkout@v2\n        with:\n          submodules: true\n\n      - name: Setup apt cache\n        uses: actions/cache@v2\n        with:\n          path: /var/cache/apt/archives\n          key: ${{ runner.os }}-apt-${{ hashFiles('/etc/apt/sources.list') }}\n\n      - name: 😭 Install system dependencies\n        run: |\n          sudo apt-get update && sudo apt-get install -y \\\n            netcat-traditional \\\n            unzip \\\n            libgeos-dev \\\n            libcurl4-openssl-dev \\\n            libssl-dev \\\n            binutils \\\n            curl \\\n            git \\\n            autoconf \\\n            automake \\\n            build-essential \\\n            libtool \\\n            gcc \\\n            libmagic-dev \\\n            poppler-utils \\\n            tesseract-ocr \\\n            libreoffice \\\n            libpq-dev \\\n            pandoc\n\n      - name: 🔽 Install the latest version of rye\n        uses: eifinger/setup-rye@v4\n        with:\n          enable-cache: true\n\n      - name: 📌 Pin Python version\n        run: rye pin ${{ matrix.python-version }}\n\n      - name: 🔽 Download and Install NATS Server\n        run: |\n          curl -L https://github.com/nats-io/nats-server/releases/download/v2.10.22/nats-server-v2.10.22-linux-amd64.zip -o nats-server.zip\n          unzip nats-server.zip -d nats-server && sudo cp nats-server/nats-server-v2.10.22-linux-amd64/nats-server /usr/bin\n\n      - name: 🛠️ Set up NATS arguments\n        run: |\n          nohup nats-server \\\n            --addr 0.0.0.0 \\\n            --port 4222 \\\n            --auth \"$NATS_TOKEN\" > nats.log 2>&1 &\n\n      - name: 🔍 Verify NATS Server is Running\n        run: |\n          sleep 1 # Give the server some time to start\n          if nc -zv localhost 4222; then\n            echo \"✅ NATS Server is running on port 4222.\"\n          else\n            echo \"❌ Failed to start NATS Server.\"\n            cat nats.log\n            exit 1\n          fi\n\n      - name: 🔨 Sync dependencies\n        run: |\n          UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock\n\n      - name: 🚀 Run tests\n        run: |\n          rye test -p megaparse-sdk\n"
  },
  {
    "path": ".github/workflows/build-and-deploy.yml",
    "content": "name: Build Docker image and push ECR\n\non:\n  push:\n    tags:\n      - \"v*\"\n    branches: [main]\n\nenv:\n  AWS_REGION: eu-west-1\n  ECR_REPOSITORY: quivrhq/megaparse\n  ECS_CLUSTER: megaparse\n  ECS_TASK_DEFINITION: .aws/task_definition.json\n  CONTAINER_NAME: megaparse\n\npermissions:\n  contents: read\n\njobs:\n  deploy:\n    name: build docker\n    runs-on: ubuntu-latest\n    environment: production\n    outputs:\n      imageoutput: ${{ steps.build-image.outputs.imageoutput }}\n\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v3\n\n      - name: Configure AWS credentials\n        uses: aws-actions/configure-aws-credentials@v4\n        with:\n          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n          aws-region: us-east-1\n\n      - name: Login to Amazon ECR\n        id: login-ecr\n        uses: aws-actions/amazon-ecr-login@v1\n        with:\n          registry-type: public\n\n      - name: Build, tag, and push image to Amazon ECR\n        id: build-image\n        env:\n          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}\n          IMAGE_TAG: ${{ github.sha }}\n        run: |\n          # Build a docker container and push it to ECR\n          docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .\n          docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG\n\n          # Tag the image as 'latest' and push\n          docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest\n          docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest\n\n          echo \"imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG\" >> $GITHUB_OUTPUT\n"
  },
  {
    "path": ".github/workflows/build-gpu.yml",
    "content": "name: Build docker GPU and push ECR\n\non:\n  push:\n    tags:\n      - \"v*\"\n    branches: [main]\n\nenv:\n  AWS_REGION: eu-west-1\n  ECR_REPOSITORY: quivrhq/megaparse-gpu\n  ECS_CLUSTER: megaparse\n  ECS_TASK_DEFINITION: .aws/task_definition.json\n  CONTAINER_NAME: megaparse\n\npermissions:\n  contents: read\n\njobs:\n  deploy:\n    name: Build docker-gpu\n    runs-on:\n      group: big-boy-gpu\n    environment: production\n    outputs:\n      imageoutput: ${{ steps.build-image.outputs.imageoutput }}\n\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v3\n\n      - name: Configure AWS credentials\n        uses: aws-actions/configure-aws-credentials@v4\n        with:\n          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}\n          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}\n          aws-region: us-east-1\n\n      - name: Login to Amazon ECR\n        id: login-ecr\n        uses: aws-actions/amazon-ecr-login@v1\n        with:\n          registry-type: public\n\n      - name: Build, tag, and push image to Amazon ECR\n        id: build-image\n        env:\n          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}\n          IMAGE_TAG: ${{ github.sha }}\n        run: |\n          # Build a docker container and push it to ECR\n          docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f Dockerfile.gpu .\n          docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG\n\n          # Tag the image as 'latest' and push\n          docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest\n          docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest\n\n          echo \"imageoutput=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG\" >> $GITHUB_OUTPUT\n"
  },
  {
    "path": ".github/workflows/release-please.yml",
    "content": "on:\n  push:\n    branches:\n      - main\n\npermissions:\n  contents: write\n  pull-requests: write\n\nname: release-please\n\njobs:\n  release-please:\n    runs-on: ubuntu-latest\n    outputs:\n      release_created: ${{ steps.release.outputs['libs/megaparse--release_created'] }}\n      release_created_sdk: ${{ steps.release.outputs['libs/megaparse_sdk--release_created'] }}\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v3\n        with:\n          fetch-depth: 0 # Fetch all history for tags and releases\n\n      - name: Setup Python\n        uses: actions/setup-python@v4\n        with:\n          python-version: \"3.11\"\n\n      - name: Run release-please\n        id: release\n        uses: google-github-actions/release-please-action@v4\n        with:\n          token: ${{ secrets.RELEASE_PLEASE_TOKEN }}\n\n  deploy-megaparse:\n    if: needs.release-please.outputs.release_created == 'true'\n    needs: release-please\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n      - name: Install Rye\n        uses: eifinger/setup-rye@v2\n        with:\n          enable-cache: true\n      - name: Rye Sync\n        run: rye sync --no-lock\n      - name: Rye Build\n        run: cd libs/megaparse && rye build\n      - name: Rye Publish\n        run: cd libs/megaparse && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes\n\n  deploy-sdk:\n    if: needs.release-please.outputs.release_created_sdk == 'true'\n    needs: release-please\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n      - name: Install Rye\n        uses: eifinger/setup-rye@v2\n        with:\n          enable-cache: true\n      - name: Rye Sync\n        run: cd libs/megaparse_sdk && rye sync --no-lock\n      - name: Rye Build\n        run: cd libs/megaparse_sdk && rye build\n      - name: Rye Publish\n        run: cd libs/megaparse_sdk && rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes\n"
  },
  {
    "path": ".github/workflows/test-build-docker.yml",
    "content": "on:\n  pull_request:\n    branches:\n      - main\n\nname: Test build docker\njobs:\n  build-docker:\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        dockerfile: [Dockerfile, Dockerfile.gpu]\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v3\n\n      - name: Set up QEMU\n        uses: docker/setup-qemu-action@v3\n        with:\n          platforms: all\n\n      - name: Set up Docker Buildx\n        uses: docker/setup-buildx-action@v3\n\n      - name: Build Docker image with caching\n        uses: docker/build-push-action@v4\n        with:\n          context: .\n          file: ${{ matrix.dockerfile }}\n          push: false\n          tags: quivrhq/megaparse:${{ matrix.dockerfile }}\n          cache-from: type=gha\n          cache-to: type=gha,mode=max\n"
  },
  {
    "path": ".gitignore",
    "content": "/output\n/input\n.env\n__pycache__/\ndist/**\nmegaparse.egg-info/\n*.pyc\nbuild/*\nENV\nvenv\n*/evaluations/*\n*/cdp/*\n*.pkl\n\n!megaparse/tests/output_tests/MegaFake_report.md\n*.DS_Store\n.tool-versions\nmegaparse/sdk/examples/only_pdfs/*\n\n**/profile/\n**/prof/\n.ropeproject/\nbenchmark/hi_res/*\nbenchmark/auto/*\n\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.6.0\n    hooks:\n      - id: check-added-large-files\n        args: [\"--maxkb=5000\"]\n      - id: check-toml\n      - id: check-yaml\n      - id: end-of-file-fixer\n      - id: trailing-whitespace\n      - id: check-merge-conflict\n      - id: detect-private-key\n      - id: check-case-conflict\n  - repo: https://github.com/pre-commit/pre-commit\n    rev: v3.6.2\n    hooks:\n      - id: validate_manifest\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    # Ruff version.\n    rev: v0.5.1\n    hooks:\n      # Run the linter.\n      - id: ruff\n        args: [--fix]\n        additional_dependencies: []\n      # Run the formatter.\n      - id: ruff-format\n        additional_dependencies: []\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: v1.10.1\n    hooks:\n      - id: mypy\n        name: mypy\n        additional_dependencies: [\"types-aiofiles\"]\n"
  },
  {
    "path": ".python-version",
    "content": "3.11.9\n"
  },
  {
    "path": ".release-please-manifest.json",
    "content": "{\n  \"libs/megaparse\": \"0.0.55\",\n  \"libs/megaparse_sdk\": \"0.1.12\"\n}\n"
  },
  {
    "path": ".vscode/extensions.json",
    "content": "{\n  \"recommendations\": [\n    \"dbaeumer.vscode-eslint\",\n    \"charliermarsh.ruff\",\n    \"knisterpeter.vscode-github\",\n    \"github.vscode-pull-request-github\",\n    \"ms-python.python\",\n    \"ms-python.vscode-pylance\",\n    \"ms-python.debugpy\"\n  ]\n}\n"
  },
  {
    "path": ".vscode/launch.json",
    "content": "{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Python: Remote Attach\",\n            \"type\": \"python\",\n            \"request\": \"attach\",\n            \"connect\": {\n                \"host\": \"localhost\",\n                \"port\": 5678\n            },\n            \"pathMappings\": [\n                {\n                    \"localRoot\": \"${workspaceFolder}/backend\",\n                    \"remoteRoot\": \".\"\n                }\n            ],\n            \"justMyCode\": true\n        },\n        {\n            \"name\": \"Python: Debug Test Script\",\n            \"type\": \"python\",\n            \"request\": \"launch\",\n            \"program\": \"${workspaceFolder}/backend/test_process_file_and_notify.py\",\n            \"console\": \"integratedTerminal\",\n            \"justMyCode\": false\n        },\n        {\n            \"name\": \"Python: Debug\",\n            \"type\": \"debugpy\",\n            \"request\": \"launch\",\n            \"program\": \"${file}\",\n            \"console\": \"integratedTerminal\",\n            \"justMyCode\": false,\n            \"env\": {\n                \"PYTHONPATH\": \"${workspaceFolder}/backend:${env:PYTHONPATH}\"\n            },\n            \"envFile\": \"${workspaceFolder}/.env\"\n        }\n    ]\n}\n"
  },
  {
    "path": ".vscode/settings.json",
    "content": "{\n  \"editor.formatOnSave\": true,\n  \"editor.formatOnSaveMode\": \"file\",\n  \"files.exclude\": {\n    \"**/__pycache__\": true,\n    \"**/.benchmarks/\": true,\n    \"**/.cache/\": true,\n    \"**/.pytest_cache/\": true,\n    \"**/.next/\": true,\n    \"**/build/\": true,\n    \"**/.docusaurus/\": true,\n    \"**/node_modules/\": true\n  },\n  \"[python]\": {\n    \"editor.defaultFormatter\": \"charliermarsh.ruff\",\n    \"editor.formatOnSave\": true,\n    \"editor.codeActionsOnSave\": {\n      \"source.organizeImports\": \"explicit\",\n      \"source.fixAll\": \"explicit\"\n    }\n  },\n  \"python.testing.unittestEnabled\": false,\n  \"python.testing.pytestEnabled\": true,\n  \"python.testing.autoTestDiscoverOnSaveEnabled\": true,\n  \"python.analysis.autoImportCompletions\": true,\n  \"python.analysis.typeCheckingMode\": \"basic\",\n  \"python.analysis.diagnosticSeverityOverrides\": {\n    \"reportMissingImports\": \"error\",\n    \"reportUnusedImport\": \"warning\",\n    \"reportGeneralTypeIssues\": \"warning\"\n  },\n  \"makefile.configureOnOpen\": false\n}\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "# Changelog\n\n## [0.0.46](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.45...megaparse-v0.0.46) (2024-11-21)\n\n\n### Features\n\n* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))\n\n## [0.0.45](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.44...megaparse-v0.0.45) (2024-11-19)\n\n\n### Bug Fixes\n\n* small fixes from backlogs ([#128](https://github.com/QuivrHQ/MegaParse/issues/128)) ([954554c](https://github.com/QuivrHQ/MegaParse/commit/954554c5abaa7b0513e9ff3f6bbaff393d36cf03))\n\n## [0.0.44](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.43...megaparse-v0.0.44) (2024-11-18)\n\n\n### Bug Fixes\n\n* fixing the wrong passing of arguments to the parse_file endpoint ([#123](https://github.com/QuivrHQ/MegaParse/issues/123)) ([9105672](https://github.com/QuivrHQ/MegaParse/commit/9105672abc0942f26785e494053112d486e8d2d9))\n\n## [0.0.43](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.42...megaparse-v0.0.43) (2024-11-14)\n\n\n### Features\n\n* increase the robustness of megaparse ([#121](https://github.com/QuivrHQ/MegaParse/issues/121)) ([d21d8bb](https://github.com/QuivrHQ/MegaParse/commit/d21d8bb77bd8e687b1a951db6b81653e4e47a8bb))\n\n\n### Bug Fixes\n\n* uvicorn version ([#127](https://github.com/QuivrHQ/MegaParse/issues/127)) ([ceaba3d](https://github.com/QuivrHQ/MegaParse/commit/ceaba3df2951be27e6a4835e5784917a62867896))\n* version requirements ([#126](https://github.com/QuivrHQ/MegaParse/issues/126)) ([a10d502](https://github.com/QuivrHQ/MegaParse/commit/a10d502f1b3576690cebe33b656d2480a24defe3))\n\n## [0.0.42](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.41...megaparse-v0.0.42) (2024-11-08)\n\n\n### Features\n\n* **sdk:** new version ([e377cd6](https://github.com/QuivrHQ/MegaParse/commit/e377cd6df98b3ea9265788a4d907b43bde796196))\n\n## [0.0.41](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.40...megaparse-v0.0.41) (2024-11-08)\n\n\n### Bug Fixes\n\n* add megaparse url env variable ([#118](https://github.com/QuivrHQ/MegaParse/issues/118)) ([132c2eb](https://github.com/QuivrHQ/MegaParse/commit/132c2ebd13177fd116c4e710a4b1c864a9fa04bb))\n\n## [0.0.40](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.39...megaparse-v0.0.40) (2024-11-08)\n\n\n### Bug Fixes\n\n* sdk version ([#116](https://github.com/QuivrHQ/MegaParse/issues/116)) ([8bfeb4a](https://github.com/QuivrHQ/MegaParse/commit/8bfeb4a52326a5f645d3ed20e113153dc19bf012))\n\n## [0.0.39](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.38...megaparse-v0.0.39) (2024-11-08)\n\n\n### Bug Fixes\n\n* add_logs ([#114](https://github.com/QuivrHQ/MegaParse/issues/114)) ([63c9236](https://github.com/QuivrHQ/MegaParse/commit/63c9236590016ee4c210174e746e96ff2b654480))\n\n## [0.0.38](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.37...megaparse-v0.0.38) (2024-11-07)\n\n\n### Bug Fixes\n\n* env roots, imports root ([#112](https://github.com/QuivrHQ/MegaParse/issues/112)) ([a04230d](https://github.com/QuivrHQ/MegaParse/commit/a04230dc2de9e0bb0bde39ab66b2208f80743922))\n\n## [0.0.37](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.36...megaparse-v0.0.37) (2024-11-07)\n\n\n### Features\n\n* bump megaparse-sdk version to 0.1.1 ([ed3fdfb](https://github.com/QuivrHQ/MegaParse/commit/ed3fdfb10498c95d4f9a510df3a2913e0dfc3c23))\n\n## [0.0.36](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.35...megaparse-v0.0.36) (2024-11-07)\n\n\n### Features\n\n* **readme:** update ([9d571b7](https://github.com/QuivrHQ/MegaParse/commit/9d571b7c71db610e7a0b08045ad98994ecf71baa))\n\n## [0.0.35](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.34...megaparse-v0.0.35) (2024-11-07)\n\n\n### Bug Fixes\n\n* unnecessary dep and readme ([#107](https://github.com/QuivrHQ/MegaParse/issues/107)) ([b80aaa3](https://github.com/QuivrHQ/MegaParse/commit/b80aaa3a894b2bd2c7d7f518919c41af5c99219f))\n\n## [0.0.34](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.33...megaparse-v0.0.34) (2024-11-07)\n\n\n### Features\n\n* megaparse-sdk-cherry ([#105](https://github.com/QuivrHQ/MegaParse/issues/105)) ([ad44aa3](https://github.com/QuivrHQ/MegaParse/commit/ad44aa34999596e156c78f91adab97bce7ceeb0e))\n\n## [0.0.33](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.32...megaparse-v0.0.33) (2024-11-01)\n\n\n### Bug Fixes\n\n* readme ([#99](https://github.com/QuivrHQ/MegaParse/issues/99)) ([b3b80a3](https://github.com/QuivrHQ/MegaParse/commit/b3b80a3a599bbd4bec8ed79bb9ef44c8c7c92789))\n\n## [0.0.32](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.31...megaparse-v0.0.32) (2024-11-01)\n\n\n### Features\n\n* **api:** megaparse under api ([#93](https://github.com/QuivrHQ/MegaParse/issues/93)) ([2edf44b](https://github.com/QuivrHQ/MegaParse/commit/2edf44bd8c09ac7127db74206e463ebe29c68998))\n\n\n### Bug Fixes\n\n* api call error & tests ([#98](https://github.com/QuivrHQ/MegaParse/issues/98)) ([6bf1ce8](https://github.com/QuivrHQ/MegaParse/commit/6bf1ce8c6ed0e4f1e81577973a0fc71f61b10776))\n\n## [0.0.31](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.30...megaparse-v0.0.31) (2024-08-20)\n\n\n### Features\n\n* **pytorch:** cpu only removed ([#88](https://github.com/QuivrHQ/MegaParse/issues/88)) ([6b2fcfa](https://github.com/QuivrHQ/MegaParse/commit/6b2fcfa4413b8a72d398aab57f277dd28ab69c2f))\n\n## [0.0.30](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.29...megaparse-v0.0.30) (2024-08-20)\n\n\n### Features\n\n* **pytorch:** cpu only optional ([#86](https://github.com/QuivrHQ/MegaParse/issues/86)) ([e5d8806](https://github.com/QuivrHQ/MegaParse/commit/e5d8806ee6182de250352ce65ac6cd57c1093494))\n\n## [0.0.29](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.28...megaparse-v0.0.29) (2024-08-18)\n\n\n### Bug Fixes\n\n* **building:** version not working ([#83](https://github.com/QuivrHQ/MegaParse/issues/83)) ([c5e73f6](https://github.com/QuivrHQ/MegaParse/commit/c5e73f6c821424ef277ddd15ddb5b2df48ff7ab2))\n\n## [0.0.28](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.27...megaparse-v0.0.28) (2024-08-16)\n\n\n### Features\n\n* **rye:** added package manager ([#81](https://github.com/QuivrHQ/MegaParse/issues/81)) ([a3a50a3](https://github.com/QuivrHQ/MegaParse/commit/a3a50a3f27d3d9b4d6de4f3415472f8e52710656))\n\n## [0.0.27](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.26...megaparse-v0.0.27) (2024-08-16)\n\n\n### Features\n\n* **unstructured:** increased version ([#78](https://github.com/QuivrHQ/MegaParse/issues/78)) ([eb49cf5](https://github.com/QuivrHQ/MegaParse/commit/eb49cf5e79cd7a38c8212b315a4b64860c35a7b7))\n\n## [0.0.26](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.25...megaparse-v0.0.26) (2024-08-16)\n\n\n### Bug Fixes\n\n* **pycrypto:** being used by an old version of pdfplumber ([#76](https://github.com/QuivrHQ/MegaParse/issues/76)) ([d28f88c](https://github.com/QuivrHQ/MegaParse/commit/d28f88ceb2a722b15c84738f395b3ff4c818a365))\n\n## [0.0.25](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.24...megaparse-v0.0.25) (2024-08-16)\n\n\n### Features\n\n* **rye:** implemented ([#74](https://github.com/QuivrHQ/MegaParse/issues/74)) ([1e9ad8e](https://github.com/QuivrHQ/MegaParse/commit/1e9ad8e0000f28c709d915219fe62c0dbe7fa812))\n\n## [0.0.24](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.23...megaparse-v0.0.24) (2024-07-30)\n\n\n### Features\n\n* async load   ([#71](https://github.com/QuivrHQ/MegaParse/issues/71)) ([fbc3e1b](https://github.com/QuivrHQ/MegaParse/commit/fbc3e1b5f504eee9757e15592169ddad9b069f03))\n\n## [0.0.23](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.22...megaparse-v0.0.23) (2024-07-30)\n\n\n### Features\n\n* megaparse 0.0.22 ([071fd4d](https://github.com/QuivrHQ/MegaParse/commit/071fd4da2e8f0abb58fc66c3cdd87c4ee5cda4d6))\n\n## 0.0.20 (2024-07-10)\n\n## What's Changed\n* add: resolve multiple page problem on llama parse by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/61\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.19...v0.0.20\n\n## 0.0.19 (2024-06-28)\n\n## What's Changed\n* add: choose unstructured strategy by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/57\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.18...v0.0.19\n\n## 0.0.18 (2024-06-28)\n\n## What's Changed\n* fix: add __init__.py by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/54\n* fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/56\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.17...v0.0.18\n\n## 0.0.17 (2024-06-27)\n\n## What's Changed\n* markdown by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/48\n* fix:Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/49\n* fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/50\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.16...v0.0.17\n\n## 0.0.16 (2024-06-27)\n\n## What's Changed\n* Fix: Update README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/47\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.15...v0.0.16\n\n## 0.0.15 (2024-06-26)\n\n## What's Changed\n* add: llm megaparser by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/42\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.14...v0.0.15\n\n## 0.0.14 (2024-06-24)\n\n## What's Changed\n* fix: remove nest asycio by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/40\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.13...v0.0.14\n\n## 0.0.13 (2024-06-24)\n\n## What's Changed\n* fix: use aload_data by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/38\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.12...v0.0.13\n\n## 0.0.12 (2024-06-18)\n\n## What's Changed\n* fix:delete markdownify dependency by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/33\n* fix: fake fix README.md by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/34\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.11...v0.0.12\n\n## 0.0.11 (2024-06-17)\n\n## What's Changed\n* Fix OpenAI key error. Add docstrings. Polish code by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/24\n* Fix DOCX reader. Add input tests by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/25\n* add: xlsx convertor by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/29\n* add: convert_tab by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/31\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.10...v0.0.11\n\n## 0.0.10 (2024-06-04)\n\n## What's Changed\n* Change from LiteralString to Literal (typing) by @dSupertramp in https://github.com/QuivrHQ/MegaParse/pull/21\n* chore: Add Dockerfile and Makefile for project setup by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/23\n\n## New Contributors\n* @dSupertramp made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/21\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.9...v0.0.10\n\n## 0.0.9 (2024-06-04)\n\n## What's Changed\n* chore: Update README.md to include optional use of LlamaParse for improved results by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/19\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.8...v0.0.9\n\n## 0.0.8 (2024-06-04)\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.7...v0.0.8\n\n## 0.0.7 (2024-06-03)\n\n## What's Changed\n* feat: Update benchmark results in README.md by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/15\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.6...v0.0.7\n\n## 0.0.6 (2024-06-03)\n\n## What's Changed\n* add: gpt cleaner for header and footer by @chloedia in https://github.com/QuivrHQ/MegaParse/pull/13\n\n## New Contributors\n* @chloedia made their first contribution in https://github.com/QuivrHQ/MegaParse/pull/13\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.5...v0.0.6\n\n## 0.0.5 (2024-06-02)\n\n## What's Changed\n* feat: Add instructions for installing poppler and tesseract by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/10\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.4...v0.0.5\n\n## 0.0.4 (2024-06-02)\n\n## What's Changed\n* add: baseline evaluation by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/7\n* Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/9\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.3...v0.0.4\n\n## 0.0.3 (2024-05-30)\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.2...v0.0.3\n\n## 0.0.2 (2024-05-30)\n\n## What's Changed\n* feat: Megaparse example and working by @StanGirard in https://github.com/QuivrHQ/MegaParse/pull/2\n\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2\n\n## 0.0.2 (2024-05-30)\n\n**Full Changelog**: https://github.com/QuivrHQ/MegaParse/compare/v0.0.1...v0.0.2\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM python:3.11.10-slim-bullseye\n\nWORKDIR /app\n\n# Install runtime dependencies\nRUN apt-get update && apt-get upgrade && apt-get install -y \\\n    libgeos-dev \\\n    libcurl4-openssl-dev \\\n    libssl-dev \\\n    binutils \\\n    curl \\\n    git \\\n    autoconf \\\n    automake \\\n    build-essential \\\n    libtool \\\n    python-dev \\\n    build-essential \\\n    wget \\\n    gcc \\\n    # Additional dependencies for document handling\n    libmagic-dev \\\n    poppler-utils \\\n    tesseract-ocr \\\n    libreoffice \\\n    libpq-dev \\\n    pandoc && \\\n    rm -rf /var/lib/apt/lists/* && apt-get clean\n\nCOPY requirements.lock  pyproject.toml README.md ./\nCOPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/\nCOPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/\n\nRUN pip install uv\nRUN uv pip install --no-cache --system -r requirements.lock\n\nRUN playwright install --with-deps\nRUN python3 - -m nltk.downloader all\n\nCOPY . .\n\nRUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk\n\nEXPOSE 8000\nCMD [\"uvicorn\", \"megaparse.api.app:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8000\"]\n"
  },
  {
    "path": "Dockerfile.gpu",
    "content": "FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu20.04\n\nWORKDIR /app\n\nENV UV_COMPILE_BYTECODE=1\nENV UV_NO_CACHE=1\nENV DEBIAN_FRONTEND=noninteractive\n\n# Install runtime dependencies\nRUN apt-get update && apt-get install -y software-properties-common && \\\n    add-apt-repository ppa:deadsnakes/ppa && \\\n    apt-get update && apt-get install -y \\\n    python3.11  \\\n    python3.11-dev \\\n    libgeos-dev \\\n    libcurl4-openssl-dev \\\n    libssl-dev \\\n    binutils \\\n    curl \\\n    git \\\n    autoconf \\\n    automake \\\n    libtool \\\n    python3-pip \\\n    build-essential \\\n    wget \\\n    gcc \\\n    # Additional dependencies for document handling\n    libmagic-dev \\\n    poppler-utils \\\n    tesseract-ocr \\\n    libreoffice \\\n    libpq-dev \\\n    pandoc && \\\n    rm -rf /var/lib/apt/lists/* && apt-get clean\n\nRUN  update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \\\n       update-alternatives --set python3 /usr/bin/python3.11\n\nCOPY requirements.lock  pyproject.toml README.md ./\nCOPY libs/megaparse/pyproject.toml libs/megaparse/README.md libs/megaparse/\nCOPY libs/megaparse_sdk/pyproject.toml libs/megaparse_sdk/README.md libs/megaparse_sdk/\n\nRUN curl -LsSf https://astral.sh/uv/install.sh | sh\nENV PATH=\"/root/.local/bin:$PATH\"\nRUN uv pip install --no-cache --system -r requirements.lock\n\nRUN playwright install --with-deps\nRUN python3 - -m nltk.downloader all\n\n# FIXME: causes runtime link issues with onnxruntime_pybind_state.cc:507 unstructured\n# RUN python3 -c \"from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()\" && \\\n# RUN python3 -c \"import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')\"\n\nCOPY . .\n\nRUN uv pip install --no-cache --system /app/libs/megaparse /app/libs/megaparse_sdk\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "Makefile",
    "content": ".DEFAULT_TARGET=help\n\n## help: Display list of commands\n.PHONY: help\nhelp:\n\t@echo \"Available commands:\"\n\t@sed -n 's|^##||p' $(MAKEFILE_LIST) | column -t ':' | sed -e 's|^| |'\n\n## dev: Start development environment\n.PHONY: dev\ndev:\n\tDOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up --build\n\n## dev-build: Build development environment without cache\n.PHONY: dev-build\ndev-build:\n\tDOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml build --no-cache\n\tDOCKER_BUILDKIT=1 docker compose -f docker-compose.dev.yml up\n\n## prod: Build and start production environment\n.PHONY: prod\nprod:\n\tdocker compose -f docker-compose.yml up --build\n"
  },
  {
    "path": "Pipfile",
    "content": "[[source]]\nurl = \"https://pypi.org/simple\"\nverify_ssl = true\nname = \"pypi\"\n\n[packages]\n\n[dev-packages]\n\n[requires]\npython_version = \"3.11\"\n"
  },
  {
    "path": "README.md",
    "content": "# MegaParse - Your Parser for every type of documents\n\n<div align=\"center\">\n    <img src=\"https://raw.githubusercontent.com/QuivrHQ/MegaParse/main/logo.png\" alt=\"Quivr-logo\" width=\"30%\"  style=\"border-radius: 50%; padding-bottom: 20px\"/>\n</div>\n\nMegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing.\n\n## Key Features 🎯\n\n- **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease.\n- **No Information Loss**: Focus on having no information loss during parsing.\n- **Fast and Efficient**: Designed with speed and efficiency at its core.\n- **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents.\n- **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use.\n\n## Support\n\n- Files: ✅ PDF ✅ Powerpoint ✅ Word\n- Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images\n\n### Example\n\nhttps://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3\n\n## Installation\n\nrequired python version >= 3.11\n\n```bash\npip install megaparse\n```\n\n## Usage\n\n1. Add your OpenAI or Anthropic API key to the .env file\n\n2. Install poppler on your computer (images and PDFs)\n\n3. Install tesseract on your computer (images and PDFs)\n\n4. If you have a mac, you also need to install libmagic ```brew install libmagic```\n\nUse MegaParse as it is : \n```python\nfrom megaparse import MegaParse\nfrom langchain_openai import ChatOpenAI\n\nmegaparse = MegaParse()\nresponse = megaparse.load(\"./test.pdf\")\nprint(response)\n```\n\n### Use MegaParse Vision\n\n```python\nfrom megaparse.parser.megaparse_vision import MegaParseVision\n\nmodel = ChatOpenAI(model=\"gpt-4o\", api_key=os.getenv(\"OPENAI_API_KEY\"))  # type: ignore\nparser = MegaParseVision(model=model)\nresponse = parser.convert(\"./test.pdf\")\nprint(response)\n\n```\n**Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.\n\n## Use as an API\nThere is a MakeFile for you, simply use :\n```make dev```\nat the root of the project and you are good to go.\n\nSee localhost:8000/docs for more info on the different endpoints !\n\n## BenchMark\n\n<!---BENCHMARK-->\n| Parser                        | similarity_ratio |\n| ----------------------------- | ---------------- |\n| megaparse_vision              | 0.87             |\n| unstructured_with_check_table | 0.77             |\n| unstructured                  | 0.59             |\n| llama_parser                  | 0.33             |\n<!---END_BENCHMARK-->\n\n_Higher the better_\n\nNote: Want to evaluate and compare your Megaparse module with ours ? Please add your config in ```evaluations/script.py``` and then run ```python evaluations/script.py```. If it is better, do a PR, I mean, let's go higher together .\n\n## In Construction 🚧\n- Improve table checker\n- Create Checkers to add **modular postprocessing** ⚙️\n- Add Structured output, **let's get computer talking** 🤖\n\n\n\n## Star History\n\n[![Star History Chart](https://api.star-history.com/svg?repos=QuivrHQ/MegaParse&type=Date)](https://star-history.com/#QuivrHQ/MegaParse&Date)\n"
  },
  {
    "path": "benchmark/process_single_doc.py",
    "content": "import asyncio\nimport time\nfrom pathlib import Path\n\nimport numpy as np\nfrom megaparse import MegaParse\n\nN_TRY = 1\n\n\nasync def process_file(megaparse: MegaParse, file_path: str | Path):\n    try:\n        t0 = time.perf_counter()\n        _ = await megaparse.aload(\n            file_path=file_path,\n        )\n        total = time.perf_counter() - t0\n        return total\n    except Exception as e:\n        print(f\"Exception occured: {e}\")\n        return None\n\n\nasync def test_process_file(file: str | Path):\n    # parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)\n    megaparse = MegaParse()\n    task = []\n    for _ in range(N_TRY):\n        task.append(process_file(megaparse, file))\n    list_process_time = await asyncio.gather(*task)\n\n    n_errors = sum([t is None for t in list_process_time])\n    list_process_time = [t for t in list_process_time if t is not None]\n\n    np_list_process_time = np.array(list_process_time)\n    print(f\"All errors : {n_errors}\")\n    print(f\"Average time taken: {np_list_process_time.mean()}\")\n    print(f\"Median time taken: {np.median(list_process_time)}\")\n    print(f\"Standard deviation of time taken: {np.std(list_process_time)}\")\n    print(f\"Max time taken: {np.max(list_process_time)}\")\n    print(f\"Min time taken: {np.min(list_process_time)}\")\n\n\nif __name__ == \"__main__\":\n    folder_path = \"/Users/amine/data/quivr/parsing/scanned/machine.pdf\"\n    asyncio.run(test_process_file(folder_path))\n"
  },
  {
    "path": "benchmark/test_quality_sim.py",
    "content": "import os\nimport difflib\nfrom pathlib import Path\n\nauto_dir = Path(\"benchmark/auto\")\nhi_res_dir = Path(\"benchmark/hi_res\")\n\n\ndef jaccard_similarity(str1, str2):\n    if len(str1) == 0 and len(str2) == 0:\n        return 1\n    # Tokenize the strings into sets of words\n    words1 = set(str1.split())\n    words2 = set(str2.split())\n\n    # Find intersection and union of the word sets\n    intersection = words1.intersection(words2)\n    union = words1.union(words2)\n\n    # Compute Jaccard similarity\n    return len(intersection) / len(union) if len(union) != 0 else 0\n\n\ndef compare_files(file_name):\n    file_path_auto = auto_dir / f\"{file_name}.md\"\n    file_path_hi_res = hi_res_dir / f\"{file_name}.md\"\n\n    with open(file_path_auto, \"r\") as f:\n        auto_content = f.read()\n\n    with open(file_path_hi_res, \"r\") as f:\n        hi_res_content = f.read()\n\n    if len(auto_content) == 0 and len(hi_res_content) == 0:\n        return 1\n\n    similarity = difflib.SequenceMatcher(None, auto_content, hi_res_content).ratio()\n    # similarity = jaccard_similarity(auto_content, hi_res_content)\n\n    return similarity\n\n\ndef main():\n    files = os.listdir(hi_res_dir)\n    print(f\"Comparing {len(files)} files...\")\n    similarity_dict = {}\n    for file in files:\n        file_name = Path(file).stem\n        similarity = compare_files(file_name)\n        similarity_dict[file_name] = similarity\n\n    avg_similarity = sum(similarity_dict.values()) / len(similarity_dict)\n    print(f\"\\nAverage similarity: {avg_similarity}\\n\")\n\n    pass_rate = sum(\n        [similarity > 0.9 for similarity in similarity_dict.values()]\n    ) / len(similarity_dict)\n\n    print(f\"Pass rate: {pass_rate}\\n\")\n\n    print(\"Under 0.9 similarity documents:\")\n    print(\"-------------------------------\")\n    for file_name, similarity in similarity_dict.items():\n        if similarity < 0.9:\n            print(f\"{file_name}: {similarity}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "docker-compose.dev.yml",
    "content": "version: \"3.8\"\n\nservices:\n  megaparse:\n    build:\n      context: .\n      dockerfile: Dockerfile\n      cache_from:\n        - megaparse:latest\n      args:\n        - DEV_MODE=true\n    image: megaparse:latest\n    extra_hosts:\n      - \"host.docker.internal:host-gateway\"\n    container_name: megaparse\n    volumes:\n      - ./:/app/\n    command: >\n      /bin/bash -c \"python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000\"\n    restart: always\n    ports:\n      - 8000:8000\n"
  },
  {
    "path": "docker-compose.yml",
    "content": "version: \"3.8\"\n\nservices:\n  megaparse:\n    image: megaparse:latest\n    pull_policy: if_not_present\n    container_name: megaparse\n    extra_hosts:\n      - \"host.docker.internal:host-gateway\"\n    healthcheck:\n      test: [ \"CMD\", \"curl\", \"http://localhost:5050/healthz\" ]\n    command: >\n      /bin/bash -c \"python -m uvicorn megaparse.api.app:app --host 0.0.0.0 --log-level info --reload --port 8000 --loop uvloop\"\n    restart: always\n    ports:\n      - 8000:8000\n"
  },
  {
    "path": "docs/archive.txt",
    "content": "### (Optional) Use LlamaParse for Improved Results\n\n1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.\n\n2. Change the parser to LlamaParser\n\n```python\nfrom megaparse import MegaParse\nfrom langchain_openai import ChatOpenAI\nfrom megaparse.parser.llama_parser import LlamaParser\n\nparser = LlamaParser(api_key = os.getenv(\"LLAMA_CLOUD_API_KEY\"))\nmegaparse = MegaParse(parser)\nresponse = megaparse.load(\"./test.pdf\")\nprint(response)\nmegaparse.save(\"./test.md\") #saves the last processed doc in md format\n```"
  },
  {
    "path": "evaluations/script.py",
    "content": "import difflib\nimport os\n\nfrom langchain_openai import ChatOpenAI\nfrom megaparse.megaparse import MegaParse\nfrom megaparse.parser.llama import LlamaParser\nfrom megaparse.parser.megaparse_vision import MegaParseVision\nfrom megaparse.parser.unstructured_parser import UnstructuredParser\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\n\nif __name__ == \"__main__\":\n    print(\"---Launching evaluations script---\")\n    model = ChatOpenAI(model=\"gpt-4o\", api_key=str(os.getenv(\"OPENAI_API_KEY\")))  # type: ignore\n    parser_dict = {\n        \"unstructured\": UnstructuredParser(strategy=StrategyEnum.AUTO, model=None),\n        \"unstructured_with_check_table\": UnstructuredParser(\n            strategy=StrategyEnum.AUTO,\n            model=model,\n        ),\n        \"llama_parser\": LlamaParser(api_key=str(os.getenv(\"LLAMA_CLOUD_API_KEY\"))),\n        \"megaparse_vision\": MegaParseVision(model=model),\n    }\n\n    base_pdf_path = \"tests/data/MegaFake_report.pdf\"\n    base_md_path = \"tests/data/grt_example/MegaFake_report.md\"\n    with open(base_md_path, \"r\", encoding=\"utf-8\") as f:\n        base_md = f.read()\n\n    score_dict = {}\n\n    for method, parser in parser_dict.items():\n        print(f\"Method: {method}\")\n        megaparse = MegaParse()\n        result = megaparse.load(file_path=base_pdf_path)\n        score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio()\n        print(f\"Score for method {method}: {score_dict[method]}\")\n\n    # Sort the results\n    sorted_score = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)\n\n    # Generate a table with the results\n    benchmark_results = \"| Parser | similarity_ratio |\\n|---|---|\\n\"\n    for parser, score in sorted_score:\n        benchmark_results += f\"| {parser} | {score:.2f} |\\n\"\n\n    print(benchmark_results)\n\n    # Update README.md file\n    with open(\"README.md\", \"r\") as readme_file:\n        readme_content = readme_file.read()\n\n    start_marker = \"<!---BENCHMARK-->\"\n    end_marker = \"<!---END_BENCHMARK-->\"\n    start_index = readme_content.find(start_marker) + len(start_marker)\n    end_index = readme_content.find(end_marker)\n\n    updated_readme_content = (\n        readme_content[:start_index]\n        + \"\\n\"\n        + benchmark_results\n        + readme_content[end_index:]\n    )\n\n    with open(\"README.md\", \"w\") as readme_file:\n        readme_file.write(updated_readme_content)\n"
  },
  {
    "path": "libs/megaparse/.python-version",
    "content": "3.11.9"
  },
  {
    "path": "libs/megaparse/CHANGELOG.md",
    "content": "# Changelog\n\n## [0.0.55](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.54...megaparse-v0.0.55) (2025-02-14)\n\n\n### Features\n\n* remove tensorrt ([#230](https://github.com/QuivrHQ/MegaParse/issues/230)) ([8b8abbc](https://github.com/QuivrHQ/MegaParse/commit/8b8abbc6a2a1b33d4e921d55d2519b773ec062c8))\n\n## [0.0.54](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.53...megaparse-v0.0.54) (2025-02-11)\n\n\n### Features\n\n* add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))\n\n## [0.0.53](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.52...megaparse-v0.0.53) (2025-01-16)\n\n\n### Features\n\n* modular parser and formatter v0 ([#175](https://github.com/QuivrHQ/MegaParse/issues/175)) ([1f4dcf8](https://github.com/QuivrHQ/MegaParse/commit/1f4dcf88a5901c5a2682cb79284a0dbb08034cb2))\n* Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))\n* type strategy output ([#216](https://github.com/QuivrHQ/MegaParse/issues/216)) ([deb8765](https://github.com/QuivrHQ/MegaParse/commit/deb8765a4df8917a4857f51a02025243192d5cf8))\n\n\n### Bug Fixes\n\n* Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))\n* add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))\n* logging error ([#218](https://github.com/QuivrHQ/MegaParse/issues/218)) ([a2170d7](https://github.com/QuivrHQ/MegaParse/commit/a2170d7c711a5d7a0531f03aa9576937ddd6576e))\n* megaparse.load & add tests ([#202](https://github.com/QuivrHQ/MegaParse/issues/202)) ([13c2677](https://github.com/QuivrHQ/MegaParse/commit/13c2677bdadb4ba985a1abf9bafeb70548ab59f9))\n* Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))\n* sync convert to parsers ([#186](https://github.com/QuivrHQ/MegaParse/issues/186)) ([fbb7d36](https://github.com/QuivrHQ/MegaParse/commit/fbb7d365fbaf710a687fdc6becacd6d301c09707))\n\n## [0.0.52](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.51...megaparse-v0.0.52) (2024-12-16)\n\n\n### Bug Fixes\n\n* hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))\n\n## [0.0.51](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.50...megaparse-v0.0.51) (2024-12-16)\n\n\n### Features\n\n* updating langchain version ([#187](https://github.com/QuivrHQ/MegaParse/issues/187)) ([0f1f597](https://github.com/QuivrHQ/MegaParse/commit/0f1f5977df147e6b8c65d55445ccd86ef6f1a862))\n\n## [0.0.50](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.49...megaparse-v0.0.50) (2024-12-13)\n\n\n### Features\n\n* small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))\n\n## [0.0.49](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.48...megaparse-v0.0.49) (2024-12-12)\n\n\n### Features\n\n* custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))\n* faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))\n\n## [0.0.48](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.47...megaparse-v0.0.48) (2024-12-03)\n\n\n### Features\n\n* Update imports and parsers in README.md ([#156](https://github.com/QuivrHQ/MegaParse/issues/156)) ([33e0303](https://github.com/QuivrHQ/MegaParse/commit/33e0303821691c4b1fc821e6b33b874bd332d430))\n\n## [0.0.47](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.46...megaparse-v0.0.47) (2024-11-21)\n\n\n### Features\n\n* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))\n* release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))\n\n## [0.0.22](https://github.com/QuivrHQ/MegaParse/compare/megaparse-v0.0.21...megaparse-v0.0.22) (2024-07-24)\n\n\n### Features\n\n* Add instructions for installing poppler and tesseract ([#10](https://github.com/QuivrHQ/MegaParse/issues/10)) ([3399552](https://github.com/QuivrHQ/MegaParse/commit/3399552bc8be705f6d34306743388a96d099eebc))\n* Add MegaParse class to __init__.py ([84c0d64](https://github.com/QuivrHQ/MegaParse/commit/84c0d648ef1ddf048ec911210d89be155443dc72))\n* Add support for Unstructured Parser, improve Table and Image Parsing, and add TOC and Hyperlinks for Docx ([#9](https://github.com/QuivrHQ/MegaParse/issues/9)) ([4934776](https://github.com/QuivrHQ/MegaParse/commit/493477672cef9fe22b0ab56ced1d5572104e1914))\n* base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))\n* base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))\n* Update benchmark results in README.md ([#15](https://github.com/QuivrHQ/MegaParse/issues/15)) ([1dfcb4c](https://github.com/QuivrHQ/MegaParse/commit/1dfcb4ce19467f7fb8137e10e5f5fbf35e563df0))\n\n\n### Bug Fixes\n\n* add __init__.py ([a5b8de9](https://github.com/QuivrHQ/MegaParse/commit/a5b8de9e1e01ef681ac2ef59a6e111ae7bd6cf70))\n* change name ([6b36437](https://github.com/QuivrHQ/MegaParse/commit/6b36437787f048d36d69c3b06c2d59f7dc7a741f))\n* PR Comments ([a0ab0ba](https://github.com/QuivrHQ/MegaParse/commit/a0ab0baa5dd9aae644baef55348f1af28a6776a7))\n* remove nest asycio ([22195a2](https://github.com/QuivrHQ/MegaParse/commit/22195a27e9dc3583bf1fbde2a95e9fbecc8d96a4))\n* use aload_data ([e5c73fe](https://github.com/QuivrHQ/MegaParse/commit/e5c73fefcbf09bb12810adc6d4412f7742c42089))\n\n## [0.0.21](https://github.com/QuivrHQ/MegaParse/compare/v0.0.20...v0.0.21) (2024-07-24)\n\n\n### Features\n\n* base loader ([#65](https://github.com/QuivrHQ/MegaParse/issues/65)) ([eb8149f](https://github.com/QuivrHQ/MegaParse/commit/eb8149f05ec2793f59fd87109a1aba8095f6f1d0))\n* base loader class ([#64](https://github.com/QuivrHQ/MegaParse/issues/64)) ([801a026](https://github.com/QuivrHQ/MegaParse/commit/801a026e4b3411f8ac85171a6928e3d17c027648))\n"
  },
  {
    "path": "libs/megaparse/README.md",
    "content": "# MegaParse CORE\n\n- Core package of megaparse\n\n> **Note:** The test files in `tests/pdf/ocr` and `tests/pdf/native` come from SAFEDOCS (CC-MAIN-2021-31-PDF-UNTRUNCATED). You can find more information [here](https://digitalcorpora.org/corpora/file-corpora/cc-main-2021-31-pdf-untruncated/)."
  },
  {
    "path": "libs/megaparse/bench.md",
    "content": "------------\nUNSTRUCTURED(HI-RES):\n------------\n\nfolder: cdp\n         cdp_etiquette.pdf parsing took: 2.10s\nfolder: scanned-tables\n         POZIBILAN 2022.pdf parsing took: 78.72s\n         Banco Popilar Number 2.pdf parsing took: 94.44s\nfolder: native\n         00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 3.25s\n         0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 39.75s\n         0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 25.02s\nfolder: scanned\n         machine.pdf parsing took: 54.29s\n         medical.pdf parsing took: 76.11s\n         les_americains.pdf parsing took: 643.84s\n         agency.pdf parsing took: 114.19s\n         clark.pdf parsing took: 27.89s\n         tables_ocr.pdf parsing took: 81.21s\nfolder: rich\n         language_learning.pdf parsing took: 2.60s\n         dites nous tout....pdf parsing took: 1.62s\n\n------------\nUNSTRUCTURED(FAST):\n------------\nfolder: cdp\n         cdp_etiquette.pdf parsing took: 0.05s\nfolder: scanned-tables\n        POZIBILAN 2022.pdf:  can't parse\n        Banco Popilar Number 2.pdf:  can't parse\nfolder: native\n         00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.07s\n         0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 0.86s\n         0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 0.24s\nfolder: scanned\n        machine.pdf parsing took: 0.02s\n        medical.pdf parsing took: 0.04s\n        les_americains.pdf parsing took: 5.90s\n        agency.pdf:  can't parse\n        clark.pdf:  can't parse\n        tables_ocr.pdf:  can't parse\nfolder: rich\n        language_learning.pdf:  can't parse\n         dites nous tout....pdf parsing took: 0.02s\n\n------------\nMegaparse (\n        strategy = AUTO\n        Config = {\n                provider=COREML,\n                det_arch: str = \"fast_base\"\n                det_batch_size: int = 2\n                assume_straight_pages: bool = True\n                preserve_aspect_ratio: bool = True\n                symmetric_pad: bool = True\n                load_in_8_bit: bool = False\n                reco_arch: str = \"crnn_vgg16_bn\"\n                rec_batch_size: int = 512\n        }\n)\n------------\nfolder: cdp\n        cdp_etiquette.pdf parsing took: 1.71s\nfolder: scanned-tables\n        POZIBILAN 2022.pdf parsing took: 17.76s\n        Banco Popilar Number 2.pdf parsing took: 19.25s\nfolder: native\n        00b03d60-fe45-4318-a511-18ee921b7bbb.pdf parsing took: 0.96s\n        0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf parsing took: 12.57s\n        0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf parsing took: 1.53s\nfolder: scanned\n        machine.pdf parsing took: 9.90s\n        medical.pdf parsing took: 13.09s\n        les_americains.pdf parsing took: 139.53s\n        agency.pdf parsing took: 10.73s\n        clark.pdf parsing took: 10.69s\n        tables_ocr.pdf parsing took: 15.58s\nfolder: rich\n        language_learning.pdf parsing took: 1.74s\n        dites nous tout....pdf parsing took: 0.64s\n----\n| Type            | PDF Name                          | Unstructured(HI-RES) | Unstructured(FAST)    | Megaparse( w/ doctr COREML)  |\n|------------------|-----------------------------------|---------------------|----------------------|--------------------|\n| **cdp**         | cdp_etiquette.pdf                 | 2.10s               | 0.05s (bad parsing)              | 1.71s             |\n| **scanned-tables** | POZIBILAN 2022.pdf             | 78.72s              | can't parse          | 17.76s            |\n| **scanned-tables** | Banco Popilar Number 2.pdf     | 94.44s              | can't parse          | 19.25s            |\n| **native**       | 00b03d60-fe45-4318-a511-18ee921b7bbb.pdf | 3.25s  | 0.07s               | 0.96s             |\n| **native**       | 0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf | 39.75s | 0.86s               | 12.57s            |\n| **native**       | 0adb1fd6-d009-4097-bcf6-b8f3af38d3f0.pdf | 25.02s | 0.24s               | 1.53s             |\n| **scanned**      | machine.pdf                      | 54.29s              | 0.02s               | 9.90s             |\n| **scanned**      | medical.pdf                      | 76.11s              | 0.04s               | 13.09s            |\n| **scanned**      | les_americains.pdf               | 643.84s             | 5.90s               | 139.53s           |\n| **scanned**      | agency.pdf                       | 114.19s             | can't parse          | 10.73s            |\n| **scanned**      | clark.pdf                        | 28.89s              | can't parse          | 10.69s            |\n| **scanned**      | tables_ocr.pdf                   | 81.21s              | can't parse          | 15.58s            |\n| **rich**         | language_learning.pdf            | 2.60s               | can't parse          | 1.74s             |\n| **rich**         | dites nous tout....pdf           | 1.62s               | 0.02s               | 0.64s             |\n"
  },
  {
    "path": "libs/megaparse/examples/parse_file_fast.py",
    "content": "import os\nfrom dataclasses import dataclass\nfrom time import perf_counter\n\nfrom unstructured.partition.auto import partition\n\n\n@dataclass\nclass File:\n    file_path: str\n    file_name: str\n    file_extension: str\n\n\ndef list_files_in_directory(directory_path: str) -> dict[str, list[File]]:\n    directory_dict = {}\n    for root, _, files in os.walk(directory_path):\n        folder_name = os.path.basename(root)\n        if len(folder_name) > 0:\n            file_list = []\n            for file_name in files:\n                file_path = os.path.join(root, file_name)\n                file_extension = os.path.splitext(file_name)[1]\n                file_list.append(\n                    File(\n                        file_path=file_path,\n                        file_name=file_name,\n                        file_extension=file_extension,\n                    )\n                )\n            directory_dict[folder_name] = file_list\n\n    return directory_dict\n\n\ndef main():\n    file_path = \"/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf\"\n    folder_path = \"/Users/amine/data/quivr/parsing/\"\n\n    list_files = list_files_in_directory(folder_path)\n\n    for folder_name, files in list_files.items():\n        print(f\"folder: {folder_name}\")\n        for file in files:\n            if file.file_extension == \".pdf\":\n                s = perf_counter()\n                elements = partition(\n                    filename=file.file_path,\n                    strategy=\"fast\",\n                )\n                if len(elements) == 0:\n                    print(f\"\\t{file.file_name}:  can't parse \")\n                    continue\n\n                e = perf_counter()\n                print(f\"\\t {file.file_name} parsing took: {e-s:.2f}s\")\n\n\nif __name__ == \"__main__\":\n    els = main()\n"
  },
  {
    "path": "libs/megaparse/examples/parse_file_mp.py",
    "content": "import os\nfrom dataclasses import dataclass\nfrom time import perf_counter\n\nfrom megaparse import MegaParse\nfrom megaparse.configs.auto import DeviceEnum, MegaParseConfig\n\n\n@dataclass\nclass File:\n    file_path: str\n    file_name: str\n    file_extension: str\n\n\ndef list_files_in_directory(directory_path: str) -> dict[str, list[File]]:\n    directory_dict = {}\n    for root, _, files in os.walk(directory_path):\n        folder_name = os.path.basename(root)\n        if len(folder_name) > 0:\n            file_list = []\n            for file_name in files:\n                file_path = os.path.join(root, file_name)\n                file_extension = os.path.splitext(file_name)[1]\n                file_list.append(\n                    File(\n                        file_path=file_path,\n                        file_name=file_name,\n                        file_extension=file_extension,\n                    )\n                )\n            directory_dict[folder_name] = file_list\n\n    return directory_dict\n\n\ndef main():\n    folder_path = \"/Users/amine/data/quivr/parsing/\"\n\n    list_files = list_files_in_directory(folder_path)\n    config = MegaParseConfig(device=DeviceEnum.COREML)\n    mp = MegaParse(config=config)\n\n    for folder_name, files in list_files.items():\n        print(f\"folder: {folder_name}\")\n        for file in files:\n            if file.file_extension == \".pdf\":\n                s = perf_counter()\n                result = mp.load(file.file_path)\n                if len(result) == 0:\n                    print(f\"\\t{file.file_name}:  can't parse \")\n                    continue\n\n                e = perf_counter()\n                print(f\"\\t {file.file_name} parsing took: {e-s:.2f}s\")\n\n\nif __name__ == \"__main__\":\n    els = main()\n"
  },
  {
    "path": "libs/megaparse/examples/parse_file_unstructured.py",
    "content": "import os\nfrom dataclasses import dataclass\nfrom time import perf_counter\n\nfrom unstructured.partition.auto import partition\n\n\n@dataclass\nclass File:\n    file_path: str\n    file_name: str\n    file_extension: str\n\n\ndef list_files_in_directory(directory_path: str) -> dict[str, list[File]]:\n    directory_dict = {}\n    for root, _, files in os.walk(directory_path):\n        folder_name = os.path.basename(root)\n        if len(folder_name) > 0:\n            file_list = []\n            for file_name in files:\n                file_path = os.path.join(root, file_name)\n                file_extension = os.path.splitext(file_name)[1]\n                file_list.append(\n                    File(\n                        file_path=file_path,\n                        file_name=file_name,\n                        file_extension=file_extension,\n                    )\n                )\n            directory_dict[folder_name] = file_list\n\n    return directory_dict\n\n\ndef main():\n    file_path = \"/Users/amine/data/quivr/parsing/native/0b0ab5f4-b654-4846-bd9b-18b3c1075c52.pdf\"\n    folder_path = \"/Users/amine/data/quivr/parsing/\"\n\n    list_files = list_files_in_directory(folder_path)\n\n    for folder_name, files in list_files.items():\n        print(f\"folder: {folder_name}\")\n        for file in files:\n            if file.file_extension == \".pdf\":\n                s = perf_counter()\n                _ = partition(\n                    filename=file.file_path,\n                    strategy=\"hi_res\",\n                )\n                e = perf_counter()\n                print(f\"\\t {file.file_name} parsing took: {e-s:.2f}s\")\n\n\nif __name__ == \"__main__\":\n    els = main()\n"
  },
  {
    "path": "libs/megaparse/pyproject.toml",
    "content": "[project]\nname = \"megaparse\"\nversion = \"0.0.55\"\nauthors = [\n    { name = \"Stan Girard\", email = \"stan@quivr.app\" },\n    { name = \"Chloé Daems\", email = \"chloe@quivr.app\" },\n    { name = \"Amine Dirhoussi\", email = \"amine@quivr.app\" },\n    { name = \"Jacopo Chevallard\", email = \"jacopo@quivr.app\" },\n]\n\nreadme = \"README.md\"\nrequires-python = \">= 3.11\"\n\ndependencies = [\n    \"megaparse-sdk\",\n    \"pycryptodome>=3.21.0\",\n    \"pdfplumber>=0.11.0\",\n    \"backoff>=2.2.1\",\n    \"pypdf>=5.0.1\",\n    \"psutil>=6.1.0\",\n    \"numpy<=2.0.0\",\n    \"playwright>=1.47.0\",\n    \"langchain-anthropic>=0.1.23\",\n    \"python-magic>=0.4.27\",\n    \"unstructured[all-docs]==0.15.0\",\n    \"langchain>=0.3,<0.4\",\n    \"langchain-community>=0.3,<0.4\",\n    \"langchain-openai>=0.1.21\",\n    \"langchain-core>=0.3,<0.4\",\n    \"llama-parse>=0.4.0\",\n    \"pydantic-settings>=2.6.1\",\n    \"onnxruntime==1.20.0; platform_machine == 'x86_64'\",\n    \"onnxruntime-gpu==1.20.0; platform_machine == 'x86_64'\",\n    \"onnxtr[gpu-headless]>=0.6.0; platform_machine == 'x86_64'\",\n    \"onnxtr[cpu]>=0.6.0; platform_machine != 'x86_64'\",\n    \"pypdfium2>=4.30.0\",\n]\n\n[project.optional-dependencies]\napi = [\n    \"python-dotenv>=1.0.0\",\n    \"uvloop>=0.18.0\",\n    \"pydantic-settings>=2.6.1\",\n    \"uvicorn>=0.32.0\",\n    \"fastapi>=0.115.2\",\n    \"ratelimit>=2.2.1\",\n\n]\n\n\n[build-system]\nrequires = [\"hatchling==1.26.3\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.rye]\nmanaged = true\ndev-dependencies = []\nuniversal = true\n\n[tool.hatch.metadata]\nallow-direct-references = true\n\n[tool.hatch.build.targets.wheel]\npackages = [\"src/megaparse\", \"src/api\"]\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/__init__.py",
    "content": "from .megaparse import MegaParse\n\n__all__ = [\"MegaParse\"]\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/api/__init__.py",
    "content": ""
  },
  {
    "path": "libs/megaparse/src/megaparse/api/app.py",
    "content": "import io\nimport os\nimport tempfile\nfrom typing import Any, Optional\n\nimport httpx\nimport psutil\nimport uvicorn\nfrom fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile\nfrom langchain_anthropic import ChatAnthropic\nfrom langchain_community.document_loaders import PlaywrightURLLoader\nfrom langchain_openai import ChatOpenAI\nfrom llama_parse.utils import Language\nfrom megaparse_sdk.schema.document import Document\nfrom megaparse_sdk.schema.parser_config import (\n    ParserType,\n    StrategyEnum,\n)\nfrom megaparse_sdk.schema.supported_models import SupportedModel\n\nfrom megaparse import MegaParse\nfrom megaparse.api.exceptions.megaparse_exceptions import (\n    HTTPDownloadError,\n    HTTPFileNotFound,\n    HTTPModelNotSupported,\n    HTTPParsingException,\n    ParsingException,\n)\nfrom megaparse.parser.builder import ParserBuilder\n\napp = FastAPI()\n\nplaywright_loader = PlaywrightURLLoader(urls=[], remove_selectors=[\"header\", \"footer\"])\n\n\ndef parser_builder_dep():\n    return ParserBuilder()\n\n\ndef get_playwright_loader():\n    return playwright_loader\n\n\n@app.get(\"/healthz\")\ndef healthz():\n    return {\"status\": \"ok\"}\n\n\ndef _check_free_memory() -> bool:\n    \"\"\"Reject traffic when free memory is below minimum (default 2GB).\"\"\"\n    mem = psutil.virtual_memory()\n    memory_free_minimum = int(os.environ.get(\"MEMORY_FREE_MINIMUM_MB\", 2048))\n\n    if mem.available <= memory_free_minimum * 1024 * 1024:\n        return False\n    return True\n\n\n@app.post(\n    \"/v1/file\",\n)\nasync def parse_file(\n    file: UploadFile = File(...),\n    method: ParserType = Form(ParserType.UNSTRUCTURED),\n    strategy: StrategyEnum = Form(StrategyEnum.AUTO),\n    check_table: bool = Form(False),\n    language: Language = Form(Language.ENGLISH),\n    parsing_instruction: Optional[str] = Form(None),\n    model_name: Optional[SupportedModel] = Form(SupportedModel.GPT_4O),\n    parser_builder=Depends(parser_builder_dep),\n) -> dict[str, str | Document]:\n    if not _check_free_memory():\n        raise HTTPException(\n            status_code=503, detail=\"Service unavailable due to low memory\"\n        )\n    model = None\n    if model_name and check_table:\n        if model_name.startswith(\"gpt\"):\n            model = ChatOpenAI(model=model_name, api_key=os.getenv(\"OPENAI_API_KEY\"))  # type: ignore\n        elif model_name.startswith(\"claude\"):\n            model = ChatAnthropic(\n                model_name=model_name,\n                api_key=os.getenv(\"ANTHROPIC_API_KEY\"),  # type: ignore\n                timeout=60,\n                stop=None,\n            )\n\n        else:\n            raise HTTPModelNotSupported()\n\n    # parser_config = ParseFileConfig( #FIXME\n    #     method=method,\n    #     strategy=strategy,\n    #     llm_model_name=SupportedModel(model_name) if model_name and check_table else None,\n    #     language=language,\n    #     parsing_instruction=parsing_instruction,\n    # )\n    try:\n        # parser = parser_builder.build(parser_config)\n        megaparse = MegaParse()\n        if not file.filename:\n            raise HTTPFileNotFound(\"No filename provided\")\n        _, extension = os.path.splitext(file.filename)\n        file_bytes = await file.read()\n        file_stream = io.BytesIO(file_bytes)\n        result = await megaparse.aload(file=file_stream, file_extension=extension)\n        return {\"message\": \"File parsed successfully\", \"result\": result}\n    except ParsingException as e:\n        print(e)\n        raise HTTPParsingException(file.filename)\n    except ValueError as e:\n        print(e)\n        raise HTTPException(status_code=400, detail=str(e))\n    except Exception as e:\n        print(e)\n        raise HTTPException(status_code=500, detail=str(e))\n\n\n@app.post(\n    \"/v1/url\",\n)\nasync def upload_url(\n    url: str, playwright_loader=Depends(get_playwright_loader)\n) -> dict[str, Any]:\n    playwright_loader.urls = [url]\n\n    if url.endswith(\".pdf\"):\n        ## Download the file\n\n        async with httpx.AsyncClient() as client:\n            response = await client.get(url)\n        if response.status_code != 200:\n            raise HTTPDownloadError(url)\n\n        with tempfile.NamedTemporaryFile(delete=False, suffix=\"pdf\") as temp_file:\n            temp_file.write(response.content)\n            try:\n                megaparse = MegaParse()\n                result = await megaparse.aload(temp_file.name)\n                return {\"message\": \"File parsed successfully\", \"result\": result}\n            except ParsingException:\n                raise HTTPParsingException(url)\n    else:\n        data = await playwright_loader.aload()\n        # Now turn the data into a string\n        extracted_content = \"\"\n        for page in data:\n            extracted_content += page.page_content\n        if not extracted_content:\n            raise HTTPDownloadError(\n                url,\n                message=\"Failed to extract content from the website. Valid URL example : https://www.quivr.com\",\n            )\n        return {\n            \"message\": \"Website content parsed successfully\",\n            \"result\": extracted_content,\n        }\n\n\nif __name__ == \"__main__\":\n    uvicorn.run(app, host=\"0.0.0.0\", port=8000)\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/api/exceptions/__init__.py",
    "content": ""
  },
  {
    "path": "libs/megaparse/src/megaparse/api/exceptions/megaparse_exceptions.py",
    "content": "from fastapi import HTTPException\n\n\nclass HTTPModelNotSupported(HTTPException):\n    def __init__(\n        self,\n        detail: str = \"The requested model is not supported yet.\",\n        headers: dict | None = None,\n    ):\n        super().__init__(status_code=501, detail=detail, headers=headers)\n\n\nclass HTTPFileNotFound(HTTPException):\n    def __init__(\n        self,\n        message=\"The UploadFile.filename does not exist and is needed for this operation\",\n    ):\n        super().__init__(status_code=404, detail=message)\n\n\nclass HTTPDownloadError(HTTPException):\n    def __init__(self, file_name, message=\"Failed to download the file\"):\n        message = f\"{file_name} : {message}\"\n        super().__init__(status_code=400, detail=message)\n\n\nclass HTTPParsingException(HTTPException):\n    def __init__(self, file_name, message=\"Failed to parse the file\"):\n        message = f\"{file_name} : {message}\"\n        super().__init__(status_code=500, detail=message)\n\n\nclass ParsingException(Exception):\n    \"\"\"Exception raised for errors in the parsing process.\"\"\"\n\n    def __init__(self, message=\"An error occurred during parsing\"):\n        self.message = message\n        super().__init__(self.message)\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/api/models/__init__.py",
    "content": ""
  },
  {
    "path": "libs/megaparse/src/megaparse/api/models/base.py",
    "content": "from enum import Enum\n\n\nclass MarkDownType(str, Enum):\n    \"\"\"Markdown type enumeration.\"\"\"\n\n    TITLE = \"Title\"\n    SUBTITLE = \"Subtitle\"\n    HEADER = \"Header\"\n    FOOTER = \"Footer\"\n    NARRATIVE_TEXT = \"NarrativeText\"\n    LIST_ITEM = \"ListItem\"\n    TABLE = \"Table\"\n    PAGE_BREAK = \"PageBreak\"\n    IMAGE = \"Image\"\n    FORMULA = \"Formula\"\n    FIGURE_CAPTION = \"FigureCaption\"\n    ADDRESS = \"Address\"\n    EMAIL_ADDRESS = \"EmailAddress\"\n    CODE_SNIPPET = \"CodeSnippet\"\n    PAGE_NUMBER = \"PageNumber\"\n    DEFAULT = \"Default\"\n    UNDEFINED = \"Undefined\"\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/configs/auto.py",
    "content": "from enum import Enum\n\nfrom pydantic import BaseModel\nfrom pydantic_settings import BaseSettings, SettingsConfigDict\n\n\nclass TextDetConfig(BaseModel):\n    det_arch: str = \"fast_base\"\n    batch_size: int = 2\n    assume_straight_pages: bool = True\n    preserve_aspect_ratio: bool = True\n    symmetric_pad: bool = True\n    load_in_8_bit: bool = False\n\n\nclass AutoStrategyConfig(BaseModel):\n    page_threshold: float = 0.6\n    document_threshold: float = 0.2\n\n\nclass TextRecoConfig(BaseModel):\n    reco_arch: str = \"crnn_vgg16_bn\"\n    batch_size: int = 512\n\n\nclass DeviceEnum(str, Enum):\n    CPU = \"cpu\"\n    CUDA = \"cuda\"\n    COREML = \"coreml\"\n\n\nclass DoctrConfig(BaseModel):\n    straighten_pages: bool = False\n    detect_orientation: bool = False\n    detect_language: bool = False\n    text_det_config: TextDetConfig = TextDetConfig()\n    text_reco_config: TextRecoConfig = TextRecoConfig()\n\n\nclass MegaParseConfig(BaseSettings):\n    \"\"\"\n    Configuration for Megaparse.\n    \"\"\"\n\n    model_config = SettingsConfigDict(\n        env_prefix=\"MEGAPARSE_\",\n        env_file=(\".env.local\", \".env\"),\n        env_nested_delimiter=\"__\",\n        extra=\"ignore\",\n        use_enum_values=True,\n    )\n    doctr_config: DoctrConfig = DoctrConfig()\n    auto_config: AutoStrategyConfig = AutoStrategyConfig()\n    device: DeviceEnum = DeviceEnum.CPU\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/examples/parse_file.py",
    "content": "from pathlib import Path\n\nfrom megaparse.megaparse import MegaParse\nfrom pydantic import BaseModel, Field\n\n\nclass MyCustomFormat(BaseModel):\n    title: str = Field(description=\"The title of the document.\")\n    problem: str = Field(description=\"The problem statement.\")\n    solution: str = Field(description=\"The solution statement.\")\n\n\ndef main():\n    # model = ChatOpenAI(name=\"gpt-4o\")\n    # formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)\n\n    megaparse = MegaParse()\n\n    file_path = Path(\"./tests/pdf/ocr/0168127.pdf\")\n    result = megaparse.load(file_path=file_path)\n    print(result)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/examples/parsing_process.py",
    "content": "from pathlib import Path\nfrom typing import IO, Any, List, Tuple\n\nimport numpy as np\nimport onnxruntime as rt\nimport pypdfium2 as pdfium\nfrom megaparse.configs.auto import (\n    AutoStrategyConfig,\n    DeviceEnum,\n    TextDetConfig,\n    TextRecoConfig,\n)\nfrom megaparse.models.page import Page, PageDimension\nfrom megaparse.parser.doctr_parser import DoctrParser\nfrom megaparse.parser.unstructured_parser import UnstructuredParser\nfrom megaparse_sdk.schema.document import BBOX, BlockLayout, BlockType, TextDetection\nfrom megaparse_sdk.schema.extensions import FileExtension\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\nfrom onnxtr.models import detection_predictor, recognition_predictor\nfrom onnxtr.models.builder import DocumentBuilder\nfrom onnxtr.models.engine import EngineConfig\nfrom onnxtr.utils.geometry import (\n    detach_scores,\n    extract_crops,\n    extract_rcrops,\n)\nfrom pypdfium2._helpers.page import PdfPage\n\n\ndef get_strategy_page(\n    pdfium_page: PdfPage, onnxtr_page: TextDetection, page_threshold: float = 0.6\n) -> StrategyEnum:\n    # assert (\n    #     p_width == onnxtr_page.dimensions[1]\n    #     and p_height == onnxtr_page.dimensions[0]\n    # ), \"Page dimensions do not match\"\n    text_coords = []\n    # Get all the images in the page\n    for obj in pdfium_page.get_objects():\n        if obj.type == 1:\n            text_coords.append(obj.get_pos())\n\n    p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())\n\n    pdfium_canva = np.zeros((int(p_height), int(p_width)))\n\n    for coords in text_coords:\n        # (left,bottom,right, top)\n        # 0---l--------------R-> y\n        # |\n        # B   (x0,y0)\n        # |\n        # T                 (x1,y1)\n        # ^\n        # x\n        x0, y0, x1, y1 = (\n            p_height - coords[3],\n            coords[0],\n            p_height - coords[1],\n            coords[2],\n        )\n        x0 = max(0, min(p_height, int(x0)))\n        y0 = max(0, min(p_width, int(y0)))\n        x1 = max(0, min(p_height, int(x1)))\n        y1 = max(0, min(p_width, int(y1)))\n        pdfium_canva[x0:x1, y0:y1] = 1\n\n    onnxtr_canva = np.zeros((int(p_height), int(p_width)))\n    for block in onnxtr_page.bboxes:\n        x0, y0 = block.bbox[0]\n        x1, y1 = block.bbox[1]\n        x0 = max(0, min(int(x0 * p_width), int(p_width)))\n        y0 = max(0, min(int(y0 * p_height), int(p_height)))\n        x1 = max(0, min(int(x1 * p_width), int(p_width)))\n        y1 = max(0, min(int(y1 * p_height), int(p_height)))\n        onnxtr_canva[y0:y1, x0:x1] = 1\n\n    intersection = np.logical_and(pdfium_canva, onnxtr_canva)\n    union = np.logical_or(pdfium_canva, onnxtr_canva)\n    iou = np.sum(intersection) / np.sum(union)\n    if iou < page_threshold:\n        return StrategyEnum.HI_RES\n    return StrategyEnum.FAST\n\n\ndef validate_input(\n    file_path: Path | str | None = None,\n    file: IO[bytes] | None = None,\n    file_extension: str | FileExtension | None = None,\n) -> FileExtension:\n    if not (file_path or file):\n        raise ValueError(\"Either file_path or file should be provided\")\n\n    if file_path and file:\n        raise ValueError(\"Only one of file_path or file should be provided\")\n\n    if file_path and file is None:\n        if isinstance(file_path, str):\n            file_path = Path(file_path)\n        file_extension = file_path.suffix\n    elif file and file_path is None:\n        if not file_extension:\n            raise ValueError(\n                \"file_extension should be provided when given file argument\"\n            )\n        file.seek(0)\n    else:\n        raise ValueError(\"Either provider a file_path or file\")\n\n    if isinstance(file_extension, str):\n        try:\n            file_extension = FileExtension(file_extension)\n        except ValueError:\n            raise ValueError(f\"Unsupported file extension: {file_extension}\")\n    return file_extension\n\n\ndef _generate_crops(\n    pages: list[np.ndarray],\n    loc_preds: list[np.ndarray],\n    channels_last: bool,\n    assume_straight_pages: bool = False,\n    assume_horizontal: bool = False,\n) -> list[list[np.ndarray]]:\n    if assume_straight_pages:\n        crops = [\n            extract_crops(page, _boxes[:, :4], channels_last=channels_last)\n            for page, _boxes in zip(pages, loc_preds, strict=False)\n        ]\n    else:\n        crops = [\n            extract_rcrops(\n                page,\n                _boxes[:, :4],\n                channels_last=channels_last,\n                assume_horizontal=assume_horizontal,\n            )\n            for page, _boxes in zip(pages, loc_preds, strict=False)\n        ]\n    return crops\n\n\ndef _prepare_crops(\n    pages: list[np.ndarray],\n    loc_preds: list[np.ndarray],\n    channels_last: bool,\n    assume_straight_pages: bool = False,\n    assume_horizontal: bool = False,\n) -> tuple[list[list[np.ndarray]], list[np.ndarray]]:\n    crops = _generate_crops(\n        pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal\n    )\n\n    # Avoid sending zero-sized crops\n    is_kept = [\n        [all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops\n    ]\n    crops = [\n        [crop for crop, _kept in zip(page_crops, page_kept, strict=False) if _kept]\n        for page_crops, page_kept in zip(crops, is_kept, strict=False)\n    ]\n    loc_preds = [\n        _boxes[_kept] for _boxes, _kept in zip(loc_preds, is_kept, strict=False)\n    ]\n\n    return crops, loc_preds\n\n\ndef _process_predictions(\n    loc_preds: list[np.ndarray],\n    word_preds: list[tuple[str, float]],\n    crop_orientations: list[dict[str, Any]],\n) -> tuple[list[np.ndarray], list[list[tuple[str, float]]], list[list[dict[str, Any]]]]:\n    text_preds = []\n    crop_orientation_preds = []\n    if len(loc_preds) > 0:\n        # Text & crop orientation predictions at page level\n        _idx = 0\n        for page_boxes in loc_preds:\n            text_preds.append(word_preds[_idx : _idx + page_boxes.shape[0]])\n            crop_orientation_preds.append(\n                crop_orientations[_idx : _idx + page_boxes.shape[0]]\n            )\n            _idx += page_boxes.shape[0]\n\n    return loc_preds, text_preds, crop_orientation_preds\n\n\ndef main():\n    file_path = Path(\"./tests/pdf/sample_pdf.pdf\")\n    strategy = StrategyEnum.AUTO\n    device = DeviceEnum.COREML\n    ocr_parser = DoctrParser()\n    default_parser = UnstructuredParser(strategy=StrategyEnum.FAST)\n    file_extension = validate_input(file_path=file_path)\n    with open(file_path, \"rb\") as file:\n        pdfium_document = pdfium.PdfDocument(file)\n        rasterized_pages: list[np.ndarray] = [\n            np.array(page.render().to_pil(scale=2)) for page in pdfium_document\n        ]\n        ##-----------------------------------\n        ## GET PAGES\n        ##-----------------------------------\n        mp_pages = []\n        if strategy == StrategyEnum.FAST:\n            parsed_document = default_parser.convert(\n                file=file,\n                file_extension=file_extension,\n            )\n        else:\n            text_det_config = TextDetConfig()\n            general_options = rt.SessionOptions()\n            providers = get_providers(device=device)\n            engine_config = EngineConfig(\n                session_options=general_options,\n                providers=providers,\n            )\n            det_predictor = detection_predictor(\n                arch=text_det_config.det_arch,\n                assume_straight_pages=text_det_config.assume_straight_pages,\n                preserve_aspect_ratio=text_det_config.preserve_aspect_ratio,\n                symmetric_pad=text_det_config.symmetric_pad,\n                batch_size=text_det_config.batch_size,\n                load_in_8_bit=text_det_config.load_in_8_bit,\n                engine_cfg=engine_config,\n            )\n            if any(page.ndim != 3 for page in rasterized_pages):\n                raise ValueError(\n                    \"incorrect input shape: all pages are expected to be multi-channel 2D images.\"\n                )\n\n            orientations = None\n            general_pages_orientations = None\n            # Localize text elements\n            loc_preds, out_maps = det_predictor(rasterized_pages, return_maps=True)\n            # FIXME: For simplicity we do not care about page orientation rn\n            # FIXME: similaly we don't care about straighten page\n\n            # Detach objectness scores from loc_preds\n            loc_preds, objectness_scores = detach_scores(loc_preds)  # type: ignore[arg-type]\n\n            # FIXME: Do not care about hooks here\n            # # Apply hooks to loc_preds if any\n            # for hook in hooks:\n            #     loc_preds = hook(loc_preds)\n            all_pages_layouts = []\n            for page_index, (page, loc_pred, objectness_score) in enumerate(\n                zip(rasterized_pages, loc_preds, objectness_scores, strict=True)\n            ):\n                block_layouts = []\n                for bbox, score in zip(loc_pred, objectness_score, strict=True):\n                    block_layouts.append(\n                        BlockLayout(\n                            bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()),\n                            objectness_score=score,\n                            block_type=BlockType.TEXT,\n                        )\n                    )\n                all_pages_layouts.append(\n                    TextDetection(\n                        bboxes=block_layouts,\n                        page_index=page_index,\n                        dimensions=page.shape[:2],\n                        orientation=general_pages_orientations[page_index]\n                        if general_pages_orientations is not None\n                        else 0,\n                    )\n                )\n            for pdfium_page, onnxtr_page, rasterized_page in zip(\n                pdfium_document, all_pages_layouts, rasterized_pages, strict=True\n            ):\n                strategy = get_strategy_page(pdfium_page, onnxtr_page)\n                mp_pages.append(\n                    Page(\n                        strategy=strategy,\n                        text_detections=onnxtr_page,\n                        rasterized=rasterized_page,\n                        page_size=PageDimension(\n                            width=pdfium_page.get_width(),\n                            height=pdfium_page.get_height(),\n                        ),\n                        page_index=onnxtr_page.page_index,\n                        pdfium_elements=pdfium_page,\n                    )\n                )\n\n            ##-----------------------------------\n            ## GET PARSER BASE ON CHOSE STRATEGY\n            ##-----------------------------------\n            if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:\n                parser = default_parser\n            elif strategy == StrategyEnum.HI_RES:\n                parser = ocr_parser\n            else:\n                if need_hi_res(mp_pages, AutoStrategyConfig()):\n                    parser = ocr_parser\n                else:\n                    parser = default_parser\n\n            ##-----------------------------------\n            ## PARSE FILE\n            ##-----------------------------------\n            if isinstance(parser, UnstructuredParser):\n                parsed_document = parser.convert(\n                    file=file,\n                    pages=mp_pages,\n                    file_extension=file_extension,\n                )\n            else:\n                origin_page_shapes: List[Tuple[int, int]] = [\n                    (page.shape[0], page.shape[1]) for page in rasterized_pages\n                ]\n\n                reco_config = TextRecoConfig()\n                reco_predictor = recognition_predictor(\n                    arch=reco_config.reco_arch,\n                    batch_size=reco_config.batch_size,\n                    load_in_8_bit=text_det_config.load_in_8_bit,\n                    engine_cfg=engine_config,\n                )\n\n                # Crop images\n                crops, loc_preds = _prepare_crops(\n                    rasterized_pages,\n                    loc_preds,  # type: ignore[arg-type]\n                    channels_last=True,\n                    assume_straight_pages=True,  # FIXME: To change\n                    assume_horizontal=True,  # FIXME: To change\n                )\n                # Rectify crop orientation and get crop orientation predictions\n                crop_orientations: Any = []\n\n                # Identify character sequences\n                word_preds = reco_predictor(\n                    [crop for page_crops in crops for crop in page_crops]\n                )\n                if not crop_orientations:\n                    crop_orientations = [\n                        {\"value\": 0, \"confidence\": None} for _ in word_preds\n                    ]\n\n                boxes, text_preds, crop_orientations = _process_predictions(\n                    loc_preds, word_preds, crop_orientations\n                )\n                doc_builder = DocumentBuilder()\n                parsed_document = doc_builder(\n                    rasterized_pages,\n                    boxes,\n                    objectness_scores,\n                    text_preds,\n                    origin_page_shapes,\n                    crop_orientations,\n                    orientations,\n                    None,\n                )\n\n        print(parsed_document)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/exceptions/base.py",
    "content": "class ParsingException(Exception):\n    \"\"\"Exception raised for errors in the parsing process.\"\"\"\n\n    def __init__(self, message=\"An error occurred during parsing\"):\n        self.message = message\n        super().__init__(self.message)\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/formatter/base.py",
    "content": "from abc import ABC\nfrom pathlib import Path\nfrom typing import Union\n\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom megaparse_sdk.schema.document import Document\n\n\nclass BaseFormatter(ABC):\n    \"\"\"\n    A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.\n    Attributes\n    ----------\n    model : BaseChatModel\n        An instance of a chat model used to process and improve the layout of elements.\n    Methods\n    -------\n    improve_layout(elements: List[Element]) -> List[Element]\n        Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.\n    \"\"\"\n\n    def __init__(self, model: BaseChatModel | None = None):\n        self.model = model\n\n    def format(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Union[Document, str]:\n        raise NotImplementedError(\"Subclasses should implement this method\")\n\n    async def aformat(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Union[Document, str]:\n        raise NotImplementedError(\"Subclasses should implement this method\")\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py",
    "content": "from pathlib import Path\n\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom megaparse.formatter.base import BaseFormatter\nfrom megaparse_sdk.schema.document import Document\nfrom pydantic import BaseModel\n\n\nclass StructuredFormatter(BaseFormatter):\n    def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):\n        super().__init__(model)\n        self.output_model = output_model\n\n    async def aformat(\n        self,\n        document: Document,\n        file_path: Path | str | None = None,\n    ) -> str:  # FIXME: Return a structured output of type BaseModel ?\n        raise NotImplementedError()\n\n    def format(\n        self,\n        document: Document,\n        file_path: Path | str | None = None,\n    ) -> str:  # FIXME: Return a structured output of type BaseModel ?\n        raise NotImplementedError()\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/formatter/structured_formatter/custom_structured_formatter.py",
    "content": "from pathlib import Path\n\nfrom megaparse.formatter.structured_formatter import StructuredFormatter\nfrom megaparse_sdk.schema.document import Document\nfrom pydantic import BaseModel\n\n\nclass CustomStructuredFormatter(StructuredFormatter):\n    def format(\n        self,\n        document: Document,\n        file_path: Path | str | None = None,\n    ) -> str:\n        \"\"\"\n        Structure the file using an AI language model.\n        Args:\n            text: The text to format.\n            file_path: The file path of the text.\n            model: The AI language model to use for formatting.\n        Returns:\n            The structured text.\n        \"\"\"\n        if not self.model:\n            raise ValueError(\"A Model is needed to use the CustomStructuredFormatter.\")\n        print(\"Formatting text using CustomStructuredFormatter...\")\n        text = str(document)\n        if len(text) < 0:\n            raise ValueError(\n                \"A non empty text is needed to format text using CustomStructuredFormatter.\"\n            )\n        if not self.output_model:\n            raise ValueError(\n                \"An output model is needed to structure text using CustomStructuredFormatter.\"\n            )\n\n        structured_model = self.model.with_structured_output(self.output_model)  # type: ignore\n\n        formatted_text = structured_model.invoke(\n            f\"Parse the text in a structured format: {text}\"\n        )\n        assert isinstance(formatted_text, BaseModel), \"Model output is not a BaseModel.\"\n\n        return formatted_text.model_dump_json()\n\n    async def aformat(\n        self,\n        document: Document,\n        file_path: Path | str | None = None,\n    ) -> str:\n        \"\"\"\n        Asynchronously structure the file using an AI language model.\n        Args:\n            text: The text to format.\n            file_path: The file path of the text.\n            model: The AI language model to use for formatting.\n        Returns:\n            The structured text.\n        \"\"\"\n        if not self.model:\n            raise ValueError(\"A Model is needed to use the CustomStructuredFormatter.\")\n        print(\"Formatting text using CustomStructuredFormatter...\")\n        text = str(document)\n\n        if len(text) < 0:\n            raise ValueError(\n                \"A non empty text is needed to format text using CustomStructuredFormatter.\"\n            )\n        if not self.output_model:\n            raise ValueError(\n                \"An output model is needed to structure text using CustomStructuredFormatter.\"\n            )\n\n        structured_model = self.model.with_structured_output(self.output_model)  # type: ignore\n\n        formatted_text = await structured_model.ainvoke(\n            f\"Parse the text in a structured format: {text}\"\n        )\n        assert isinstance(formatted_text, BaseModel), \"Model output is not a BaseModel.\"\n\n        return formatted_text.model_dump_json()\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/formatter/table_formatter/__init__.py",
    "content": "from pathlib import Path\n\nfrom megaparse.formatter.base import BaseFormatter\nfrom megaparse_sdk.schema.document import Document\n\n\nclass TableFormatter(BaseFormatter):\n    def format(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Document:\n        raise NotImplementedError(\"Subclasses should implement this method\")\n\n    async def aformat(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Document:\n        raise NotImplementedError(\"Subclasses should implement this method\")\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/formatter/table_formatter/llm_table_formatter.py",
    "content": "import re\nimport warnings\nfrom pathlib import Path\nfrom typing import Optional\n\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom langchain_core.prompts import ChatPromptTemplate\nfrom megaparse.formatter.table_formatter import TableFormatter\nfrom megaparse_sdk.schema.document import Document, TableBlock\n\n\nclass SimpleMDTableFormatter(TableFormatter):\n    \"\"\"\n    A formatter that converts table elements into Markdown format using llms.\n    \"\"\"\n\n    TABLE_MARKER_START = \"[TABLE]\"\n    TABLE_MARKER_END = \"[/TABLE]\"\n    CODE_BLOCK_PATTERN = r\"^```.*$\\n?\"\n\n    def __init__(self, model: Optional[BaseChatModel] = None):\n        super().__init__(model)\n\n    async def aformat(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Document:\n        warnings.warn(\n            \"The SimpleMDTableFormatter is a sync formatter, please use the sync format method\",\n            UserWarning,\n            stacklevel=2,\n        )\n        return self.format(document=document, file_path=file_path)\n\n    def format(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Document:\n        \"\"\"\n        Formats table elements within a list of elements.\n        Args:\n            elements: A list of Element objects.\n        Returns:\n            A list of Element objects with formatted tables.\n        \"\"\"\n        if not self.model:\n            raise ValueError(\"A Model is needed to use the SimpleMDTableFormatter.\")\n        print(\"Formatting tables using SimpleMDTableFormatter...\")\n        table_stack = []\n        formatted_elements = []\n\n        for block in document.content:\n            if isinstance(block, TableBlock):\n                previous_table = table_stack[-1] if table_stack else \"\"\n                formatted_table = self.format_table(block, previous_table)\n                table_stack.append(formatted_table.text)\n                formatted_elements.append(formatted_table)\n            else:\n                formatted_elements.append(block)\n\n        document.content = formatted_elements\n        return document\n\n    def format_table(\n        self, table_element: TableBlock, previous_table: str\n    ) -> TableBlock:\n        \"\"\"\n        Formats a single table element into Markdown using an AI language model.\n        Args:\n            table_element: The table element to format.\n            previous_table: The previously formatted table text.\n        Returns:\n            The formatted table element.\n        \"\"\"\n        assert self.model is not None, \"Model is not set.\"\n\n        prompt = ChatPromptTemplate.from_messages(\n            [\n                (\n                    \"human\",\n                    (\n                        \"You are an expert in markdown tables. Transform the following parsed table into a \"\n                        \"markdown table. Provide just the table in pure markdown, nothing else.\\n\"\n                        \"<TEXT>\\n{text}\\n</TEXT>\\n\"\n                        \"<PREVIOUS_TABLE>\\n{previous_table}\\n</PREVIOUS_TABLE>\"\n                    ),\n                ),\n            ]\n        )\n\n        chain = prompt | self.model\n        result = chain.invoke(\n            {\n                \"text\": table_element.text,\n                \"previous_table\": previous_table,\n            }\n        )\n\n        content_str = str(result.content)\n        cleaned_content = re.sub(\n            self.CODE_BLOCK_PATTERN, \"\", content_str, flags=re.MULTILINE\n        )\n        markdown_table = (\n            f\"{self.TABLE_MARKER_START}\\n\"\n            f\"{cleaned_content}\\n\"\n            f\"{self.TABLE_MARKER_END}\\n\\n\"\n        )\n\n        table_element.text = markdown_table\n\n        return table_element\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/formatter/table_formatter/vision_table_formatter.py",
    "content": "import base64\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import List, Optional\n\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom langchain_core.messages import HumanMessage\nfrom megaparse.formatter.table_formatter import TableFormatter\nfrom megaparse_sdk.schema.document import Document, TableBlock\nfrom pdf2image import convert_from_path\nfrom PIL import Image\n\nTABLE_OCR_PROMPT = \"\"\"\nYou are tasked with transcribing the content of a table into markdown format. Your goal is to create a well-structured, readable markdown table that accurately represents the original content while adding appropriate formatting.\nAnswer uniquely with the parsed table. Do not include the fenced code blocks backticks.\n\"\"\"\n\n\nclass VisionMDTableFormatter(TableFormatter):\n    \"\"\"\n    A formatter that converts table elements into Markdown format using an AI language model.\n    \"\"\"\n\n    TABLE_MARKER_START = \"[TABLE]\"\n    TABLE_MARKER_END = \"[/TABLE]\"\n    CODE_BLOCK_PATTERN = r\"^```.*$\\n?\"\n\n    def __init__(self, model: Optional[BaseChatModel] = None):\n        super().__init__(model)\n\n    def _crop_table_image(self, table_element: TableBlock, file_path: str) -> str:\n        \"\"\"\n        Helper method to crop the table portion of the PDF page and convert it to a base64 string.\n        \"\"\"\n        assert table_element.bbox, \"Table element must have coordinates.\"\n        bbox = table_element.bbox\n        page_number = table_element.page_range[0]\n        assert page_number, \"Table element must have a page number.\"\n        assert bbox, \"Table element must have coordinates.\"\n\n        pages = convert_from_path(file_path)\n\n        # Calculate the box for cropping\n        box = (\n            bbox.top_left.x,\n            bbox.top_left.y,\n            bbox.bottom_right.x,\n            bbox.bottom_right.y,\n        )\n        table_image = pages[page_number - 1].crop(box)\n        # Convert the cropped image to base64\n        table_image64 = self.process_file([table_image])[0]\n        return table_image64\n\n    async def aformat(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Document:\n        \"\"\"\n        Asynchronously formats table elements within a list of elements.\n        \"\"\"\n        if not self.model:\n            raise ValueError(\"A Model is needed to use the VisionMDTableFormatter.\")\n        print(\"Formatting tables using VisionMDTableFormatter (async)...\")\n        assert (\n            file_path\n        ), \"A file path is needed to format tables using VisionMDTableFormatter.\"\n        if not isinstance(file_path, str):\n            file_path = str(file_path)\n        formatted_elements = []\n        for block in document.content:\n            if isinstance(block, TableBlock):\n                formatted_table = await self.aformat_table(block, file_path)\n                formatted_elements.append(formatted_table)\n            else:\n                formatted_elements.append(block)\n\n        document.content = formatted_elements\n        return document\n\n    def format(\n        self, document: Document, file_path: Path | str | None = None\n    ) -> Document:\n        \"\"\"\n        Asynchronously formats table elements within a list of elements.\n        \"\"\"\n        if not self.model:\n            raise ValueError(\"A Model is needed to use the VisionMDTableFormatter.\")\n        print(\"Formatting tables using VisionMDTableFormatter (async)...\")\n        assert (\n            file_path\n        ), \"A file path is needed to format tables using VisionMDTableFormatter.\"\n        if not isinstance(file_path, str):\n            file_path = str(file_path)\n        formatted_elements = []\n        for block in document.content:\n            if isinstance(block, TableBlock):\n                formatted_table = self.format_table(block, file_path)\n                formatted_elements.append(formatted_table)\n            else:\n                formatted_elements.append(block)\n\n        document.content = formatted_elements\n        return document\n\n    async def aformat_table(\n        self, table_element: TableBlock, file_path: str\n    ) -> TableBlock:\n        \"\"\"\n        Asynchronously formats a table element into Markdown format using a Vision Model.\n        \"\"\"\n        table_image64 = self._crop_table_image(table_element, file_path)\n        formatted_table = await self.avision_extract(table_image64)\n\n        markdown_table = (\n            f\"{self.TABLE_MARKER_START}\\n\"\n            f\"{formatted_table}\\n\"\n            f\"{self.TABLE_MARKER_END}\\n\\n\"\n        )\n        # Replace the element's text with the formatted table text\n        table_element.text = markdown_table\n        return table_element\n\n    def format_table(self, table_element: TableBlock, file_path: str) -> TableBlock:\n        \"\"\"\n        Asynchronously formats a table element into Markdown format using a Vision Model.\n        \"\"\"\n        table_image64 = self._crop_table_image(table_element, file_path)\n        formatted_table = self.vision_extract(table_image64)\n\n        markdown_table = (\n            f\"{self.TABLE_MARKER_START}\\n\"\n            f\"{formatted_table}\\n\"\n            f\"{self.TABLE_MARKER_END}\\n\\n\"\n        )\n        # Replace the element's text with the formatted table text\n        table_element.text = markdown_table\n        return table_element\n\n    def process_file(self, images: List[Image.Image], image_format=\"PNG\") -> List[str]:\n        \"\"\"\n        Convert a list of PIL images to base64 encoded images.\n        \"\"\"\n        try:\n            images_base64 = []\n            for image in images:\n                buffered = BytesIO()\n                image.save(buffered, format=image_format)\n                image_base64 = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n                images_base64.append(image_base64)\n            return images_base64\n        except Exception as e:\n            raise ValueError(f\"Error processing PDF file: {str(e)}\")\n\n    async def avision_extract(self, table_image: str) -> str:\n        \"\"\"\n        Asynchronously send image data to the language model for processing.\n        \"\"\"\n        assert (\n            self.model\n        ), \"A model is needed to use the VisionMDTableFormatter (async).\"\n        image_prompt = {\n            \"type\": \"image_url\",\n            \"image_url\": {\"url\": f\"data:image/jpeg;base64,{table_image}\"},\n        }\n\n        message = HumanMessage(\n            content=[\n                {\"type\": \"text\", \"text\": TABLE_OCR_PROMPT},\n                image_prompt,\n            ],\n        )\n        response = await self.model.ainvoke([message])\n        return str(response.content)\n\n    def vision_extract(self, table_image: str) -> str:\n        \"\"\"\n        Synchronously send image data to the language model for processing.\n        \"\"\"\n        assert self.model, \"A model is needed to use the VisionMDTableFormatter (sync).\"\n        image_prompt = {\n            \"type\": \"image_url\",\n            \"image_url\": {\"url\": f\"data:image/jpeg;base64,{table_image}\"},\n        }\n\n        message = HumanMessage(\n            content=[\n                {\"type\": \"text\", \"text\": TABLE_OCR_PROMPT},\n                image_prompt,\n            ],\n        )\n        response = self.model.invoke([message])\n        return str(response.content)\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/layout_detection/layout_detector.py",
    "content": "import logging\nimport os\nimport pathlib\nimport uuid\nfrom typing import Any, List\n\nimport numpy as np\nimport onnxruntime as rt\nfrom megaparse.configs.auto import DeviceEnum\nfrom megaparse.layout_detection.output import LayoutDetectionOutput\nfrom megaparse.utils.onnx import get_providers\nfrom megaparse_sdk.schema.document import BBOX, Point2D\nfrom onnxtr.models.engine import EngineConfig\nfrom onnxtr.models.preprocessor import PreProcessor\nfrom PIL import Image, ImageDraw\nfrom PIL.Image import Image as PILImage\n\nlogger = logging.getLogger(\"megaparse\")\n\nLABEL_MAP = {\n    0: \"Caption\",\n    1: \"Footnote\",\n    2: \"Formula\",\n    3: \"List-item\",\n    4: \"Page-footer\",\n    5: \"Page-header\",\n    6: \"Picture\",\n    7: \"Section-header\",\n    8: \"Table\",\n    9: \"Text\",\n    10: \"Title\",\n}\n\ndefault_cfg: dict[str, dict[str, Any]] = {\n    \"yolov10s-doclaynet\": {\n        \"mean\": (0.5, 0.5, 0.5),\n        \"std\": (1.0, 1.0, 1.0),\n        \"url_8_bit\": None,\n        \"input_shape\": (1, 1024, 1024),\n        \"url\": pathlib.Path(__file__).parent.joinpath(\"models/yolov10s-doclaynet.onnx\"),\n    }\n}\n\n\nclass LayoutDetector:\n    def __init__(\n        self,\n        device: DeviceEnum = DeviceEnum.CPU,\n        threshold: float = 0.1,\n        preserve_aspect_ratio: bool = True,\n        model_name: str = \"yolov10s-doclaynet\",\n        load_in_8_bit: bool = False,\n    ):\n        model_config = default_cfg[model_name]\n        self.device = device\n        general_options = rt.SessionOptions()\n        providers = get_providers(self.device)\n        self.threshold = threshold\n        self.batch_size, self.required_width, self.required_height = model_config[\n            \"input_shape\"\n        ]\n        self.preserve_aspect_ratio = preserve_aspect_ratio\n\n        self.pre_processor = PreProcessor(\n            output_size=(self.required_width, self.required_height),\n            batch_size=self.batch_size,\n            preserve_aspect_ratio=self.preserve_aspect_ratio,\n        )\n\n        engine_config = EngineConfig(\n            session_options=general_options,\n            providers=providers,\n        )\n        model_path = (\n            model_config.get(\"url_8_bit\") if load_in_8_bit else model_config.get(\"url\")\n        )\n        assert model_path, f\"Model path not found for {model_name}\"\n\n        self.model = rt.InferenceSession(model_path, engine_config=engine_config)\n\n    def __call__(\n        self, img_pages: list[PILImage], output_dir: str | None = None\n    ) -> List[List[LayoutDetectionOutput]]:\n        pages = [np.array(img) for img in img_pages]\n        # Dimension check\n        if any(page.ndim != 3 for page in pages):\n            raise ValueError(\n                \"incorrect input shape: all pages are expected to be multi-channel 2D images.\"\n            )\n        processed_batches = self.pre_processor(pages)\n        processed_batches = np.array(processed_batches)\n        processed_batches = processed_batches.squeeze(1)  # Horrendus\n        processed_batches = processed_batches.transpose(0, 3, 1, 2)\n\n        pred_batches = np.array(\n            [\n                self.model.run(None, {\"images\": np.expand_dims(batch, axis=0)})\n                for batch in processed_batches\n            ]\n        )\n        pred_batches = np.concatenate(pred_batches, axis=0)\n        pred_batches = pred_batches.squeeze(1)  # Horrendus\n\n        processed_preds = []\n        for page, pred in zip(pages, pred_batches, strict=True):\n            img_h, img_w = page.shape[:2]\n            bboxes = self.extract_bboxes_from_page(pred, img_h, img_w)\n            processed_preds.append(bboxes)\n\n        if output_dir:\n            self._save_layout(pages=pages, preds=processed_preds, output_dir=output_dir)\n\n        return processed_preds\n\n    def extract_bboxes_from_page(\n        self, preds: np.ndarray, img_h: int, img_w: int\n    ) -> List[LayoutDetectionOutput]:\n        results = []\n\n        assert preds.shape == (300, 6)\n\n        scale_h = img_h / self.required_height\n        scale_w = img_w / self.required_width\n\n        for det in preds:\n            # Rescale the bounding box coordinates to the original dimensions\n            x1, y1, x2, y2, score, cls_idx = det\n            if score < self.threshold:\n                continue\n\n            x1 *= scale_w\n            x2 *= scale_w\n            y1 *= scale_h\n            y2 *= scale_h\n\n            if self.preserve_aspect_ratio:\n                ratio = img_h / img_w\n                x1 = x1 * (ratio if ratio > 1 else 1)\n                x2 = x2 * (ratio if ratio > 1 else 1)\n                y1 = y1 / (ratio if ratio < 1 else 1)\n                y2 = y2 / (ratio if ratio < 1 else 1)\n\n            x1 = max(0, min(x1, img_w))\n            x2 = max(0, min(x2, img_w))\n            y1 = max(0, min(y1, img_h))\n            y2 = max(0, min(y2, img_h))\n\n            bbox_id = uuid.uuid4()\n\n            results.append(\n                LayoutDetectionOutput(\n                    bbox_id=bbox_id,\n                    bbox=BBOX(\n                        top_left=Point2D(x=x1 / img_w, y=y1 / img_h),\n                        bottom_right=Point2D(x=x2 / img_w, y=y2 / img_h),\n                    ),\n                    prob=det[4],\n                    label=int(det[5]),\n                )\n            )\n\n        result = self.topK(results)  # or topK\n        return result\n\n    def nms(\n        self,\n        raw_bboxes: List[LayoutDetectionOutput],\n        iou_threshold: float = 0.9,  # FIXME: thresh Configurable in constructor\n    ) -> List[LayoutDetectionOutput]:\n        \"\"\"\n        Non-Maximum Suppression (NMS) algorithm.\n\n        Args:\n            raw_bboxes (list): List of LayoutBBox objects.\n            iou_threshold (float): IoU threshold for suppression.\n\n        Returns:\n            None: The input list `raw_bboxes` is modified in-place.\n        \"\"\"\n        raw_bboxes.sort(key=lambda x: x.prob, reverse=True)\n\n        current_index = 0\n        for index in range(len(raw_bboxes)):\n            drop = False\n            for prev_index in range(current_index):\n                iou = raw_bboxes[index].bbox.iou(raw_bboxes[prev_index].bbox)\n                if iou > iou_threshold:\n                    drop = True\n                    break\n            if not drop:\n                raw_bboxes[current_index], raw_bboxes[index] = (\n                    raw_bboxes[index],\n                    raw_bboxes[current_index],\n                )\n                current_index += 1\n\n        return raw_bboxes[:current_index]\n\n    def topK(\n        self, detectResult: List[LayoutDetectionOutput], topK: int = 50\n    ) -> List[LayoutDetectionOutput]:\n        if len(detectResult) <= topK:\n            return detectResult\n        else:\n            predBoxs = []\n            sort_detectboxs = sorted(detectResult, key=lambda x: x.prob, reverse=True)\n            for i in range(topK):\n                predBoxs.append(sort_detectboxs[i])\n            return predBoxs\n\n    def _save_layout(\n        self,\n        pages: list[np.ndarray],\n        preds: list[list[LayoutDetectionOutput]],\n        output_dir: str,\n    ):\n        os.makedirs(output_dir, exist_ok=True)\n        for i, (page, layout) in enumerate(zip(pages, preds, strict=True)):\n            image = Image.fromarray(page)\n            draw = ImageDraw.Draw(image)\n            img_w, img_h = image.size\n\n            for detection in layout:\n                x_min, y_min, x_max, y_max = detection.bbox.to_numpy()\n                bbox = x_min * img_w, y_min * img_h, x_max * img_w, y_max * img_h\n                confidence = detection.prob\n                category = detection.label\n                label = LABEL_MAP.get(category, \"Unknown\")\n\n                draw.rectangle(bbox, outline=\"red\", width=2)\n                # assert bbox[2] <= image.width\n                # assert bbox[3] <= image.height\n                draw.text(\n                    (bbox[0], bbox[1]),\n                    f\"{label} ({confidence:.2f})\",\n                    fill=\"red\",\n                )\n\n            image.save(os.path.join(output_dir, f\"page_{i}.png\"))\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/layout_detection/output.py",
    "content": "from uuid import UUID\n\nfrom megaparse_sdk.schema.document import BBOX\nfrom pydantic import BaseModel\n\n\nclass LayoutDetectionOutput(BaseModel):\n    bbox_id: UUID\n    bbox: BBOX\n    prob: float\n    label: int\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/megaparse.py",
    "content": "import logging\nimport warnings\nfrom pathlib import Path\nfrom typing import IO, BinaryIO, List\n\nimport pypdfium2 as pdfium\nfrom megaparse_sdk.schema import document\nfrom megaparse_sdk.schema.extensions import FileExtension\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\n\nfrom megaparse.configs.auto import MegaParseConfig\nfrom megaparse.exceptions.base import ParsingException\nfrom megaparse.formatter.base import BaseFormatter\nfrom megaparse.layout_detection.layout_detector import LayoutDetector\nfrom megaparse.models.page import Page, PageDimension\nfrom megaparse.parser.doctr_parser import DoctrParser\nfrom megaparse.parser.unstructured_parser import UnstructuredParser\nfrom megaparse.utils.strategy import (\n    determine_global_strategy,\n    get_page_strategy,\n)\n\nlogger = logging.getLogger(\"megaparse\")\n\n\nclass MegaParse:\n    def __init__(\n        self,\n        formatters: List[BaseFormatter] | None = None,\n        config: MegaParseConfig = MegaParseConfig(),\n        unstructured_strategy: StrategyEnum = StrategyEnum.AUTO,\n    ) -> None:\n        self.config = config\n        self.formatters = formatters\n        self.doctr_parser = DoctrParser(\n            text_det_config=self.config.doctr_config.text_det_config,\n            text_reco_config=self.config.doctr_config.text_reco_config,\n            device=self.config.device,\n            straighten_pages=self.config.doctr_config.straighten_pages,\n            detect_orientation=self.config.doctr_config.detect_orientation,\n            detect_language=self.config.doctr_config.detect_language,\n        )\n        self.unstructured_parser = UnstructuredParser()\n\n        self.layout_model = LayoutDetector()\n        self.unstructured_parser = UnstructuredParser(unstructured_strategy)\n\n    def validate_input(\n        self,\n        file_path: Path | str | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: str | FileExtension | None = None,\n    ) -> FileExtension:\n        if not (file_path or file):\n            raise ValueError(\"Either file_path or file should be provided\")\n\n        if file_path and file:\n            raise ValueError(\"Only one of file_path or file should be provided\")\n\n        if file_path and file is None:\n            if isinstance(file_path, str):\n                file_path = Path(file_path)\n            file_extension = file_path.suffix\n        elif file and file_path is None:\n            if not file_extension:\n                raise ValueError(\n                    \"file_extension should be provided when given file argument\"\n                )\n            file.seek(0)\n        else:\n            raise ValueError(\"Either provider a file_path or file\")\n\n        if isinstance(file_extension, str):\n            try:\n                file_extension = FileExtension(file_extension)\n            except ValueError:\n                raise ValueError(f\"Unsupported file extension: {file_extension}\")\n        return file_extension\n\n    def extract_page_strategies(\n        self, pdfium_document: pdfium.PdfDocument, rast_scale: int = 2\n    ) -> List[Page]:\n        pages = []\n        for i, pdfium_page in enumerate(pdfium_document):\n            rasterized_page = pdfium_page.render(scale=rast_scale)\n            assert (\n                abs(pdfium_page.get_width() * rast_scale - rasterized_page.width) <= 1\n            ), (\n                f\"Widths do not match within a margin of 1: \"\n                f\"{pdfium_page.get_width() * rast_scale} != {rasterized_page.width}\"\n            )\n            pages.append(\n                Page(\n                    strategy=StrategyEnum.AUTO,\n                    text_detections=None,\n                    rasterized=rasterized_page.to_pil(),\n                    page_size=PageDimension(\n                        width=pdfium_page.get_width() * rast_scale,\n                        height=pdfium_page.get_height() * rast_scale,\n                    ),\n                    page_index=i,\n                    pdfium_elements=pdfium_page,\n                )\n            )\n            pages.append(\n                Page(\n                    strategy=StrategyEnum.AUTO,\n                    text_detections=None,\n                    rasterized=rasterized_page.to_pil(),\n                    page_size=PageDimension(\n                        width=pdfium_page.get_width() * rast_scale,\n                        height=pdfium_page.get_height() * rast_scale,\n                    ),\n                    page_index=i,\n                    pdfium_elements=pdfium_page,\n                )\n            )\n\n        # ----\n        # Get text detection for each page -> PAGE\n\n        pages = self.doctr_parser.get_text_detections(pages)\n\n        # ---\n\n        # Get strategy per page -> PAGE\n        for page in pages:\n            page.strategy = get_page_strategy(\n                page.pdfium_elements,\n                page.text_detections,\n                threshold=self.config.auto_config.page_threshold,\n            )\n        return pages\n\n        pages = self.doctr_parser.get_text_detections(pages)\n\n        for page in pages:\n            page.strategy = get_page_strategy(\n                page.pdfium_elements,\n                page.text_detections,\n                threshold=self.config.auto_config.page_threshold,\n            )\n        return pages\n\n    def load(\n        self,\n        file_path: Path | str | None = None,\n        file: BinaryIO | None = None,\n        file_extension: str | FileExtension = \"\",\n        strategy: StrategyEnum = StrategyEnum.AUTO,\n    ) -> str:\n        file_extension = self.validate_input(\n            file=file, file_path=file_path, file_extension=file_extension\n        )\n        if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:\n            self.unstructured_parser.strategy = strategy\n            return str(\n                self.unstructured_parser.convert(\n                    file_path=file_path, file=file, file_extension=file_extension\n                )\n            )\n        else:\n            opened_file = None\n            try:\n                if file_path:\n                    opened_file = open(file_path, \"rb\")\n                    file = opened_file\n\n                assert file is not None, \"No File provided\"\n\n                pdfium_document = pdfium.PdfDocument(file)\n\n                # Rasterize pages and extract text recognition\n                pages = self.extract_page_strategies(pdfium_document)\n                strategy = determine_global_strategy(\n                    pages, self.config.auto_config.document_threshold\n                )\n\n                # Extract layout model\n                assert all(p.rasterized for p in pages)\n                layout_result = self.layout_model([p.rasterized for p in pages])  # type: ignore\n\n                if strategy == StrategyEnum.HI_RES:\n                    logger.debug(\"Using doctr for text recognition\")\n                    parsed_document = self.doctr_parser.get_text_recognition(\n                        pages, layout_result\n                    )\n\n                else:\n                    logger.debug(\"Using Unstructured Parser\")\n                    self.unstructured_parser.strategy = StrategyEnum.FAST\n                    parsed_document = self.unstructured_parser.convert(\n                        file=file, file_extension=file_extension\n                    )\n\n                # additional attributes\n                parsed_document.file_name = str(file_path) if file_path else None\n                parsed_document.metadata = pdfium_document.get_metadata_dict()\n\n                # Format -> TODO: should be generic\n                if self.formatters:\n                    for formatter in self.formatters:\n                        if isinstance(parsed_document, str):\n                            warnings.warn(\n                                f\"The last step returned a string, the {formatter.__class__} and following will not be applied\",\n                                stacklevel=2,\n                            )\n                            break\n                        parsed_document = formatter.format(parsed_document)\n\n                if not isinstance(parsed_document, str):\n                    return str(parsed_document)\n                return parsed_document\n            except Exception as e:\n                logger.exception(f\"Error occured while parsing {file}: {e}\")\n                raise ParsingException(\n                    f\"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}\"\n                )\n            finally:\n                if opened_file:\n                    opened_file.close()\n\n    async def aload(\n        self,\n        file_path: Path | str | None = None,\n        file: BinaryIO | None = None,\n        file_extension: str | FileExtension = \"\",\n        strategy: StrategyEnum = StrategyEnum.AUTO,\n    ) -> str | document.Document:\n        file_extension = self.validate_input(\n            file=file, file_path=file_path, file_extension=file_extension\n        )\n        if file_extension != FileExtension.PDF or strategy == StrategyEnum.FAST:\n            self.unstructured_parser.strategy = strategy\n            parsed_document = await self.unstructured_parser.aconvert(\n                file_path=file_path, file=file, file_extension=file_extension\n            )\n            return str(parsed_document)\n        else:\n            opened_file = None\n            try:\n                if file_path:\n                    opened_file = open(file_path, \"rb\")\n                    file = opened_file\n\n                assert file is not None, \"No File provided\"\n                pdfium_document = pdfium.PdfDocument(file)\n                # Determine strategy\n                pages = self.extract_page_strategies(pdfium_document)\n                strategy = determine_global_strategy(\n                    pages, self.config.auto_config.document_threshold\n                )\n\n                # Run layout model\n                assert all(p.rasterized for p in pages)\n                layout_result = self.layout_model([p.rasterized for p in pages])  # type: ignore\n\n                if strategy == StrategyEnum.HI_RES:\n                    logger.info(\"Using Doctr for text recognition\")\n                    parsed_document = self.doctr_parser.get_text_recognition(\n                        pages, layout_result\n                    )\n\n                else:\n                    logger.info(\"Switching to Unstructured Parser\")\n                    self.unstructured_parser.strategy = StrategyEnum.FAST\n                    parsed_document = await self.unstructured_parser.aconvert(\n                        file=file, file_extension=file_extension\n                    )\n\n                parsed_document.file_name = str(file_path) if file_path else None\n                parsed_document.metadata = pdfium_document.get_metadata_dict()\n\n                if self.formatters:\n                    for formatter in self.formatters:\n                        if isinstance(parsed_document, str):\n                            warnings.warn(\n                                f\"The last step returned a string, the {formatter.__class__} and following will not be applied\",\n                                stacklevel=2,\n                            )\n                            break\n                        parsed_document = await formatter.aformat(parsed_document)\n\n                return parsed_document\n            except Exception as e:\n                raise ParsingException(\n                    f\"Error while parsing file {file_path or file}, file_extension: {file_extension}: {e}\"\n                )\n            finally:\n                if opened_file:\n                    opened_file.close()\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/models/page.py",
    "content": "from typing import List\n\nfrom megaparse_sdk.schema.document import TextDetection\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\nfrom PIL.Image import Image as PILImage\nfrom pydantic import BaseModel, ConfigDict\nfrom pypdfium2._helpers.page import PdfPage\n\n\nclass PageDimension(BaseModel):\n    \"\"\"\n    A class to represent a page dimension\n    \"\"\"\n\n    width: float\n    height: float\n\n\nclass Page(BaseModel):\n    \"\"\"\n    A class to represent a page\n    \"\"\"\n\n    strategy: StrategyEnum\n    text_detections: TextDetection | None = None\n    rasterized: PILImage | None = None\n    page_size: PageDimension\n    page_index: int\n    pdfium_elements: PdfPage\n\n    model_config = ConfigDict(arbitrary_types_allowed=True)\n\n\nclass GatewayDocument(BaseModel):\n    \"\"\"\n    A class to represent a Gateway MegaParse Document, which is a container of pages.\n    \"\"\"\n\n    file_name: str\n    pages: List[Page]\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/__init__.py",
    "content": "from .base import BaseParser\n\n__all__ = [\"BaseParser\"]\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/base.py",
    "content": "from abc import ABC, abstractmethod\nfrom pathlib import Path\nfrom typing import IO\n\nfrom megaparse_sdk.schema.document import Document\nfrom megaparse_sdk.schema.extensions import FileExtension\n\n\nclass BaseParser(ABC):\n    \"\"\"Mother Class for all the parsers [Unstructured, LlamaParse, MegaParseVision]\"\"\"\n\n    supported_extensions = []\n\n    def check_supported_extension(\n        self, file_extension: FileExtension | None, file_path: str | Path | None = None\n    ):\n        if not file_extension and not file_path:\n            raise ValueError(\n                f\"Either file_path or file_extension must be provided for {self.__class__.__name__}\"\n            )\n        if file_path and not file_extension:\n            file_path = Path(file_path) if isinstance(file_path, str) else file_path\n            file_extension = FileExtension(file_path.suffix)\n        if file_extension and file_extension not in self.supported_extensions:\n            raise ValueError(\n                f\"Unsupported file extension {file_extension.value} for {self.__class__.__name__}\"\n            )\n\n    @abstractmethod\n    async def aconvert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: FileExtension | None = None,\n        **kwargs,\n    ) -> Document:\n        \"\"\"\n        Convert the given file to a specific format.\n\n        Args:\n            file_path (str | Path): The path to the file to be converted.\n            **kwargs: Additional keyword arguments for the conversion process.\n\n        Returns:\n            str: The result of the conversion process.\n\n        Raises:\n            NotImplementedError: If the method is not implemented by a subclass.\n        \"\"\"\n        raise NotImplementedError(\"Subclasses should implement this method\")\n\n    @abstractmethod\n    def convert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: FileExtension | None = None,\n        **kwargs,\n    ) -> Document:\n        \"\"\"\n        Convert the given file to the unstructured format.\n\n        Args:\n            file_path (str | Path): The path to the file to be converted.\n            **kwargs: Additional keyword arguments for the conversion process.\n\n        Returns:\n            str: The result of the conversion process.\n\n        Raises:\n            NotImplementedError: If the method is not implemented by a subclass.\n        \"\"\"\n        raise NotImplementedError(\"Subclasses should implement this method\")\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/builder.py",
    "content": "from megaparse_sdk.schema.parser_config import ParseFileConfig\n\nfrom megaparse.parser.base import BaseParser\nfrom megaparse.parser.llama import LlamaParser\nfrom megaparse.parser.megaparse_vision import MegaParseVision\nfrom megaparse.parser.unstructured_parser import UnstructuredParser\n\nparser_dict: dict[str, type] = {\n    \"unstructured\": UnstructuredParser,\n    \"llama_parser\": LlamaParser,\n    \"megaparse_vision\": MegaParseVision,\n}\n\n\nclass ParserBuilder:\n    def build(self, config: ParseFileConfig) -> BaseParser:\n        \"\"\"\n        Build a parser based on the given configuration.\n\n        Args:\n            config (ParserDict): The configuration to be used for building the parser.\n\n        Returns:\n            BaseParser: The built parser.\n\n        Raises:\n            ValueError: If the configuration is invalid.\n        \"\"\"\n        return parser_dict[config.method](**config.model_dump())\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/doctr_parser.py",
    "content": "import logging\nimport uuid\nfrom typing import Any, Dict, List, Tuple, Type\nfrom uuid import UUID\n\nimport numpy as np\nimport onnxruntime as rt\nfrom megaparse_sdk.schema.document import (\n    BBOX,\n    Block,\n    BlockLayout,\n    BlockType,\n    CaptionBlock,\n    FooterBlock,\n    HeaderBlock,\n    ImageBlock,\n    ListElementBlock,\n    Point2D,\n    SubTitleBlock,\n    TableBlock,\n    TextBlock,\n    TextDetection,\n    TitleBlock,\n    UndefinedBlock,\n)\nfrom megaparse_sdk.schema.document import Document as MPDocument\nfrom megaparse_sdk.schema.extensions import FileExtension\nfrom onnxtr.io import Document\nfrom onnxtr.models import detection_predictor, recognition_predictor\nfrom onnxtr.models._utils import get_language\nfrom onnxtr.models.engine import EngineConfig\nfrom onnxtr.models.predictor.base import _OCRPredictor\nfrom onnxtr.utils.geometry import detach_scores\nfrom onnxtr.utils.repr import NestedObject\n\nfrom megaparse.configs.auto import DeviceEnum, TextDetConfig, TextRecoConfig\nfrom megaparse.layout_detection.output import LayoutDetectionOutput\nfrom megaparse.models.page import Page\nfrom megaparse.utils.onnx import get_providers\n\nlogger = logging.getLogger(\"megaparse\")\n\nblock_cls_map: Dict[int, Type[Block]] = {\n    0: CaptionBlock,\n    1: TextBlock,\n    2: TextBlock,\n    3: ListElementBlock,\n    4: FooterBlock,\n    5: HeaderBlock,\n    6: ImageBlock,\n    7: SubTitleBlock,\n    8: TableBlock,\n    9: TextBlock,\n    10: TitleBlock,\n}\n\n\nclass DoctrParser(NestedObject, _OCRPredictor):\n    supported_extensions = [FileExtension.PDF]\n\n    def __init__(\n        self,\n        text_det_config: TextDetConfig = TextDetConfig(),\n        text_reco_config: TextRecoConfig = TextRecoConfig(),\n        device: DeviceEnum = DeviceEnum.CPU,\n        straighten_pages: bool = False,\n        detect_orientation: bool = False,\n        detect_language: bool = False,\n        **kwargs,\n    ):\n        self.device = device\n        general_options = rt.SessionOptions()\n        providers = get_providers(self.device)\n        engine_config = EngineConfig(\n            session_options=general_options,\n            providers=providers,\n        )\n\n        _OCRPredictor.__init__(\n            self,\n            text_det_config.assume_straight_pages,\n            straighten_pages,\n            text_det_config.preserve_aspect_ratio,\n            text_det_config.symmetric_pad,\n            detect_orientation,\n            clf_engine_cfg=engine_config,\n            **kwargs,\n        )\n\n        self.det_predictor = detection_predictor(\n            arch=text_det_config.det_arch,\n            assume_straight_pages=text_det_config.assume_straight_pages,\n            preserve_aspect_ratio=text_det_config.preserve_aspect_ratio,\n            symmetric_pad=text_det_config.symmetric_pad,\n            batch_size=text_det_config.batch_size,\n            load_in_8_bit=text_det_config.load_in_8_bit,\n            engine_cfg=engine_config,\n        )\n\n        self.reco_predictor = recognition_predictor(\n            arch=text_reco_config.reco_arch,\n            batch_size=text_reco_config.batch_size,\n            load_in_8_bit=text_det_config.load_in_8_bit,\n            engine_cfg=engine_config,\n        )\n\n        self.detect_orientation = detect_orientation\n        self.detect_language = detect_language\n\n    def get_text_detections(self, pages: list[Page], **kwargs) -> List[Page]:\n        rasterized_pages = [np.array(page.rasterized) for page in pages]\n        # Dimension check\n        if any(page.ndim != 3 for page in rasterized_pages):\n            raise ValueError(\n                \"incorrect input shape: all pages are expected to be multi-channel 2D images.\"\n            )\n\n        origin_page_shapes = [page.shape[:2] for page in rasterized_pages]\n\n        # Localize text elements\n        loc_preds, out_maps = self.det_predictor(\n            rasterized_pages, return_maps=True, **kwargs\n        )\n\n        # Detect document rotation and rotate pages\n        seg_maps = [\n            np.where(\n                out_map > self.det_predictor.model.postprocessor.bin_thresh,\n                255,\n                0,\n            ).astype(np.uint8)\n            for out_map in out_maps\n        ]\n        if self.detect_orientation:\n            general_pages_orientations, origin_pages_orientations = (\n                self._get_orientations(rasterized_pages, seg_maps)\n            )\n            orientations = [\n                {\"value\": orientation_page, \"confidence\": None}\n                for orientation_page in origin_pages_orientations\n            ]\n        else:\n            orientations = None\n            general_pages_orientations = None\n            origin_pages_orientations = None\n        if self.straighten_pages:\n            rasterized_pages = self._straighten_pages(\n                rasterized_pages,\n                seg_maps,\n                general_pages_orientations,\n                origin_pages_orientations,\n            )\n            # update page shapes after straightening\n            origin_page_shapes = [page.shape[:2] for page in rasterized_pages]\n\n            # forward again to get predictions on straight pagess\n            loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]\n\n        # Detach objectness scores from loc_preds\n        loc_preds, objectness_scores = detach_scores(loc_preds)  # type: ignore[arg-type]\n\n        # Apply hooks to loc_preds if any\n        for hook in self.hooks:\n            loc_preds = hook(loc_preds)\n\n        for page_index, (rast_page, loc_pred, objectness_score, page) in enumerate(\n            zip(rasterized_pages, loc_preds, objectness_scores, pages, strict=True)\n        ):\n            block_layouts = []\n            for bbox, score in zip(loc_pred, objectness_score, strict=True):\n                block_layouts.append(\n                    BlockLayout(\n                        bbox=BBOX(bbox[:2].tolist(), bbox[2:].tolist()),\n                        objectness_score=score,\n                        block_type=BlockType.TEXT,\n                    )\n                )\n            page.text_detections = TextDetection(\n                bboxes=block_layouts,\n                page_index=page_index,\n                dimensions=rast_page.shape[:2],\n                orientation=orientations[page_index] if orientations is not None else 0,\n                origin_page_shape=origin_page_shapes[page_index],\n            )\n\n        return pages\n\n    def get_text_recognition(\n        self, pages: List[Page], layout: List[List[LayoutDetectionOutput]], **kwargs\n    ) -> MPDocument:\n        assert any(\n            page.text_detections is not None for page in pages\n        ), \"Text detections should be computed before running text recognition\"\n\n        rasterized_pages = []\n        loc_preds = []\n        objectness_scores = []\n        orientations = []\n        origin_page_shapes = []\n        for page in pages:\n            page_loc_pred = page.text_detections.get_loc_preds()  # type: ignore\n            if page_loc_pred.shape[0] == 0:\n                page_loc_pred = np.zeros((0, 4))\n            rasterized_pages.append(np.array(page.rasterized))\n            loc_preds.append(page_loc_pred)  # type: ignore\n            objectness_scores.append(page.text_detections.get_objectness_scores())  # type: ignore\n            orientations.append(page.text_detections.get_orientations())  # type: ignore\n            origin_page_shapes.append(page.text_detections.get_origin_page_shapes())  # type: ignore\n        # Crop images\n        crops, loc_preds = self._prepare_crops(\n            rasterized_pages,\n            loc_preds,  # type: ignore[arg-type]\n            channels_last=True,\n            assume_straight_pages=self.assume_straight_pages,\n            assume_horizontal=self._page_orientation_disabled,\n        )\n        # Rectify crop orientation and get crop orientation predictions\n        crop_orientations: Any = []\n        if not self.assume_straight_pages:\n            crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)\n            crop_orientations = [\n                {\"value\": orientation[0], \"confidence\": orientation[1]}\n                for orientation in _crop_orientations\n            ]\n\n        # Identify character sequences\n        word_preds = self.reco_predictor(\n            [crop for page_crops in crops for crop in page_crops], **kwargs\n        )\n        if not crop_orientations:\n            crop_orientations = [{\"value\": 0, \"confidence\": None} for _ in word_preds]\n\n        boxes, text_preds, crop_orientations = self._process_predictions(\n            loc_preds, word_preds, crop_orientations\n        )\n\n        if self.detect_language:\n            languages = [\n                get_language(\" \".join([item[0] for item in text_pred]))\n                for text_pred in text_preds\n            ]\n            languages_dict = [\n                {\"value\": lang[0], \"confidence\": lang[1]} for lang in languages\n            ]\n        else:\n            languages_dict = None\n\n        # FIXME : Not good return type we want :(\n        out = self.doc_builder(\n            rasterized_pages,\n            boxes,\n            objectness_scores,\n            text_preds,\n            origin_page_shapes,\n            crop_orientations,\n            orientations,\n            languages_dict,\n        )\n        return self.__to_elements_list(out, layout)\n\n    def _get_block_cls(\n        self,\n        coordinates: tuple[float, float, float, float],\n        layout: List[LayoutDetectionOutput],\n        threshold: float = 0.6,\n    ) -> Tuple[UUID | None, Type[Block]]:\n        for det in layout:\n            x1, y1, x2, y2 = coordinates\n            X1, Y1, X2, Y2 = det.bbox.to_numpy()\n\n            assert x1 <= x2 and y1 <= y2, \"bbox1 coordinates are invalid\"\n            assert X1 <= X2 and Y1 <= Y2, \"bbox2 coordinates are invalid\"\n\n            union_x1 = max(x1, X1)\n            union_y1 = max(y1, Y1)\n            union_x2 = min(x2, X2)\n            union_y2 = min(y2, Y2)\n\n            union_width = max(0, union_x2 - union_x1)\n            union_height = max(0, union_y2 - union_y1)\n            union_area = union_width * union_height\n\n            detection_area = max(0, x2 - x1) * max(0, y2 - y1)\n\n            if union_area / detection_area > threshold:\n                # breakpoint()\n                return (det.bbox_id, block_cls_map[det.label])\n\n        return (uuid.uuid4(), UndefinedBlock)\n\n    def __to_elements_list(\n        self, doctr_document: Document, layouts: List[List[LayoutDetectionOutput]]\n    ) -> MPDocument:\n        results = []\n\n        for page_number, (page, layout) in enumerate(\n            zip(doctr_document.pages, layouts, strict=True)\n        ):\n            result = {}\n            for block in page.blocks:\n                if len(block.lines) and len(block.artefacts) > 0:\n                    raise ValueError(\n                        \"Block should not contain both lines and artefacts\"\n                    )\n                for line in block.lines:\n                    line_coordinates = [word.geometry for word in line.words]\n                    x0 = min(word[0][0] for word in line_coordinates)\n                    y0 = min(word[0][1] for word in line_coordinates)\n                    x1 = max(word[1][0] for word in line_coordinates)\n                    y1 = max(word[1][1] for word in line_coordinates)\n\n                    block_id, block_cls = self._get_block_cls(\n                        coordinates=(x0, y0, x1, y1), layout=layout\n                    )\n                    if block_id in result:\n                        bbx0, bby0, bbx1, bby1 = result[block_id].bbox.to_numpy()\n                        result[block_id].text += \"\\n\" + line.render()\n                        result[block_id].bbox = BBOX(\n                            top_left=Point2D(x=min(x0, bbx0), y=min(y0, bby0)),\n                            bottom_right=Point2D(x=max(x1, bbx1), y=max(y1, bby1)),\n                        )\n\n                    elif issubclass(block_cls, TextBlock):\n                        result[block_id] = block_cls(\n                            text=line.render(),\n                            bbox=BBOX(\n                                top_left=Point2D(x=x0, y=y0),\n                                bottom_right=Point2D(x=x1, y=y1),\n                            ),\n                            metadata={},\n                            page_range=(page_number, page_number),\n                        )\n                # We add the Image Blocks to the MPDocument with the right order\n                for det in layout:\n                    if det.label in [6, 8]:\n                        x0, y0, x1, y1 = det.bbox.to_numpy()\n                        block_cls = block_cls_map[det.label]\n                        result[uuid.uuid4()] = block_cls(\n                            bbox=BBOX(\n                                top_left=Point2D(x=x0, y=y0),\n                                bottom_right=Point2D(x=x1, y=y1),\n                            ),\n                            metadata={},\n                            page_range=(page_number, page_number),\n                        )\n            sorted_page_blocks = sorted(\n                result.values(), key=lambda block: block.bbox.top_left.y\n            )\n\n            results += sorted_page_blocks\n        return MPDocument(\n            metadata={},\n            content=results,\n            detection_origin=\"doctr\",\n        )\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/entity.py",
    "content": "from enum import Enum\nfrom typing import List, Optional\n\n\nclass TagEnum(str, Enum):\n    \"\"\"Possible tags for the elements in the file\"\"\"\n\n    TABLE = \"TABLE\"\n    TOC = \"TOC\"\n    HEADER = \"HEADER\"\n    IMAGE = \"IMAGE\"\n\n\nclass SupportedModel(Enum):\n    GPT_4O = (\"gpt-4o\", None)\n    GPT_4O_TURBO = (\"gpt-4o-turbo\", None)\n    CLAUDE_3_5_SONNET = (\"claude-3-5-sonnet\", [\"latest\", \"20241022\"])\n    CLAUDE_3_OPUS = (\"claude-3-opus\", [\"latest\", \"20240229\"])\n\n    def __init__(self, model_name: str, supported_releases: Optional[List[str]]):\n        self.model_name = model_name\n        self.supported_releases = supported_releases\n\n    @classmethod\n    def is_supported(cls, model_name: str) -> bool:\n        # Attempt to match model_name by checking if it starts with a known model name\n        for model in cls:\n            if model_name.startswith(model.model_name):\n                # Extract the release version if available\n                release = model_name[len(model.model_name) :].lstrip(\"-\") or None\n                # Check if the model supports this release\n                if model.supported_releases is None:\n                    return True\n                return release in model.supported_releases if release else False\n        return False\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/llama.py",
    "content": "from pathlib import Path\nfrom typing import IO, List\n\nfrom llama_index.core.schema import Document as LlamaDocument\nfrom llama_parse import LlamaParse as _LlamaParse\nfrom llama_parse.utils import Language, ResultType\nfrom megaparse_sdk.schema.document import BBOX, Point2D, TextBlock\nfrom megaparse_sdk.schema.document import Document as MPDocument\nfrom megaparse_sdk.schema.extensions import FileExtension\n\nfrom megaparse.parser import BaseParser\n\n\nclass LlamaParser(BaseParser):\n    supported_extensions = [FileExtension.PDF]\n\n    def __init__(\n        self,\n        api_key: str,\n        verbose=True,\n        language: Language = Language.FRENCH,\n        parsing_instruction: str | None = None,\n        **kwargs,\n    ) -> None:\n        self.api_key = api_key\n        self.verbose = verbose\n        self.language = language\n        if parsing_instruction:\n            self.parsing_instruction = parsing_instruction\n        else:\n            self.parsing_instruction = \"\"\"Do not take into account the page breaks (no --- between pages),\n            do not repeat the header and the footer so the tables are merged if needed. Keep the same format for similar tables.\"\"\"\n\n    async def aconvert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: None | FileExtension = None,\n        **kwargs,\n    ) -> MPDocument:\n        if not file_path:\n            raise ValueError(\"File_path should be provided to run LlamaParser\")\n        self.check_supported_extension(file_extension, file_path)\n\n        llama_parser = _LlamaParse(\n            api_key=self.api_key,\n            result_type=ResultType.MD,\n            gpt4o_mode=True,\n            verbose=self.verbose,\n            language=self.language,\n            parsing_instruction=self.parsing_instruction,\n        )\n\n        documents: List[LlamaDocument] = await llama_parser.aload_data(str(file_path))\n\n        return self.__to_elements_list__(documents)\n\n    def convert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: None | FileExtension = None,\n        **kwargs,\n    ) -> MPDocument:\n        if not file_path:\n            raise ValueError(\"File_path should be provided to run LlamaParser\")\n        self.check_supported_extension(file_extension, file_path)\n\n        llama_parser = _LlamaParse(\n            api_key=self.api_key,\n            result_type=ResultType.JSON,\n            gpt4o_mode=True,\n            verbose=self.verbose,\n            language=self.language,\n            parsing_instruction=self.parsing_instruction,\n        )\n\n        documents: List[LlamaDocument] = llama_parser.load_data(str(file_path))\n\n        return self.__to_elements_list__(documents)\n\n    def __to_elements_list__(self, llama_doc: List[LlamaDocument]) -> MPDocument:\n        list_blocks = []\n        for i, page in enumerate(llama_doc):\n            list_blocks.append(\n                TextBlock(\n                    text=page.text,\n                    metadata={},\n                    page_range=(i, i + 1),\n                    bbox=BBOX(\n                        top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)\n                    ),\n                )\n            )\n        return MPDocument(\n            metadata={},\n            detection_origin=\"llamaparse\",\n            content=list_blocks,\n        )\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/megaparse_vision.py",
    "content": "import asyncio\nimport base64\nimport re\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import IO, List\n\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom langchain_core.messages import HumanMessage\nfrom megaparse_sdk.schema.document import BBOX, Block, Point2D, TextBlock\nfrom megaparse_sdk.schema.document import Document as MPDocument\nfrom megaparse_sdk.schema.extensions import FileExtension\nfrom pdf2image import convert_from_path\n\nfrom megaparse.parser import BaseParser\nfrom megaparse.parser.entity import SupportedModel, TagEnum\n\n# BASE_OCR_PROMPT = \"\"\"\n# Transcribe the content of this file into markdown. Be mindful of the formatting.\n# Add formatting if you think it is not clear.\n# Do not include page breaks and merge content of tables if it is continued in the next page.\n# Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]'\n# Return only the parsed content.\n# \"\"\"\n\nBASE_OCR_PROMPT = \"\"\"\nYou are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags.\n\n\nFollow these instructions to complete the task:\n\n1. Carefully read through the entire file content.\n\n2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure.\n\n3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure.\n\n4. For tables, headers, and table of contents, add the following tags:\n   - Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page.\n   - Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file.\n   - Table of contents: Enclose in [TOC] and [/TOC] tags\n\n5. When transcribing tables:\n   - If a table continues across multiple pages, merge the content into a single, cohesive table.\n   - Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure.\n\n6. Do not include page breaks in your transcription.\n\n7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.).\n\n8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed.\n\n10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents.\n\"\"\"\n\n\nclass MegaParseVision(BaseParser):\n    supported_extensions = [FileExtension.PDF]\n\n    def __init__(self, model: BaseChatModel, **kwargs):\n        if hasattr(model, \"model_name\"):\n            if not SupportedModel.is_supported(model.model_name):\n                raise ValueError(\n                    f\"Invald model name, MegaParse vision only supports model that have vision capabilities. \"\n                    f\"{model.model_name} is not supported.\"\n                )\n        self.model = model\n\n        self.parsed_chunks: list[str] | None = None\n\n    def process_file(self, file_path: str, image_format: str = \"PNG\") -> List[str]:\n        \"\"\"\n        Process a PDF file and convert its pages to base64 encoded images.\n\n        :param file_path: Path to the PDF file\n        :param image_format: Format to save the images (default: PNG)\n        :return: List of base64 encoded images\n        \"\"\"\n        try:\n            images = convert_from_path(file_path)\n            images_base64 = []\n            for image in images:\n                buffered = BytesIO()\n                image.save(buffered, format=image_format)\n                image_base64 = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n                images_base64.append(image_base64)\n            return images_base64\n        except Exception as e:\n            raise ValueError(f\"Error processing PDF file: {str(e)}\")\n\n    def get_element(self, tag: TagEnum, chunk: str):\n        pattern = rf\"\\[{tag.value}\\]([\\s\\S]*?)\\[/{tag.value}\\]\"\n        all_elmts = re.findall(pattern, chunk)\n        if not all_elmts:\n            print(f\"No {tag.value} found in the chunk\")\n            return []\n        return [elmt.strip() for elmt in all_elmts]\n\n    async def asend_to_mlm(self, images_data: List[str]) -> str:\n        \"\"\"\n        Send images to the language model for processing.\n\n        :param images_data: List of base64 encoded images\n        :return: Processed content as a string\n        \"\"\"\n        images_prompt = [\n            {\n                \"type\": \"image_url\",\n                \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image_data}\"},\n            }\n            for image_data in images_data\n        ]\n        message = HumanMessage(\n            content=[\n                {\"type\": \"text\", \"text\": BASE_OCR_PROMPT},\n                *images_prompt,\n            ],\n        )\n        response = await self.model.ainvoke([message])\n        return str(response.content)\n\n    def send_to_mlm(self, images_data: List[str]) -> str:\n        \"\"\"\n        Send images to the language model for processing.\n\n        :param images_data: List of base64 encoded images\n        :return: Processed content as a string\n        \"\"\"\n        images_prompt = [\n            {\n                \"type\": \"image_url\",\n                \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image_data}\"},\n            }\n            for image_data in images_data\n        ]\n        message = HumanMessage(\n            content=[\n                {\"type\": \"text\", \"text\": BASE_OCR_PROMPT},\n                *images_prompt,\n            ],\n        )\n        response = self.model.invoke([message])\n        return str(response.content)\n\n    async def aconvert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: FileExtension | None = None,\n        batch_size: int = 3,\n        **kwargs,\n    ) -> MPDocument:\n        \"\"\"\n        Parse a PDF file and process its content using the language model.\n\n        :param file_path: Path to the PDF file\n        :param batch_size: Number of pages to process concurrently\n        :return: List of processed content strings\n        \"\"\"\n        if not file_path:\n            raise ValueError(\"File_path should be provided to run MegaParseVision\")\n\n        if isinstance(file_path, Path):\n            file_path = str(file_path)\n\n        self.check_supported_extension(file_extension, file_path)\n\n        pdf_base64 = self.process_file(file_path)\n        n_pages = len(pdf_base64)\n        tasks = [\n            self.asend_to_mlm(pdf_base64[i : i + batch_size])\n            for i in range(0, len(pdf_base64), batch_size)\n        ]\n        self.parsed_chunks = await asyncio.gather(*tasks)\n        responses = self.get_cleaned_content(\"\\n\".join(self.parsed_chunks))\n        return self.__to_elements_list__(responses, n_pages=n_pages)\n\n    def convert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: FileExtension | None = None,\n        batch_size: int = 3,\n        **kwargs,\n    ) -> MPDocument:\n        \"\"\"\n        Parse a PDF file and process its content using the language model.\n\n        :param file_path: Path to the PDF file\n        :param batch_size: Number of pages to process at a time\n        :return: List of processed content strings\n        \"\"\"\n        if not file_path:\n            raise ValueError(\"File_path should be provided to run MegaParseVision\")\n\n        if isinstance(file_path, Path):\n            file_path = str(file_path)\n\n        self.check_supported_extension(file_extension, file_path)\n\n        pdf_base64 = self.process_file(file_path)\n        n_pages = len(pdf_base64)\n        chunks = [\n            pdf_base64[i : i + batch_size]\n            for i in range(0, len(pdf_base64), batch_size)\n        ]\n        self.parsed_chunks = []\n        for chunk in chunks:\n            response = self.send_to_mlm(chunk)\n            self.parsed_chunks.append(response)\n        responses = self.get_cleaned_content(\"\\n\".join(self.parsed_chunks))\n        return self.__to_elements_list__(responses, n_pages)\n\n    def get_cleaned_content(self, parsed_file: str) -> str:\n        \"\"\"\n        Get cleaned parsed file without any tags defined in TagEnum.\n\n        This method removes all tags from TagEnum from the parsed file, formats the content,\n        and handles the HEADER tag specially by keeping only the first occurrence.\n\n        Args:\n            parsed_file (str): The parsed file content with tags.\n\n        Returns:\n            str: The cleaned content without TagEnum tags.\n\n        \"\"\"\n        tag_pattern = \"|\".join(map(re.escape, TagEnum.__members__.values()))\n        tag_regex = rf\"\\[({tag_pattern})\\](.*?)\\[/\\1\\]\"\n        # handle the HEADER tag specially\n        header_pattern = rf\"\\[{TagEnum.HEADER.value}\\](.*?)\\[/{TagEnum.HEADER.value}\\]\"\n        headers = re.findall(header_pattern, parsed_file, re.DOTALL)\n        if headers:\n            first_header = headers[0].strip()\n            # Remove all HEADER tags and their content\n            parsed_file = re.sub(header_pattern, \"\", parsed_file, flags=re.DOTALL)\n            # Add the first header back at the beginning\n            parsed_file = f\"{first_header}\\n{parsed_file}\"\n\n        # Remove all other tags\n        def remove_tag(match):\n            return match.group(2)\n\n        cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL)\n\n        cleaned_content = re.sub(r\"^```.*$\\n?\", \"\", cleaned_content, flags=re.MULTILINE)\n        cleaned_content = re.sub(r\"\\n\\s*\\n\", \"\\n\\n\", cleaned_content)\n        cleaned_content = cleaned_content.replace(\"|\\n\\n|\", \"|\\n|\")\n        cleaned_content = cleaned_content.strip()\n\n        return cleaned_content\n\n    def __to_elements_list__(self, mpv_doc: str, n_pages: int) -> MPDocument:\n        list_blocks: List[Block] = [\n            TextBlock(\n                text=mpv_doc,\n                metadata={},\n                page_range=(0, n_pages - 1),\n                bbox=BBOX(top_left=Point2D(x=0, y=0), bottom_right=Point2D(x=1, y=1)),\n            )\n        ]\n        return MPDocument(\n            metadata={},\n            detection_origin=\"megaparse_vision\",\n            content=list_blocks,\n        )\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/parser/unstructured_parser.py",
    "content": "import warnings\nfrom pathlib import Path\nfrom typing import IO, Dict, List\n\nfrom dotenv import load_dotenv\nfrom langchain_core.language_models.chat_models import BaseChatModel\nfrom megaparse_sdk.schema.document import (\n    BBOX,\n    Block,\n    FooterBlock,\n    HeaderBlock,\n    ImageBlock,\n    Point2D,\n    SubTitleBlock,\n    TableBlock,\n    TextBlock,\n    TitleBlock,\n)\nfrom megaparse_sdk.schema.document import (\n    Document as MPDocument,\n)\nfrom megaparse_sdk.schema.extensions import FileExtension\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\nfrom unstructured.documents.elements import Element\nfrom unstructured.partition.auto import partition\n\nfrom megaparse.parser import BaseParser\n\nload_dotenv()\n\n\nclass UnstructuredParser(BaseParser):\n    supported_extensions = [\n        FileExtension.PDF,\n        FileExtension.DOCX,\n        FileExtension.TXT,\n        FileExtension.OTF,\n        FileExtension.EPUB,\n        FileExtension.HTML,\n        FileExtension.XML,\n        FileExtension.CSV,\n        FileExtension.XLSX,\n        FileExtension.XLS,\n        FileExtension.PPTX,\n        FileExtension.MD,\n        FileExtension.MARKDOWN,\n    ]\n\n    def __init__(\n        self, strategy=StrategyEnum.AUTO, model: BaseChatModel | None = None, **kwargs\n    ):\n        self.strategy = strategy\n        self.model = model\n\n    def convert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: FileExtension | None = None,\n        **kwargs,\n    ) -> MPDocument:\n        self.check_supported_extension(file_extension, file_path)\n        # Partition the PDF\n        elements = partition(\n            filename=str(file_path) if file_path else None,\n            file=file,\n            strategy=self.strategy,\n            content_type=file_extension.mimetype if file_extension else None,\n        )\n\n        return self.__to_mp_document(elements)\n\n    async def aconvert(\n        self,\n        file_path: str | Path | None = None,\n        file: IO[bytes] | None = None,\n        file_extension: FileExtension | None = None,\n        **kwargs,\n    ) -> MPDocument:\n        self.check_supported_extension(file_extension, file_path)\n        warnings.warn(\n            \"The UnstructuredParser is a sync parser, please use the sync convert method\",\n            UserWarning,\n            stacklevel=2,\n        )\n        return self.convert(file_path, file, file_extension, **kwargs)\n\n    def __to_mp_document(self, elements: List[Element]) -> MPDocument:\n        text_blocks = []\n        for element in elements:\n            block = self.__convert_element_to_block(element)\n            if block:\n                text_blocks.append(block)\n        return MPDocument(\n            content=text_blocks, metadata={}, detection_origin=\"unstructured\"\n        )\n\n    def __convert_element_to_block(self, element: Element) -> Block | None:\n        element_type = element.category\n        text = element.text\n        metadata = element.metadata\n        category_depth = metadata.category_depth\n\n        # Element type-specific markdown content\n        markdown_types: Dict[str, Block] = {\n            \"Title\": TitleBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Subtitle\": SubTitleBlock(\n                text=text,\n                depth=category_depth if category_depth else 0,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Header\": HeaderBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Footer\": FooterBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"NarrativeText\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"ListItem\": TextBlock(  # FIXME: @chloedia, list item need to be handled differently in ListBlock\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Table\": TableBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Image\": ImageBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Formula\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"FigureCaption\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"Address\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"EmailAddress\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"CodeSnippet\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n            \"UncategorizedText\": TextBlock(\n                text=text,\n                metadata={},\n                page_range=(metadata.page_number, metadata.page_number)\n                if metadata.page_number\n                else None,\n                bbox=BBOX(\n                    top_left=Point2D(\n                        x=metadata.coordinates.points[0][0],\n                        y=metadata.coordinates.points[0][1],\n                    ),\n                    bottom_right=Point2D(\n                        x=metadata.coordinates.points[3][0],\n                        y=metadata.coordinates.points[3][1],\n                    ),\n                )\n                if metadata.coordinates and metadata.coordinates.points\n                else None,\n            ),\n        }\n        return markdown_types.get(element_type, None)\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/predictor/layout_predictor.py",
    "content": "from PIL import Image\nfrom unstructured_inference.inference.layout import PageLayout\nfrom unstructured_inference.models.base import get_model\nfrom unstructured_inference.visualize import draw_bbox\n\n\ndef extract_layout(\n    page_number: int, page_image: Image.Image, model_name: str = \"yolox\"\n) -> PageLayout:\n    layout_model = get_model(model_name)\n    parsed_page = PageLayout.from_image(\n        image=page_image,\n        number=page_number,\n        detection_model=layout_model,\n        element_extraction_model=None,\n        fixed_layout=None,\n    )\n\n    colors = [\"red\" for _ in parsed_page.elements]\n    for el, color in zip(parsed_page.elements, colors, strict=True):\n        page_image = draw_bbox(page_image, el, color=color, details=False)\n\n    page_image.show()\n\n    return parsed_page\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/utils/extract_metadata.py",
    "content": "from typing import Any, Dict\n\nimport pypdfium2 as pdfium\n\n\ndef get_doc_metdata(pdfium_document: pdfium.PdfDocument) -> Dict[str, Any]:\n    pass\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/utils/onnx.py",
    "content": "import logging\nfrom typing import List\n\nimport onnxruntime as rt\nfrom megaparse.configs.auto import DeviceEnum\n\nlogger = logging.getLogger(\"megaparse\")\n\n\ndef get_providers(device: DeviceEnum) -> List[str]:\n    prov = rt.get_available_providers()\n    logger.info(\"Available providers: %s\", prov)\n    if device == DeviceEnum.CUDA:\n        if \"CUDAExecutionProvider\" not in prov:\n            raise ValueError(\n                \"onnxruntime can't find CUDAExecutionProvider in list of available providers\"\n            )\n        return [\"CUDAExecutionProvider\"]\n    elif device == DeviceEnum.COREML:\n        if \"CoreMLExecutionProvider\" not in prov:\n            raise ValueError(\n                \"onnxruntime can't find CoreMLExecutionProvider in list of available providers\"\n            )\n        return [\"CoreMLExecutionProvider\"]\n    elif device == DeviceEnum.CPU:\n        return [\"CPUExecutionProvider\"]\n    else:\n        raise ValueError(\"device not in (CUDA,CoreML,CPU)\")\n"
  },
  {
    "path": "libs/megaparse/src/megaparse/utils/strategy.py",
    "content": "from typing import List\n\nimport numpy as np\nfrom megaparse.models.page import Page\nfrom megaparse_sdk.schema.document import TextDetection\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\nfrom pypdfium2._helpers.page import PdfPage\n\n\ndef get_page_strategy(\n    pdfium_page: PdfPage, onnxtr_page: TextDetection | None, threshold: float\n) -> StrategyEnum:\n    if onnxtr_page is None:\n        return StrategyEnum.FAST\n    text_coords = []\n    # Get all the images in the page\n    for obj in pdfium_page.get_objects():\n        if obj.type == 1:  # type: ignore\n            text_coords.append(obj.get_pos())\n\n    p_width, p_height = int(pdfium_page.get_width()), int(pdfium_page.get_height())\n\n    pdfium_canva = np.zeros((int(p_height), int(p_width)))\n\n    for coords in text_coords:\n        # (left,bottom,right, top)\n        # 0---l--------------R-> y\n        # |\n        # B   (x0,y0)\n        # |\n        # T                 (x1,y1)\n        # ^\n        # x\n        x0, y0, x1, y1 = (\n            p_height - coords[3],\n            coords[0],\n            p_height - coords[1],\n            coords[2],\n        )\n        x0 = max(0, min(p_height, int(x0)))\n        y0 = max(0, min(p_width, int(y0)))\n        x1 = max(0, min(p_height, int(x1)))\n        y1 = max(0, min(p_width, int(y1)))\n        pdfium_canva[x0:x1, y0:y1] = 1\n\n    onnxtr_canva = np.zeros((int(p_height), int(p_width)))\n    for block in onnxtr_page.bboxes:\n        x0, y0 = block.bbox[0]\n        x1, y1 = block.bbox[1]\n        x0 = max(0, min(int(x0 * p_width), int(p_width)))\n        y0 = max(0, min(int(y0 * p_height), int(p_height)))\n        x1 = max(0, min(int(x1 * p_width), int(p_width)))\n        y1 = max(0, min(int(y1 * p_height), int(p_height)))\n        onnxtr_canva[y0:y1, x0:x1] = 1\n\n    intersection = np.logical_and(pdfium_canva, onnxtr_canva)\n    union = np.logical_or(pdfium_canva, onnxtr_canva)\n    sum_intersection = np.sum(intersection)\n    sum_union = np.sum(union)\n    iou = sum_intersection / sum_union if sum_union != 0 else 0\n    if iou < threshold:\n        return StrategyEnum.HI_RES\n    return StrategyEnum.FAST\n\n\ndef determine_global_strategy(pages: List[Page], threshold: float) -> StrategyEnum:\n    count = sum(1 for page in pages if page.strategy == StrategyEnum.HI_RES)\n    if count / len(pages) > threshold:\n        return StrategyEnum.HI_RES\n    return StrategyEnum.FAST\n"
  },
  {
    "path": "libs/megaparse/tests/__init__.py",
    "content": ""
  },
  {
    "path": "libs/megaparse/tests/certs/client-cert.pem",
    "content": "-----BEGIN CERTIFICATE-----\nMIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw\ngZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p\nbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw\nPgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh\nbWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow\nZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD\nVQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv\ndXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp\ntlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5\nKDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH\nqmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN\ngLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8\nghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT\nWWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG\nAQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1\n2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp\ndnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ\n6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV\nHdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn\nAmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p\nvxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW\n0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9\nze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr\ndrdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7\n/E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=\n-----END CERTIFICATE-----\n"
  },
  {
    "path": "libs/megaparse/tests/certs/client-key.pem",
    "content": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp\ntlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5\nKDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH\nqmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN\ngLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8\nghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT\nWWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU\nQYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj\nrGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj\nBkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k\n0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo\n8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy\ndJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0\nxbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW\nOgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB\nPx56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18\nvK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY\nnWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ\neereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M\nf1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG\nqLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh\nzPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq\n8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP\nHllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz\n4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI\n1OaXIqrCA/V43NydDezh0ylQ\n-----END PRIVATE KEY-----\n"
  },
  {
    "path": "libs/megaparse/tests/conftest.py",
    "content": "from pathlib import Path\nfrom typing import IO\n\nimport pytest_asyncio\nfrom httpx import ASGITransport, AsyncClient\nfrom langchain_community.document_loaders import PlaywrightURLLoader\nfrom langchain_core.documents import Document\nfrom megaparse.api.app import app, get_playwright_loader, parser_builder_dep\nfrom megaparse.parser.base import BaseParser\nfrom megaparse_sdk.schema.document import Document as MPDocument\nfrom megaparse_sdk.schema.document import TextBlock\nfrom megaparse_sdk.schema.extensions import FileExtension\n\n\nclass FakeParserBuilder:\n    def build(self, *args, **kwargs) -> BaseParser:\n        \"\"\"\n        Build a fake parser based on the given configuration.\n\n        Returns:\n            BaseParser: The built fake parser.\n\n        Raises:\n            ValueError: If the configuration is invalid.\n        \"\"\"\n\n        class FakeParser(BaseParser):\n            def convert(\n                self,\n                file_path: str | Path | None = None,\n                file: IO[bytes] | None = None,\n                file_extension: None | FileExtension = None,\n                **kwargs,\n            ) -> MPDocument:\n                print(\"Fake parser is converting the file\")\n                return MPDocument(\n                    file_name=\"Fake file\",\n                    content=[TextBlock(text=\"Fake conversion result\", metadata={})],\n                    metadata={},\n                    detection_origin=\"fakeparser\",\n                )\n\n            async def aconvert(\n                self,\n                file_path: str | Path | None = None,\n                file: IO[bytes] | None = None,\n                file_extension: None | FileExtension = None,\n                **kwargs,\n            ) -> MPDocument:\n                print(\"Fake parser is converting the file\")\n                return MPDocument(\n                    file_name=\"Fake file\",\n                    content=[TextBlock(text=\"Fake conversion result\", metadata={})],\n                    metadata={},\n                    detection_origin=\"fakeparser\",\n                )\n\n        return FakeParser()\n\n\n@pytest_asyncio.fixture(scope=\"function\")\nasync def test_client():\n    print(\"Setting up test_client fixture\")\n\n    def fake_parser_builder():\n        return FakeParserBuilder()\n\n    def fake_playwright_loader():\n        class FakePlaywrightLoader(PlaywrightURLLoader):\n            async def aload(self):\n                return [Document(page_content=\"Fake website content\")]\n\n        return FakePlaywrightLoader(urls=[], remove_selectors=[\"header\", \"footer\"])\n\n    app.dependency_overrides[parser_builder_dep] = fake_parser_builder\n    app.dependency_overrides[get_playwright_loader] = fake_playwright_loader\n    async with AsyncClient(\n        transport=ASGITransport(app=app),  # type: ignore\n        base_url=\"http://test\",\n    ) as ac:\n        yield ac\n    app.dependency_overrides = {}\n"
  },
  {
    "path": "libs/megaparse/tests/data/grt_example/MegaFake_report.md",
    "content": "| My Mega fake report | #1756394 | 31/05/2024 |\n|---------------------|----------|------------|\n\n## Why Mega Parse might be the best ?\n\n### Introduction\n\nMega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.\n\n### Features of Mega Parse\n\nMega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.\n\n**Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.\n\n**High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.\n\n**Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.\n\nAccuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.\n\nCustomizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases.\n\nBatch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time.\n\nError Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.\n\n# Benefits of Mega Parse\n\nThe implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.\n\n**Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.\n\n**Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.\n\n**Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but\nalso highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.\n\nImproved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.\n\nCost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs.\n\nScalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.\n\n# Comparative Performance\n\nThe following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.\n\n| Metric              | Mega Parse       | Parser A       | Parser B       | Parser C       | Parser D       |\n|---------------------|------------------|----------------|----------------|----------------|----------------|\n| Supported Formats   | PDF, DOCX, PPTX  | PDF, DOCX      | DOCX, PPTX     | PDF, PPTX      | PDF, DOCX, XLSX|\n| Conversion Speed (pages/min)   | 120              | 90             | 100            | 85             | 95             |\n| **Accuracy Rate (%)**    | 98         | 95         | 93         | 90         | 92         |\n| **Output Format**        | Markdown   | HTML       | Markdown   | Plain Text | HTML       |\n| **Error Rate (%)**       | 1          | 3          | 4          | 5          | 3          |\n| **Ease of Use**          | High       | Medium     | High       | Medium     | Medium     |\n| **Integration Capability** | Excellent | Good       | Good       | Fair       | Good       |\n| **Batch Processing**     | Yes        | No         | Yes        | No         | Yes        |\n| **Custom Parsing Rules** | Yes        | Limited    | Yes        | No         | Limited    |\n| **Multilingual Support** | Yes        | Yes        | No         | Yes        | Yes        |\n| **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes |\n| **Price (per user/month)** | $30       | $25        | $20        | $15        | $18        |\n| **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |\n| **Free Trial Available** | Yes        | Yes        | No         | Yes        | No         |\n| **Cloud Integration**    | Yes        | No         | Yes        | Yes        | No         |\n| **Security Features**    | Advanced   | Basic      | Advanced   | Basic      | Intermediate |\n| **User Community Size**        | Large                          | Medium                         | Medium                         | Small                          | Medium                         |\n| **Monthly Updates**            | Yes                            | Yes                            | No                             | Yes                            | No                             |\n| **Mobile App Availability**    | Yes                            | No                             | Yes                            | No                             | Yes                            |\n| **Platform Compatibility**     | Windows, Mac, Linux            | Windows, Mac                   | Windows                        | Mac, Linux                     | Windows, Linux                 |\n| **Data Privacy Compliance**    | High                           | Medium                         | High                           | Low                            | Medium                         |\n| **AI-Driven Enhancements**     | Yes                            | No                             | Yes                            | No                             | Yes                            |\n| **File Size Limit (per document)** | 1GB                            | 500MB                          | 750MB                          | 200MB                          | 500MB                          |\n| **User Training Resources**    | Extensive                      | Moderate                       | Extensive                      | Limited                        | Moderate                       |\n| **API Access**                 | Yes                            | No                             | Yes                            | No                             | Yes                            |\n| **Customizable Output Templates** | Yes                            | Limited                        | Yes                            | No                             | Yes                            |\n| **Collaboration Features**     | Yes                            | No                             | Yes                            | No                             | Limited                        |\n| **Document Version Control**   | Yes                            | No                             | Yes                            | No                             | Yes                            |\n| **Import/Export Options**      | Extensive                      | Moderate                       | Extensive                      | Limited                        | Moderate                       |\n| Feedback Mechanism | Yes | No | Yes | No | Yes |\n\n*Note: All data presented in this table is fictional and for illustrative purposes only.*\n\n## Conclusion\n\nMega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence."
  },
  {
    "path": "libs/megaparse/tests/pdf/test_detect_ocr.py",
    "content": "import os\n\nimport pypdfium2\nimport pytest\nfrom megaparse.megaparse import MegaParse\nfrom megaparse.utils.strategy import determine_global_strategy\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\n\nocr_pdfs = os.listdir(\"./tests/pdf/ocr\")\nnative_pdfs = os.listdir(\"./tests/pdf/native\")\n\nmegaparse = MegaParse()\n\n\n@pytest.mark.parametrize(\"hi_res_pdf\", ocr_pdfs)\ndef test_hi_res_strategy(hi_res_pdf):\n    if hi_res_pdf == \"0168004.pdf\":\n        pytest.skip(\"Skip 0168004.pdf as it is flaky currently\")\n\n    pdf_doc = pypdfium2.PdfDocument(f\"./tests/pdf/ocr/{hi_res_pdf}\")\n    pages = megaparse.extract_page_strategies(pdf_doc)\n    assert (\n        determine_global_strategy(\n            pages, megaparse.config.auto_config.document_threshold\n        )\n        == StrategyEnum.HI_RES\n    )\n\n\n@pytest.mark.parametrize(\"native_pdf\", native_pdfs)\ndef test_fast_strategy(native_pdf):\n    if native_pdf == \"0168029.pdf\":\n        pytest.skip(\"Skip 0168029.pdf as it is too long to process\")\n\n    pdf_doc = pypdfium2.PdfDocument(f\"./tests/pdf/native/{native_pdf}\")\n    pages = megaparse.extract_page_strategies(pdf_doc)\n\n    assert (\n        determine_global_strategy(\n            pages, megaparse.config.auto_config.document_threshold\n        )\n        == StrategyEnum.FAST\n    )\n"
  },
  {
    "path": "libs/megaparse/tests/pdf/test_pdf_processing.py",
    "content": "from pathlib import Path\n\nimport pypdfium2\nimport pytest\nfrom megaparse.configs.auto import (\n    DeviceEnum,\n    MegaParseConfig,\n)\nfrom megaparse.megaparse import MegaParse\nfrom megaparse.utils.strategy import determine_global_strategy\nfrom megaparse_sdk.schema.extensions import FileExtension\nfrom megaparse_sdk.schema.parser_config import StrategyEnum\n\n\n@pytest.fixture\ndef native_pdf() -> Path:\n    p = Path(\"./tests/pdf/sample_native.pdf\")\n    return p\n\n\n@pytest.fixture\ndef scanned_pdf() -> Path:\n    p = Path(\"./tests/pdf/sample_pdf.pdf\")\n    return p\n\n\n# def test_get_default_processors_megaparse():\n#     megaparse = MegaParse()\n#     assert type(megaparse.parser) is UnstructuredParser\n\n\n@pytest.mark.asyncio\n@pytest.mark.parametrize(\"pdf_name\", [\"scanned_pdf\", \"native_pdf\"])\nasync def test_async_megaparse_pdf_processor_file_path(pdf_name, request):\n    pdf = request.getfixturevalue(pdf_name)\n    processor = MegaParse(config=MegaParseConfig(device=DeviceEnum.COREML))\n    result = await processor.aload(file_path=pdf)\n    assert len(str(result)) > 0\n\n\n@pytest.mark.parametrize(\"pdf_name\", [\"scanned_pdf\", \"native_pdf\"])\ndef test_sync_megaparse_pdf_processor_file_path(pdf_name, request):\n    pdf = request.getfixturevalue(pdf_name)\n    processor = MegaParse()\n    result = processor.load(file_path=pdf)\n    assert len(result) > 0\n\n\n@pytest.mark.asyncio\n@pytest.mark.parametrize(\"pdf_name\", [\"scanned_pdf\", \"native_pdf\"])\nasync def test_megaparse_pdf_processor_file(pdf_name, request):\n    pdf = request.getfixturevalue(pdf_name)\n    processor = MegaParse()\n    with open(pdf, \"rb\") as f:\n        result = await processor.aload(file=f, file_extension=FileExtension.PDF)\n        assert len(str(result)) > 0\n\n\ndef test_strategy_native(native_pdf):\n    processor = MegaParse()\n    pdf_doc = pypdfium2.PdfDocument(native_pdf)\n\n    pages = processor.extract_page_strategies(pdf_doc)\n\n    assert (\n        determine_global_strategy(\n            pages, processor.config.auto_config.document_threshold\n        )\n        == StrategyEnum.FAST\n    )\n    pdf_doc.close()\n\n\ndef test_strategy_scanned(scanned_pdf):\n    processor = MegaParse()\n    pdf_doc = pypdfium2.PdfDocument(scanned_pdf)\n    pages = processor.extract_page_strategies(pdf_doc)\n    assert (\n        determine_global_strategy(\n            pages, processor.config.auto_config.document_threshold\n        )\n        == StrategyEnum.HI_RES\n    )\n    pdf_doc.close()\n"
  },
  {
    "path": "libs/megaparse/tests/pdf/test_pdfium_parser.py",
    "content": "from pathlib import Path\n\nimport pypdfium2 as pdfium\n\n\ndef test_pdfium():\n    # scanned pdf\n    p = Path(\"./tests/pdf/mlbook.pdf\")\n    document = pdfium.PdfDocument(p)\n\n    objs = []\n    for page in document:\n        for obj in page.get_objects():\n            objs.append(obj)\n\n    document.close()\n"
  },
  {
    "path": "libs/megaparse/tests/supported_docs/sample.csv",
    "content": "Name,Description\nMegaParse,\"MegaParse is the best parser, even with accents like é, è, and ñ.\"\nOtherParse,\"OtherParse is a decent parser, but it struggles with accents.\"\nRandomParse,\"RandomParse is another parser, but it often fails with special characters.\""
  },
  {
    "path": "libs/megaparse/tests/supported_docs/sample.markdown",
    "content": "# The Difficulty of Parsing Files\n\nParsing files can be a challenging task due to several factors:\n\n## 1. File Format Variability\nDifferent file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.\n\n## 2. Inconsistent Data\nFiles often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.\n\n## 3. Large File Sizes\nParsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.\n\n## 4. Encoding Issues\nFiles may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.\n\n## 5. Nested Structures\nSome file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.\n\n## Conclusion\nDespite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.\n"
  },
  {
    "path": "libs/megaparse/tests/supported_docs/sample.md",
    "content": "# The Difficulty of Parsing Files\n\nParsing files can be a challenging task due to several factors:\n\n## 1. File Format Variability\nDifferent file formats (e.g., JSON, XML, CSV) require different parsing techniques. Each format has its own structure and rules, making it necessary to handle each one uniquely.\n\n## 2. Inconsistent Data\nFiles often contain inconsistent or malformed data. Handling these inconsistencies requires robust error-checking and validation mechanisms.\n\n## 3. Large File Sizes\nParsing large files can be resource-intensive and time-consuming. Efficient algorithms and memory management techniques are essential to handle large datasets.\n\n## 4. Encoding Issues\nFiles may use different character encodings (e.g., UTF-8, ASCII). Properly detecting and handling these encodings is crucial to avoid data corruption.\n\n## 5. Nested Structures\nSome file formats, like JSON and XML, can have deeply nested structures. Parsing these nested structures requires recursive algorithms and careful handling of hierarchical data.\n\n## Conclusion\nDespite these challenges, effective file parsing is essential for data processing and analysis. By understanding and addressing these difficulties, developers can create robust parsers that handle a wide variety of file formats and data inconsistencies.\n"
  },
  {
    "path": "libs/megaparse/tests/supported_docs/sample.txt",
    "content": "Lorem ipsum \n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \n\nVestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.\nMaecenas non lorem quis tellus placerat varius. \nNulla facilisi. \nAenean congue fringilla justo ut aliquam. \nMauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. \nMorbi viverra semper lorem nec molestie. \nMaecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\nhttps://github.com/QuivrHQ/MegaParse"
  },
  {
    "path": "libs/megaparse/tests/supported_docs/sample.xml",
    "content": "<?xml version=\"1.0\"?>\n<customers>\n   <customer id=\"55000\">\n      <name>Charter Group</name>\n      <address>\n         <street>100 Main</street>\n         <city>Framingham</city>\n         <state>MA</state>\n         <zip>01701</zip>\n      </address>\n      <address>\n         <street>720 Prospect</street>\n         <city>Framingham</city>\n         <state>MA</state>\n         <zip>01701</zip>\n      </address>\n      <address>\n         <street>120 Ridge</street>\n         <state>MA</state>\n         <zip>01760</zip>\n      </address>\n   </customer>\n</customers>"
  },
  {
    "path": "libs/megaparse/tests/supported_docs/sample_complexe.html",
    "content": "\n<!-- saved from url=(0065)https://demo.borland.com/testsite/stadyn_largepagewithimages.html -->\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\">\n<!--<meta HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=ISO-8859-1\">  -->\n\n<title>Large HTML page with images</title>\n\n<!-- The following link block and style block present methods how to load external CSS documents.\n     The link tag forces an immediate download of the CSS document whereas the import clause in\n     the style block has the document downloaded some random time later.  -->\n\n<link href=\"./sample_complexe_files/sections.css\" rel=\"stylesheet\" type=\"text/css\">\n\n\n<!--\n<style type=\"text/css\">\n@import url(htmlparser/sections.css);\n</style>\n-->\n\n</head>\n\n<body data-new-gr-c-s-check-loaded=\"14.1204.0\" data-gr-ext-installed=\"\">\n<div align=\"center\"><center>\n<h1 class=\"fore\">When to load CSS</h1>\n\n<table border=\"0\" cellspacing=\"5\" width=\"100%\" bgcolor=\"#F2F2FF\">\n  <tbody><tr>\n    <td><p align=\"left\"><font color=\"#000080\" size=\"5\" face=\"Tahoma\"><strong>Large HTML page\n    with Images</strong></font></p></td>\n  </tr>\n</tbody></table>\n</center></div>\n\n<hr>\n\n<p><strong><small><font face=\"Verdana\">This page shall test if the recorder generates\nscripts with requests in correct order. It includes images in sequential order: stadyn_image1.gif\nthrough stadyn_image10.gif</font></small></strong></p>\n\n<hr>\n\n<p><img src=\"./sample_complexe_files/stadyn_image1.gif\" width=\"70\" height=\"70\" alt=\"stadyn_image1.gif (6512 bytes)\"><br>\nstadyn_image1</p>\n\n<p><img src=\"./sample_complexe_files/stadyn_image2.gif\" width=\"70\" height=\"70\" alt=\"stadyn_image2.gif (5983 bytes)\"><br>\nstadyn_image2</p>\n\n<p align=\"center\"><b><font size=\"6\">Open Financial Exchange<br>\nSpecification 1.0</font></b> </p>\n\n<p align=\"center\"><b><font size=\"2\">February 14, 1997<br>\n</font></b></p>\n\n<p align=\"center\"><b><font size=\"2\"> 1997 CheckFree Corp., Intuit Inc., Microsoft Corp.\nAll rights reserved<br>\n<br>\n<br>\n</font></b></p>\n\n<p align=\"center\"><i><b><font size=\"5\">Chapters 1 - 10<br>\n<br>\n<br>\n<br>\n<br>\n</font></b></i></p>\n\n<h1><img src=\"./sample_complexe_files/stadyn_image3.gif\" width=\"70\" height=\"70\" alt=\"stadyn_image3.gif (6537 bytes)\"><br>\n<font size=\"5\">stadyn_image3</font></h1>\n\n<h1><img src=\"./sample_complexe_files/stadyn_image4.gif\" width=\"70\" height=\"70\" alt=\"stadyn_image4.gif (6028 bytes)\"><br>\nstadyn_image4</h1>\n\n<p><img src=\"./sample_complexe_files/stadyn_image5.gif\" width=\"70\" height=\"70\" alt=\"stadyn_image5.gif (4068 bytes)\"><br>\nstadyn_image5</p>\n\n<p>&nbsp;</p>\n\n<p>&nbsp;</p>\n\n<h1><a name=\"_Toc371408167\"><font size=\"5\">Contents</font></a></h1>\n\n<p>1. Overview 51.1 Introduction 51.1.1 Design Principles 51.2 Open Financial Exchange at\na Glance 71.2.1 Data Transport 71.2.2 Request and Response Model 81.3 Conventions 92.\nStructure 102.1 HTTP Headers 102.2 Open Financial Exchange Headers 112.2.1 The Meaning of\nVersion Numbers 122.3 SGML Details 122.3.1 Compliance 122.3.2 Special Characters 122.4\nOpen Financial Exchange SGML Structure 132.4.1 Overview 132.4.2 Top Level 132.4.3 Messages\n132.4.4 Message Sets and Version Control 142.4.5 Transactions 152.5 The Signon Message Set\n162.5.1 Signon &lt;SONRQ&gt; &lt;SONRS&gt; 162.5.2 PIN Change &lt;PINCHRQ&gt;\n&lt;PINCHRS&gt; 192.5.3 Examples 202.6 External Data Support 202.7 Extensions to Open\nFinancial Exchange 213. Common Aggregates, Elements, and Data Types 223.1 Common\nAggregates 223.1.1 Identifying Financial Institutions and Accounts 223.1.2 Balance Records\n&lt;BAL&gt; 223.1.3 Error Reporting &lt;STATUS&gt; 233.2 Common Elements 243.2.1 Financial\nInstitution Transaction ID &lt;FITID&gt; 243.2.2 Server-Assigned ID &lt;SRVRTID&gt;\n243.2.3 Client-Assigned Transaction UID &lt;TRNUID&gt; 253.2.4 Token &lt;TOKEN&gt; 253.2.5\nTransaction Amount &lt;TRNAMT&gt; 253.2.6 Memo &lt;MEMO&gt; 253.2.7 Date Start and Date\nEnd &lt;DTSTART&gt; &lt;DTEND&gt; 263.3 Common data types 263.3.1 Dates and Times 263.3.2\nAmounts, Prices, and Quantities 283.3.3 Language 283.3.4 Basic data types 284. Security\n294.1 Security Solutions 294.1.1 Determining Security Levels &lt;OFXSEC&gt;\n&lt;TRANSPSEC&gt; 294.2 Channel-Level Security 304.2.1 Security Requirements 304.2.2 Using\nSSL 3.0 in Open Financial Exchange 304.3 Application-Level Security 314.3.1 Requirements\nfor Application-Layer Security 314.3.2 Using Application-level Encryption in Open\nFinancial Exchange 325. International Support 335.1 Language and Encoding 335.2 Currency\n&lt;CURDEF&gt; &lt;CURRENCY&gt; &lt;ORIGCURRENCY&gt; 335.3 Country-Specific Tag Values\n346. Data Synchronization 356.1 Overview 356.2 Background 356.3 Data Synchronization\nApproach 366.4 Data Synchronization Specifics 376.5 Conflict Detection and Resolution\n396.6 Synchronization vs. Refresh 406.7 Typical Server Architecture for Synchronization\n416.8 Typical Client Processing of Synchronization Results 436.9 Simultaneous Connections\n446.10 Synchronization Alternatives 446.10.1 Lite Synchronization 446.10.2 Relating\nSynchronization and Error Recovery 456.11 Examples 467. FI Profile 487.1 Overview 487.1.1\nMessage Sets 487.1.2 Version Control 497.1.3 Batching and Routing 497.2 Profile Request\n507.3 Profile Response 517.3.1 Message Set 527.3.2 Signon Realms 537.3.3 Status Codes\n537.4 Profile Message Set Profile Information 548. Activation &amp; Account Information\n558.1 Overview 558.2 Approaches to User Sign-Up with Open Financial Exchange 558.3 Users\nand Accounts 568.4 Enrollment and Password Acquisition &lt;ENROLLRQ&gt; &lt;ENROLLRS&gt;\n568.4.1 User IDs 578.4.2 Enrollment Request 578.4.3 Enrollment Response 598.4.4 Enrollment\nStatus Codes 598.4.5 Examples 608.5 Account Information 608.5.1 Request &lt;ACCTINFORQ&gt;\n618.5.2 Response &lt;ACCTINFORS&gt; 618.5.3 Account Information Aggregate &lt;ACCTINFO&gt;\n628.5.4 Status Codes 628.5.5 Examples 638.6 Service Activation 638.6.1 Activation Request\nand Response 648.6.2 Service Activation Synchronization 668.6.3 Examples 668.7 Name and\nAddress Changes &lt;CHGUSERINFORQ&gt; &lt;CHGUSERINFORS&gt; 678.7.1 &lt;CHGUSERINFORQ&gt;\n678.7.2 &lt;CHGUSERINFORS&gt; 688.7.3 Status Codes 688.8 Signup Message Set Profile\nInformation 699. Customer to FI Communication 709.1 The E-Mail Message Set 709.2 E-Mail\nMessages 709.2.1 Regular vs. Specialized E-Mail 719.2.2 Basic &lt;MAIL&gt; Aggregate\n719.2.3 E-Mail &lt;MAILRQ&gt; &lt;MAILRS&gt; 719.2.4 E-Mail Synchronization\n&lt;MAILSYNCRQ&gt; &lt;MAILSYNCRS&gt; 729.2.5 Example 739.3 Get HTML Page 749.3.1 MIME Get\nRequest and Response &lt;GETMIMERQ&gt; &lt;GETMIMERS&gt; 749.3.2 Example 759.4 E-Mail\nMessage Set Profile Information 7610. Recurring Transactions 7710.1 Creating a Recurring\nModel 7710.2 Recurring Instructions &lt;RECURRINST&gt; 7710.2.1 Values for &lt;FREQ&gt;\n7810.2.2 Examples 7910.3 Retrieving Transactions Generated by a Recurring Model 8010.4\nModifying and Canceling Individual Transactions 8010.5 Modifying and Canceling Recurring\nModels 8010.5.1 Examples 81\n\n</p><ol>\n  <li><a name=\"_Toc380493239\"><font size=\"6\" face=\"Arial\">Overview</font></a> </li>\n  <li><a name=\"_Toc380493240\"><font size=\"5\" face=\"Arial\">Introduction</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange is a broad-based framework for exchanging\nfinancial data and instructions between customers and their financial institutions. It\nallows institutions to connect directly to their customers without requiring an\nintermediary. <br>\n<br>\n</font></p>\n\n<p><font size=\"2\">Open Financial Exchange is an open specification that anyone can\nimplement: any financial institution, transaction processor, software developer or other\nparty. It uses widely accepted open standards for data formatting (such as SGML),\nconnectivity (such as TCP/IP and HTTP), and security (such as SSL).</font> </p>\n\n<p><font size=\"2\">Open Financial Exchange defines the request and response messages used\nby each financial service as well as the common framework and infrastructure to support\nthe communication of those messages. This specification does not describe any specific\nproduct implementation.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493241\"><font size=\"4\" face=\"Arial\">Design Principles</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The following principles were used in designing Open Financial Exchange:</font>\n</p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><font size=\"2\" face=\"Arial\"><b>Broad</b> <b>Range of Financial Activities</b></font><font size=\"2\"> -\nOpen Financial Exchange provides support for a <i><b>broad</b></i> range of financial\nactivities. Open Financial Exchange 1.0 specifies the following services:</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Bank statement download</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Credit card statement download</font>\n</p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Funds transfers including\nrecurring transfers</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Consumer payments, including\nrecurring payments</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Business payments, including\nrecurring payments</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Brokerage and mutual fund\nstatement download, including transaction history, current holdings and balances</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><font size=\"2\" face=\"Arial\"><b>Broad</b> <b>Range of Financial Institutions</b></font><font size=\"2\"> -\nOpen Financial Exchange supports communication with a <i><b>broad</b></i> range of\nfinancial institutions (FIs), including:</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Banks</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Brokerage houses</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Merchants</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Processors</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Financial advisors</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">n</font><font size=\"2\"> Government agencies</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><font size=\"2\" face=\"Arial\"><b>Broad</b> <b>Range of Front-End applications</b></font><font size=\"2\"> -\nOpen Financial Exchange supports a <i><b>broad </b></i>range of front-end applications\ncovering all types of financial activities running on all types of platforms, including\nWeb-based applications.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Extensible</font></b><font size=\"2\"> - Open Financial Exchange has been\ndesigned to allow the easy addition of new services. Future versions will include support\nfor many new services.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Open</font></b><font size=\"2\"> - This specification is publicly available.\nYou can build client and server applications using the Open Financial Exchange protocols\nindependent of any specific technology, product, or company.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Multiple Client Support</font></b><font size=\"2\"> - Open Financial Exchange\nallows a user to use multiple client applications to access the same data at a financial\ninstitution. With the popularity of the World Wide Web, customers are increasingly more\nlikely to use multiple applications-either desktop-based or Web-based-to perform financial\nactivities. For example, a customer can track personal finances at home with a desktop\napplication and occasionally pay bills while at work with a Web-based application. The use\nof data synchronization to support multiple clients is a key innovation in Open Financial\nExchange.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Robust</font></b><font size=\"2\"> - Open Financial Exchange will be used for\nexecuting important financial transactions and for communicating important financial\ninformation. Assuring users that transactions are executed and information is correct is\ncrucial. Open Financial Exchange provides robust protocols for error recovery.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Secure</font></b><font size=\"2\"> - Open Financial Exchange provides a\nframework for building secure online financial services. In Open Financial Exchange,\nsecurity encompasses authentication of the parties involved, as well as secrecy and\nintegrity of the information being exchanged.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Batch &amp; Interactive</font></b><font size=\"2\"> - The design of request and\nresponse messages in Open Financial Exchange is for use in either batch or interactive\nstyle of communication. Open Financial Exchange provides for applying a single\nauthentication context to multiple requests in order to reduce the overhead of user\nauthentication.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">International</font></b><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Support</font></b><font size=\"2\"> - Open Financial Exchange is designed to supply financial services throughout\nthe world. It supports multiple currencies, country-specific extensions, and different\nforms of encoding such as UNICODE.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Platform Independent</font></b><font size=\"2\"> -Open Financial Exchange can\nbe implemented on a wide variety of front-end client devices, including those running\nWindows 3.1, Windows 95, Windows NT, Macintosh, or UNIX. It also supports a wide variety\nof Web-based environments, including those using HTML, Java, JavaScript, or ActiveX.\nSimilarly on the back-end, Open Financial Exchange can be implemented on a wide variety of\nserver systems, including those running UNIX, Windows NT, or OS/2.</font> </p>\n\n<p><font size=\"1\" face=\"Wingdings\">l</font><font size=\"2\"> </font><b><font size=\"2\" face=\"Arial\">Transport Independent</font></b><font size=\"2\"> - Open Financial Exchange is\nindependent of the data communication protocol used to transport the messages between the\nclient and server computers. Open Financial Exchange 1.0 will use HTTP.</font>\n\n</p><ol>\n  <li><font size=\"5\" face=\"Arial\"><a name=\"_Toc380493242\">Open Financial Exchange </a>at a\n    Glance</font> </li>\n</ol>\n\n<p><font size=\"2\">The design of Open Financial Exchange is as a client and server system.\nAn end-user uses a client application to communicate with a server at a financial\ninstitution. The form of communication is requests from the client to the server and\nresponses from the server back to the client.</font> </p>\n\n<p><font size=\"2\">Open Financial Exchange uses the Internet Protocol (IP) suite to provide\nthe communication channel between a client and a server. IP protocols are the foundation\nof the public Internet and a private network can also use them.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493243\"><font size=\"4\" face=\"Arial\">Data Transport</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Clients use the HyperText Transport Protocol (HTTP) to communicate to an\nOpen Financial Exchange server. The World Wide Web throughout uses the same HTTP protocol.\nIn principle, a financial institution can use any off-the-shelf web server to implement\nits support for Open Financial Exchange.</font> </p>\n\n<p><font size=\"2\">To communicate by means of Open Financial Exchange over the Internet,\nthe client must establish an Internet connection. This connection can be a dial-up\nPoint-to-Point Protocol (PPP) connection to an Internet Service Provider (ISP) or a\nconnection over a local area network that has a gateway to the Internet.</font> </p>\n\n<p><font size=\"2\">Clients use the HTTP POST command to send a request to the previously\nacquired Uniform Resource Locator (URL) for the desired financial institution. The URL\npresumably identifies a Common Gateway Interface (CGI) or other process on an FI server\nthat can accept Open Financial Exchange requests and produce a response.</font> </p>\n\n<p><font size=\"2\">The POST identifies the data as being of type application/x-ofx. Use\napplication/x-ofx as the return type as well. Fill in other fields per the HTTP 1.0 spec.\nHere is a typical request:</font> </p>\n\n<pre><font size=\"1\">POST http://www.fi.com/ofx.cgi HTTP/1.0\nUser-Agent:MyApp 5.0\nContent-Type: application/x-ofx\nContent-Length: 1032\n\nOFXHEADER:100\n</font><font size=\"2\">DATA:OFXSGML\nVERSION:100\nSECURITY:1\nENCODING:USASCII\n\n&lt;OFX&gt;\n... Open Financial Exchange requests ...\n&lt;/OFX&gt;</font>\n</pre>\n\n<p><font size=\"2\">A blank line defines the separation between the HTTP headers and the\nstart of the actual Open Financial Exchange data. A blank line also separates the Open\nFinancial Exchange headers and the actual response. (See Chapter 2, for more information.)</font>\n</p>\n\n<p><font size=\"2\">The structure of a response is similar to the request, with the first\nline containing the standard HTTP result, as shown next. The content length is given in\nbytes.</font> </p>\n\n<pre><font size=\"1\">HTTP 1.0 200 OK\nContent-Type: application/x-ofx\nContent-Length: 8732\n\nOFXHEADER:100\n</font><font size=\"2\">DATA:OFXSGML\nVERSION:100\nSECURITY:1\nENCODING:USASCII\n\n&lt;OFX&gt;\n... Open Financial Exchange responses ...\n&lt;/OFX&gt;</font>\n</pre>\n\n<ol>\n  <li><font size=\"4\" face=\"Arial\"><a name=\"_Toc380493244\">Request and Response</a> Model</font>\n  </li>\n</ol>\n\n<p><font size=\"2\">The basis for Open Financial Exchange is the request and response model.\nOne or more requests can be batched in a single file. This file typically includes a\nsignon request and one or more service-specific requests. An FI server will process all of\nthe requests and return a single response file. This batch model lends itself to Internet\ntransport as well as other off-line transports. Both requests and responses are plain text\nfiles, formatted using a grammar based on Standard Generalized Markup Language (SGML).\nOpen Financial Exchange is syntactically similar to HyperText Markup Language (HTML),\nfeaturing tags to identify and delimit the data. The use of a tagged data format allows\nOpen Financial Exchange to evolve over time while continuing to support older clients and\nservers.</font> </p>\n\n<p><font size=\"2\">Here is a simplified example of an Open Financial Exchange request file.\n(This example does not show the Open Financial Exchange headers and the indentation is\nonly for readability.) For complete details, see the more complete examples throughout\nthis specification.</font> </p>\n\n<p>&lt;OFX&gt; &lt;!-- Begin request data --&gt; &lt;SIGNONMSGSRQV1&gt; &lt;SONRQ&gt;\n&lt;!-- Begin signon --&gt; &lt;DTCLIENT&gt;19961029101000 &lt;!-- Oct. 29, 1996, 10:10:00\nam --&gt; &lt;USERID&gt;123-45-6789 &lt;!-- User ID (that is, SSN) --&gt;\n&lt;USERPASS&gt;MyPassword &lt;!-- Password (SSL encrypts whole) --&gt;\n&lt;LANGUAGE&gt;ENG &lt;!-- Language used for text --&gt; &lt;FI&gt; &lt;!-- ID of\nreceiving institution --&gt; &lt;ORG&gt;NCH &lt;!-- Name of ID owner --&gt;\n&lt;FID&gt;1001 &lt;!-- Actual ID --&gt; &lt;/FI&gt; &lt;APPID&gt;MyApp &lt;APPVER&gt;0500\n&lt;/SONRQ&gt; &lt;!-- End of signon --&gt; &lt;/SIGNONMSGSRQV1&gt; &lt;BANKMSGSRQV1&gt;\n&lt;STMTTRNRQ&gt; &lt;!-- First request in file --&gt; &lt;TRNUID&gt;1001 &lt;STMTRQ&gt;\n&lt;!-- Begin statement request --&gt; &lt;BANKACCTFROM&gt; &lt;!-- Identify the account\n--&gt; &lt;BANKID&gt;121099999 &lt;!-- Routing transit or other FI ID --&gt;\n&lt;ACCTID&gt;999988 &lt;!-- Account number --&gt; &lt;ACCTTYPE&gt;CHECKING &lt;!--\nAccount type --&gt; &lt;/BANKACCTFROM&gt; &lt;!-- End of account ID --&gt; &lt;INCTRAN&gt;\n&lt;!-- Begin include transaction --&gt; &lt;INCLUDE&gt;Y &lt;!-- Include transactions\n--&gt; &lt;/INCTRAN&gt; &lt;!-- End of include transaction --&gt; &lt;/STMTRQ&gt; &lt;!--\nEnd of statement request --&gt; &lt;/STMTTRNRQ&gt; &lt;!-- End of first request --&gt;\n&lt;/BANKMSGSRQV1&gt;&lt;/OFX&gt; &lt;!-- End of request data --&gt;<font size=\"2\">The\nresponse format follows a similar structure. Although a response such as a statement\nresponse contains all of the details of each transaction, each element is identified using\ntags.</font> </p>\n\n<p><font size=\"2\">The key rule of Open Financial Exchange syntax is that each tag is\neither an element or an aggregate. Data follows its element tag. An aggregate tag begins a\ncompound tag sequence, which must end with a matching tag; for example, &lt;AGGREGATE&gt;\n... &lt;/AGGREGATE&gt;. </font></p>\n\n<p><font size=\"2\">The actual file Open Financial Exchange sends is without any extra white\nspace between tags.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493245\"><font size=\"5\" face=\"Arial\">Conventions</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The conventions used in the detailed descriptions include: </font>\n\n</p><ul>\n  <li><font size=\"2\">Required tags are in <b>bold</b>. Regular face indicates tags that are\n    optional. Required means that a client will always include a tag in a request, and a\n    server must always include a tag in a response.</font> </li>\n  <li><font size=\"2\"><i>Italic </i>shows a required or optional aggregate from a set of\n    possible aggregates. </font></li>\n  <li><font size=\"2\">Required tags occur once unless noted as one or more in the description,\n    in which case the specification allows multiple occurrences. </font></li>\n  <li><font size=\"2\">Optional tags occur once if present unless noted as zero or more in the\n    description, in which case the specification allows multiple occurrences.</font> </li>\n  <li><font size=\"2\">Allowable specific values are listed, where applicable.</font> </li>\n  <li><font size=\"2\">A-<i>n</i> or N-<i>n</i>, specify those values that take general\n    alphanumeric or pure numeric type values, where <i>n</i> indicates the maximum size. </font></li>\n  <li><font size=\"2\">References to certain common value types, such as a dollar amount, are by\n    name. Chapter 3 lists value types that can be referenced by name.</font> </li>\n</ul>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;REQUIREDTAG&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Required tag (1 or more)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;REQUIREDTAG2&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Required tag that occurs only once </font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&lt;OPTIONALTAG&gt; </font></td>\n    <td width=\"336\"><font size=\"2\">Optional tag; this particular one can occur multiple times\n    (0 or more) </font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&lt;SPECIFIC&gt;</font></td>\n    <td width=\"336\"><font size=\"2\">Values are A, B, and C</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&lt;ALPHAVALUE&gt;</font></td>\n    <td width=\"336\"><font size=\"2\">Takes an alphanumeric value up to 32 characters,<i> A-32</i></font>\n    </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493246\"><font size=\"6\" face=\"Arial\">Structure</font></a> </li>\n</ol>\n\n<p><font size=\"2\">This chapter describes the basic structure of an Open Financial Exchange\nrequest and response. Structure includes headers, basic syntax, and the Signon request and\nresponse. This chapter also describes how Open Financial Exchange encodes external data,\nsuch as bit maps.</font> </p>\n\n<p><font size=\"2\">Open Financial Exchange data consists of some headers plus one or more\nOpen Financial Exchange data blocks. Each block consists of a signon message and zero or\nmore additional messages. When sent over the internet using HTTP, standard HTTP and\nmulti-part MIME headers and formats surround the Open Financial Exchange data. A simple\nfile that contained only Open Financial Exchange data would have the following form:</font>\n</p>\n\n<pre><font size=\"1\">HTTP headers\nMIME type application/x-ofx\nOpen Financial Exchange headers\nOpen Financial Exchange SGML block 1</font>\n</pre>\n\n<p><font size=\"2\">A more complex file that contained multiple Open Financial Exchange data\nblocks and additional Open Financial Exchange data would have this form:</font> </p>\n\n<pre><font size=\"1\">HTTP headers\nMIME type multipart/x-mixed-replace; boundary =--boundary-\n---boundary---\nMIME type application/x-ofx\n</font><font size=\"2\">\tOpen Financial Exchange headers\n\tOpen Financial Exchange SGML block 1\n\tOpen Financial Exchange SGML block 2\n---boundary---\n\tMIME type image/jpeg\n\t\tFI logo</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493247\"><font size=\"5\" face=\"Arial\">HTTP Headers</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Data delivered by way of HTTP places the standard HTTP result code on\nthe first line. HTTP defines a number of status codes. Servers can return any standard\nHTTP result. However, FIs should expect clients to collapse these codes into the following\nthree cases:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"84\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"96\"><i><font size=\"2\">Meaning</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"312\"><i><font size=\"2\">Action</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"84\"><font size=\"1\">200</font></td>\n    <td width=\"96\"><font size=\"2\">OK</font> </td>\n    <td width=\"312\"><font size=\"2\">The request was processed and a valid Open Financial\n    Exchange result is returned.</font> </td>\n  </tr>\n  <tr>\n    <td width=\"84\"><font size=\"1\">400s</font></td>\n    <td width=\"96\"><font size=\"2\">Bad request</font> </td>\n    <td width=\"312\"><font size=\"2\">The request was invalid and was not processed. Clients will\n    report an internal error to the user.</font> </td>\n  </tr>\n  <tr>\n    <td width=\"84\"><font size=\"1\">500s</font></td>\n    <td width=\"96\"><font size=\"2\">Server error</font> </td>\n    <td width=\"312\"><font size=\"2\">The server is unavailable. Clients should advise the user\n    to retry shortly.</font> </td>\n  </tr>\n</tbody></table>\n\n<p><i><b>NOTE:</b> Open Financial Exchange returns a code 400 only if it cannot parse the\nfile. Open Financial Exchange handles content errors such as wrong PIN, or invalid\naccount, by returning a valid Open Financial Exchange response along with code 200.</i> </p>\n\n<p>Open Financial Exchange requires the following HTTP standard headers: </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"84\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"96\"><i><font size=\"2\">Value</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"312\"><i><font size=\"2\">Explanation</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"84\"><font size=\"1\">Content-type</font></td>\n    <td width=\"96\"><font size=\"2\">application/x-ofx</font> </td>\n    <td width=\"312\"><font size=\"2\">The MIME type for Open Financial Exchange</font> </td>\n  </tr>\n  <tr>\n    <td width=\"84\"><font size=\"1\">Content-length</font></td>\n    <td width=\"96\"><font size=\"2\">length</font> </td>\n    <td width=\"312\"><font size=\"2\">Length of the data after removing HTTP headers</font> </td>\n  </tr>\n</tbody></table>\n\n<pre>\n</pre>\n\n<p><font size=\"2\">When responding with multi-part MIME, the main type will be\nmulti-part/x-mixed-replace; <br>\none of the parts will use application/x-ofx.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493248\"><font size=\"5\" face=\"Arial\">Open Financial Exchange Headers</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">The intent of Open Financial Exchange is for use with a variety of\ntransports and to provide sufficient version control capabilities for future expansion. To\nsupport this goal, the contents of an Open Financial Exchange file consist of a simple set\nof headers followed by contents defined by that header. \"File format\" means the\nentire content after removal of any transport headers. The HTTP transport described in\nthis document, means without the HTTP and MIME headers.</font> </p>\n\n<p><font size=\"2\">The Open Financial Exchange headers are in a simple <i>tag:value</i>\nsyntax and terminated by a blank line. Open Financial Exchange always sends headers\nunencrypted, even if there is application-level encryption in use for the remaining\ncontents. The first entry will always be OFXHEADER with a version number. This entry will\nhelp identify the contents as an Open Financial Exchange file, and provides the version of\nthe Open Financial Exchange headers that follow (not of the content itself). For example:</font>\n</p>\n\n<pre><font size=\"1\">OFXHEADER:100</font>\n</pre>\n\n<p><font size=\"2\">This document defines version 1.0 of the headers to contain at least the\nfollowing additional tags:</font> </p>\n\n<pre><font size=\"1\">DATA:OFXSGML\nVERSION:100\nSECURITY:\nENCODING:\n</font><font size=\"2\">CHARSET:\nCOMPRESSION:\nOLDFILEUID:\nNEWFILEUID:</font>\n</pre>\n\n<p><font size=\"2\">The data tag identifies the contents as being in OFX SGML form. VERSION\nidentifies the version type as OFXSGML data. In the case of OFXSGML, it translates to the\nversion of the Document Type Definition (DTD) that it uses for parsing. The ENCODING and\nCHARSET tags define the interpretation of the character data. See Chapter 5,\n\"International Support\" for more information on these tags. Chapter 4 describes\nthe security tag. A future version of this specification will define compression.</font> </p>\n\n<p><font size=\"2\">Open Financial Exchange uses OLDFILEUID and NEWFILEUID to support error\nrecovery. They are not present when clients are not requesting error recovery. (See\nChapter 6, \"Data Synchronization\")</font> </p>\n\n<p><font size=\"2\">A blank line follows the last tag. Then (for type OFXSGML), the\nSGML-readable data begins with the &lt;OFX&gt; tag.</font> </p>\n\n<p><font size=\"2\"><i><b>NOTE:</b> Here, VERSION provides the overall version of the DTD.\nThe &lt;OFX&gt; block describes the specific message set versions used, shown later in\nthis chapter.</i></font>\n\n</p><ol>\n  <li><a name=\"_Toc380493249\"><font size=\"4\" face=\"Arial\">The Meaning of Version Numbers</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">The OFXHEADER value should only change its major number if an existing\nclient is unable to process the new header. This can occur because of a complete syntax\nchange in a header, or a significant change in the semantics of an existing tag-not the\nentire response. You can add new tags as long as clients can function without\nunderstanding them.</font> </p>\n\n<p><font size=\"2\">You should add new values for a data tag only when you introduce an\nentirely new syntax. In the case of OFXSGML, a new syntax would have to be non-SGML\ncompliant to warrant a new data value. It is possible that there will be more than one\nsyntax in use at the same time to meet different needs.</font> </p>\n\n<p><font size=\"2\">The intent of the header version tag is to identify syntactic changes.\nIn the case of OFXSGML, this corresponds to the DTD. Purely for identification purposes,\neach change will increment the minor number of the version tag. If you introduce an\nincompatible change so that an older DTD can not parse the file, the major number will\nchange. See the general discussion of message sets and version control, later in this\nchapter.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493250\"><font size=\"5\" face=\"Arial\">SGML Details</font></a> </li>\n  <li><a name=\"_Toc380493251\"><font size=\"4\" face=\"Arial\">Compliance</font></a> </li>\n</ol>\n\n<p><font size=\"2\">SGML is the basis for Open Financial Exchange. There is a DTD that\nformally defines the SGML wire format. However, Open Financial Exchange is not completely\nSGML-<i>compliant</i> because the specification allows unrecognized tags to be present. It\nrequires clients and servers to skip over the unrecognized material. That is, if\n&lt;XYZ&gt;qqq&lt;/XYZ&gt; appeared and a client or server cannot recognize &lt;XYZ&gt;,\nthe server should ignore that tag and its enclosed data. A fully-compliant SGML parser\nwould not <i>validate</i> an Open Financial Exchange document if it contained any tags\nthat the DTD does not define.</font> </p>\n\n<p><font size=\"2\">Although SGML is the basis for the specification, and the specification\nis largely compliant with SGML, do not assume Open Financial Exchange supports any SGML\nfeatures not documented in this specification. The intent is to allow parsing to be as\nsimple as possible, while retaining compatibility with the SGML world.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493252\"><font size=\"4\" face=\"Arial\">Special Characters</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The following characters are special to SGML. Use the given alternative\nsequence to represent them:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Character</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Escape sequence</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&lt; (less than)</font></td>\n    <td width=\"336\"><font size=\"2\">&amp;lt;</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&gt; (greater than)</font></td>\n    <td width=\"336\"><font size=\"2\">&amp;gt;</font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&amp; (ampersand)</font></td>\n    <td width=\"336\"><font size=\"2\">&amp;amp;</font> </td>\n  </tr>\n</tbody></table>\n\n<p>For example, the string \"AT&amp;amp;T\" encodes \"AT&amp;T.\" </p>\n\n<p>A special case applies in specific tags that can accept HTML-formatted strings, such as\ne-mail records. These accept SGML marked section syntax to hide the HTML from the Open\nFinancial Exchange parser. You must prefix strings with \"&lt;![ CDATA [\"and\nsuffixed with\"]]&gt;.\" Within these bounds, treat the above characters literally\nwithout an escape. See the Chapter 9 for an example.\n\n</p><ol>\n  <li><a name=\"_Toc380493253\"><font size=\"5\" face=\"Arial\">Open Financial Exchange SGML\n    Structure</font></a> </li>\n  <li><a name=\"_Toc380493254\"><font size=\"4\" face=\"Arial\">Overview</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange hierarchically organizes request and response\nblocks:</font> </p>\n\n<p><font size=\"2\">Top Level &lt;OFX&gt;<br>\nMessage Set and Version &lt;<i>XXX</i>MSGSVn&gt;<br>\nSynchronization Wrappers &lt;YYYSYNCRQ&gt;, &lt;YYYSYNCRS&gt; <br>\nTransaction Wrappers &lt;YYYTRNRQ&gt;, &lt;YYYTRNRS&gt;<br>\nSpecific requests and responses</font> </p>\n\n<p><font size=\"2\">The following sections describe each of these levels.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493255\"><font size=\"4\" face=\"Arial\">Top Level</font></a> </li>\n</ol>\n\n<p><font size=\"2\">An Open Financial Exchange request or response has the following\ntop-level form:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&lt;<b>OFX</b>&gt;</font></td>\n    <td width=\"336\"><font size=\"2\">Opening tag</font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">... Open Financial Exchange requests or responses ...</font>\n    </td>\n    <td width=\"336\"><font size=\"2\">0 or more transaction requests and responses inside\n    appropriate message set aggregates</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><font size=\"1\">&lt;/<b>OFX</b>&gt;</font></td>\n    <td width=\"336\"><font size=\"2\">Closing tag for the Open Financial Exchange record</font> </td>\n  </tr>\n</tbody></table>\n\n<p>This chapter specifies the order of requests and responses. </p>\n\n<p>A single file can contain multiple &lt;OFX&gt; ... &lt;/OFX&gt; blocks. A typical use\nof multiple blocks is to request in a single file information associated with different\nusers.\n\n</p><ol>\n  <li><a name=\"_Toc380493256\"><font size=\"4\" face=\"Arial\">Messages</font></a> </li>\n</ol>\n\n<p><font size=\"2\">A message is the unit of work in Open Financial Exchange. It refers to a\nrequest and response pair, and the status codes associated with that response. For\nexample, the message to download a bank statement consists of the request &lt;STMTRQ&gt;\nand the response &lt;STMTRS&gt;. In addition, with the exception of the signon message,\neach message includes a <i>transaction wrapper</i>. These aggregates add a transaction\nunique ID &lt;TRNUID&gt;, and for responses, a &lt;STATUS&gt; aggregate, to the basic\nrequest and response. </font></p>\n\n<p><font size=\"2\">For messages subject to synchronization (see Chapter 6), a third layer\nof aggregates is also part of a message definition: a synchronization request and\nresponse. These add a token and, in some cases, other information to the transactions. </font></p>\n\n<p><font size=\"2\">Open Financial Exchange uses the following naming where the <i>XXX</i>\nmessage includes:</font>\n\n</p><ul>\n  <li><font size=\"2\">Basic request &lt;<i>XXX</i><b>RQ</b>&gt; and response &lt;<i>XXX</i><b>RS</b>&gt;</font>\n  </li>\n  <li><font size=\"2\">Transaction wrapper &lt;<i>XXX</i><b>TRNRQ</b>&gt; and &lt;<i>XXX</i><b>TRNRS</b>&gt;</font>\n  </li>\n  <li><font size=\"2\">If needed, synchronization wrapper &lt;<i>XXX</i><b>SYNCRQ</b>&gt; and\n    &lt;<i>XXX</i><b>SYNCRS</b>&gt;</font> </li>\n</ul>\n\n<p><font size=\"2\">In a few cases, a small number of related basic requests and responses\nshare a transaction and synchronization wrapper. The term message will still apply to each\nrequest and response; only the naming scheme will not hold in those cases.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493257\"><font size=\"4\" face=\"Arial\">Message Sets and Version Control</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Message sets are collections of messages. Generally they form all or\npart of what a user would consider a <i>service</i>, something for which they might have\nsigned up, such as \"banking.\" Message sets are the basis of version control,\nrouting, and security. They are also the basis for the required ordering in Open Financial\nExchange files.</font> </p>\n\n<p><font size=\"2\">Within an Open Financial Exchange block, Open Financial Exchange\norganizes messages by message set. A message set can appear at most once within an Open\nFinancial Exchange block. All messages from a message set must be from the same version of\nthat message set.</font> </p>\n\n<p><font size=\"2\">For each message set of <i>XXX</i> and version <i>n</i>, there exists an\naggregate named &lt;<i>XXX</i>MSGSV<i>n</i>&gt;. (Compare with &lt;<i>XXX</i>MSGSETV<i>n</i>&gt;\nin Chapter 7.) All of the messages from that message set must be inside the appropriate\nmessage set aggregate. In the following example, the Open Financial Exchange block\ncontains a signon request inside the signon message set, and two statement requests and a\ntransfer request inside the bank message set.</font> </p>\n\n<pre><font size=\"1\">&lt;OFX&gt;\n\t&lt;SIGNONMSGSRQV1&gt;\t&lt;!-- Signon message set --&gt;\n\t\t&lt;SONRQ&gt;\t\t\t\t&lt;!-- Signon message --&gt;\n\t\t...\n\t\t&lt;/SONRQ&gt;\n\t&lt;/SIGNONMSGSRQV1&gt;\n\n\t&lt;BANKMSGSRQV1&gt;\t\t&lt;!-- Banking message set --&gt;\n\t\t&lt;STMTTRNRQ&gt;\t\t&lt;!-- Statement request --&gt;\n\t\t...\n\t\t&lt;/STMTTRNRQ&gt;\n\t\t&lt;STMTTRNRQ&gt;\t\t&lt;!-- Another stmt request --&gt;\n\t\t...\n\t\t&lt;/STMTTRNRQ&gt;\n\t\t&lt;INTRATRNRQ&gt;\t\t&lt;!-- Intra-bank transfer request --&gt;\n\t\t...\n\t\t&lt;/INTRATRNRQ&gt;\n\t&lt;/BANKMSGSRQV1&gt;\n&lt;/OFX&gt;</font>\n</pre>\n\n<p><font size=\"2\">Message sets, if used at all, must appear in the following order:</font>\n\n</p><ul>\n  <li><font size=\"2\">Signon</font> </li>\n  <li><font size=\"2\">Signup</font> </li>\n  <li><font size=\"2\">Banking</font> </li>\n  <li><font size=\"2\">Credit card statements</font> </li>\n  <li><font size=\"2\">Investment statements</font> </li>\n  <li><font size=\"2\">Interbank funds transfers</font> </li>\n  <li><font size=\"2\">Wire funds transfers</font> </li>\n  <li><font size=\"2\">Payments</font> </li>\n  <li><font size=\"2\">General e-mail</font> </li>\n  <li><font size=\"2\">Investment security list</font> </li>\n  <li><font size=\"2\">FI Profile</font> </li>\n</ul>\n\n<p><font size=\"2\">The definition of each message set can further prescribe an order of its\nmessages within that message set.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493258\"><font size=\"4\" face=\"Arial\">Transactions</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Other than the signon message, each request is made as a transaction.\nTransactions contain a client-assigned globally unique ID, optional client-supplied\npass-back data, and then the record for the specific request. A transaction similarly\nwraps each response. The response transaction returns the client ID sent in the request,\nalong with a status message, the pass-back data if present, and the specific response\nrecord. This technique allows a client to track responses against requests.</font> </p>\n\n<p><font size=\"2\">The &lt;STATUS&gt; aggregate, defined in Chapter 3, provides feedback on\nthe processing of the request. If the &lt;SEVERITY&gt; of the status is ERROR, the server\nprovides no specific response record. Otherwise, the response will be complete even though\nsome warning might have occurred.</font> </p>\n\n<p><font size=\"2\">Clients can send additional information in &lt;CLTCOOKIE&gt; that\nservers will return in the response. This allows clients that do not maintain state, and\nthus do not save TRNUIDs, to cause some additional descriptive information to be present\nin the response. For example, a client might identify a request as relating to a user or a\nspouse.</font> </p>\n\n<p><font size=\"2\">In some countries some transactions require a customer-supplied\nauthorization number for each transaction. In those countries, the &lt;TAN&gt; element\nprovides the means to pass this information to servers. As Open Financial Exchange is\nimplemented in each country, the specification will define the specific requirements for\nthe use of &lt;TAN&gt; in each country.</font> </p>\n\n<p><font size=\"2\">A typical request is as follows:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;<b><i>XXX</i>TRNRQ&gt;</b></font></b> </td>\n    <td width=\"366\"><font size=\"2\">Transaction-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;TRNUID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Client-assigned globally unique ID for this transaction <i>trnuid</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;CLTCOOKIE&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Data to be echoed in the transaction response <i>A-32</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;TAN&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Transaction authorization number; used in some countries\n    with some types of transactions. Country-specific documentation will define messages that\n    require a TAN, <i>A-80</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">specific request</font></td>\n    <td width=\"366\"><font size=\"2\">Aggregate for the specific request</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/<b><i>XXX</i>TRNRQ&gt;</b></font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>A typical response is as follows:<br>\n</p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;<b><i>XXX</i>TRNRS&gt;</b></font></b> </td>\n    <td width=\"366\"><font size=\"2\">Transaction-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;TRNUID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Client-assigned globally unique ID for this transaction, <i>trnuid</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;CLTCOOKIE&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Client-provided data, <b>REQUIRED</b> if provided in\n    request, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;STATUS&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Status aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/STATUS&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">response record</font></td>\n    <td width=\"366\"><font size=\"2\">Aggregate for the specific response</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/<b><i>XXX</i>TRNRS&gt;</b></font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493259\"><font size=\"5\" face=\"Arial\">The Signon Message Set</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The Signon message set includes the signon message and the PIN change\nmessage, and must appear in that order. The &lt;SIGNONMSGSRQV1&gt; and\n&lt;SIGNONMSGSRSV1&gt; aggregates wrap the message.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493260\"><font size=\"4\" face=\"Arial\">Signon &lt;SONRQ&gt; &lt;SONRS&gt;</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">The signon record identifies and authenticates a user to an FI. It also\nincludes information about the application making the request, because some services might\nbe appropriate only for certain clients. Every Open Financial Exchange request contains\nexactly one &lt;SONRQ&gt;. Every response must contain exactly one &lt;SONRS&gt; record. </font></p>\n\n<p><font size=\"2\">Use of Open Financial Exchange presumes that FIs authenticate each\ncustomer and then give the customer access to one or more accounts or services. If\npasswords are specific to individual services or accounts, a separate Open Financial\nExchange request will be made for each distinct user ID or password required. This will\nnot necessarily be in a manner visible to the user. Note that some situations, such as\njoint accounts or business accounts, will have multiple user IDs and multiple passwords\nthat can access the same account.</font> </p>\n\n<p><font size=\"2\">FIs assign user IDs for the customer. It can be the customer's social\nsecurity number, but the client will not make any assumptions about the syntax of the ID,\nadd check-digits, or do similar processing.</font> </p>\n\n<p><font size=\"2\">To improve server efficiency in handling a series of Open Financial\nExchange request files sent over a short period of time, clients can request that a server\nreturn a &lt;USERKEY&gt; in the signon response. If the server provide a user key, clients\nwill send the &lt;USERKEY&gt; in instead of the user ID and password in subsequent\nsessions, until the &lt;USERKEY&gt; expires. This allows servers to authenticate\nsubsequent requests more quickly.</font> </p>\n\n<p><font size=\"2\">The client returns &lt;SESSCOOKIE&gt; if it sent one in a previous\n&lt;SONRS&gt;. Servers can use this value to track client usage but cannot assume that all\nrequests come from a single client, nor can they deny service if they did not expect the\nreturned cookie. Use of a backup file, for example, would lead to an unexpected\n&lt;SESSCOOKIE&gt; value that should nevertheless not stop a user from connecting.</font> </p>\n\n<p><font size=\"2\">Servers can request that a consumer change his or her password by\nreturning status code 15000. Servers should keep in mind that only one status code can be\nreturned. If the current signon response status should be 15500 (invalid ID or password),\nthe request to change password will need to wait until an otherwise successful signon is\nachieved.</font>\n\n</p><ol>\n  <li><font size=\"2\" face=\"Arial\">Record Request &lt;SONRQ&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><font size=\"2\">Tag</font></td>\n    <td bgcolor=\"#000000\" width=\"366\"><font size=\"2\">Description</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;SONRQ&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Record- request aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;<b>DTCLIENT</b>&gt;</font> </td>\n    <td width=\"366\"><font size=\"2\">Date and time of the request from the client computer, <i>datetime</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USERID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">User identification string. Use &lt;USERID&gt; &amp;\n    &lt;USERPASS&gt;, or &lt;USERKEY&gt;, but not both; <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USERPASS&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">User password on server - either &lt;USERID&gt; &amp;\n    &lt;USERPASS&gt; are used, or &lt;USERKEY&gt;, but not both;<i> A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;USERKEY&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Login using previously authenticated context - use\n    &lt;USERID&gt; &amp; &lt;USERPASS&gt;, or &lt;USERKEY&gt;, but not both; <i>A-64</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;GENUSERKEY&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Request server to return a USERKEY for future use, <i>Boolean</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;LANGUAGE&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Requested language for text responses, <i>language</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;SESSCOOKIE&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Session cookie, value received in previous &lt;SONRS&gt;,\n    not sent if first login or if none sent by FI <i>A-1000</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;FI&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Financial-Institution-identification aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;/FI&gt;</font></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;APPID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">ID of client application, <i>A-5</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;APPVER&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Version of client application, <i>N-4</i> (6.00 encoded as\n    0600)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/SONRQ&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Response &lt;SONRS&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;SONRS&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Record-response aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;STATUS&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Status aggregate, see list of possible code values</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;DTSERVER&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Date and time of the server response, <i>datetime</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;USERKEY&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Use user key that instead of USERID and USERPASS for\n    subsequent requests. TSKEYEXPIRE can limit lifetime</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;TSKEYEXPIRE&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Date and time that USERKEY expires</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\"><b>&lt;LANGUAGE</b>&gt;</font> </td>\n    <td width=\"366\"><font size=\"2\">Language used in text responses, <i>language</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;DTPROFUP&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Date and time of last update to profile information for any\n    service supported by this FI (see Chapter 7), <i>datetime</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;DTACCTUP&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Date and time of last update to account information (see\n    Chapter 8), <i>datetime</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;FI&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Financial-Institution-identification aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;/FI&gt;</font></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;SESSCOOKIE&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Session cookie that the client should return on the next\n    &lt;SONRQ&gt; <br>\n    <i>A-1000</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/SONRS&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>List of status code values for the &lt;CODE&gt; element of &lt;STATUS&gt;: <br>\n</p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"60\"><i><font size=\"1\">Value</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"438\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">0</font></td>\n    <td width=\"438\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">2000</font></td>\n    <td width=\"438\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">15000</font></td>\n    <td width=\"438\"><font size=\"2\">Must change PIN (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">15500</font></td>\n    <td width=\"438\"><font size=\"2\">Signon (for example, user ID or password) invalid (ERROR)</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">15501</font></td>\n    <td width=\"438\"><font size=\"2\">Customer account already in use (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">15502</font></td>\n    <td width=\"438\"><font size=\"2\">PIN Lockout (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Financial Institution ID &lt;FI&gt;</font> </li>\n</ol>\n\n<p><font size=\"2\">Some service providers support multiple FIs, and assign each FI an ID.\nThe signon allows clients to pass this information along, so that providers will know to\nwhich FI the user is actually doing a signon.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;FI&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">FI-record aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;ORG&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Organization defining this FI name space, <i>A-32</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;FID&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Financial Institution ID (unique within &lt;ORG&gt;), <i>A-32</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/FI&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493261\"><font size=\"4\" face=\"Arial\">PIN Change &lt;PINCHRQ&gt;\n    &lt;PINCHRS&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The signon sends a request to change a customer password as a separate\nrequest. The transaction request &lt;PINCHTRNRQ&gt; aggregate contains &lt;PINCHRQ&gt;.\nResponses are also inside transaction responses &lt;PINCHTRNRS&gt;. </font></p>\n\n<p><font size=\"2\">Password changes pose a special problem for error recovery. If the\nclient does not receive a response, it does not know whether the password change was\nsuccessful or not. Open Financial Exchange recommends that servers accept either the old\npassword or the new password on the connection following the one containing a password\nchange. The password used becomes the new password.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;PINCHRQ&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">PIN-change-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USERID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">User identification string. Often a social security number,\n    but if so, does not include any check digits, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;NEWUSERPASS&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">New user password, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/PINCHRQ&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;PINCHRS&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">PIN-change-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USERID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">User identification string. Often a social security number,\n    but if so, does not include any check digits, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;DTCHANGED&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Date and time the password was changed, <i>datetime</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/PINCHRS&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Status Code Values for the &lt;CODE&gt; Element of\n    &lt;STATUS&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"60\"><i><font size=\"1\">Value</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"438\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">0</font></td>\n    <td width=\"438\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">2000</font></td>\n    <td width=\"438\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"60\"><font size=\"1\">15503</font></td>\n    <td width=\"438\"><font size=\"2\">Could not change PIN (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493262\"><font size=\"4\" face=\"Arial\">Examples</font></a> </li>\n</ol>\n\n<p><b><font size=\"2\">User requests a password change: </font></b></p>\n\n<pre><font size=\"2\">&lt;PINCHTRNRQ&gt;\n\t&lt;TRNUID&gt;888\n\t&lt;PINCHRQ&gt;\n\t\t&lt;USERID&gt;123456789\n\t\t&lt;NEWUSERPASS&gt;5321\n\t&lt;/PINCHRQ&gt;\n&lt;/PINCHTRNRQ&gt;</font>\n</pre>\n\n<p><b><font size=\"2\">The server responds with: </font></b></p>\n\n<pre><font size=\"1\">&lt;PINCHTRNRS&gt;\n\t&lt;TRNUID&gt;888\n\t&lt;STATUS&gt;\n\t\t&lt;CODE&gt;0\n\t\t&lt;SEVERITY&gt;INFO\n\t&lt;/STATUS&gt;\n\t&lt;PINCHRS&gt;\n\t\t&lt;USERID&gt;123456789\n\t&lt;/PINCHRS&gt;\n&lt;/PINCHTRNRS&gt;</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493263\"><font size=\"5\" face=\"Arial\">External Data Support</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Some data, such as binary data, cannot be easily sent directly within\nSGML. For these situations, the specification will define a tag that contains a reference\nto some external data. The way that clients pick up the external data depends on the\ntransport used. For the HTTP-based transport described in this document, servers can send\nthe data in one of two ways:</font>\n\n</p><ul>\n  <li><font size=\"2\">Send the same response, using multi-part MIME types to separate the\n    response into basic <br>\n    Open Financial Exchange and one or more external data files</font> </li>\n  <li><font size=\"2\">Client can make a separate HTTP get against the supplied URL, if it\n    really needs the data </font></li>\n</ul>\n\n<p><font size=\"2\">For example, to retrieve a logo, a &lt;GETMIMERS&gt; might answer a\n&lt;GETMIMERQ&gt; as follows: </font></p>\n\n<pre><font size=\"1\">&lt;GETMIMERS&gt;\n\t&lt;URL&gt;https://www.fi.com/xxx/yyy/zzz.html\n&lt;/GETMIMERS&gt;</font>\n</pre>\n\n<p><font size=\"2\">If the file sent includes the same response using multi-part MIME,\nclients will assume it has the local file, zzz.jpg.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493264\"><font size=\"5\" face=\"Arial\">Extensions to Open Financial\n    Exchange</font></a> </li>\n</ol>\n\n<p><font size=\"2\">An organization that provides a customized client and server that\ncommunicate by means of <br>\nOpen Financial Exchange might wish to add new requests and responses or even specific\nelements to existing requests and responses. To ensure that each organization can extend\nthe specification without the risk of conflict, Open Financial Exchange defines a style of\ntag naming that lets each organization have its own name space.</font> </p>\n\n<p><font size=\"2\">Organizations can register a specific tag name prefix. (The specific\nprocedure or organization to manage this registration will be detailed at a later time.)\nIf an organization registers \"ABC,\" then they can safely add new tags named\n&lt;ABC.SOMETHING&gt; without</font>\n\n</p><ul>\n  <li><font size=\"2\">Colliding with another party wishing to extend the specification</font> </li>\n  <li><font size=\"2\">Confusing a client or server that does not support the extension</font> </li>\n</ul>\n\n<p><font size=\"2\">The extensions are not considered proprietary. An organization is free\nto publish their extensions and encourage client and server implementers to support them.</font>\n</p>\n\n<p><font size=\"2\">All tag names that do not contain a period (.) are reserved for use in\nfuture versions of the core <br>\nOpen Financial Exchange specification.<br>\n</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493265\"><font size=\"6\" face=\"Arial\">Common Aggregates, Elements, and\n    Data Types</font></a> </li>\n  <li><a name=\"_Toc380493266\"><font size=\"5\" face=\"Arial\">Common Aggregates</font></a> </li>\n</ol>\n\n<p><font size=\"2\">This section describes aggregates used in more than one service of Open\nFinancial Exchange (for example, investments and payments).</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493267\"><font size=\"4\" face=\"Arial\">Identifying Financial Institutions\n    and Accounts</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange does not provide a universal space for\nidentifying financial institutions, accounts, or types of accounts. The way to identify an\nFI and an account at that FI depends on the service. For information about\nservice-specific ID aggregates, see Chapters 11, 12, and 13 on banking, payments, and\ninvestments.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493268\"><font size=\"4\" face=\"Arial\">Balance Records &lt;BAL&gt;</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Several responses allow FIs to send an arbitrary set of balance\ninformation as part of a response, for example a bank statement download. FIs might want\nto send information on outstanding balances, payment dates, interest rates, and so forth.\nBalances can report the date the given balance reflects in &lt;DTASOF&gt;.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;BAL&gt;</font></b></td>\n    <td width=\"300\"><font size=\"2\">Balance-response aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;NAME&gt;</font></b></td>\n    <td width=\"300\"><font size=\"2\">Balance name, <i>A-20</i></font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;DESC&gt;</font></b></td>\n    <td width=\"300\"><font size=\"2\">Balance description, <i>A-80</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;BALTYPE&gt;</font></b> </td>\n    <td width=\"300\"><font size=\"2\">Balance type. <br>\n    DOLLAR = dollar (value formatted DDDDcc)<br>\n    PERCENT = percentage (value formatted XXXX.YYYY)<br>\n    NUMBER = number (value formatted as is)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;VALUE&gt;</font></b></td>\n    <td width=\"300\"><font size=\"2\">Balance value.<br>\n    Interpretation depends on &lt;BALTYPE&gt; field, <i>N-20</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\" face=\"Arial\">&lt;CURRENCY&gt;</font> </td>\n    <td width=\"300\"><font size=\"2\">If dollar formatting, can optionally include currency</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\" face=\"Arial\">&lt;DTASOF&gt;</font> </td>\n    <td width=\"300\"><font size=\"2\">Effective date of the given balance, <i>datetime</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/BAL&gt;</font></b></td>\n    <td width=\"300\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493269\"><font size=\"4\" face=\"Arial\">Error Reporting &lt;STATUS&gt;</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">To provide as much feedback as possible to clients and their users, Open\nFinancial Exchange defines a &lt;STATUS&gt; aggregate. The most important element is the\ncode that identifies the error. Each response defines the codes it uses. Codes 0 through\n2999 have common meanings in all Open Financial Exchange transactions. Codes from 3000 and\nup have meanings specific to each transaction.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;STATUS&gt;</font></b></td>\n    <td width=\"300\"><font size=\"2\">Error-reporting aggregate.</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;CODE&gt;</font></b></td>\n    <td width=\"300\"><font size=\"2\">Error code, <i>N-6</i></font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;SEVERITY&gt;</font></b> </td>\n    <td width=\"300\"><font size=\"2\">Severity of the error: <br>\n    INFO = Informational only<br>\n    WARN = Some problem with the request occurred but valid response still present<br>\n    ERROR = A problem severe enough that response could not be made</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\" face=\"Arial\">&lt;MESSAGE&gt;</font> </td>\n    <td width=\"300\"><font size=\"2\">A textual explanation from the FI. Note that clients will\n    generally have messages of their own for each error ID. Use this tag only to provide more\n    details or for the General errors.</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/STATUS&gt;</font></b></td>\n    <td width=\"300\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>&nbsp;</p>\n\n<p><img src=\"./sample_complexe_files/stadyn_image6.gif\" width=\"70\" height=\"70\" alt=\"stadyn_image6.gif (4356 bytes)\"><br>\nstadyn_image6</p>\n\n<p><img src=\"./sample_complexe_files/stadyn_image7.gif\" width=\"223\" height=\"72\" alt=\"stadyn_image7.gif (1702 bytes)\"><br>\nstadyn_image7</p>\n\n<p>For general errors, the server can respond with one of the following &lt;CODE&gt;\nvalues. However, not all codes are possible in a specific context. </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">0</font></td>\n    <td width=\"366\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">2000</font></td>\n    <td width=\"366\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">2021</font></td>\n    <td width=\"366\"><font size=\"2\">Unsupported version (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<p><i><b>NOTE:</b> Clients will generally have error messages based on &lt;CODE&gt;.\nTherefore, do not use &lt;MESSAGE&gt; to replace that text. Use &lt;MESSAGE&gt; only to\nexplain an error not well described by one of the defined CODEs, or to provide some\nadditional information.</i>\n\n</p><ol>\n  <li><a name=\"_Toc380493270\"><font size=\"5\" face=\"Arial\">Common Elements</font></a> </li>\n</ol>\n\n<p><font size=\"2\">This section defines elements used in several services of Open Financial\nExchange. The format of the value is either alphanumeric (A-<i>n</i>)<i> </i>or numeric\n(N-<i>n</i>) with a maximum length <i>n</i>; or as a named type. Section 3.3 describes the\nnamed types.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493271\"><font size=\"4\" face=\"Arial\">Financial Institution Transaction ID\n    &lt;FITID&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\"><b>Format: </b><i>A-255</i></font> </p>\n\n<p><font size=\"2\">An FI assigns an &lt;FITID&gt; to uniquely identify a transaction. Its\nprimary purpose is to allow a client to detect duplicate responses. Open Financial\nExchange intends &lt;FITID&gt; for use in statement download applications, where every\ntransaction requires a unique ID; not just those that are client-originated or\nserver-originated.</font> </p>\n\n<p><font size=\"2\">FITIDs must be unique within the scope of the requested transactions\n(that is, within an account) but need not be sequential or even increasing. Clients should\nbe aware that FITIDs are not unique across FIs. If a client performs the same type of\nrequest within the same scope at two different FIs, clients will need to use FI + account\n+ &lt;FITID&gt; as a unique key in a client database.</font> </p>\n\n<p><font size=\"2\"><b>Usage:</b> Bank statement download, investment statement download</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493272\"><font size=\"4\" face=\"Arial\">Server-Assigned ID &lt;SRVRTID&gt;</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\"><b>Format:</b> <i>A-10</i></font> </p>\n\n<p><font size=\"2\">A &lt;SRVRTID&gt; is a server-assigned ID. It should remain constant\nthroughout the lifetime of the object on the server. The client will consider the SRVRTID\nas its \"receipt\" or confirmation and will use this ID in any subsequent requests\nto change, delete, or inquire about this object. </font></p>\n\n<p><font size=\"2\">Where the context allows, it is possible for a server to use the same <i>value\n</i>for a given server object for both &lt;SRVRTID&gt; and &lt;FITID&gt;, but the client\nwill not know this. SRVRTIDs need be unique only within the scope of the requests and\nresponses they apply to, such as an account number. Like &lt;FITID&gt;, a &lt;SRVRTID&gt;\nis not unique across FIs and clients might need to use FI + &lt;SRVRTID&gt; if a client\nrequires a unique key.</font> </p>\n\n<p><font size=\"2\"><b>Usage:</b> Payments, Banking</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493273\"><font size=\"4\" face=\"Arial\">Client-Assigned Transaction UID\n    &lt;TRNUID&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\"><b>Format:</b> <i>A-36</i></font> </p>\n\n<p><font size=\"2\">Open Financial Exchange uses TRNUIDs to identify transactions,\nspecifically &lt;<i>XXX</i>TRNRQ&gt;. Clients expect a server to return the same\n&lt;TRNUID&gt; in the corresponding response and can use this to match up requests and\nresponses. Servers can use TRNUIDs to reject duplicate requests. Because multiple clients\nmight be generating requests to the same server, transaction IDs need to be unique across\nclients. Thus, &lt;TRNUID&gt; must be a globally unique ID.</font> </p>\n\n<p><font size=\"2\">The Open Software Foundation Distributed Computing Environment standards\nspecify a 36-character hexadecimal encoding of a 128-bit number and an algorithm to\ngenerate it. Clients are free to use their own algorithm, to use smaller TRNUIDs, or to\nrelax the uniqueness requirements if in their particular application it makes sense.\nHowever, it is <b>RECOMMENDED</b> that clients allow for the full 36 characters in\nresponses to work better with other clients.</font> </p>\n\n<p><font size=\"2\"><b>Usage:</b> All services</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493274\"><font size=\"4\" face=\"Arial\">Token &lt;TOKEN&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\"><b>Format:</b> <i>A-10</i></font> </p>\n\n<p><font size=\"2\">Open Financial Exchange uses <b>&lt;</b>TOKEN&gt; as part of data\nsynchronization requests to identify the point in history that the client has already\nreceived data, and in responses to identify the server's current end of history. See\nChapter 6, \"Data Synchronization,\" for more information.</font> </p>\n\n<p><font size=\"2\">&lt;TOKEN&gt; is unique within an FI and the scope of the\nsynchronization request. For example, if the synchronization request includes an account\nID, the &lt;TOKEN&gt; needs be unique only within an account. Servers are free to use a\n&lt;TOKEN&gt; that is unique across the entire FI. Clients must save separate\n&lt;TOKEN&gt;s for each account, FI, and type of synchronization request.</font> </p>\n\n<p><font size=\"2\"><b>Usage:</b> All synchronization requests and responses</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493275\"><font size=\"4\" face=\"Arial\">Transaction Amount &lt;TRNAMT&gt;</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\"><b>Format:</b> <i>Amount</i></font> </p>\n\n<p><font size=\"2\">Open Financial Exchange uses &lt;TRNAMT&gt; in any request or response\nthat reports the total amount of an individual transaction.</font> </p>\n\n<p><font size=\"2\"><b>Usage:</b> Bank statement download, investment statement download,\npayments</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493276\"><font size=\"4\" face=\"Arial\">Memo &lt;MEMO&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\"><b>Format:</b> <i>A-255</i></font> </p>\n\n<p><font size=\"2\">A &lt;MEMO&gt; provides additional information about a transaction.</font>\n</p>\n\n<p><font size=\"2\"><b>Usage:</b> Bank statement download, investment statement download,\npayments, transfers</font>\n\n</p><ol>\n  <li><font size=\"4\" face=\"Arial\"><a name=\"_Toc380493277\">Date Start and Date End\n    &lt;DTSTART&gt;</a> &lt;DTEND&gt;</font> </li>\n</ol>\n\n<p><font size=\"2\"><b>Format:</b> <i>Datetime</i></font> </p>\n\n<p><font size=\"2\">Open Financial Exchange uses these tags in requests to provide guidance\nto the FI about the range of response that is desired. It also uses them in responses to\nlet clients know what the FI was actually able to produce.</font> </p>\n\n<p><font size=\"2\">In requests, the following rules apply:</font>\n\n</p><ul>\n  <li><font size=\"2\">If &lt;DTSTART&gt; is absent, the client is requesting all available\n    history (up to the &lt;DTEND&gt;, if specified). Otherwise, it indicates the <i>inclusive</i>\n    date and time in history where the client expects servers to start sending information.</font>\n  </li>\n  <li><font size=\"2\">If &lt;DTEND&gt; is absent, the client is requesting all available\n    history (starting from &lt;DTSTART&gt;, if specified). Otherwise, it indicates the <i>exclusive</i>\n    date and time in history where the client expects servers to stop sending information.</font>\n  </li>\n</ul>\n\n<p><font size=\"2\">In responses, the following rules apply:</font>\n\n</p><ul>\n  <li><font size=\"2\">&lt;DTSTART&gt; is the date and time where the server began <i>looking</i>\n    for information, not necessarily the date of the earliest returned information. If the\n    response &lt;DTSTART&gt; is later than the requested &lt;DTSTART&gt;, clients can infer\n    that the user has not signed on frequently enough to ensure that the client has retrieved\n    all information. If the user has been calling frequently enough, &lt;DTSTART&gt; in the\n    response will match &lt;DTSTART&gt; in the request.</font> </li>\n  <li><font size=\"2\">&lt;DTEND&gt; is the date and time that, if used by the client as the\n    next requested &lt;DTSTART&gt;, it would pick up exactly where the current response left\n    off. It is the <i>exclusive</i> date and time in history where the server stopped <i>looking</i>\n    for information, based on the request &lt;DTEND&gt; rules. </font></li>\n</ul>\n\n<p><font size=\"2\">In all cases, servers are <b>REQUIRED</b> to use a \"system add\ndatetime\" as the basis for deciding which details match the requested date range. For\nexample, if an FI posts a transaction dated Jan 3 to a user's account on Jan 5, and a\nclient connects on Jan 4 and again on Jan 6, the server is <b>REQUIRED</b> to return that\nJan 3 dated transaction to the client when it calls on Jan 6. </font></p>\n\n<p><font size=\"2\"><b>Usage:</b> Bank statement download, investment statement download</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493278\"><font size=\"5\" face=\"Arial\">Common data types</font></a> </li>\n  <li><a name=\"_Toc380493279\"><font size=\"4\" face=\"Arial\">Dates and Times</font></a> </li>\n  <li><font size=\"2\" face=\"Arial\">Basic Format</font> </li>\n</ol>\n\n<p><font size=\"2\">There is one format for representing dates, times, and time zones. The\ncomplete form is:</font> </p>\n\n<p><font size=\"2\">YYYYMMDDHHMMSS.XXX[<i>gmt offset</i>:<i>tz name</i>]</font> </p>\n\n<p><font size=\"2\">For example, \"19961005132200.1234[-5:EST]\" represents October\n5, 1996, at 1:22 and 124 milliseconds p.m., in Eastern Standard Time. This is the same as\n6:22 p.m. Greenwich Mean Time (GMT).</font> </p>\n\n<p><font size=\"2\">Tags specified as type <i>date </i>and generally starting with the\nletters \"DT\" will accept a fully formatted date-time-timezone as specified\nabove. They will also accept values with fields omitted from the right. They assume the\nfollowing defaults if a field is missing:</font>\n\n</p><ul>\n  <li><font size=\"2\">YYYYMMDD: 12:00 midnight (the start of the day), GMT</font> </li>\n  <li><font size=\"2\">YYYYMMDDHHMMSS: GMT</font> </li>\n  <li><font size=\"2\">YYYYMMDDHHMMSS.XXX: GMT</font> </li>\n  <li><font size=\"2\">YYYYMMDDHHMMSS.XXX[-0500:EST]: Fully qualified</font> </li>\n</ul>\n\n<p><font size=\"2\">Open Financial Exchange identifies elements that require a time as\nhaving type <i>timestamp </i>and their tag name will start with the prefix TS. The\ntimezone and milliseconds are still optional, and will default to GMT.</font> </p>\n\n<p><font size=\"2\">Take care when specifying an ending date without a time. If the last\ntransaction returned for a bank statement download was Jan 5 1996 10:46 a.m. and if the\n&lt;DTEND&gt; was given as just Jan 5, the transactions on Jan 5 would be resent. If\nresults are only available daily, then just using dates and not times will work correctly.\n</font></p>\n\n<p><font size=\"2\"><i><b>NOTE:</b> Open Financial Exchange does not require servers or\nclients to use the full precision specified. However, they are <i><b>REQUIRED</b> to\naccept any of these forms without complaint.</i></i></font> </p>\n\n<p><font size=\"2\">Some services extend the general notion of a <i>date</i> by adding\nspecial values, such as \"TODAY.\" These special values are called \"smart\ndates.\" Specific requests indicate when to use these extra values, and list the tag\nas having a special data type.</font>\n\n</p><ol>\n  <li><font size=\"2\" face=\"Arial\">Time Zone Issues</font> </li>\n</ol>\n\n<p><font size=\"2\">Several issues arise when a customer and the FI are not in the same time\nzone, or when a customer moves a computer into new time zones. In addition, it is\ngenerally unsafe to assume that computer users have correctly set their time or timezone.</font>\n</p>\n\n<p><font size=\"2\">Although most transactions are not sensitive to the exact time, they\noften are sensitive to the date. In some cases, time zone errors lead to actions occurring\non a different date than intended by the customer. For this reason, servers should always\nuse a complete local time plus GMT offset in any datetime values in a response. If a\ncustomer's request is for 5 p.m. EST, and a server in Europe responds with 1 a.m. MET the\nnext day, a smart client can choose to warn the customer about the date shift.</font> </p>\n\n<p><font size=\"2\">Clients that maintain local state, especially of long-lived server\nobjects, should be careful how they store datetime values. If a customer initiates a\nrepeating transaction for 5 p.m. EST, then moves to a new time zone, the customer might\nhave intended that the transaction remain 5 p.m. in the new local time, requiring a change\nrequest to be sent to the server. If, however, they intended it to remain fixed in server\ntime, this would require a change in the local time stored in the client.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493280\"><font size=\"4\" face=\"Arial\">Amounts, Prices, and Quantities</font></a>\n  </li>\n  <li><font size=\"2\" face=\"Arial\">Positive and Negative Signs</font> </li>\n</ol>\n\n<p><font size=\"2\">Unless otherwise noted in the specification, Open Financial Exchange\nalways signs amounts and quantities from the perspective of the customer. Some typically\nnegative amounts:</font>\n\n</p><ul>\n  <li><font size=\"2\">Investment buy amount, investment sell quantity</font> </li>\n  <li><font size=\"2\">Bank statement debit amounts, checks, fees</font> </li>\n  <li><font size=\"2\">Credit card purchases</font> </li>\n  <li><font size=\"2\">Margin balance (unless the FI owes the client money)</font> </li>\n</ul>\n\n<p><font size=\"2\">Some typically positive amounts:</font>\n\n</p><ul>\n  <li><font size=\"2\">Investment sell amount, investment buy quantity</font> </li>\n  <li><font size=\"2\">Bank statement credits</font> </li>\n  <li><font size=\"2\">Credit card payments</font> </li>\n  <li><font size=\"2\">Ledger balance (unless the account is overdrawn)</font> </li>\n</ul>\n\n<p><font size=\"2\"><i>Amount: </i>All amount-valued tags are sent with a decimal point or\ncomma, as in \"XXXX.XX.\" There should not be any punctuation separating\nthousands, millions, and so forth. The maximum value accepted depends on the client.</font>\n</p>\n\n<p><font size=\"2\"><i>Quantity: </i>Use decimal notation.</font> </p>\n\n<p><font size=\"2\"><i>Unitprice: </i>Use decimal notation. Unless specifically noted,\nprices should always be positive.</font> </p>\n\n<p><font size=\"2\"><i>Rate: </i>Use decimal notation, with the rate specified out of 100%.\nFor example, 5.2 is 5.2%.</font> </p>\n\n<p><font size=\"2\">Some services define special values, such as INFLATION, which you can\nuse instead of a designated value. Open Financial Exchange refers to these as \"smart\ntypes,\" and identifies them in the specification.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493281\"><font size=\"4\" face=\"Arial\">Language</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange identifies human-readable language-for such\nthings as status messages and e-mail-with a three-letter code based on ISO-639.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493282\"><font size=\"4\" face=\"Arial\">Basic data types</font></a> </li>\n</ol>\n\n<p><font size=\"2\"><i>Boolean: </i>Y = yes or true, N = no or false.</font> </p>\n\n<p><font size=\"2\"><i>URL: </i>String form of a World Wide Web Uniform Resource Location.\nIt should be fully qualified including protocol, host, and path.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493283\"><font size=\"6\" face=\"Arial\">Security</font></a> </li>\n  <li><a name=\"_Toc380493284\"><font size=\"5\" face=\"Arial\">Security Solutions</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange carries financial information over the Internet\nin such a way to provide privacy, message integrity, and authentication for applications\nat the appropriate level. Each service within Open Financial Exchange requires a certain\nlevel of security. Online banking and payments require strong secrecy, whereas stock\nquotations consist of publicly available information and consequently have a much weaker\nsecrecy requirement.</font> </p>\n\n<p><font size=\"2\">Some Internet protocols, such as HTTPS (which uses Secure Socket Layer\nversion 3, SSLv3), provide channel-level security. When the security requirement exceeds\nthat provided by the channel, you must use an application-level protocol.</font> </p>\n\n<p><a name=\"_Toc371763498\"><font size=\"2\">To address these various needs, Open Financial\nExchange allows a range of security solutions. Open Financial Exchange 1.0 supports online\nbanking and payment functions for which strong channel security is currently appropriate.\nFuture releases will support a wider array of services, some of which will require more\nelaborate trust models. Application-level protection will secure the latter.</font></a> </p>\n\n<p><font size=\"2\">Open Financial Exchange security properties include:</font>\n\n</p><ul>\n  <li><font size=\"2\">SSL - protects information during transmission over the Internet between\n    a customer and an FI </font></li>\n  <li><font size=\"2\">Application layer security - encrypts and formats messages using RSA Data\n    Security PKCS#7 techniques </font></li>\n  <li><font size=\"2\">New cryptographic options and enhancements when available - will enhance\n    Open Financial Exchange to provide these facilities </font></li>\n</ul>\n\n<ol>\n  <li><a name=\"_Toc380493285\"><font size=\"4\" face=\"Arial\">Determining Security Levels\n    &lt;OFXSEC&gt; &lt;TRANSPSEC&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Two elements in the FI profile, &lt;OFXSEC&gt; and &lt;TRANSPSEC&gt;,\ncontain the security level a client should use to communicate with a server.</font> </p>\n\n<p><font size=\"2\">The valid values for &lt;OFXSEC&gt; are as follows:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"1\">Type</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\">NONE</font></td>\n    <td width=\"318\"><font size=\"2\">No application level security</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\">APPSEC</font></td>\n    <td width=\"318\"><font size=\"2\">Use application level security</font> </td>\n  </tr>\n</tbody></table>\n\n<p>The &lt;TRANSPSEC&gt; element value is Boolean. If the value is YES, use channel-level\nsecurity.\n\n</p><ol>\n  <li><a name=\"_Toc380493286\"><font size=\"5\" face=\"Arial\">Channel-Level Security</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Secure Socket Layer version 3 (SSLv3) provides channel-level security in\nOpen Financial Exchange. SSLv3 provides confidentiality, message integrity, and implicit\nauthentication. In Open Financial Exchange 1.0, channel-level security using SSLv3 is the\nprimary form of security.</font>\n\n</p><ol>\n  <li><font size=\"4\" face=\"Arial\"><a name=\"_Toc380493287\">Security Requirements</a> </font></li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange provides a method to exchange financial\ninformation over public networks. This necessitates strong security facilities and careful\nprotocol design. The most commonly used facility, and trusted method for accomplishing\nmany of these goals is SSL. The following sub-sections describe the most prominent\nrequirements for security and how Secure Socket Layer (SSL) addresses these.</font>\n\n</p><ol>\n  <li><font size=\"2\" face=\"Arial\">Privacy, Authentication, and Message Integrity</font> </li>\n</ol>\n\n<p><font size=\"2\">SSL provides a range of strong encryption methods for insuring\nconfidentiality, and strong measures to insure that messages are not altered as they\npropagate over the Internet. User authentication is usually addressed at the application\nlayer, not within SSL. Servers are configured with public key certificates that client\napplication software verifies. This provides some measure of server authentication.\nTesting certificate revocation lists is not commonly performed. However, as these\nfacilities emerge, client software will be written to support this need. </font>\n\n</p><ol>\n  <li><font size=\"2\" face=\"Arial\">Facilities for Authorization</font> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange messages typically provide user ID and password\nso that a service provider can authenticate the user. Once a system authenticates a user,\nthe service provider must insure that the user is authorized to perform the requested\nactions. For example, the service provider must decide if the specified user is authorized\nto perform a transfer from the specified account. The service provider must also determine\nwhether the user has exceeded allowed limits on withdrawals, whether the activity on this\naccount is unusual given past history, and other context-sensitive issues.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493288\"><font size=\"4\" face=\"Arial\">Using SSL 3.0 in Open Financial\n    Exchange</font></a> </li>\n</ol>\n\n<p><font size=\"2\">SSL version 3.0 provides a set of widely and commonly accepted methods\nfor securing Internet transactions. These common methods within SSL are called\nCipherSuites. You can secure applications appropriately within SSL by specifying an\nordered sequence of preferred CipherSuites (highest preference listed first). Servers\nselect the strongest supported CipherSuite from the list provided by the client. </font></p>\n\n<p><font size=\"2\"><i><b>NOTE:</b> Passing username and password pairs in a weakly\nencrypted channel exposes this information to cryptographic attack. When implementing Open\nFinancial Exchange, use the strongest available ciphers.</i></font> </p>\n\n<p><font size=\"2\">You should not use the following CipherSuites because they are\nvulnerable to man-in-the-middle attacks during Open Financial Exchange message exchanges:</font>\n\n</p><menu>\n  <li><font size=\"2\">SSL_DH_anon_EXPORT_WITH_RC4_40_MD5 </font></li>\n  <li><font size=\"2\">SSL_DH_anon_WITH_RC4_128_MD5 </font></li>\n  <li><font size=\"2\">SSL_DH_anon_EXPORT_WITH_DES40_CBC_SHA </font></li>\n  <li><font size=\"2\">SSL_DH_anon_WITH_DES_CBC_SHA </font></li>\n  <li><font size=\"2\">SSL_DH_anon_WITH_3DES_EDE_CBC_SHA</font> </li>\n</menu>\n\n<p><a name=\"_Toc376002624\"><font size=\"2\">Setting tags to enable channel-level security in\nthe FI profile advises the Open Financial Exchange application to use this security\nmethod. Usually, the service provider of the Web server configures the allowed\nCipherSuites within SSL. </font></a>\n\n</p><ol>\n  <li><a name=\"_Toc380493289\"><font size=\"5\" face=\"Arial\">Application-Level Security</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">While strong channel-level security is sufficient for the current suite\nof Open Financial Exchange transactions, there are features that channel security does not\nprovide. These include (but are not limited to) data signing, non-repudiation, rational\ncertificate management and revocation, and trust proxy. Where the trust model for an\napplication requires such features to conduct the transaction safely, Open Financial\nExchange stipulates the use of an application-level protocol. A future implementation\nguide will publish this protocol. </font></p>\n\n<p><font size=\"2\">The standard method for providing application-level security is to rely\nupon the RSA Public Key Cryptography Standard (PKCS) message format. The PKCS #7 standard\nspecifies a message format that is both cryptographically strong and flexible enough to\nprovide sufficient facilities for evolution. </font>\n\n</p><ol>\n  <li><a name=\"_Toc380493290\"><font size=\"4\" face=\"Arial\">Requirements for Application-Layer\n    Security</font></a> </li>\n  <li><font size=\"2\" face=\"Arial\">Privacy, Authentication, and Message Integrity</font> </li>\n</ol>\n\n<p><font size=\"2\">RSA Public Key Cryptography Standard #7 (PKCS#7) defines a rich set of\nmessage formats for securely exchanging information over public networks. These message\nformats provide for encrypting data using a combination of cryptographic techniques to\nleverage manageability of public key cryptography. It also utilizes the speed of block\nciphers into a hybrid, which exploits the best properties of each. </font></p>\n\n<p><font size=\"2\">PKCS#7 message encryption provides privacy. A digitally signed message\n(or applying HMAC) insures message integrity.</font> </p>\n\n<p><font size=\"2\">Use one of the following to define PKCS#7 messages: Data, Digitally\nSigned-Data, Enveloped-Data, or Digitally Signed and Enveloped-Data (also referred to as\nSealed-Data). Open Financial Exchange can use Digested-Data, which digests application\ndata before it embeds data within an Enveloped-Data object. However, it should not\ntransmit this data over public networks without encryption applied.</font>\n\n</p><ol>\n  <li><font size=\"2\" face=\"Arial\">Facilities for Authorization</font> </li>\n</ol>\n\n<p><font size=\"2\">As stated previously in the section 4.2, authentication and\nauthorization are the responsibility of the service provider. Open Financial Exchange\nmessages contain the information to enable authentication and authorization decisions.\nWith application-level security that uses a digitally signed format, the verification of\nthat signature provides an additional method of authenticating the user.</font>\n\n</p><ol>\n  <li><font size=\"4\" face=\"Arial\"><a name=\"_Toc380493291\">Using Application-level Encryption\n    in </a>Open Financial Exchange </font></li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange applications requiring a sophisticated trust\nmodel require more facilities than those provided by SSL. If an Open Financial Exchange\napplication requires only point-to-point security, SSL version 3.0 provides adequate\nfacilities for message security. However, if the application requires more directed,\nspecific forms of security, then use the appropriate PKCS#7 message formats for the\napplication. An example of this might be a stock trading application issuing orders whose\nvalues demand that the security level be high, and where Open Financial Exchange treats\nthe message with special handling instructions. </font></p>\n\n<p><font size=\"2\">Recommended cryptographic techniques for Open Financial Exchange\napplication security are: </font>\n\n</p><ul>\n  <li><font size=\"2\">RC4 for bulk encryption (using 40 bits for exportable applications, 128\n    for North America)</font> </li>\n  <li><font size=\"2\">RSA encryption of bulk encryption keys and digital signatures</font> </li>\n  <li><font size=\"2\">SHA-1 as a secure hash algorithm </font></li>\n</ul>\n\n<p><font size=\"2\">In the absence of digital signatures, Open Financial Exchange\napplications should utilize the HMAC keyed MAC algorithm, using SHA-1 as a secure hash\nfunction. </font></p>\n\n<p><font size=\"2\">When you set the tags for application-layer security-which determines\nwhether to use PKCS#7 message format-in the FI profile, the application software uses\nthese facilities. <br>\n</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493292\"><font size=\"6\" face=\"Arial\">International Support</font></a> </li>\n  <li><a name=\"_Toc380493293\"><font size=\"5\" face=\"Arial\">Language and Encoding</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Most of the content in Open Financial Exchange is language-neutral.\nHowever, some error messages, balance descriptions, and similar tags contain text meant to\nappear to the financial institution customers. There are also cases, such as e-mail\nrecords, where customers need to send text in other languages. To support world-wide\nlanguages, Open Financial Exchange must identify the basic text encoding, specific\ncharacter set, and the specific language.</font> </p>\n\n<p><font size=\"2\">The outer Open Financial Exchange headers specify the encoding and\ncharacter set, as described Chapter 2. Current encoding values are ASCII and UNICODE. For\nASCII, character set values are code pages. Unicode ignores the character set <i>per se</i>\nalthough it still requires the syntax. Clients identify the language in the signon\nrequest. Open Financial Exchange specifies languages by three-letter codes as defined in\nISO-639. Servers report their supported languages in the profile (see Chapter 7). If a\nserver cannot support the requested language, they must return an error and not process\nthe rest of the transactions.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493294\"><font size=\"5\" face=\"Arial\">Currency &lt;CURDEF&gt;\n    &lt;CURRENCY&gt; &lt;ORIGCURRENCY&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">In each transaction involving amounts, responses include a default\ncurrency identification, &lt;CURDEF&gt;. The values are based on the ISO 4217 three-letter\ncurrency identifiers.</font><font size=\"1\"> </font></p>\n\n<p><font size=\"2\">Within each transaction, specific parts of the response might need to\nreport a different currency. Where appropriate, aggregates will include an optional\n&lt;CURRENCY&gt; aggregate. The scope of a &lt;CURRENCY&gt; aggregate is everything within\nthe same aggregate that the &lt;CURRENCY&gt; aggregate appears in, including nested\naggregates, unless overridden by a nested &lt;CURRENCY&gt; aggregate. For example,\nspecifying a &lt;CURRENCY&gt; aggregate in an investment statement detail means that the\nunit price, transaction total, commission, and all other amounts are in terms of the given\ncurrency, not the default currency. </font></p>\n\n<p><font size=\"2\">Note that there is no way for two or more individual elements that\nrepresent amounts-and are directly part of the same aggregate-to have different\ncurrencies. For example, there is no way in a statement download to have a different\ncurrency for the &lt;LEDGERBAL&gt; and the &lt;AVAILBAL&gt;, because they are both\ndirectly members of &lt;STMTRS&gt;. In most cases, you can use the optional &lt;BAL&gt;\nrecords to overcome this limitation, which do accept individual &lt;CURRENCY&gt;\naggregates.</font> </p>\n\n<p><font size=\"2\">The default currency for a request is the currency of the source\naccount. For example, the currency for &lt;BANKACCTFROM&gt;.</font> </p>\n\n<p><font size=\"2\">The &lt;CURRATE&gt; should be the one in effect throughout the scope of\nthe &lt;CURRENCY&gt; aggregate. It is not necessarily the current rate. Note that the\n&lt;CURRATE&gt; needs to take into account the choice of the FI for formatting of amounts\n(that is, where the decimal is) in both default and overriding currency, so that a client\ncan do math. This can mean that the rate is adjusted by orders of magnitude (up or down)\nfrom what is commonly reported in newspapers.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;CURRENCY&gt;</font></b><font size=\"1\" face=\"Arial\"> <i>or\n    <br>\n    </i><b>&lt;ORIGCURRENCY&gt;</b></font> </td>\n    <td width=\"354\"><font size=\"2\">Currency aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;CURSYM&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">ISO 4217 3-letter currency identifier, <i>A-3</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;CURRATE&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Ratio of &lt;CURDEF&gt; currency to &lt;CURSYM&gt;\n    currency, in decimal form</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/CURRENCY&gt;</font></b><font size=\"1\" face=\"Arial\">\n    <i>or <br>\n    </i><b>&lt;/ORIGCURRENCY&gt;</b></font> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>In some cases, Open Financial Exchange will define transaction responses so that\namounts have been converted to the home currency. However, Open Financial Exchange will\nallow FIs to optionally report the original amount and the original (foreign) currency. In\nthese cases, transactions include a specific tag for the original amount, and then a\n&lt;ORIGCURRENCY&gt; tag to report the details of the foreign currency. </p>\n\n<p>Again, &lt;CURRENCY&gt; means that Open Financial Exchange <i>has not</i> converted\namounts. Whereas, &lt;ORIGCURRENCY&gt; means that Open Financial Exchange <i>has</i>\nalready converted amounts.\n\n</p><ol>\n  <li><a name=\"_Toc380493295\"><font size=\"5\" face=\"Arial\">Country-Specific Tag Values</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Some of the tags in Open Financial Exchange have values that are\ncountry-specific. For example, &lt;USPRODUCTTYPE&gt; is only useful within the United\nStates. Open Financial Exchange will extend in each country as needed to provide tags that\naccept values useful to that country. Clients in other countries that do not know about\nthese tags will simply skip them.</font> </p>\n\n<p><font size=\"2\">In some cases, a tag value represents a fundamental way of identifying\nsomething, yet there does not exist a world-wide standard for such identification.\nExamples include bank accounts and securities. In these cases, it is important that Open\nFinancial Exchange defines a single, extensible approach for identification. For example,\nCUSIPs are used within the U.S., but not in other countries. However, CUSIPs are\nfundamental to relating investment securities, holdings, and transactions. Thus, a\nsecurity ID consists of a two-part aggregate: one to identify the naming scheme, and one\nto provide a value. Open Financial Exchange will define valid naming schemes as necessary\nfor each country.<br>\n<br>\n<br>\n</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493296\"><font size=\"6\" face=\"Arial\">Data Synchronization</font></a> </li>\n  <li><a name=\"_Toc380493297\"><font size=\"5\" face=\"Arial\">Overview</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Currently, some systems provide only limited support for error recovery\nand no support for backup files or multiple clients. The Open Financial Exchange data\nsynchronization approach described in this chapter handles all of these situations. </font></p>\n\n<p><font size=\"2\">Open Financial Exchange defines a powerful form of data synchronization\nbetween clients and servers. </font></p>\n\n<p><font size=\"2\">Open Financial Exchange data synchronization addresses the following\nproblems:</font>\n\n</p><ul>\n  <li><font size=\"2\">Error recovery</font> </li>\n  <li><font size=\"2\">Use of multiple client applications</font> </li>\n  <li><font size=\"2\">Restoring from a backup file</font> </li>\n  <li><font size=\"2\">Multiple data files (for example, one copy at home, another at work).</font>\n  </li>\n</ul>\n\n<p><font size=\"2\">This chapter first provides a brief background of error recovery\nproblems and then presents the basic strategy used in Open Financial Exchange to perform\ndata synchronization. Each Open Financial Exchange service includes specific details for\ndata synchronization requests and responses.</font> </p>\n\n<p><font size=\"2\">Most of the information in this chapter concerns data synchronization,\nsince it is a relatively new concept. The final section in this chapter discusses\nalternatives to full synchronization, and summarizes the options for each.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493298\"><font size=\"5\" face=\"Arial\">Background</font></a> </li>\n</ol>\n\n<p><font size=\"2\">When a client begins a connection with a server for which the connection\ndoes not successfully complete, there are two main problems:</font>\n\n</p><ul>\n  <li><font size=\"2\">Unconfirmed requests</font> </li>\n</ul>\n\n<p><font size=\"2\">If a client does not receive a response to work it initiates, it has no\nway of knowing whether the server processed the request. It also will not have any\nserver-supplied information about the request, such as a server ID number.</font>\n\n</p><ul>\n  <li><font size=\"2\">Unsolicited data</font> </li>\n</ul>\n\n<p><font size=\"2\">Some banking protocols allow a server to send data to the client\nwhenever the client makes a connection. This specification assumes that the first client\nthat calls in after the unsolicited data is available for download receives the data. If\nthe connection fails, this information would be forever lost to the client. Examples of\nunsolicited data include updates in the status of a bill payment and e-mail responses.</font>\n</p>\n\n<p><font size=\"2\">Unsolicited data presents problems beyond error recovery. Because the\nfirst client that connects to a server is the only one to receive unsolicited data, this\nsituation precludes use of multiple clients without a data synchronization method. For\nexample, if a user has a computer at work and one at home, and wants to perform online\nbanking from both computers, a bank server could send unsolicited data to one but not the\nother. </font></p>\n\n<p><font size=\"2\">An even greater problem occurs when a user resorts to a backup copy of\nthe client data file. This backup file will be missing recent unsolicited data with no way\nto retrieve it from the server once the server sends it.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493299\"><font size=\"5\" face=\"Arial\">Data Synchronization Approach</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">A simple solution is to make sure that clients can always obtain\ninformation from the server for a reasonable length of time. Clients can request recent\nresponses-whether due to client-initiated work or other status changes on the server-by\nsupplying the previous endpoint in the response history. Servers always supply a new\nendpoint whenever they supply responses. </font></p>\n\n<p><font size=\"2\">If a client connection fails-or a client receives a response, but\ncrashes before updating its database-the client will not save the new endpoint. On the\nnext synchronization request, the server sends the same information (plus any further\nstatus changes). </font></p>\n\n<p><font size=\"2\">If a user switches to a backup file, again the client will use the older\nendpoint in the synchronization request. </font></p>\n\n<p><font size=\"2\">If multiple clients are in use, each will send requests based on its own\nendpoint, so that both clients will obtain complete information from the server. This is\none reason why Open Financial Exchange responses carry enough information from the request\nto enable them to be processed on their own. The diagram below shows the relationship\nbetween clients and servers.<br>\n<br>\n</font></p>\n\n<p><font size=\"2\">Open Financial Exchange relieves the server from maintaining any special\nerror-recovery state information. However, Open Financial Exchange requires the server to\nmaintain a history of individual responses it would have sent and a way to identify a\nposition in the history. This ID could be a timestamp, or be based on its existing state\ninformation. </font></p>\n\n<p><font size=\"2\"><i><b>NOTE:</b> Open Financial Exchange does not require servers to\nstore these responses based on individual connections. Also, not all requests are subject\nto synchronization. For example, Open Financial Exchange does not require servers to store\nstatement-download responses separately for data synchronization. </i></font>\n\n</p><ol>\n  <li><a name=\"_Toc380493300\"><font size=\"5\" face=\"Arial\">Data Synchronization Specifics</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange does synchronization separately for each type of\nresponse. In addition, a synchronization request might include further identifying\ninformation, such as a specific account number. This specification defines the additional\ninformation for each synchronization request.</font> </p>\n\n<p><font size=\"2\">Each Open Financial Exchange service will identify the specific\nresponses that are subject to data synchronization. For example, a bank statement-download\nis a read-only operation from the server. A client can request again if it fails;\nconsequently, there is no special data synchronization for this type of response.</font> </p>\n\n<p><font size=\"2\">The basis for synchronization is a <i>token</i> as defined by the\n&lt;TOKEN&gt; tag. The server is free to create a token in any way it wishes. The client\nsimply holds the token for possible use in a future synchronization request. </font></p>\n\n<p><font size=\"2\">The server can derive a token from one of the following: </font>\n\n</p><ul>\n  <li><font size=\"2\">Timestamp</font> </li>\n  <li><font size=\"2\">Sequential number</font> </li>\n  <li><font size=\"2\">Unique non-sequential number</font> </li>\n  <li><font size=\"2\">Other convenient values for a server </font></li>\n</ul>\n\n<menu>\n  <li><font size=\"2\"><i><b>NOTE:</b> Open Financial Exchange reserves a &lt;TOKEN&gt; value of\n    zero for the first time each type of response does a synchronization task. </i></font></li>\n</menu>\n\n<p><font size=\"2\">Clients will send a &lt;TOKEN&gt; of zero on their first synchronization\nrequest. Servers should send all available history, allowing a new client to know about\nwork done by other clients. If a user's account has never been used with Open Financial\nExchange, the server returns no history.</font> </p>\n\n<p><font size=\"2\">The server can use different types of tokens for different types of\nresponses, if suitable for the server. </font></p>\n\n<p><font size=\"2\">Tokens will be subject to a maximum size; see Chapter 3, \"Common\nAggregates, Elements, and Data Types.\" Tokens need to be unique only with respect to\na specific type of synchronization request and the additional information in that request.\nFor example, a bill payment synchronization request takes an account number; therefore, a\ntoken needs to be unique only within payments for a specific account.</font> </p>\n\n<p><font size=\"2\">Servers will not have infinite history available, so synchronization\nresponses include a &lt;LOSTSYNC&gt; element with a value of Y (yes) if the old token in\nthe synchronization request was older than available history. This tag allows clients to\nalert users that some responses have been lost.</font> </p>\n\n<p><font size=\"2\"><i><b>NOTE:</b> A token is unrelated to a &lt;TRNUID&gt;,\n&lt;SRVRTID&gt;, or &lt;FITID&gt;. Each of these serves a specific purpose, and has its\nown scope and lifetime. </i></font></p>\n\n<p><font size=\"2\">A &lt;SRVRTID&gt; is not appropriate as a &lt;TOKEN&gt; for bill\npayment. A single payment has a single &lt;SRVRTID&gt;, but it can undergo several state\nchanges over its life and thus have several entries in the token history.</font> </p>\n\n<p><font size=\"2\">There are three different ways a client and a server can conduct their\nrequests and responses:</font>\n\n</p><ul>\n  <li><font size=\"2\">Explicit synchronization - A client can request synchronization without\n    sending any (other) Open Financial Exchange requests. Clients will send a specific\n    synchronization request, including the current token for that request. The response will\n    be a set of individual responses more recent than the given token, along with a new token.\n    </font></li>\n  <li><font size=\"2\">Synchronization with new requests - A client can request synchronization\n    as part of the response to any new requests it has. It gives the old token. The response\n    should include responses to the new requests plus any others that became available since\n    the old token, along with a new token. An aggregate contains the requests so that the\n    server can process the new requests and update the token as an atomic action.</font> </li>\n  <li><font size=\"2\">New requests without synchronization - A client can make new requests\n    without providing the old token. In this case, it expects just responses to the new\n    requests. A subsequent request for synchronization will cause the client to send the same\n    response again, because the client did not update its token.</font> </li>\n</ul>\n\n<p><font size=\"2\">Each request and response that requires data synchronization will define\na synchronization aggregate. The aggregate tells the server which particular kind of data\nit should synchronize. By convention, these aggregates always have SYNC as part of their\ntag names, for example, &lt;PMT<b>SYNC</b>RQ&gt;. You can use these aggregates on their\nown to perform explicit synchronization, or as wrappers around one or more new\ntransactions. For example, you can use &lt;PMTSYNCRQ&gt; aggregates to request\nsynchronization in combination with new work. You can use the &lt;PMTTRNRQ&gt; by itself\nif you do not require synchronization.</font> </p>\n\n<p><font size=\"2\">Some clients can choose to perform an explicit synchronization before\nsending any new requests (with or without synchronization). This practice allows clients\nto be up-to-date and possibly interact with the user before sending any new requests.\nOther clients can simply send new requests as part of the synchronization request. </font></p>\n\n<p><font size=\"2\">If a client synchronizes in one file, then sends new work inside a\nsynchronization request in a second file, there is a small chance that additional\nresponses become available between the two connections. There is even a smaller chance\nthat these would be conflicting requests, such as modifications to the same object.\nHowever, some clients and some requests might require absolute control, so that the user\ncan be certain that they are changing known data. To support this, synchronization\nrequests can optionally specify &lt;REJECTIFMISSING&gt;. The tag tells a server that it\nshould reject all enclosed requests if the supplied &lt;TOKEN&gt; is out of date <i>before\nconsidering the new requests.</i> That is, if any new responses became available, whether\nrelated to the incoming requests or not (but part of the scope of the synchronization\nrequest), the server should immediately reject the requests. It should still return the\nnew responses. A client can then try again until it finds a stable window to submit the\nwork. See section 6.5 for more information about conflict detection and resolution.</font>\n</p>\n\n<p><font size=\"2\">The password change request and response present a special problem. See\nsection 2.5.2 for further information.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493301\"><font size=\"5\" face=\"Arial\">Conflict Detection and Resolution</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Conflicts arise whenever two or more users or servers modify the same\ndata. This can happen to any object that has a &lt;SRVRTID&gt; that supports change or\ndelete requests. For example, one spouse and the other might independently modify the same\nrecurring bill payment model. From a server perspective, there is usually no way to\ndistinguish between the same user making two intended changes and two separate users\nmaking perhaps unintended changes. Therefore, Open Financial Exchange provides enough\ntools to allow clients to carefully detect and resolve conflicts. Open Financial Exchange\nrequires only that a server process atomically all requests in a single &lt;OFX&gt; block.\n</font></p>\n\n<p><font size=\"2\">A careful client will always synchronize before sending any new\nrequests. If any responses come back that could affect a user's pending requests, the\nclient can ask the user whether it should still send those pending requests. Because there\nis a small chance for additional server actions to occur between the initial\nsynchronization request and sending the user's pending requests, extremely careful clients\ncan use the &lt;REJECTIFMISSING&gt; option. Clients can iterate sending pending requests\ninside a synchronization request with &lt;REJECTIFMISSING&gt; and testing the responses to\nsee if they conflict with pending requests. A client can continue to do this until a\nwindow of time exists wherein the client is the only agent trying to modify the server. In\nreality, this will almost always succeed on the first try.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493302\"><font size=\"5\" face=\"Arial\">Synchronization vs. Refresh</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">There are some situations, and some types of clients, where it is\npreferable for a client to ask a server to send everything it knows, rather than just\nreceive a set of changes. For example, a situation where a client that has not connected\noften enough has lost synchronization. An example of \"type\" might be a\ncompletely stateless client, such as a web browser. This choice is made during client\nimplementation. Open Financial Exchange does not require a client to refresh just because\nit has lost synchronization.</font> </p>\n\n<p><font size=\"2\">Clients will mainly want to refresh lists of long-lived objects on the\nserver; generally objects with a &lt;SRVRTID&gt;. For example, Open Financial Exchange\nPayments has both individual payments and models of recurring payments. </font></p>\n\n<p><font size=\"2\">A brand new client, or a client that lost synchronization, might want to\nlearn about in-progress payments by doing a synchronization refresh of the payment\nrequests. It would almost certainly want to do a synchronization refresh of the recurring\npayment models, because these often live for months or years. </font></p>\n\n<p><font size=\"2\">A client might not perform a synchronization refresh on e-mail\nresponses.</font> </p>\n\n<p><font size=\"2\">A client can request a refresh by using the &lt;REFRESH&gt; tag with\nvalue of Y instead of the &lt;TOKEN&gt; tag. Server descriptions detail the exact behavior\nthat servers should follow. However, the general rule is that servers should send\nresponses that emulate a client creating or adding each of the objects governed by the\nparticular synchronization request. </font></p>\n\n<p><font size=\"2\">In these cases, you can set &lt;TRNUID&gt; to zero; the standard value\nfor server-generated responses. </font></p>\n\n<p><font size=\"2\">There is no need to recreate a stream of responses that emulate the\nentire history of the object, just an add response that reflects the current state. For\nexample, if you create a model and then modify it three times, even if this history would\nhave been available for a regular synchronization, servers should only send a single add\nthat reflects the current state. </font></p>\n\n<p><font size=\"2\">A client that just wants the current token, without refresh or\nsynchronization, makes requests with &lt;TOKENONLY&gt; and a value of Y.</font> </p>\n\n<p><font size=\"2\">In all cases, servers should send the current ending &lt;TOKEN&gt; for\nthe synchronization request in refresh responses. This allows a client to perform regular\nsynchronization requests in the future.</font> </p>\n\n<p><font size=\"2\">The following table summarizes the options in a client synchronization\nrequest:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;TOKEN&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Previous value of &lt;TOKEN&gt; received for this type of\n    synchronization request from server; 0 for first-time requests; <i>token</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;TOKENONLY&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Request for just the current &lt;TOKEN&gt; without the\n    history, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;REFRESH&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Request for refresh of current state, <i>Boolean</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;REJECTIFMISSING&gt;</font> </td>\n    <td width=\"366\"><font size=\"2\">If Y, do not process requests if client &lt;TOKEN&gt; is\n    out of date, <i>Boolean</i></font> </td>\n  </tr>\n</tbody></table>\n\n<p><b>NOTE:</b> <i>Open Financial Exchange requires one each of</i> &lt;TOKEN&gt;,\n&lt;TOKENONLY&gt;, <i>or</i> &lt;REFRESH&gt;.\n\n</p><ol>\n  <li><a name=\"_Toc380493303\"><font size=\"5\" face=\"Arial\">Typical Server Architecture for\n    Synchronization</font></a> </li>\n</ol>\n\n<p><font size=\"2\">This section describes how an FI can approach supporting synchronization\nbased on the assumption that modifications to an existing financial server will be kept to\na minimum. </font></p>\n\n<p><font size=\"2\">The simplest approach is to create a history database separate from the\nexisting server. This history could consist of the actual Open Financial Exchange\ntransaction responses (&lt;TRNRS&gt; aggregates) that are available to a synchronization\nrequest. The history database could index records by token, response type, and any other\nidentifying information for that type, such as account number. </font></p>\n\n<p><font size=\"2\">The diagram below shows a high-level model of the Open Financial\nExchange architecture for a financial institution. Notice that the diagram shows the\npresence of a history journal. <br>\n<br>\n</font></p>\n\n<p><font size=\"2\">The server adds responses to the history journal for any action that\ntakes place on the existing server. This is true whether the Open Financial Exchange\nrequests initiate the action or, in the case of recurring payments, it happens\nautomatically on the server. Once added to the history journal, the server can forget\nthem.</font> </p>\n\n<p><font size=\"2\">The areas of the Open Financial Exchange server that process\nsynchronization requests need only search this history database for matching responses\nthat are more recent than the incoming token. </font></p>\n\n<p><font size=\"2\">For a refresh request, an Open Financial Exchange server would access\nthe actual bank server to obtain the current state rather than recent history. </font></p>\n\n<p><font size=\"2\">Periodically the bank server would purge the history server of older\nentries.</font> </p>\n\n<p><font size=\"2\">Only requests that are subject to synchronization need to have entries\nin the history database. Statement downloads do not involve synchronization; therefore,\nthe FI server should not add these responses to the history database. Since statement\ndownloads are usually the largest in space and the most frequent, eliminating these saves\nmuch of the space a response history might otherwise require.</font> </p>\n\n<p><font size=\"2\">More sophisticated implementations can save even more space. The history\ndatabase could save responses in a coded binary form that is more compact than the full\nOpen Financial Exchange response format. Some FIs might have much or all of the necessary\ndata already in their servers; consequently, they would not require new data. An FI could\nregenerate synchronization responses rather than recall them from a database.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493304\"><font size=\"5\" face=\"Arial\">Typical Client Processing of\n    Synchronization Results</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The diagram below shows a general flowchart of what an Open Financial\nExchange client would do with the results of a synchronization request. Most requests and\nresponses subject to data synchronization contain both &lt;TRNUID&gt; and &lt;SRVRTID&gt;.\n<br>\n<br>\n<br>\n</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493305\"><font size=\"5\" face=\"Arial\">Simultaneous Connections</font></a> </li>\n</ol>\n\n<p><font size=\"2\">It is increasingly common that a server can get simultaneous or\noverlapping requests from the same user over two different computers. Open Financial\nExchange requires a server to process each set of requests sent in a file as an atomic\naction. Servers can deal with the problems that arise with simultaneous use in two ways:</font>\n\n</p><ul>\n  <li><font size=\"2\">Allow simultaneous connections, insure each is processed atomically, and\n    use the data synchronization mechanism to bring the two clients up to date. This is the\n    preferred method.</font> </li>\n  <li><font size=\"2\">Lock out all but one user at a time, returning the error code for\n    multiple users.</font> </li>\n</ul>\n\n<ol>\n  <li><a name=\"_Toc380493306\"><font size=\"5\" face=\"Arial\">Synchronization Alternatives</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Although it is <b>RECOMMENDED </b>that Open Financial Exchange servers\nimplement full synchronization as described in this chapter, an alternate approach,\n\"lite synchronization,\" could be easier for some servers to support. This\napproach focuses only on error recovery and does not provide any support for multiple\nclients, multiple data files, or use of backup files. The approach is to preserve the\nmessage sets while simplifying the implementation.</font> </p>\n\n<p><font size=\"2\">In addition, some clients might prefer to use response-file based error\nrecovery with all servers, even if the client and some server both support full\nsynchronization. This section first describes lite synchronization, and then explains the\nrules that clients and servers use to decide how to communicate.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493307\"><font size=\"4\" face=\"Arial\">Lite Synchronization</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Lite synchronization requires servers to accept all synchronization\nmessages, but does not require them to keep any history or tokens. Responses need only be\nsent once and then the server can forget them. Responses to client requests, whether or\nnot they are made inside a synchronization request, are processed normally. Responses that\nrepresent server-initiated work, such as payment responses that arise from recurring\npayments, are sent only in response to synchronization requests. A server does not have to\nhold responses in case a second client makes a synchronization request.</font> </p>\n\n<p><font size=\"2\">Because full synchronization supports error recovery, an alternative is\nneeded for lite synchronization. Servers using lite synchronization keep a copy of the\nentire response file they last sent. Clients requesting that servers prepare for error\nrecovery generate a globally unique ID for each file they send. In the OFX headers, there\nare two tags associated with error recovery:</font>\n\n</p><ul>\n  <li><font size=\"2\">OLDFILEUID - UID of the last request and response that was successfully\n    received and processed by the client</font> </li>\n  <li><font size=\"2\">NEWFILEUID - UID of the current file</font> </li>\n</ul>\n\n<p><font size=\"2\">The format of these is the same as used with &lt;TRNUID&gt; as\ndocumented in Chapter 3.</font> </p>\n\n<p><font size=\"2\">Servers use the following rules:</font>\n\n</p><ul>\n  <li><font size=\"2\">If these tags are absent, the client is not requesting error recovery\n    protection for this file. The server does not need to save a copy of the response.</font> </li>\n  <li><font size=\"2\">If the NEWFILEUID matches a file saved on the server, then the client is\n    in error recovery. The server must ignore the new requests in this file and instead send\n    back the matching saved response file.</font> </li>\n  <li><font size=\"2\">If the OLDFILEUID matches a file saved on the server, then OLDFILEUID is\n    a file that the client has successfully processed and the server can delete it. The client\n    is also requesting that the response for the current file be saved under the NEWFILEUID\n    for possible error recovery.</font> </li>\n</ul>\n\n<p><font size=\"2\">A server will never need to save more than one file per client data\nfile, but because of possible multi-client or multi-datafile usage, it might need to save\nseveral files for a given user. A server should save as long as possible, but not\nindefinitely. A server cannot recognize an error recovery attempt if it comes after it has\npurged the error recovery file. A server would process it as a new request. In this case,\na server should recognize duplicate transaction UIDs for client-initiated work, such as\npayments, and then reject them individually. Server-generated responses would be lost to\nthe client.</font> </p>\n\n<p><font size=\"2\">For a server accustomed to sending unsolicited responses, lite\nsynchronization should closely match the current response-file based implementation. The\nonly difference is that a server should hold the unsolicited responses until the client\nmakes the first appropriate synchronization request; rather than automatically adding them\nto any response file. Once added, the server can mark them as delivered, relying on error\nrecovery to insure actual delivery.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493308\"><font size=\"4\" face=\"Arial\">Relating Synchronization and Error\n    Recovery</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Client and server developers should first decide whether they will\nsupport full synchronization or not. If they can, then they can support response-file\nerror recovery as well, or they can rely on synchronization to perform error recovery. If\nthey adopt only lite synchronization, Open Financial Exchange requires response-file error\nrecovery. A severs describes each of these choices in its server profile records. The\nfollowing combinations are valid:</font>\n\n</p><ul>\n  <li><font size=\"2\">Full synchronization with response-file error recovery</font> </li>\n  <li><font size=\"2\">Full synchronization without separate response-file error recovery</font>\n  </li>\n  <li><font size=\"2\">Lite synchronization with response-file error recovery</font> </li>\n</ul>\n\n<p><font size=\"2\">Clients request response-file error recovery by including the old and\nnew session UIDs in the header. If they are absent, servers need not save the response\nfile for error recovery. Clients request synchronization by using those synchronization\nrequests defined throughout this specification.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493309\"><font size=\"5\" face=\"Arial\">Examples</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Here is an example of full synchronization using bill payment as the\nservice. Open Financial Exchange Payments provides two different synchronization requests\nand responses, each with their own token; one for payment requests and one for repeating\npayment model requests. See Chapter 102 for full details.</font> </p>\n\n<p><font size=\"2\">These simplified examples, show without the outer &lt;OFX&gt; layer,\n&lt;SIGNON&gt;, and so forth.Client A requests synchronization:</font> </p>\n\n<pre><font size=\"1\">&lt;PMTSYNCRQ&gt;\n\t&lt;TOKEN&gt;123\n\t&lt;BANKACCTFROM&gt;\n\t\t&lt;BANKID&gt;121000248\n\t\t&lt;ACCTID&gt;123456789\n\t\t&lt;ACCTTYPE&gt;CHECKING\n\t&lt;/BANKACCTFROM&gt;\n&lt;/PMTSYNCRQ&gt;</font><font size=\"2\">The server sends in response:\n</font><font size=\"1\" face=\"Courier New\">&lt;PMTSYNCRS&gt;\n\t&lt;TOKEN&gt;125\n\t&lt;LOSTSYNC&gt;N\n\t&lt;BANKACCTFROM&gt;\n\t\t&lt;BANKID&gt;121000248\n\t\t&lt;ACCTID&gt;123456789\n\t\t&lt;ACCTTYPE&gt;CHECKING\n\t&lt;/BANKACCTFROM&gt;\n\t&lt;PMTTRNRS&gt;\n\t\t&lt;STATUS&gt;\n\t\t\t... status details\n\t\t&lt;/STATUS&gt;\n\t\t&lt;TRNUID&gt;123\n\t\t&lt;PMTRS&gt;\n\t\t\t... details on a payment response\n\t\t&lt;/PMTRS&gt;\n\t&lt;/PMTTRNRS&gt;\n\t&lt;PMTTRNRS&gt;\n\t\t&lt;STATUS&gt;\n\t\t\t... status details\n\t\t&lt;/STATUS&gt;\n\t\t&lt;TRNUID&gt;546\n\t\t&lt;PMTRS&gt;\n\t\t\t... details on another payment response\n\t\t&lt;/PMTRS&gt;\n\t&lt;/PMTTRNRS&gt;\n&lt;/PMTSYNCRS&gt;</font>\n</pre>\n\n<p><font size=\"2\">Client A was missing two payment responses, which the server provides.\nAt this point, client A is synchronized with the server. Client A now makes a new payment\nrequest, and includes a synchronization update as part of the request. This update avoids\nhaving to re-synchronize the expected response at a later time.</font> </p>\n\n<pre><font size=\"1\">&lt;PMTSYNCRQ&gt;\n\t&lt;TOKEN&gt;125\n\t&lt;BANKACCTFROM&gt;\n\t\t&lt;BANKID&gt;121000248\n\t\t&lt;ACCTID&gt;123456789\n\t\t&lt;ACCTTYPE&gt;CHECKING\n\t&lt;/BANKACCTFROM&gt;\n\t&lt;PMTTRNRQ&gt;\n\t\t&lt;TRNUID&gt;12345\n\t\t&lt;PMTRQ&gt;\n\t\t\t... details of a new payment request\n\t\t&lt;/PMTRQ&gt;\n\t&lt;/PMTTRNRQ&gt;\n&lt;/PMTSYNCRQ&gt;</font><font size=\"2\">The response to this new\nrequest:\n</font><font size=\"1\" face=\"Courier New\">&lt;PMTSYNCRS&gt;\n\t&lt;TOKEN&gt;126\n\t&lt;LOSTSYNC&gt;N\n\t&lt;BANKACCTFROM&gt;\n\t\t&lt;BANKID&gt;121000248\n\t\t&lt;ACCTID&gt;123456789\n\t\t&lt;ACCTTYPE&gt;CHECKING\n\t&lt;/BANKACCTFROM&gt;\n\t&lt;PMTTRNRS&gt;\n\t\t... details on a payment response to the new request\n\t&lt;/PMTTRNRS&gt;\n&lt;/PMTSYNCRS&gt;</font>\n</pre>\n\n<p><font size=\"2\">The client now knows that the server has processed the payments request\nit just made, and that nothing else has happened on the server since it last synchronized\nwith the server.</font> </p>\n\n<p><font size=\"2\">Assume client B was synchronized with respect to payments for this\naccount up through token 125. If it called in now and synchronized-with or without making\nadditional requests-it would pick up the payment response associated with token 126. It\nrecords the same information that was in client A, which would give both clients a\ncomplete picture of payment status.<br>\n</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493310\"><font size=\"6\" face=\"Arial\">FI Profile</font></a> </li>\n  <li><a name=\"_Toc380493311\"><font size=\"5\" face=\"Arial\">Overview</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange clients use the profile to learn the\ncapabilities of an Open Financial Exchange server. This information includes general\nproperties such as account types supported, user password requirements, specific messages\nsupported, and how the client should batch requests and where to send the requests. A\nclient obtains a portion of the profile when a user first selects an FI. The client\nobtains the remaining information prior to sending any actual requests to that FI. The\nserver uses a timestamp to indicate whether the server has updated the profile, and the\nclient checks periodically to see if it should obtain a new profile.</font> </p>\n\n<p><font size=\"2\">In more detail, a profile response contains the following sections,\nwhich a client can request independently:</font>\n\n</p><ul>\n  <li><font size=\"2\">Message Sets - list of services and any general attributes of those\n    services. Message sets are collections of messages that are related functionally. They are\n    generally subsets of what users see as a service.</font> </li>\n  <li><font size=\"2\">Signon realms - FIs can require different signons (user ID and/or\n    password) for different message sets. Because there can only be one signon per &lt;OFX&gt;\n    block, a client needs to know which signon the server requires and then provide the right\n    signon for the right batch of messages.</font> </li>\n</ul>\n\n<p><font size=\"2\">The profile message is itself a message set. In files, Open Financial\nExchange uses the &lt;PROFMSGSV1&gt; aggregate to identify this profile message set.</font>\n</p>\n\n<p><font size=\"2\">The following sections describe the general use of profile information. </font>\n\n</p><ol>\n  <li><a name=\"_Toc380493312\"><font size=\"4\" face=\"Arial\">Message Sets</font></a> </li>\n</ol>\n\n<p><font size=\"2\">A message set is a collection of related messages. For example, Chapter\n11, \"Banking,\" defines several message sets: statement download, credit card\nstatement download, intrabank transfers, and so forth. A server routes all of the messages\nin a message set to a single URL and merges their versions together.</font> </p>\n\n<p><font size=\"2\">Clients and servers generally use message sets as the granularity to\ndecide what functionality they will support. A \"banking\" server can choose to\nsupport the statement download and intrabank transfer message sets, but not the wire\ntransfer message set. Attributes are available in many cases to further define how Open\nFinancial Exchange supports a message set.</font> </p>\n\n<p><font size=\"2\">Each portion of the Open Financial Exchange specification that defines\nmessages also defines the message set to which that the messages belongs. This includes\nwhat additional attributes are available for those messages, and whether Open Financial\nExchange requires the message set or it is optional.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493313\"><font size=\"4\" face=\"Arial\">Version Control</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Message sets are the basis of version control. Over time there will be\nnew versions of the message sets, and at any given time servers will likely want to\nsupport more than one version of a message set. Clients should also be capable of\nsupporting as many versions as possible. Through the profile, clients discover which\nversions are supported for each message set. Considering the client capabilities, it\nexchanges messages at the highest common level for each message set. </font></p>\n\n<p><font size=\"2\">For the Open Financial Exchange-SGML data format, there is a single DTD\nfor all message sets. Its version advances when any <i>syntactic</i> change is made to any\nof the message sets. (It is possible to make a <i>semantic</i> change that would not even\nrequire a change in syntax. A change in rules, for example, that would change the version\nof the message set without changing the DTD.) A single DTD cannot have two different\ndefinitions for the same aggregate. There are limitations to how a server that uses true\nDTD-based parsing can handle multiple versions of a message at the same time.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493314\"><font size=\"4\" face=\"Arial\">Batching and Routing</font></a> </li>\n</ol>\n\n<p><font size=\"2\">To allow FIs to set up different servers for different message sets,\ndifferent versions, or to directly route some messages to third party processors, message\nsets define the URL to which a server sends messages in that message set. Each version of\na message set can have a different URL. In the common case where many or all message sets\nare sent to a single URL, clients will consolidate messages across compatible message\nsets. Clients can consolidate when:</font>\n\n</p><ul>\n  <li><font size=\"2\">Message sets have the same URL</font> </li>\n  <li><font size=\"2\">Message sets have a common security level</font> </li>\n  <li><font size=\"2\">Message sets have the same signon realm</font> </li>\n</ul>\n\n<ol>\n  <li><a name=\"_Toc380493315\"><font size=\"5\" face=\"Arial\">Profile Request</font></a> </li>\n</ol>\n\n<p><font size=\"2\">A profile request indicates which profile components a client desires.\nIt also indicates what the client's routing capability is. Profiles returned by the FI\nmust be compatible with the requested routing style, or it returns an error.</font> </p>\n\n<p><font size=\"2\">Profile requests are not subject to synchronization. Use the\n&lt;PROFTRNRQ&gt; transaction tag.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;<b>PROFRQ</b>&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Profile-request aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;CLIENTROUTING&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Identifies client routing capabilities, see table below</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;DTPROFUP&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Date and time client last received a profile update</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/PROFRQ&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">NONE</font></td>\n    <td width=\"366\"><font size=\"2\">Client cannot perform any routing. All URLs must be the\n    same. All message sets share a single signon realm.</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">SERVICE</font></td>\n    <td width=\"366\"><font size=\"2\">Client can perform limited routing. See details below.</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">MSGSET</font></td>\n    <td width=\"366\"><font size=\"2\">Client can route at the message-set level. Each message set\n    may have a different URL and/or signon realm.</font> </td>\n  </tr>\n</tbody></table>\n\n<p>The intent of the SERVICE option for client routing is to support clients that can\nroute bill payment messages to a separate URL from the rest of the messages. Because the\nexact mapping of message sets to the general concept of bill payment can vary by client\nand by locale, this specification does not provide precise rules for the SERVICE option.\nEach client will define its requirements. <br>\n\n</p><ol>\n  <li><a name=\"_Toc380493316\"><font size=\"5\" face=\"Arial\">Profile Response</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The response includes message set descriptions, signon information, and\ngeneral contact information.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;PROFRS&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Profile-response aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;MSGSETLIST&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Beginning list of message set information</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;<b><i>XXXMSGSET</i>&gt;</b></font></b> </td>\n    <td width=\"354\"><font size=\"2\">One or more message set aggregates</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/<b><i>XXXMSGSET</i>&gt;</b></font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/MSGSETLIST&gt;</font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SIGNONINFOLIST&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Beginning of signon information</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SIGNONINFO&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">One or more signon information aggregates</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/SIGNONINFO&gt;</font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/SIGNONINFOLIST&gt;</font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;DTPROFUP&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Time this was updated on server, <i>datetime</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;FINAME&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Name of institution, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;ADDR1&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">FI address, line 1</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;ADDR2&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">FI address, line 2</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;ADDR3&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">FI address, line 3</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;CITY&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">FI address city</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;STATE&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">FI address state</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;POSTALCODE&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">FI address postal code</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;COUNTRY&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">FI address country</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;CSPHONE&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">Customer service telephone number, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;TSPHONE&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">Technical support telephone number, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;FAXPHONE&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">Fax number, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;URL&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">URL for general information about FI (not for sending data)\n    <i>URL</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\" face=\"Arial\">&lt;EMAIL&gt;</font> </td>\n    <td width=\"354\"><font size=\"2\">E-mail address for FI, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SYNCMODE&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">FULL for full synchronization capability <br>\n    LITE for lite synchronization capability</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;RESPFILEER&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Y if server supports response-file based error recovery, <i>Boolean</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/PROFRS&gt;</font></b></td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>See the Chapter 6 for more information on &lt;SYNCMODE&gt; and &lt;RESPFILEER&gt;.\n\n</p><ol>\n  <li><font size=\"4\" face=\"Arial\"><a name=\"_Toc380493317\">Message Set</a> </font></li>\n</ol>\n\n<p><font size=\"2\">An aggregate describes each message set supported by an FI. Message sets\nin turn contain an aggregate for each version of the message set that is supported. For a\nmessage set named <i>XXX</i>, the convention is to name the outer aggregate &lt;<i>XXX</i>MSGSET&gt;\nand the tag for each version &lt;<i>XXX</i>MSGSETVn&gt;. The reason for message\nset-specific aggregates is that the set of attributes depends on the message set. These\ncan change from version to version, so there are version-specific aggregates as well.</font>\n</p>\n\n<p><font size=\"2\">The general form of the response is:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;<b><i>XXXMSGSET</i>&gt;</b></font></b> </td>\n    <td width=\"354\"><font size=\"2\">Service aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;<b><i>XXXMSGSETVn</i>&gt;</b></font></b> </td>\n    <td width=\"354\"><font size=\"2\">Version-of-message-set aggregate, 1 or more</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/<b><i>XXXMSGSETVn</i>&gt;</b></font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/<b><i>XXXMSGSET</i>&gt;</b></font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>The &lt;<b><i>XXX</i>MSGSETVn</b>&gt; aggregate has the following form:<br>\n</p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;<b><i>XXX</i>MSGSETVn&gt;</b></font></b> </td>\n    <td width=\"354\"><font size=\"2\">Message-set-version aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;MSGSETCORE&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Common message set information</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/MSGSETCORE&gt;</font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\">message-set specific</font></td>\n    <td width=\"354\"><font size=\"2\">Zero or more attributes specific to this version of this\n    message set, as defined by each message set</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/<b><i>XXX</i>MSGSETVn&gt;</b></font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>The common message set information &lt;MSGSETCORE&gt; is as follows: <br>\n</p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;MSGSETCORE&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Common-message-set-information aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;VER&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Version number, <i>N-5 </i>(version 1.0 formatted as 100)</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;URL&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">URL where messages in this set are to be sent</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;OFXSEC&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Security level required for this message set; see Chapter 4</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;TRANSPSEC&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Y if transport security must be used, N if not used; <i>Boolean</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SIGNONREALM&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Signon realm to use with this message set</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;LANGUAGE&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">One or more languages supported</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/MSGSETCORE&gt;</font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493318\"><font size=\"4\" face=\"Arial\">Signon Realms</font></a> </li>\n</ol>\n\n<p><font size=\"2\">A signon realm identifies a set of messages that can be accessed using\nthe same password. Realms are used to disassociate signons from specific services,\nallowing FIs to require different signons for different message sets. In practice, FIs\nwill want to use the absolute minimum number of realms possible to reduce the user's\nworkload.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SIGNONINFO&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Signon-information aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SIGNONREALM&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Identifies this realm</font></td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;MIN&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Minimum number of password characters</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;MAX&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Max number of password characters</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;ALPHA&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Y if alphabetic characters are allowed, <i>Boolean</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;NUMERIC&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Y if numeric characters are allowed, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;CASESEN&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Y if password is case-sensitive, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SPECIAL&gt;</font></b> </td>\n    <td width=\"354\"><font size=\"2\">Y if special characters are allowed, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;SPACES&gt;</font></b></td>\n    <td width=\"354\"><font size=\"2\">Y if spaces are allowed, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><b><font size=\"1\">&lt;/SIGNONINFO&gt;</font></b> </td>\n    <td width=\"354\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493319\"><font size=\"4\" face=\"Arial\">Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"144\"><i><font size=\"1\">Value</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"354\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\">0</font></td>\n    <td width=\"354\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"144\"><font size=\"1\">2000</font></td>\n    <td width=\"354\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493320\"><font size=\"5\" face=\"Arial\">Profile Message Set Profile\n    Information</font></a> </li>\n</ol>\n\n<p><img src=\"./sample_complexe_files/stadyn_image8.gif\" width=\"214\" height=\"214\" alt=\"stadyn_image8.gif (22120 bytes)\"><br>\nstadyn_image8</p>\n\n<p><font size=\"2\">&nbsp;</font></p>\n\n<p><font size=\"2\">The profile message set functions the same way as all other message\nsets; therefore, it contains a profile description for that message set. Because\n&lt;PROFMSGSET&gt; is always part of a message set response, it is described here. Servers\nthat support profile information must include the &lt;PROFMSGSET&gt; as part of the\nprofile response &lt;MSGSETLIST&gt;. There are no attributes, but the aggregate must be\npresent to indicate support for the message set.<br>\n</font></p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;PROFMSGSET&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Message-set-profile-information aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;PROFMSGSETV1&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Opening tag for V1 of the message set profile information</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;MSGSETCORE&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Common message set information</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/MSGSETCORE&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/PROFMSGSETV1&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/PROFMSGSET&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493321\"><font size=\"6\" face=\"Arial\">Activation &amp; Account Information</font></a>\n  </li>\n  <li><a name=\"_Toc380493322\"><font size=\"5\" face=\"Arial\">Overview</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The Signup message set defines three messages to help users get setup\nwith their FI:</font>\n\n</p><ul>\n  <li><font size=\"2\">Enrollment - informs FI that a user wants to use Open Financial Exchange\n    and requests that a password be returned</font> </li>\n  <li><font size=\"2\">Accounts - asks the FI to return a list of accounts, and the services\n    supported for each account</font> </li>\n  <li><font size=\"2\">Activation - allows a client to tell the FI which services a user wants\n    on each account</font> </li>\n</ul>\n\n<p><font size=\"2\">There is also a message to request name and address changes.</font> </p>\n\n<p><font size=\"2\">Clients use the account information request on a regular basis to look\nfor changes in a user's account information. A timestamp is part of the request so that a\nserver has only to report new changes. Account activation requests are subject to data\nsynchronization, and will allow multiple clients to learn how the other clients have been\nenabled.</font> </p>\n\n<p><font size=\"2\">In Open Financial Exchange files, the &lt;SIGNUPMSGSV1&gt; aggregate\nidentifies the Signup message.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493323\"><font size=\"5\" face=\"Arial\">Approaches to User Sign-Up with Open\n    Financial Exchange</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The message sets in this chapter are designed to allow both FIs and\nclients to support a variety of sign-up procedures. There are four basic steps a user\nneeds to go through to complete the sign-up:</font>\n\n</p><ol>\n  <li><font size=\"2\"><b>Select the FI</b>. Open Financial Exchange does not define this step\n    or provide message sets to support it. Client developers and FIs can let a user browse or\n    search this information on a web site, or might define additional message sets to do this\n    within the client. At the conclusion of this step, the client will have some minimal\n    profile information about the FI, including the set of services supported and the URL to\n    use for the next step.</font> </li>\n  <li><font size=\"2\"><b>Enrollment and password acquisition.</b> In this step, the user\n    identifies and authenticates itself to the FI <i>without a password</i>. In return, the\n    user obtains a password (possibly temporary) to use with Open Financial Exchange. FIs can\n    perform this entire step over the telephone, through a combination of telephone requests\n    and a mailed response, or at the FI web site. FIs can also use the Open Financial Exchange\n    enrollment message to do this by means of the client. The response can contain a temporary\n    password or users can wait for a mailed welcome letter containing the password.</font> </li>\n  <li><font size=\"2\"><b>Account Information.</b> In this step, the user obtains a list of\n    accounts available for use with Open Financial Exchange, and which specific services are\n    available for each account. Even if users have enrolled over the telephone, clients will\n    still use this message set to help users properly set up the accounts within the client.\n    Clients periodically check back with the FI for updates.</font> </li>\n  <li><font size=\"2\"><b>Service Activation.</b> The last step is to activate specific services\n    on specific accounts. The activation messages support this step. Synchronization is\n    applied to these messages to insure that other clients are aware of activated services.</font>\n  </li>\n</ol>\n\n<p><font size=\"2\">The combination of media-interface through which an FI accomplishes\nthese steps can vary. FIs might wish to do steps two through four over the telephone.\nClients will still use Open Financial Exchange messages in steps 3 and 4 to automatically\nset up the client based on the choices made by the user over the phone. Other FIs might\nwish to have the entire user experience occur within the client. Either way, the Open\nFinancial Exchange sign-up messages support the process.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493324\"><font size=\"5\" face=\"Arial\">Users and Accounts</font></a> </li>\n</ol>\n\n<p><font size=\"2\">To support the widest possible set of FIs, Open Financial Exchange\nassumes that individual users and accounts are in a many-to-many relationship. Consider a\nhousehold with three accounts:</font>\n\n</p><ul>\n  <li><font size=\"2\">Checking 1 - held individually by one spouse</font> </li>\n  <li><font size=\"2\">Checking 2 - held jointly by both</font> </li>\n  <li><font size=\"2\">Checking 3 - held individually by the other spouse</font> </li>\n</ul>\n\n<p><font size=\"2\">Checking 2 should be available to either spouse, and the spouse holding\nChecking 1 should be able to see both Checking 1 and 2.</font> </p>\n\n<p><font size=\"2\">Open Financial Exchange expects FIs to give each user their own user ID\nand password. Each user will go through the enrollment step separately. A given account\nneed only be activated once for a service; not once for each user. Clients will use the\naccount information and activation messages to combine information about jointly-held\naccounts.</font> </p>\n\n<p><font size=\"2\">If an FI prefers to have a single user ID and password per household or\nper master account, they will have to make this clear to users through the enrollment\nprocess. It is up to the FI to assign a single user ID and password that can access all\nthree of the checking accounts described above.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493325\"><font size=\"5\" face=\"Arial\">Enrollment and Password Acquisition\n    &lt;ENROLLRQ&gt; &lt;ENROLLRS&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The main purpose of the enrollment message is to communicate a user's\nintent to access the FI by way of Open Financial Exchange and to acquire a password for\nfuture use with Open Financial Exchange. Some FIs might return a user ID and an initial\npassword in the enrollment response, while others will send them by way of regular mail. </font></p>\n\n<p><font size=\"2\"><i><b>NOTE:</b> Because the server does not know the user ID and\npassword when the client sends the enrollment request, the &lt;SONRQ&gt; will not contain\na valid user ID or password. The enrollment message accepts standard user identification\ninformation. </i></font></p>\n\n<p><font size=\"2\">Enrollment requests are not subject to synchronization. If the client\ndoes not receive a response, it will simply re-request the enrollment. If a user\nsuccessfully enrolls from another client before the first client obtains a response, the\nserver should respond to subsequent requests from the first client with status code: </font></p>\n\n<pre><font size=\"1\">13501 - user already enrolled.</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493326\"><font size=\"4\" face=\"Arial\">User IDs</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The Open Financial Exchange &lt;SONRQ&gt; requires a user ID to uniquely\nidentify a user to an FI. Many FIs in the United States use social security numbers (SSNs)\nas the ID. Others create IDs that are unrelated to the users' SSNs. FIs can have an\nexisting user IDs that they use for other online activities that they wish to use for Open\nFinancial Exchange as well. They might also create new IDs specifically for Open Financial\nExchange. Finally, some FIs might assign IDs while others might allow users to create\nthem. </font></p>\n\n<p><font size=\"2\">Because users do not usually know either their Open Financial Exchange\nsign-on user ID or their password at time of enrollment, the enrollment response is\ndesigned to return both. The enrollment request allows users to optionally provide a user\nID, which an FI can interpret as their existing online ID or a suggestion for what their\nnew user ID should be. It is recommended that the enrollment process explains ID syntax to\nusers.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493327\"><font size=\"4\" face=\"Arial\">Enrollment Request</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The enrollment request captures enough information to identify and\nauthenticate a user as being legitimate and that it has a relationship with the FI. </font></p>\n\n<p><font size=\"2\">FIs might require that an account number be entered as part of the\nidentification process. However, this is discouraged since the account information request\nis designed to automatically obtain all account information, avoiding the effort and\npotential mistakes of a user-supplied account number. </font></p>\n\n<p><font size=\"2\">It is <b>RECOMMENDED</b> that FIs provide detailed specifications for\nIDs and passwords along with information about the services available when a user is\nchoosing an FI.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;ENROLLRQ&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Enrollment-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;FIRSTNAME&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">First name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;MIDDLENAME&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Middle name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;LASTNAME&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Last name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;ADDR1&gt;</font></b></td>\n    <td width=\"318\"><font size=\"2\">Address line 1</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR2&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 2</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR3&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 3</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;CITY&gt;</font></b></td>\n    <td width=\"318\"><font size=\"2\">City</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;STATE&gt;</font></b></td>\n    <td width=\"318\"><font size=\"2\">State or province</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;POSTALCODE&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Postal code</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;COUNTRY&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">3-letter country code from ISO/DIS-3166</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;DAYPHONE&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Daytime telephone number</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;EVEPHONE&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Evening telephone number</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;EMAIL&gt;</font></b></td>\n    <td width=\"318\"><font size=\"2\">Electronic e-mail address</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;USERID&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Actual user ID if already known, or preferred user ID if\n    user can pick</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;TAXID&gt;</font></b></td>\n    <td width=\"318\"><font size=\"2\">ID used for tax purposes (such as SSN), may be same as user\n    ID</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;SECURITYNAME&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Mother's maiden name or equivalent</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;DATEBIRTH&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Date of birth</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;<i>ACCTFROM&gt;</i></font> </td>\n    <td width=\"318\"><font size=\"2\">An account description aggregate for one existing account\n    at the FI, for identification purposes only. Can be &lt;BANKACCTFROM&gt;,\n    &lt;INVACCTFROM&gt;, etc.</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;/<i>ACCTFROM</i>&gt;</font> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;/ENROLLRQ&gt;</font></b> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>This enrollment request is intended for use only by individuals. Business enrollment\nwill be defined in a later release.\n\n</p><ol>\n  <li><a name=\"_Toc380493328\"><font size=\"4\" face=\"Arial\">Enrollment Response</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The main purpose of the enrollment response is to acknowledge the\nrequest. In those cases where FIs permit delivery of an ID and a temporary password, the\nresponse also provides for this. Otherwise the server will send the real response to the\nuser by way of regular mail, electronic mail, or over the telephone. If enrollment is\nsuccessful, but the server does not return the ID and password in the response, a server\nis REQUIRED to use status code 10 and provide some information to the user by means of the\n&lt;MESSAGE&gt; element in the &lt;STATUS&gt; aggregate about what to expect next.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;ENROLLRS&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Enrollment-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;TEMPPASS&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Temporary password</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\">&lt;USERID&gt;</font></td>\n    <td width=\"318\"><font size=\"2\">User ID</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;DTEXPIRE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Time the temporary password expires (if &lt;TEMPPASS&gt;\n    included)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;/ENROLLRS&gt;</font></b> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493329\"><font size=\"4\" face=\"Arial\">Enrollment Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"72\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">0</font></td>\n    <td width=\"300\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2000</font></td>\n    <td width=\"300\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">13000</font></td>\n    <td width=\"300\"><font size=\"2\">User ID &amp; password will be sent out-of-band (INFO)</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">13500</font></td>\n    <td width=\"300\"><font size=\"2\">Unable to enroll (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">13501</font></td>\n    <td width=\"300\"><font size=\"2\">User already enrolled (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493330\"><font size=\"4\" face=\"Arial\">Examples</font></a><font size=\"2\">An\n    enrollment request:</font> </li>\n</ol>\n\n<pre><font size=\"1\">&lt;ENROLLTRNRQ&gt;\n\t&lt;TRNUID&gt;12345\n\t&lt;ENROLLRQ&gt;\n\t\t&lt;FIRSTNAME&gt;Joe\n\t\t&lt;MIDDLENAME&gt;Lee\n\t\t&lt;LASTNAME&gt;Smith\n\t\t&lt;ADDR1&gt;21 Main St.\n\t\t&lt;CITY&gt;Anytown\n\t\t&lt;STATE&gt;TX\n\t\t&lt;POSTALCODE&gt;87321\n\t\t&lt;COUNTRY&gt;USA\n\t\t&lt;DAYPHONE&gt;123-456-7890\n\t\t&lt;EVEPHONE&gt;987-654-3210\n\t\t&lt;EMAIL&gt;jsmith@isp.com\n\t\t&lt;USERID&gt;jls\n\t\t&lt;TAXID&gt;123-456-1234\n\t\t&lt;SECURITYNAME&gt;jbmam\n\t\t&lt;DATEBIRTH&gt;19530202\n\t&lt;/ENROLLRQ&gt;\n&lt;/ENROLLTRNRQ&gt;</font><font size=\"2\">And the reply might be:\n</font><font size=\"1\" face=\"Courier New\">&lt;ENROLLTRNRS&gt;\n\t&lt;TRNUID&gt;12345\n\t&lt;STATUS&gt;\n\t\t&lt;CODE&gt;0\n\t\t&lt;SEVERITY&gt;INFO\n\t&lt;/STATUS&gt;\n\t&lt;ENROLLRS&gt;\n\t\t&lt;TEMPPASS&gt;changeme\n\t\t&lt;USERID&gt;jls\n\t\t&lt;DTEXPIRE&gt;19970105\n\t&lt;/ENROLLRS&gt;\n&lt;/ENROLLTRNRS&gt;</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493331\"><font size=\"5\" face=\"Arial\">Account Information</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Account information requests ask a server to identify and describe all\nof the accounts accessible by the signed-on user. The definition of <i>all</i> is up to\nthe FI. At a minimum, it is <b>RECOMMENDED</b> that a server include information about all\naccounts that it can activate for one or more Open Financial Exchange services. To give\nthe user a complete picture of his relationship with an FI, FIs can give information on\nother accounts, even if those accounts are available only for limited Open Financial\nExchange services.</font> </p>\n\n<p><font size=\"2\">Some service providers will not have any prior knowledge of any user\naccount information. The profile allows these servers to report this, and clients will\nthen know to ask users for account information rather than reading it from the server.</font>\n</p>\n\n<p><font size=\"2\">Clients can perform several tasks for users with this account\ninformation. First, the information helps a client set up a user for online services by\ngiving it a precise list of its account information and available services for each.\nClients can set up their own internal state as well as prepare service activation requests\nwith no further typing by users. This can eliminate data entry mistakes in account\nnumbers, routing transit numbers, and so forth.</font> </p>\n\n<p><font size=\"2\">Second, FIs can provide limited information on accounts that would not\nordinarily be suitable to Open Financial Exchange services. For example, a balance-only\nstatement download would be useful for certificates of deposits even though a customer or\nan FI might not want or allow CDs to be used for full statement download.</font> </p>\n\n<p><font size=\"2\">For each account, there is one &lt;ACCTINFO&gt; aggregate returned. The\naggregate includes one service-specific account information aggregate for each available\nservice on that account. That, in turn, provides the service-specific account\nidentification. Common to each service-specific account information aggregate is the\n&lt;SVCSTATUS&gt; tag, which indicates the status of this service on this account.</font> </p>\n\n<p><font size=\"2\">A server should return joint accounts (accounts for which more than one\nuser ID can be used to access the account) for either user. Clients that wish to have a\nunified view will aggregate the results and remove duplicates before making specific\nrequests involving joint accounts.</font> </p>\n\n<p><font size=\"2\">Requests and responses include a &lt;DTACCTUP&gt; element. Responses\ncontain the last time a server updated the information. Clients can <b>OPTIONALLY </b>send\nthis in a subsequent request, and servers are <b>REQUIRED </b>to compare this to the\ncurrent modification time and only send information if it is more recent. The server sends\nthe entire account information response if the client's time is older; there is no attempt\nto incrementally update specific account information.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493332\"><font size=\"4\" face=\"Arial\">Request &lt;ACCTINFORQ&gt;</font></a>\n  </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"2\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;ACCTINFORQ&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Account-information-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;DTACCTUP&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Last &lt;DTACCTUP&gt; received in a response</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;INCIMAGES&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Y if server should include logo in response, N if client\n    will separately fetch them based on given URL; <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;/ACCTINFORQ&gt;</font></b> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493333\"><font size=\"4\" face=\"Arial\">Response &lt;ACCTINFORS&gt;</font></a>\n  </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"2\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;ACCTINFORS&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Account-information-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;DTACCTUP&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Date and time of last update to this information on the\n    server</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ACCTINFO&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Zero or more account information aggregates</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;/ACCTINFO&gt;</font> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;/ACCTINFORS&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">End of account information response</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493334\"><font size=\"4\" face=\"Arial\">Account Information Aggregate\n    &lt;ACCTINFO&gt;</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"210\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"288\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;ACCTINFO&gt;</font></b> </td>\n    <td width=\"288\"><font size=\"2\">Account-information-record aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><font size=\"1\">&lt;DESC&gt;</font></td>\n    <td width=\"288\"><font size=\"2\">Description of the account, <i>A-80</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><font size=\"1\">&lt;PHONE&gt;</font></td>\n    <td width=\"288\"><font size=\"2\">Telephone number for the account, <i>A-20</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><font size=\"1\">&lt;LOGO&gt;</font></td>\n    <td width=\"288\"><font size=\"2\">URL to request the logo for the account (actual logos\n    should be included via multi-part MIME in the response file if requested), <i>URL</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;<i><b>XXXACCTINFO&gt;</b></i></font></b> </td>\n    <td width=\"288\"><font size=\"2\">Service-specific account information, defined in each\n    service chapter, one or more allowed</font> </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;<b><i>XXXACCTFROM</i>&gt;</b></font></b> </td>\n    <td width=\"288\"><font size=\"2\">Service-specific account identification</font> </td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;/<b><i>XXXACCTFROM</i>&gt;</b></font></b> </td>\n    <td width=\"288\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;SVCSTATUS&gt;</font></b> </td>\n    <td width=\"288\"><font size=\"2\">AVAIL = Available, but not yet requested</font> <p><font size=\"2\">PEND = Requested, but not yet available</font> </p>\n    <p><font size=\"2\">ACTIVE = In use</font> </p></td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;/<b><i>XXXACCTINFO</i>&gt;</b></font></b> </td>\n    <td width=\"288\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"210\"><b><font size=\"1\">&lt;/ACCTINFO&gt;</font></b> </td>\n    <td width=\"288\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p><i><b>NOTE:</b> A server uses the &lt;DESC&gt; field to convey the FI's preferred name\nfor the account, such as \"PowerChecking.\" It should not include the account\nnumber. </i>\n\n</p><ol>\n  <li><a name=\"_Toc380493335\"><font size=\"4\" face=\"Arial\">Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"72\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">0</font></td>\n    <td width=\"300\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2000</font></td>\n    <td width=\"300\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">13001</font></td>\n    <td width=\"300\"><font size=\"2\">No change since supplied &lt;DTACCTUP&gt; (INFO)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493336\"><font size=\"4\" face=\"Arial\">Examples</font></a><font size=\"2\">An\n    account information request:</font> </li>\n</ol>\n\n<pre><font size=\"1\">&lt;ACCTINFOTRNRQ&gt;\n\t&lt;TRNUID&gt;12345\n\t&lt;ACCTINFORQ&gt;\n\t\t&lt;DTACCTUP&gt;19960101\n\t\t&lt;INCIMAGES&gt;N\n\t&lt;/ACCTINFORQ&gt;\n&lt;/ACCTINFOTRNRQ&gt;</font><font size=\"2\">And a response for a\nuser with access to one account, supporting banking:\n<a name=\"_Toc379956901\"><font size=\"1\">&lt;ACCTINFOTRNRS&gt;\n\t&lt;TRNUID&gt;12345\n\t&lt;STATUS&gt;\n\t\t&lt;CODE&gt;0\n\t\t&lt;SEVERITY&gt;INFO\n\t&lt;/STATUS&gt;\n\t&lt;ACCTINFORS&gt;\n\t\t&lt;DTACCTUP&gt;19960102\n\t\t&lt;ACCTINFO&gt;\n\t\t&lt;DESC&gt;Power Checking\n\t\t&lt;PHONE&gt;8002223333\n\t\t&lt;LOGO&gt;https://www.fi.com/ofx/logos/powercheck.jpg\n\t\t\t&lt;BANKACCTINFO&gt;\n\t\t\t\t&lt;BANKACCTFROM&gt;\n\t\t\t\t\t&lt;BANKID&gt;1234567789\n\t\t\t\t\t&lt;ACCTID&gt;12345\n\t\t\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t\t\t&lt;/BANKACCTFROM&gt;\n\t\t\t&lt;SUPTXDL&gt;Y\n\t\t\t&lt;XFERSRC&gt;Y\n\t\t\t&lt;XFERDEST&gt;Y\n\t\t\t&lt;SVCSTATUS&gt;ACTIVE\n\t\t\t&lt;/BANKACCTINFO&gt;\n\t\t&lt;/ACCTINFO&gt;\n\t&lt;/ACCTINFORS&gt;\n&lt;/ACCTINFOTRNRS&gt;</font></a></font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493337\"><font size=\"5\" face=\"Arial\">Service Activation</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Clients inform FIs that they wish to start, modify, or terminate a\nservice for an account by sending service activation requests. These are subject to data\nsynchronization, and servers should send responses to inform clients of any changes, even\nif the changes originated on the server. </font></p>\n\n<p><font size=\"2\">Clients use these records during the initial user sign-up process. Once\na client learns about the available accounts and services (by using the account\ninformation request above, or by having a user directly enter the required information),\nit sends a series of service ADD requests.</font> </p>\n\n<p><font size=\"2\">If a user changes any of the identifying information about an account,\nthe client sends a service activation request containing both the old and the new account\ninformation. Servers should interpret this as a change in the account, not a request to\ntransfer the service between two existing accounts, and all account-based information such\nas synchronization tokens should continue. If a user or FI is reporting that service\nshould be moved between two existing accounts, service must be terminated for the old\naccount and started for the new account. The new account will have reset token histories,\nas with any new service.</font> </p>\n\n<p><font size=\"2\">Each service to be added, changed, or removed is contained in its own\nrequest because the same real-world account might require different &lt;ACCTFROM&gt;\naggregates depending on the type of service.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493338\"><font size=\"4\" face=\"Arial\">Activation Request and Response</font></a>\n  </li>\n  <li><font size=\"2\" face=\"Arial\">Request &lt;ACCTRQ&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;ACCTRQ&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Account-service-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;<i>ACTION&gt;</i></font></b> </td>\n    <td width=\"336\"><font size=\"2\">Action aggregate, either &lt;SVCADD&gt;, &lt;SVCCHG&gt;, or\n    &lt;SVCDEL&gt;</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/<i>ACTION&gt;</i></font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SVC&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Service to be added/changed/deleted</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/ACCTRQ&gt;</font></b></td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Response &lt;ACCTRS&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;ACCTRS&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Account-service-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;<i>ACTION&gt;</i></font></b> </td>\n    <td width=\"336\"><font size=\"2\">Action aggregate, either &lt;SVCADD&gt;, &lt;SVCCHG&gt;, or\n    &lt;SVCDEL&gt;</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/<i>ACTION&gt;</i></font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SVC&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Service to be added/changed:</font> <p><font size=\"2\">BANKSVC\n    = Banking service<br>\n    BPSVC = Payments service<br>\n    INVSVC = Investments</font> </p></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/ACCTRS&gt;</font></b></td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Service Add Aggregate &lt;SVCADD&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SVCADD&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Service-add aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;<i>ACCTTO&gt;</i></font></b> </td>\n    <td width=\"336\"><font size=\"2\">Service-specific-account-identification aggregate (see\n    &lt;BANKACCTTO&gt;, &lt;INVACCTTO&gt;)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/<i>ACCTTO</i>&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/SVCADD&gt;</font></b></td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Service Change Aggregate &lt;SVCCHG&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SVCCHG&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Service-add aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;<i>ACCTFROM&gt;</i></font></b> </td>\n    <td width=\"336\"><font size=\"2\">Service-specific-account-identification aggregate (see\n    &lt;BANKACCTFROM&gt;, &lt;INVACCTFROM&gt;)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/<i>ACCTFROM</i>&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;<i>ACCTTO&gt;</i></font></b> </td>\n    <td width=\"336\"><font size=\"2\">Service-specific-account-identification aggregate (see\n    &lt;BANKACCTTO&gt;, &lt;INVACCTTO&gt;)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/<i>ACCTTO</i>&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/SVCCHG&gt;</font></b></td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Service Delete Aggregate &lt;SVCDEL&gt;</font> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SVCDEL&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Service-deletion aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;<i>ACCTFROM&gt;</i></font></b> </td>\n    <td width=\"336\"><font size=\"2\">Service-specific-account-identification aggregate (see\n    &lt;BANKACCTFROM&gt;, &lt;INVACCTFROM&gt;)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/<i>ACCTFROM</i>&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/SVCDEL&gt;</font></b></td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc378489965\"><font size=\"2\" face=\"Arial\">Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"72\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">0</font></td>\n    <td width=\"300\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2000</font></td>\n    <td width=\"300\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2002</font></td>\n    <td width=\"300\"><font size=\"2\">Other account error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2006</font></td>\n    <td width=\"300\"><font size=\"2\">Source (from) account not found (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2007</font></td>\n    <td width=\"300\"><font size=\"2\">Source (from) account closed (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2008</font></td>\n    <td width=\"300\"><font size=\"2\">Source (from) account not authorized (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2009</font></td>\n    <td width=\"300\"><font size=\"2\">Destination (to) account not found (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2010</font></td>\n    <td width=\"300\"><font size=\"2\">Destination (to) account closed (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2011</font></td>\n    <td width=\"300\"><font size=\"2\">Destination (to) account not authorized (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">13502</font></td>\n    <td width=\"300\"><font size=\"2\">Invalid service (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493339\"><font size=\"4\" face=\"Arial\">Service Activation Synchronization</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Service activation requests are subject to the standard data\nsynchronization protocol. The scope of these requests and the &lt;TOKEN&gt; is the\nuser-ID. The request and response tags are &lt;ACCTSYNCRQ&gt; and &lt;ACCTSYNCRS&gt;.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493340\"><font size=\"4\" face=\"Arial\">Examples</font></a><font size=\"2\">Activating\n    a payment:</font> </li>\n</ol>\n\n<pre><font size=\"1\">&lt;ACCTTRNRQ&gt;\n\t&lt;TRNUID&gt;12345\n\t&lt;ACCTRQ&gt;\n\t\t&lt;SVCADD&gt;\n\t\t\t&lt;BANKACCTTO&gt;\n\t\t\t\t&lt;BANKID&gt;1234567789\n\t\t\t\t&lt;ACCTID&gt;12345\n\t\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t\t&lt;/BANKACCTTO&gt;\n\t\t&lt;/SVCADD&gt;\n\t\t&lt;SVC&gt;BPSVC\n\t&lt;/ACCTRQ&gt;\n&lt;/ACCTTRNRQ&gt;</font><font size=\"2\">A response:\n</font><font size=\"1\" face=\"Courier New\">&lt;ACCTTRNRS&gt;\n\t&lt;TRNUID&gt;12345\n\t\t&lt;STATUS&gt;\n\t\t&lt;CODE&gt;0\n\t\t&lt;SEVERITY&gt;INFO\n\t&lt;/STATUS&gt;\n\t&lt;ACCTRS&gt;\n\t\t&lt;SVCADD&gt;\n\t\t\t&lt;BANKACCTTO&gt;\n\t\t\t\t&lt;BANKID&gt;1234567789\n\t\t\t\t&lt;ACCTID&gt;12345\n\t\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t\t&lt;/BANKACCTTO&gt;\n\t\t&lt;/SVCADD&gt;\n\t\t&lt;SVC&gt;BPSVC\n\t&lt;/ACCTRS&gt;\n&lt;/ACCTTRNRS&gt;</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493341\"><font size=\"5\" face=\"Arial\">Name and Address Changes\n    &lt;CHGUSERINFORQ&gt; &lt;CHGUSERINFORS&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Users may request that an FI update the official name, address, phone,\nand e-mail information using the &lt;CHGUSERINFORQ&gt;. Only the fields that should be\nchanged are sent. The response reports all of the current values. For security reasons,\nsome of the fields in the &lt;ENROLLRQ&gt; cannot be changed online, such as tax ID. </font></p>\n\n<p><font size=\"2\">The transaction tag is &lt;CHGUSERINFOTRNRQ&gt; and\n&lt;CHGUSERINFOTRNRSRQ&gt;. These methods are subject to synchronization,\n&lt;CHGUSERINFOSYNCRQ&gt; and &lt;CHGUSERINFOSYNCRS&gt;.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493342\"><font size=\"4\" face=\"Arial\">&lt;CHGUSERINFORQ&gt;</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;CHGUSERINFORQ&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Change-user-information-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;FIRSTNAME&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">First name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;MIDDLENAME&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Middle name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;LASTNAME&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Last name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR1&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 1</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR2&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 2</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR3&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 3</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;CITY&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">City</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;STATE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">State or province</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;POSTALCODE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Postal code</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;COUNTRY&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">3-letter country code from ISO/DIS-3166</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;DAYPHONE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Daytime telephone number</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;EVEPHONE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Evening telephone number</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;EMAIL&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Electronic e-mail address</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;/CHGUSERINFORQ&gt;</font></b> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493343\"><font size=\"4\" face=\"Arial\">&lt;CHGUSERINFORS&gt;</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"318\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;CHGUSERINFORS&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Change-user-information-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;FIRSTNAME&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">First name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;MIDDLENAME&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Middle name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;LASTNAME&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Last name of user</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR1&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 1</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;ADDR2&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Address line 2</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;CITY&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">City</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;STATE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">State or province</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;POSTALCODE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Postal code</font></td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;COUNTRY&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">3-letter country code from ISO/DIS-3166</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;DAYPHONE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Daytime telephone number</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;EVEPHONE&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Evening telephone number</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><font size=\"1\" face=\"Arial\">&lt;EMAIL&gt;</font> </td>\n    <td width=\"318\"><font size=\"2\">Electronic e-mail address</font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;DTINFOCHG&gt;</font></b> </td>\n    <td width=\"318\"><font size=\"2\">Date and time of update <i>datetime</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"180\"><b><font size=\"1\">&lt;/CHGUSERINFORS&gt;</font></b> </td>\n    <td width=\"318\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493344\"><font size=\"4\" face=\"Arial\">Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"72\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">0</font></td>\n    <td width=\"300\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2000</font></td>\n    <td width=\"300\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">13503</font></td>\n    <td width=\"300\"><font size=\"2\">Cannot change user information (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493345\"><font size=\"5\" face=\"Arial\">Signup Message Set Profile\n    Information</font></a> </li>\n</ol>\n\n<p><font size=\"2\">A server must include the following aggregates as part of the profile\n&lt;MSGSETLIST&gt; response, since every server must support at least the account\ninformation and service activation messages. In the &lt;ENROLLPROF&gt; aggregate, servers\nindicate how enrollment should proceed: via the client, a given web page, or a text\nmessage directing users to some other method (such as a phone call)..</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SIGNUPMSGSET&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Signup-message-set-profile-information aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;SIGNUPMSGSETV1&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Opening tag for V1 of the message set profile information</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;MSGSETCORE&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Common message set information, defined in the profile\n    chapter</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/MSGSETCORE&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\">&nbsp;</td>\n    <td width=\"336\"><font size=\"2\">Enrollment options - only one of &lt;CLIENTENROLL&gt;,\n    &lt;WEBENROLL&gt;, or &lt;OTHERENROLL&gt; is allowed</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;</font></b><font size=\"1\" face=\"Arial\">CLIENTENROLL<b>&gt;</b></font>\n    </td>\n    <td width=\"336\"><font size=\"2\">Client-based enrollment supported</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;ACCTREQUIRED&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Y if account number is required as part of enrollment <i>Boolean</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/</font></b><font size=\"1\" face=\"Arial\">CLIENTENROLL<b>&gt;</b></font>\n    </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;</font></b><font size=\"1\" face=\"Arial\">WEBENROLL<b>&gt;</b></font>\n    </td>\n    <td width=\"336\"><font size=\"2\">Web-based enrollment supported</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;URL&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">URL to start enrollment process</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/WEBENROLL&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;</font></b><font size=\"1\" face=\"Arial\">OTHERENROLL<b>&gt;</b></font>\n    </td>\n    <td width=\"336\"><font size=\"2\">Some other enrollment process</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;MESSAGE&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Message to give to consumer about what to do next (e.g. a\n    phone number) <i>A-80</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/</font></b><font size=\"1\" face=\"Arial\">OTHERENROLL<b>&gt;</b></font>\n    </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;CHGUSERINFO&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Y if server supports client-based user information changes</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;AVAILACCTS&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Y if server can provide information on accounts with\n    SVCSTATUS available, N means client should expect to ask user for specific account\n    information <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/SIGNUPMSGSETV1&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/SIGNUPMSGSET&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p><br>\n\n</p><ol>\n  <li><a name=\"_Toc380493346\"><font size=\"6\" face=\"Arial\">Customer to FI Communication</font></a>\n  </li>\n  <li><a name=\"_Toc380493347\"><font size=\"5\" face=\"Arial\">The E-Mail Message Set</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The e-mail message set includes two messages: generic e-mail and generic\nMIME requests by way of URLs. In Open Financial Exchange files, the message set name is\nEMAILMSGSV1.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493348\"><font size=\"5\" face=\"Arial\">E-Mail Messages</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange allows consumers and FIs to exchange messages.\nThe message body is in HTML so that FIs can provide some graphic structure to the message.\nKeep in mind that, as with regular World Wide Web browsing, an Open Financial Exchange\nclient might not support some or all of the HTML formatting, so the text of the message\nmust be clear on its own. Clients can request that graphics (the images referenced in an\n&lt;IMG&gt; tag) be sent as part of the response file, or clients can separately request\nthose elements. If a server sends images, it should use the standard procedure for\nincorporating external data as described in Chapter 2. Servers are not required to support\nHTML or to send images, even if the client asks.</font> </p>\n\n<p><font size=\"2\">A user or an FI can originate a message. E-mail messages are subject to\ndata synchronization so that a server can send a response again if it is lost or if it is\nused by multiple clients.</font> </p>\n\n<p><font size=\"2\">Because e-mail messages cannot be replied to immediately, the response\nshould just echo back the original message (so that data synchronization will get this\noriginal e-mail message to other clients). When the FI is ready to reply, it should\ngenerate an unsolicited response (&lt;TRNUID&gt;0) and the client will pick this up during\nsynchronization.</font> </p>\n\n<table bordercolor=\"#000000\" border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#FFFFFF\" width=\"156\"><i><font size=\"1\">Client Sends</font></i> </td>\n    <td bgcolor=\"#FFFFFF\" width=\"168\"><i><font size=\"2\">Server Responds</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"156\"><font size=\"1\">Account information</font></td>\n    <td width=\"168\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"156\"><font size=\"1\">From, To</font></td>\n    <td width=\"168\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"156\"><font size=\"1\">Subject</font></td>\n    <td width=\"168\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"156\"><font size=\"1\">Message</font></td>\n    <td width=\"168\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"156\">&nbsp;</td>\n    <td width=\"168\"><font size=\"2\">Account information</font> </td>\n  </tr>\n  <tr>\n    <td width=\"156\">&nbsp;</td>\n    <td width=\"168\"><font size=\"2\">From, To</font> </td>\n  </tr>\n  <tr>\n    <td width=\"156\">&nbsp;</td>\n    <td width=\"168\"><font size=\"2\">Subject</font> </td>\n  </tr>\n  <tr>\n    <td width=\"156\">&nbsp;</td>\n    <td width=\"168\"><font size=\"2\">Message</font> </td>\n  </tr>\n  <tr>\n    <td width=\"156\">&nbsp;</td>\n    <td width=\"168\"><font size=\"2\">Type</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493349\"><font size=\"4\" face=\"Arial\">Regular vs. Specialized E-Mail</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">Several services with Open Financial Exchange define e-mail requests and\nresponses that contain additional information specific to that service. To simplify\nimplementation for both clients and servers, this section defines a &lt;MAIL&gt; aggregate\nthat Open Financial Exchange uses in all e-mail requests and responses. For regular\ne-mail, the only additional information is an account from aggregate and whether to\ninclude images in the e-mail response or not. </font>\n\n</p><ol>\n  <li><a name=\"_Toc380493350\"><font size=\"4\" face=\"Arial\">Basic &lt;MAIL&gt; Aggregate</font></a>\n  </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAIL&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Core e-mail aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USERID&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">User ID such as SSN</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;DTCREATED&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">When message was created <i>datetime</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;FROM&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Customer's input for whom message is from, <i>A-32</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;TO&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Who e-mail should be delivered to, <i>A-32</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;SUBJECT&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Subject of message (plain text, not HTML), <i>A-60</i></font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MSGBODY&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Body of message, HTML-encoded or plain text depending on\n    &lt;USEHTML&gt;, <i>A-10000</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MSGBODY&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">End of message</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;INCIMAGES&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Include images in response, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USEHTML&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Y if client wants an HTML response, N if client wants plain\n    text, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAIL&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>If using HTML for the message body, clients and servers are <b>REQUIRED</b> to wrap the\ndesired HTML in an SGML marked section to protect the HTML markup: &lt;![ CDATA [ ... html\n... ]]&gt;. See the example.\n\n</p><ol>\n  <li><a name=\"_Toc380493351\"><font size=\"4\" face=\"Arial\">E-Mail &lt;MAILRQ&gt; &lt;MAILRS&gt;</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">E-mail is subject to synchronization. The transaction tag is\n&lt;MAILTRNRQ&gt; / &lt;MAILTRNRS&gt; and the synchronization tag is &lt;MAILSYNCRQ&gt; /\n&lt;MAILSYNCRS&gt;.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAILRQ&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">E-mail-message-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAIL&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Core e-mail aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAIL&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAILRQ&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>In a response, the &lt;TRNUID&gt; is zero if this is an unsolicited message. Otherwise,\nit should contain the &lt;TRNUID&gt; of the user's original message. It is RECOMMENDED\nthat servers include the &lt;MESSAGE&gt; of the user's message as part of the reply\n&lt;MESSAGE&gt;. The &lt;MESSAGE&gt; contents can include carriage returns to identify\ndesired line breaks. </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAILRS&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">E-mail-message-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAIL&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Core e-mail aggregate</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAIL&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAILRS&gt;</font></b></td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc378489974\"><font size=\"2\" face=\"Arial\">Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"72\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">0</font></td>\n    <td width=\"300\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2000</font></td>\n    <td width=\"300\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">16500</font></td>\n    <td width=\"300\"><font size=\"2\">HTML not allowed (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">16501</font></td>\n    <td width=\"300\"><font size=\"2\">Unknown mail To: (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493352\"><font size=\"4\" face=\"Arial\">E-Mail Synchronization\n    &lt;MAILSYNCRQ&gt; &lt;MAILSYNCRS&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange uses data synchronization to collect responses\nthat could have been lost due to communication problems, or that the servers previously\nsent to a different client or data file. All messages sent to the signed-on user ID are\ncovered by a single &lt;TOKEN&gt;. Note that this synchronization action expects only the\nbasic &lt;MAILRS&gt; responses. Specialized e-mail is received by means of their own\nsynchronization requests.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><font size=\"1\">Tag</font></td>\n    <td bgcolor=\"#000000\" width=\"366\"><font size=\"2\">Description</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAILSYNCRQ&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">E-mail-synchronization-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;TOKEN&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Client history marker</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;INCIMAGES&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Include images in response, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;USEHTML&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Y if client wants an HTML response, N if client wants plain\n    text, <i>Boolean</i></font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAILSYNCRQ&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><font size=\"1\">Tag</font></td>\n    <td bgcolor=\"#000000\" width=\"366\"><font size=\"2\">Description</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;MAILSYNCRS&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">E-mail-synchronization-response. aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;TOKEN&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">Server history marker</font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><font size=\"1\">&lt;MAILTRNRS&gt;</font></td>\n    <td width=\"366\"><font size=\"2\">Missing e-mail response transactions (0 or more)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/MAILSYNCRS&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493353\"><font size=\"4\" face=\"Arial\">Example</font></a> </li>\n</ol>\n\n<p><font size=\"2\">In this example, a consumer requests information from customer service\nabout the checking statement just downloaded. This example omits the &lt;OFX&gt; top level\nand the signon &lt;SONRQ&gt;. This example uses HTML for the message body, and so it must\nprotect the HTML content in an SGML CDATA marked section.The request:</font> </p>\n\n<pre><font size=\"1\">&lt;MAILTRNRQ&gt;\n\t&lt;TRNUID&gt;54321\n\t&lt;MAILRQ&gt;\n\t\t&lt;MAIL&gt;\n\t\t\t&lt;USERID&gt;123456789\n\t\t\t&lt;FROM&gt;James Hackleman\n\t\t\t&lt;TO&gt;Noelani Federal Savings\n\t\t\t&lt;SUBJECT&gt;What do I need to earn interest?\n\t\t\t&lt;DTCREATED&gt;19960305\n</font><font size=\"2\">\t\t\t&lt;MSGBODY&gt;&lt;![ CDATA [&lt;HTML&gt;&lt;BODY&gt;I didn't earn any interest this month. Can you please tell me what I need to do to earn interest on this account?&lt;/BODY&gt;&lt;/HTML&gt;\n]]&gt;&lt;/MSGBODY&gt;\n\t\t\t&lt;INCIMAGES&gt;N\n\t\t\t&lt;USEHTML&gt;Y\n\t\t&lt;/MAIL&gt;\n\t&lt;/MAILRQ&gt;\n&lt;/MAILTRNRQ&gt;The response from the FI:\n</font><font size=\"1\" face=\"Courier New\">&lt;MAILTRNRS&gt;\n\t&lt;TRNUID&gt;54321\n\t&lt;STATUS&gt;\n\t\t&lt;CODE&gt;0\n\t\t&lt;SEVERITY&gt;INFO\n\t\t&lt;/STATUS&gt;\n\t&lt;MAILRS&gt;\n\t\t&lt;MAIL&gt;\n\t\t\t&lt;USERID&gt;123456789\n\t\t\t&lt;DTCREATED&gt;19960307\n\t\t\t&lt;FROM&gt;Noelani Federal Savings\n\t\t\t&lt;TO&gt;James Hackleman\n\t\t\t&lt;SUBJECT&gt;Re: What do I need to earn interest?\n</font><font size=\"2\">\t\t\t&lt;MSGBODY&gt;&gt;&lt;![ CDATA [&lt;HTML&gt;&lt;BODY&gt;You need to maintain $1000 in this account to earn interest. Because your balance was only $750 this month, no interest was earned. You could also switch to our new Checking Extra plan that always pays interest. Call us or check our web page http://www.fi.com/check-plans.html for more information.\nSincerely,\nCustomer Service Department\n\nOriginal message:\nI didn't earn any interest this month. Can you please tell me what I need to do to earn interest on this account?&lt;/BODY&gt;&lt;/HTML&gt;\n]]&gt;&lt;/MSGBODY&gt;\n\t\t\t&lt;INCIMAGES&gt;N\n\t\t\t&lt;USEHTML&gt;Y\n\t\t&lt;/MAIL&gt;\n\t&lt;/MAILRS&gt;\n&lt;/MAILTRNRS&gt;</font>\n</pre>\n\n<ol>\n  <li><font size=\"2\" face=\"Arial\">Example of Synchronization Involving E-Mail </font></li>\n</ol>\n\n<p><font size=\"2\">In the following example the client did not receive the reply to the\nmessage sent in the previous example, so its &lt;TOKEN&gt; is one less than the server's.\nThe server replies by giving the current &lt;TOKEN&gt; and the missed response.</font> </p>\n\n<pre><font size=\"1\">&lt;MAILSYNCRQ&gt;\n\t&lt;TOKEN&gt;101\n&lt;/MAILSYNCRQ&gt;\n\n&lt;MAILSYNCRS&gt;\n\t&lt;TOKEN&gt;102\n\t&lt;MAILTRNRS&gt;\n\t\t&lt;TRNUID&gt;54321\n\t\t&lt;STATUS&gt;\n\t\t\t&lt;CODE&gt;0\n\t\t\t&lt;SEVERITY&gt;INFO\n\t\t&lt;/STATUS&gt;\n\t\t&lt;MAILRS&gt;\n\t\t\t... contents of e-mail message response as shown in previous example\n\t\t&lt;/MAILRS&gt;\n\t&lt;/MAILTRNRS&gt;\n&lt;/MAILSYNCRS&gt;</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493354\"><font size=\"5\" face=\"Arial\">Get HTML Page</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Some responses contain values that are URLs, intended to be separately\nfetched by clients if desired. Clients can use their own HTTP libraries to perform this\nfetch outside of the Open Financial Exchange specification. However, to insulate clients\nagainst changes in transport technology, and to allow for fetches that require the\nprotection of an authenticated signon by a specific user, Open Financial Exchange defines\na transaction roughly equivalent to an HTTP Get. Any MIME type can be retrieved, including\nimages as well as HTML pages.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493355\"><font size=\"4\" face=\"Arial\">MIME Get Request and Response\n    &lt;GETMIMERQ&gt; &lt;GETMIMERS&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The following table lists the components of a request:</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;GETMIMERQ&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Get-MIME-request aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;URL&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">URL, <i>URL</i></font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/GETMIMERQ&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p>The response simply echoes back the URL. The actual response, whether HTML, an image,\nor some other type, is always sent as a separate part of the file using multi-part MIME. </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"132\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"366\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;GETMIMERS&gt;</font></b> </td>\n    <td width=\"366\"><font size=\"2\">Get-MIME-response aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;URL&gt;</font></b></td>\n    <td width=\"366\"><font size=\"2\">URL, <i>URL</i></font></td>\n  </tr>\n  <tr>\n    <td width=\"132\"><b><font size=\"1\">&lt;/GETMIMERS&gt;</font></b> </td>\n    <td width=\"366\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc378489978\"><font size=\"2\" face=\"Arial\">Status Codes</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"72\"><i><font size=\"1\">Code</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"300\"><i><font size=\"2\">Meaning</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">0</font></td>\n    <td width=\"300\"><font size=\"2\">Success (INFO)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2000</font></td>\n    <td width=\"300\"><font size=\"2\">General error (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">2019</font></td>\n    <td width=\"300\"><font size=\"2\">Duplicate transaction (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">16502</font></td>\n    <td width=\"300\"><font size=\"2\">Invalid URL (ERROR)</font> </td>\n  </tr>\n  <tr>\n    <td width=\"72\"><font size=\"1\">16503</font></td>\n    <td width=\"300\"><font size=\"2\">Unable to get URL (ERROR)</font> </td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493356\"><font size=\"4\" face=\"Arial\">Example</font></a><font size=\"2\">A\n    request:</font> </li>\n</ol>\n\n<pre><font size=\"1\">&lt;GETMIMETRNRQ&gt;\n\t&lt;TRNUID&gt;54321\n\t&lt;GETMIMERQ&gt;\n\t\t&lt;URL&gt;http://www.fi.com/apage.html\n\t&lt;/GETMIMERQ&gt;\n&lt;/GETMIMETRNRQ&gt;</font><font size=\"2\">A response - the full\nfile is shown here to illustrate the use of multi-part MIME:\n</font><font size=\"1\" face=\"Courier New\">HTTP 1.0 200 OK\nContent-Type: multipart/x-mixed-replace; boundary =--boundary-\n\n--boundary--\nContent-Type: application/x-ofx\nContent-Length: 8732\n\n</font><font size=\"2\">OFXHEADER:100\nDATA:OFXSGML\nVERSION:100\nENCRYPTION:1\nENCODING:USASCII\n\n&lt;OFX&gt;\n\t\t&lt;!-- signon not shown\n\t\tmessage set wrappers not shown --&gt;\n&lt;GETMIMETRNRS&gt;\n\t&lt;TRNUID&gt;54321\n\t&lt;STATUS&gt;\n\t\t&lt;CODE&gt;0\n\t\t&lt;SEVERITY&gt;INFO\n\t&lt;/STATUS&gt;\n\t&lt;GETMIMERS&gt;\n\t\t&lt;URL&gt;http://www.fi.com/apage.html\n\t&lt;/GETMIMERS&gt;\n&lt;/GETMIMETRNRS&gt;\n&lt;/OFX&gt;\n\n--boundary--\nContent-Type: text/html\n&lt;HTML&gt;\n\t&lt;!-- standard HTML page --&gt;\n&lt;/HTML&gt;\n\n--boundary--\n\n</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493357\"><font size=\"5\" face=\"Arial\">E-Mail Message Set Profile\n    Information</font></a> </li>\n</ol>\n\n<p><font size=\"2\">If either or both of the messages in the e-mail message set are\nsupported, the following aggregate must be included in the profile &lt;MSGSETLIST&gt;\nresponse.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"162\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"336\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;EMAILMSGSET&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">E-mail-message-set-profile-information aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;EMAILMSGSETV1&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Opening tag for V1 of the message set profile information</font>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;MSGSETCORE&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Common message set information, defined in the profile\n    chapter</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/MSGSETCORE&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;EMAIL&gt;</font></b></td>\n    <td width=\"336\"><font size=\"2\">Y if server supports generic e-mail message</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;GETMIME&gt;</font></b> </td>\n    <td width=\"336\"><font size=\"2\">Y if server supports get MIME message</font> </td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/EMAILMSGSETV1&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n  <tr>\n    <td width=\"162\"><b><font size=\"1\">&lt;/EMAILMSGSET&gt;</font></b> </td>\n    <td width=\"336\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<p><br>\n<br>\n\n</p><ol>\n  <li><a name=\"_Toc380493358\"><font size=\"6\" face=\"Arial\">Recurring Transactions</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Open Financial Exchange enables users to automate transactions that\noccur on a regular basis. Recurring transactions are useful when a customer has payments\nor transfers, for example, that repeat at regular intervals. The customer can create a\n\"model\" at the server for automatic generation of these instructions. The model\nin turn creates payments or transfers until it is canceled or expires. After the user\ncreates a recurring model at the server, the server can relieve the user from the burden\nof creating these transactions; it generates the transactions on its own, based on the\noperating parameters of the model.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493359\"><font size=\"5\" face=\"Arial\">Creating a Recurring Model</font></a>\n  </li>\n</ol>\n\n<p><font size=\"2\">The client must provide the following information to create a model:</font>\n\n</p><ul>\n  <li><font size=\"2\">Type of transaction generated by the model (payment or transfer) </font></li>\n  <li><font size=\"2\">Frequency of recurring transaction</font> </li>\n  <li><font size=\"2\">Total number of recurring transactions to generate</font> </li>\n  <li><font size=\"2\">Service-specific information, such as transfer date, payment amount,\n    payee address</font> </li>\n</ul>\n\n<p><font size=\"2\">The model creates each transaction some time before its due date,\nusually thirty days. This allows the user to retrieve the transactions in advance of\nposting. This also gives the user the opportunity to modify or cancel individual\ntransactions without changing the recurring model itself.</font> </p>\n\n<p><font size=\"2\">When a model is created, it can generate several transactions\nimmediately. The model does not automatically return responses for the newly created\ntransactions. It only returns a response to the request that was made to create the model.\nFor this reason, clients should send a synchronization request along with the request to\ncreate a model. This allows the server to return the newly created transaction responses,\nas well as the response to the request to set up a new model.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493360\"><font size=\"5\" face=\"Arial\">Recurring Instructions\n    &lt;RECURRINST&gt;</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The Recurring Instructions aggregate is used to specify the schedule for\na repeating instruction. It is passed to the server when a recurring transfer or payment\nmodel is first created.</font> </p>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"168\"><i><font size=\"1\">Tag</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"330\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"168\"><b><font size=\"1\">&lt;RECURRINST&gt;</font></b> </td>\n    <td width=\"330\"><font size=\"2\">Recurring-Instructions aggregate</font> </td>\n  </tr>\n  <tr>\n    <td width=\"168\"><b><font size=\"1\">&lt;FREQ&gt;</font></b></td>\n    <td width=\"330\"><font size=\"2\">Frequency, see section 10.2.1</font> </td>\n  </tr>\n  <tr>\n    <td width=\"168\"><font size=\"1\" face=\"Arial\">&lt;NINSTS&gt;</font> </td>\n    <td width=\"330\"><font size=\"2\">Number of instructions</font> <p><font size=\"2\">If this tag\n    is absent, the schedule is open-ended,<i> N-3</i></font> </p></td>\n  </tr>\n  <tr>\n    <td width=\"168\"><b><font size=\"1\">&lt;/RECURRINST&gt;</font></b> </td>\n    <td width=\"330\">&nbsp;</td>\n  </tr>\n</tbody></table>\n\n<ol>\n  <li><a name=\"_Toc380493361\"><font size=\"4\" face=\"Arial\">Values for &lt;FREQ&gt;</font></a> </li>\n</ol>\n\n<table border=\"1\">\n  <tbody><tr>\n    <td bgcolor=\"#000000\" width=\"126\"><i><font size=\"2\">Value</font></i> </td>\n    <td bgcolor=\"#000000\" width=\"180\"><i><font size=\"2\">Description</font></i> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">WEEKLY</font></td>\n    <td width=\"180\"><font size=\"2\">Weekly</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">BIWEEKLY</font></td>\n    <td width=\"180\"><font size=\"2\">Biweekly</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">TWICEMONTHLY</font></td>\n    <td width=\"180\"><font size=\"2\">Twice a month</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">MONTHLY</font></td>\n    <td width=\"180\"><font size=\"2\">Monthly</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">FOURWEEKS</font></td>\n    <td width=\"180\"><font size=\"2\">Every four weeks</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">BIMONTHLY</font></td>\n    <td width=\"180\"><font size=\"2\">Bimonthly</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">QUARTERLY</font></td>\n    <td width=\"180\"><font size=\"2\">Quarterly</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">SEMIANNUALLY</font></td>\n    <td width=\"180\"><font size=\"2\">Semiannually</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">TRIANNUALLY</font></td>\n    <td width=\"180\"><font size=\"2\">Triannually</font> </td>\n  </tr>\n  <tr>\n    <td width=\"126\"><font size=\"1\">ANNUALLY</font></td>\n    <td width=\"180\"><font size=\"2\">Annually</font> </td>\n  </tr>\n</tbody></table>\n\n<p>Rules for calculating recurring dates of WEEKLY, BIWEEKLY, and TWICEMONTHLY are as\nfollows:\n\n</p><ul>\n  <li>WEEKLY = starting date for first transaction, starting date + 7 days for the second </li>\n  <li>TWICEMONTHLY = starting date for first, starting date + 15 days for the second </li>\n  <li>BIWEEKLY = starting date for first, starting date + 14 days for the second </li>\n</ul>\n\n<p><b>Examples:</b> </p>\n\n<p>Start date of May 2: next transaction date for WEEKLY is May 9; TWICEMONTHLY is May 17;\nnext transfer date for BIWEEKLY is May 16. </p>\n\n<p>Start date of May 20: next date for WEEKLY is May 27; TWICEMONTHLY is June 4; next date\nfor BIWEEKLY is June 3. </p>\n\n<p>TWICEMONTHLY recurring transactions will occur each month on those days adjusting for\nweekends and holidays. BIWEEKLY will occur every 14 days.\n\n</p><ol>\n  <li><a name=\"_Toc380493362\"><font size=\"4\" face=\"Arial\">Examples</font></a> </li>\n</ol>\n\n<p><font size=\"2\">The following example illustrates the creation of a repeating payment.\nThe payment repeats on a monthly basis for 12 months. All payments are for $395.The\nrequest:</font> </p>\n\n<pre><font size=\"1\">.\n.\n.\n&lt;RECPMTRQ&gt;\n\t&lt;RECURRINST&gt;\n\t\t&lt;FREQ&gt;MONTHLY\n\t\t&lt;NINSTS&gt;12\n\t&lt;/RECURRINST&gt;\n\t&lt;PMTINFO&gt;\n\t\t&lt;BANKACCTFROM&gt;\n\t\t\t&lt;BANKID&gt;555432180\n\t\t\t&lt;ACCTID&gt;763984\n\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t&lt;/BANKACCTFROM&gt;\n\t\t&lt;TRNAMT&gt;395.00\n\t\t&lt;PAYEEID&gt;77810\n\t\t&lt;PAYACCT&gt;444-78-97572\n\t\t&lt;DTDUE&gt;19971115\n\t\t&lt;MEMO&gt;Auto loan payment\n\t&lt;/PMTINFO&gt;\n&lt;/RECPMTRQ&gt;\n.\n.\n.\n\n</font><font size=\"2\">The response includes the &lt;RECSRVRTID&gt;\nthat the client can use\n\nto cancel or modify the model:\n</font><font size=\"1\" face=\"Courier New\">.\n.\n.\n&lt;RECPMTRS&gt;\n</font><font size=\"2\">\t&lt;RECSRVRTID&gt;387687138\n\t&lt;RECURRINST&gt;\n\t\t&lt;FREQ&gt;MONTHLY\n\t\t&lt;NINSTS&gt;12\n\t&lt;/RECURRINST&gt;\n\t&lt;PMTINFO&gt;\n\t\t&lt;BANKACCTFROM&gt;\n\t\t\t&lt;BANKID&gt;555432180\n\t\t\t&lt;ACCTID&gt;763984\n\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t&lt;/BANKACCTFROM&gt;\n\t\t&lt;TRNAMT&gt;395.00\n\t\t&lt;PAYEEID&gt;77810\n\t\t&lt;PAYACCT&gt;444-78-97572\n\t\t&lt;DTDUE&gt;19971115\n\t\t&lt;MEMO&gt;Auto loan payment\n\t&lt;/PMTINFO&gt;\n&lt;/RECPMTRS&gt;\n.\n.\n.</font>\n</pre>\n\n<ol>\n  <li><a name=\"_Toc380493363\"><font size=\"5\" face=\"Arial\">Retrieving Transactions Generated by\n    a Recurring Model</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Once created, a recurring model independently generates instructions.\nSince the client has not directly generated these transactions, the client has no record\nof their creation. To enable users to modify and/or cancel pending instructions, the\nclient must use data synchronization in order to retrieve these transactions.</font> </p>\n\n<p><font size=\"2\">The client has two purposes for synchronizing state with the server with\nrespect to recurring models:</font>\n\n</p><ul>\n  <li><font size=\"2\">Retrieve any added, modified, or canceled recurring models</font> </li>\n  <li><font size=\"2\">Retrieve any added, modified, or canceled transactions generated by any\n    models</font> </li>\n</ul>\n\n<p><font size=\"2\">The client must be able to synchronize with the state of any models at\nthe server, as well as the state of any transactions generated by the server.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493364\"><font size=\"5\" face=\"Arial\">Modifying and Canceling Individual\n    Transactions</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Once created and retrieved by the customer, recurring payments and\ntransfers are almost identical to customer-created payments or transfers. As with ordinary\npayments or transfers, you can cancel or modify transactions individually. However,\nbecause servers generate these transfers, they are different in the following respects:</font>\n\n</p><ul>\n  <li><font size=\"2\">Recurring transactions must be retrieved as part of a synchronization\n    request.</font> </li>\n  <li><font size=\"2\">Recurring transactions are related to a model. A server can modify or\n    cancel transactions if the model is modified or canceled. </font></li>\n</ul>\n\n<ol>\n  <li><font size=\"5\" face=\"Arial\"><a name=\"_Toc380493365\">Modifying and Canceling Recurring\n    Model</a>s</font> </li>\n</ol>\n\n<p><font size=\"2\">A recurring model can be modified or canceled. When a model is modified,\nall transactions that it generates in the future will change as well. The client can\nindicate whether transactions that have been generated, but have not been sent, should be\nmodified as well. The actual elements within a transaction that can be modified differ by\nservice. See the recurring sections within the Banking and Payments chapters for details.</font>\n</p>\n\n<p><font size=\"2\">A user can cancel a model immediately or at a future date. If a user\ncancels the model immediately, the client cancels any transactions that it has not yet\nsent. If the client schedules the cancel for a future date, the client will not cancel\npending transactions.</font>\n\n</p><ol>\n  <li><a name=\"_Toc380493366\"><font size=\"4\" face=\"Arial\">Examples</font></a> </li>\n</ol>\n\n<p><font size=\"2\">Canceling a recurring payment model requires the client to pass the\n&lt;RECSRVRTID&gt; of the model. The client requests that pending payments also be\ncanceled. The server cancels the model immediately and notifies the client that both the\nmodel and any scheduled payments were canceled.The request:</font> </p>\n\n<pre><font size=\"1\">.\n.\n.\n\t&lt;RECPMTCANCRQ&gt;\n\t\t&lt;RECSRVRTID&gt;387687138\n\t\t&lt;CANPENDING&gt;Y\n\t&lt;/RECPMTCANCRQ&gt;\n.\n.\n.\n</font><font size=\"2\">The response:\n</font><font size=\"1\" face=\"Courier New\">.\n.\n.\n\t&lt;RECPMTCANCRS&gt;\n\t\t&lt;RECSRVRTID&gt;387687138\n\t\t&lt;CANPENDING&gt;Y\n\t&lt;/RECPMTCANCRS&gt;\n.\n.\n.\n</font>\n</pre>\n\n<pre><img src=\"./sample_complexe_files/stadyn_image9.gif\" width=\"382\" height=\"208\" alt=\"stadyn_image9.gif (7739 bytes)\">\nstadyn_image9</pre>\n\n<p><font size=\"2\">The server also cancels any payments that have been generated but not\nexecuted. In the example shown above, the client would not learn of this immediately. To\nreceive notification that the model and all generated payments were canceled, the client\nwould need to include a synchronization request in the file. The following example\nillustrates this alternate approach.The request file now includes a synchronization\nrequest:</font> </p>\n\n<pre><font size=\"1\">.\n.\n.\n\t&lt;RECPMTCANCRQ&gt;\n\t\t&lt;RECSRVRTID&gt;387687138\n\t\t&lt;CANPENDING&gt;Y\n\t&lt;/RECPMTCANCRQ&gt;\n\t&lt;PMTSYNCRQ&gt;\n\t\t&lt;TOKEN&gt;12345\n\t\t&lt;BANKACCTFROM&gt;\n\t\t\t&lt;BANKID&gt;123432123\n\t\t\t&lt;ACCTID&gt;516273\n\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t&lt;/BANKACCTFROM&gt;\n\t&lt;/PMTSYNCRQ&gt;\n.\n.\n.\n</font><font size=\"2\">The response file now contains two responses\n(assuming one payment was pending),\n\none for the canceled model and one for the canceled payment.\n</font><font size=\"1\" face=\"Courier New\">.\n.\n.\n\t&lt;RECPMTCANCRS&gt;\n\t\t&lt;RECSRVRTID&gt;387687138\n\t\t&lt;CANPENDING&gt;Y\n\t&lt;/RECPMTCANCRS&gt;\n\t&lt;PMTSYNCRS&gt;\n\t\t&lt;TOKEN&gt;3247989384\n\t\t&lt;BANKACCTFROM&gt;\n\t\t\t&lt;BANKID&gt;123432123\n\t\t\t&lt;ACCTID&gt;516273\n\t\t\t&lt;ACCTTYPE&gt;CHECKING\n\t\t&lt;/BANKACCTFROM&gt;\n\t\t&lt;PMTTRNRS&gt;\n\t\t\t&lt;TRNUID&gt;10103\n\t\t\t&lt;STATUS&gt;\n\t\t\t\t&lt;CODE&gt;0\n\t\t\t\t&lt;SEVERITY&gt;INFO\n\t\t\t&lt;/STATUS&gt;\n\t\t\t&lt;PMTCANCRS&gt;\n\t\t\t\t&lt;SRVRTID&gt;1030155\n\t\t\t&lt;/PMTCANCRS&gt;\n\t\t&lt;/PMTTRNRS&gt;\n\t&lt;/PMTSYNCRS&gt;\n.\n.\n.\n\n</font>\n</pre>\n\n<p><img src=\"./sample_complexe_files/stadyn_image10.gif\" width=\"174\" height=\"310\" alt=\"stadyn_image10.gif (4217 bytes)\"><br>\nstadyn_image10</p>\n\n\n</body><grammarly-desktop-integration data-grammarly-shadow-root=\"true\"><template shadowrootmode=\"open\"><style>\n      div.grammarly-desktop-integration {\n        position: absolute;\n        width: 1px;\n        height: 1px;\n        padding: 0;\n        margin: -1px;\n        overflow: hidden;\n        clip: rect(0, 0, 0, 0);\n        white-space: nowrap;\n        border: 0;\n        -moz-user-select: none;\n        -webkit-user-select: none;\n        -ms-user-select:none;\n        user-select:none;\n      }\n\n      div.grammarly-desktop-integration:before {\n        content: attr(data-content);\n      }\n    </style><div aria-label=\"grammarly-integration\" role=\"group\" tabindex=\"-1\" class=\"grammarly-desktop-integration\" data-content=\"{&quot;mode&quot;:&quot;full&quot;,&quot;isActive&quot;:true,&quot;isUserDisabled&quot;:false}\"></div></template></grammarly-desktop-integration></html>\n"
  },
  {
    "path": "libs/megaparse/tests/test_endpoints.py",
    "content": "import pytest\n\n\n@pytest.mark.asyncio\nasync def test_parse_file_endpoint(test_client):\n    # Simulate a request to the parse endpoint\n    with open(\"./tests/pdf/sample_pdf.pdf\", \"rb\") as file:\n        response = await test_client.post(\n            \"/v1/file\",\n            files={\"file\": (\"test.pdf\", file)},\n            data={\n                \"method\": \"unstructured\",\n                \"strategy\": \"auto\",\n                \"language\": \"en\",\n                \"check_table\": False,\n            },\n        )\n    assert response.status_code == 200\n    assert response.json()[\"message\"] == \"File parsed successfully\"\n\n\n@pytest.mark.asyncio\nasync def test_parse_url_endpoint(test_client):\n    response = await test_client.post(\"/v1/url?url=https://www.quivr.com\")\n    assert response.status_code == 200\n    assert response.json() == {\n        \"message\": \"Website content parsed successfully\",\n        \"result\": \"Fake website content\",\n    }\n"
  },
  {
    "path": "libs/megaparse/tests/test_import.py",
    "content": "import pytest\nfrom megaparse import MegaParse\n\n\n@pytest.mark.skip(\"slow test\")\ndef test_load():\n    megaparse = MegaParse()\n    response = megaparse.load(\"./tests/data/dummy.pdf\")\n    print(response)\n    assert response.strip(\"\\n\") == \"Dummy PDF download\"\n"
  },
  {
    "path": "libs/megaparse/tests/test_parsers.py",
    "content": "import os\n\nimport pytest\nfrom megaparse.parser.doctr_parser import DoctrParser\nfrom megaparse.parser.llama import LlamaParser\nfrom megaparse.parser.megaparse_vision import MegaParseVision\nfrom megaparse.parser.unstructured_parser import UnstructuredParser\nfrom megaparse_sdk.schema.extensions import FileExtension\n\nPARSER_LIST = [\n    UnstructuredParser,\n    # DoctrParser,\n]\n\n\n@pytest.mark.parametrize(\"parser\", PARSER_LIST)\n@pytest.mark.parametrize(\"extension\", list(FileExtension))\ndef test_sync_parser(parser, extension):\n    directory = \"./tests/supported_docs\"\n    file_path = next(\n        (\n            os.path.join(root, file)\n            for root, _, files in os.walk(directory)\n            for file in files\n            if file.endswith(extension.value)\n        ),\n        None,\n    )\n    if file_path is None:\n        pytest.fail(f\"No file with extension {extension.value} found in {directory}\")\n\n    myparser = parser()\n    if extension in myparser.supported_extensions:\n        response = myparser.convert(file_path)\n\n        assert response\n        assert len(str(response)) > 0\n    else:\n        with pytest.raises(ValueError):\n            myparser.convert(file_path)\n"
  },
  {
    "path": "libs/megaparse_sdk/CHANGELOG.md",
    "content": "# Changelog\n\n## [0.1.12](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.11...megaparse-sdk-v0.1.12) (2025-02-13)\n\n\n### Features\n\n* add layout detection ([#228](https://github.com/QuivrHQ/MegaParse/issues/228)) ([77f7040](https://github.com/QuivrHQ/MegaParse/commit/77f7040c9c221a17effce089be7ec575cdd83468))\n\n## [0.1.11](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.10...megaparse-sdk-v0.1.11) (2025-02-11)\n\n\n### Features\n\n* add_layout_detection ([#220](https://github.com/QuivrHQ/MegaParse/issues/220)) ([2d2d0b4](https://github.com/QuivrHQ/MegaParse/commit/2d2d0b42bba4c883db423568e932eda42edd60d7))\n* Text detection in auto strategy ([#209](https://github.com/QuivrHQ/MegaParse/issues/209)) ([03c7ada](https://github.com/QuivrHQ/MegaParse/commit/03c7ada1dc245e13ef41ffd6fa3a8ed869269d37))\n\n\n### Bug Fixes\n\n* Add EngineConfig & StrategyHandler ([#211](https://github.com/QuivrHQ/MegaParse/issues/211)) ([2e1c6dd](https://github.com/QuivrHQ/MegaParse/commit/2e1c6ddd676227d1cbc4cff9771b20595259ba38))\n* add parse tests for every supported extensions ([#198](https://github.com/QuivrHQ/MegaParse/issues/198)) ([9dff0de](https://github.com/QuivrHQ/MegaParse/commit/9dff0de0c1de848151fe9a6519b658f0924c1228))\n* Strategy heuristic test & fix ([#203](https://github.com/QuivrHQ/MegaParse/issues/203)) ([7b7fb40](https://github.com/QuivrHQ/MegaParse/commit/7b7fb40cae4ed380a5f0ca0035a7bd2bcc9147c3))\n\n## [0.1.10](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.9...megaparse-sdk-v0.1.10) (2024-12-16)\n\n\n### Bug Fixes\n\n* hatchling version ([#193](https://github.com/QuivrHQ/MegaParse/issues/193)) ([f6070a5](https://github.com/QuivrHQ/MegaParse/commit/f6070a5483a20eeb83751a2dcfc01b7f0fb14473))\n\n## [0.1.9](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.8...megaparse-sdk-v0.1.9) (2024-12-13)\n\n\n### Features\n\n* small fixes ([#181](https://github.com/QuivrHQ/MegaParse/issues/181)) ([004afe2](https://github.com/QuivrHQ/MegaParse/commit/004afe2f170570075bbebcd32dec5d15ddba4609))\n\n## [0.1.8](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.7...megaparse-sdk-v0.1.8) (2024-12-12)\n\n\n### Features\n\n* custom auto ([#131](https://github.com/QuivrHQ/MegaParse/issues/131)) ([3cb5be4](https://github.com/QuivrHQ/MegaParse/commit/3cb5be4a8c8eeb6dd6e9b87d7bbca24491db4c29))\n* faster ocr ([#180](https://github.com/QuivrHQ/MegaParse/issues/180)) ([5661cb2](https://github.com/QuivrHQ/MegaParse/commit/5661cb2d52d959cbca0f41339791129cd35d4036))\n\n## [0.1.7](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.6...megaparse-sdk-v0.1.7) (2024-11-25)\n\n\n### Bug Fixes\n\n* Update README.md ([#154](https://github.com/QuivrHQ/MegaParse/issues/154)) ([a103393](https://github.com/QuivrHQ/MegaParse/commit/a1033938184e20c24b0e54ee0db088b28075fd14))\n\n## [0.1.6](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.5...megaparse-sdk-v0.1.6) (2024-11-25)\n\n\n### Features\n\n* megaparse sdk tests ([#148](https://github.com/QuivrHQ/MegaParse/issues/148)) ([e030285](https://github.com/QuivrHQ/MegaParse/commit/e0302853fc2c1526b8e912bf3ef85b970a5b89bc))\n\n## [0.1.5](https://github.com/QuivrHQ/MegaParse/compare/megaparse-sdk-v0.1.4...megaparse-sdk-v0.1.5) (2024-11-21)\n\n\n### Features\n\n* refacto megaparse for service ([#132](https://github.com/QuivrHQ/MegaParse/issues/132)) ([ab9ad7f](https://github.com/QuivrHQ/MegaParse/commit/ab9ad7fb7db580a04a998d144dd2ba3407068334))\n* release plz ([#134](https://github.com/QuivrHQ/MegaParse/issues/134)) ([d8a221e](https://github.com/QuivrHQ/MegaParse/commit/d8a221e23f6e15e969c1328f183da3582d0d7925))\n"
  },
  {
    "path": "libs/megaparse_sdk/README.md",
    "content": "## MegaParse SDK\n\nWelcome to the MegaParse SDK! This SDK allows you to easily interact with the MegaParse API to upload URLs and files for processing.\n\n### Installation\n\nTo install the MegaParse SDK, use pip:\n\n```sh\npip install megaparse-sdk\n```\n\n### Usage\n\nHere is an example of how to use the MegaParse SDK:\n\n#### Uploading URLs\n\n```python\nimport asyncio\nimport os\n\nfrom megaparse.sdk import MegaParseSDK\n\nasync def upload_url():\n    api_key = str(os.getenv(\"MEGAPARSE_API_KEY\"))\n    megaparse = MegaParseSDK(api_key)\n\n    url = \"https://www.quivr.com\"\n\n    # Upload a URL\n    url_response = await megaparse.url.upload(url)\n    print(f\"\\n----- URL Response : {url} -----\\n\")\n    print(url_response)\n\n    await megaparse.close()\n\nif __name__ == \"__main__\":\n    asyncio.run(upload_url())\n```\n\n#### Uploading Files\n\n```python\nimport asyncio\nimport os\n\nfrom megaparse.sdk import MegaParseSDK\n\nasync def upload_file():\n    api_key = str(os.getenv(\"MEGAPARSE_API_KEY\"))\n    megaparse = MegaParseSDK(api_key)\n\n    file_path = \"your/file/path.pdf\"\n    # Upload a file\n    response = await megaparse.file.upload(\n        file_path=file_path,\n        method=\"unstructured\",  # unstructured, llama_parser, megaparse_vision\n        strategy=\"auto\",\n    )\n    print(f\"\\n----- File Response : {file_path} -----\\n\")\n    print(response)\n\n    await megaparse.close()\n\nif __name__ == \"__main__\":\n    asyncio.run(upload_file())\n```\n\n### Features\n\n- **Upload URLs**: Easily upload URLs for processing.\n- **Upload Files**: Upload files with different processing methods and strategies.\n\n### Getting Started\n\n1. **Set up your API key**: Make sure to set the `MEGAPARSE_API_KEY` environment variable with your MegaParse API key.\n2. **Run the example**: Use the provided example to see how to upload URLs and files.\n\nFor more details, refer to the [usage example](#file:usage_example.py-context).\n\nWe hope you find the MegaParse SDK useful for your projects!\n\nEnjoy, _Quivr Team_ !\n"
  },
  {
    "path": "libs/megaparse_sdk/__init__.py",
    "content": "\n"
  },
  {
    "path": "libs/megaparse_sdk/examples/usage_example.py",
    "content": "import asyncio\nimport os\n\nfrom megaparse.sdk.megaparse_sdk import MegaParseSDK\n\n\nasync def main():\n    api_key = str(os.getenv(\"MEGAPARSE_API_KEY\"))\n    megaparse = MegaParseSDK(api_key)\n\n    # url = \"https://www.quivr.com\"\n\n    # # Upload a URL\n    # url_response = await megaparse.url.upload(url)\n    # print(f\"\\n----- URL Response : {url} -----\\n\")\n    # print(url_response)\n\n    # file_path = \"megaparse/sdk/pdf/MegaFake_report.pdf\"\n    file_path = (\n        \"megaparse/sdk/examples/only_pdfs/4 The Language of Medicine  2024.07.21.pdf\"\n    )\n    # Upload a file\n    response = await megaparse.file.upload(\n        file_path=file_path,\n        method=\"unstructured\",  # type: ignore  # unstructured, llama_parser, megaparse_vision\n        strategy=\"auto\",  # type: ignore  # fast, auto, hi_res\n    )\n    print(f\"\\n----- File Response : {file_path} -----\\n\")\n    print(response)\n    await megaparse.close()\n\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/__init__.py",
    "content": "from .client import MegaParseClient\nfrom .endpoints.file_upload import FileUpload\nfrom .endpoints.url_upload import URLUpload\n\n\nclass MegaParseSDK:\n    def __init__(self, api_key: str | None = None, base_url: str | None = None):\n        self.client = MegaParseClient(api_key, base_url)\n        self.file = FileUpload(self.client)\n        self.url = URLUpload(self.client)\n\n    async def close(self):\n        await self.client.close()\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/client.py",
    "content": "import asyncio\nimport enum\nimport logging\nimport os\nfrom io import BytesIO\nfrom pathlib import Path\nfrom types import TracebackType\nfrom typing import Any, Self\n\nimport httpx\nimport nats\nfrom nats.errors import NoRespondersError, TimeoutError\n\nfrom megaparse_sdk.config import ClientNATSConfig, MegaParseSDKConfig\nfrom megaparse_sdk.schema.document import Document\nfrom megaparse_sdk.schema.mp_exceptions import (\n    DownloadError,\n    InternalServiceError,\n    MemoryLimitExceeded,\n    ModelNotSupported,\n    ParsingException,\n)\nfrom megaparse_sdk.schema.mp_inputs import (\n    FileInput,\n    MPInput,\n    ParseFileConfig,\n    ParseFileInput,\n    ParseUrlInput,\n)\nfrom megaparse_sdk.schema.mp_outputs import (\n    MPErrorType,\n    MPOutput,\n    MPOutputType,\n)\nfrom megaparse_sdk.utils.load_ssl import load_ssl_cxt\n\nlogger = logging.getLogger(\"megparse_sdk\")\n\n\nclass MegaParseClient:\n    def __init__(\n        self,\n        api_key: str | None = None,\n        base_url: str | None = None,\n    ):\n        config = MegaParseSDKConfig()\n        self.base_url = base_url or config.url\n        self.api_key = api_key or config.api_key\n        self.max_retries = config.max_retries\n        if self.api_key:\n            self.session = httpx.AsyncClient(\n                headers={\"x-api-key\": self.api_key}, timeout=config.timeout\n            )\n        else:\n            self.session = httpx.AsyncClient(timeout=config.timeout)\n\n    async def request(self, method: str, endpoint: str, **kwargs: Any) -> Any:\n        url = f\"{self.base_url}{endpoint}\"\n        client = self.session\n        for attempt in range(self.max_retries):\n            try:\n                response = await client.request(method, url, **kwargs)\n                response.raise_for_status()\n                return response.json()\n            except (httpx.HTTPStatusError, httpx.RequestError):\n                if attempt < self.max_retries - 1:\n                    await asyncio.sleep(2**attempt)  # Exponential backoff\n\n        raise RuntimeError(f\"Can't send request to the server: {url}\")\n\n    async def close(self):\n        await self.session.aclose()\n\n\nclass ClientState(enum.Enum):\n    # First state of the client\n    UNOPENED = 1\n    #   Client has either sent a request, or is within a `with` block.\n    OPENED = 2\n    #   Client has either exited the `with` block, or `close()` called.\n    CLOSED = 3\n\n\nclass MegaParseNATSClient:\n    def __init__(self, config: ClientNATSConfig):\n        self.nc_config = config\n        self.max_retries = self.nc_config.max_retries\n        self.backoff = self.nc_config.backoff\n        if self.nc_config.ssl_config:\n            self.ssl_ctx = load_ssl_cxt(self.nc_config.ssl_config)\n        else:\n            self.ssl_ctx = None\n        # Client connection\n        self._state = ClientState.UNOPENED\n        self._nc = None\n\n    async def _get_nc(self):\n        if self._nc is None:\n            self._nc = await nats.connect(\n                self.nc_config.endpoint,\n                tls=self.ssl_ctx,\n                connect_timeout=self.nc_config.connect_timeout,\n                reconnect_time_wait=self.nc_config.reconnect_time_wait,\n                max_reconnect_attempts=self.nc_config.max_reconnect_attempts,\n            )\n            return self._nc\n        return self._nc\n\n    async def __aenter__(self: Self) -> Self:\n        if self._state != ClientState.UNOPENED:\n            msg = {\n                ClientState.OPENED: \"Cannot open a client instance more than once.\",\n                ClientState.CLOSED: (\n                    \"Cannot reopen a client instance, client was closed.\"\n                ),\n            }[self._state]\n            raise RuntimeError(msg)\n\n        self._state = ClientState.OPENED\n\n        await self._get_nc()\n        return self\n\n    async def __aexit__(\n        self,\n        exc_type: type[BaseException] | None = None,\n        exc_value: BaseException | None = None,\n        traceback: TracebackType | None = None,\n    ) -> None:\n        self._state = ClientState.CLOSED\n        await self.aclose()\n\n    async def parse_url(self, url: str):\n        url_inp = ParseUrlInput(url=url)\n        return await self._send_req(MPInput(input=url_inp))\n\n    async def parse_file(\n        self, file: Path | BytesIO, file_name: str | None = None\n    ) -> str | Document:\n        if isinstance(file, Path):\n            with open(file, \"rb\") as f:\n                data = f.read()\n            file_name = os.path.basename(file)\n        else:\n            file.seek(0)\n            data = file.read()\n            if file_name is None:\n                raise ValueError(\"please provide file_name if passing ByteIO stream\")\n\n        file_input = ParseFileInput(\n            file_input=FileInput(file_name=file_name, file_size=len(data), data=data),\n            parse_config=ParseFileConfig(),\n        )\n\n        inp = MPInput(input=file_input)\n        return await self._send_req(inp)\n\n    async def _send_req(self, inp: MPInput) -> str | Document:\n        logger.debug(f\"Sending {inp} to megaparse service.\")\n\n        for attempt in range(self.max_retries):\n            try:\n                return await self._send_req_inner(inp)\n            except (TimeoutError, NoRespondersError) as e:\n                logger.error(f\"Sending req error: {e}. Retrying for {attempt} time\")\n                if attempt < self.max_retries - 1:\n                    logger.debug(f\"Backoff for {2**self.backoff}s\")\n                    await asyncio.sleep(2**self.backoff)\n        raise ParsingException\n\n    async def _send_req_inner(self, inp: MPInput):\n        nc = await self._get_nc()\n        raw_response = await nc.request(\n            self.nc_config.subject,\n            inp.model_dump_json().encode(\"utf-8\"),\n            timeout=self.nc_config.timeout,\n        )\n        response = MPOutput.model_validate_json(raw_response.data.decode(\"utf-8\"))\n        return self._handle_mp_output(response)\n\n    def _handle_mp_output(self, response: MPOutput) -> str | Document:\n        if response.output_type == MPOutputType.PARSE_OK:\n            assert response.result, \"Parsing OK but response is None\"\n            return response.result\n        elif response.output_type == MPOutputType.PARSE_ERR:\n            assert response.err, \"Parsing OK but response is None\"\n            match response.err.mp_err_code:\n                case MPErrorType.MEMORY_LIMIT:\n                    raise MemoryLimitExceeded\n                case MPErrorType.INTERNAL_SERVER_ERROR:\n                    raise InternalServiceError\n                case MPErrorType.MODEL_NOT_SUPPORTED:\n                    raise ModelNotSupported\n                case MPErrorType.DOWNLOAD_ERROR:\n                    raise DownloadError\n                case MPErrorType.PARSING_ERROR:\n                    raise ParsingException\n        raise ValueError(f\"unknown service response type: {response}\")\n\n    async def aclose(self):\n        nc = await self._get_nc()\n        await nc.close()\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/config.py",
    "content": "from pydantic import BaseModel, FilePath\nfrom pydantic_settings import BaseSettings, SettingsConfigDict\n\n\nclass MegaParseSDKConfig(BaseSettings):\n    \"\"\"\n    Configuration for the Megaparse SDK.\n    \"\"\"\n\n    model_config = SettingsConfigDict(env_prefix=\"MEGAPARSE_SDK_\")\n    api_key: str | None = None\n    url: str = \"https://megaparse.tooling.quivr.app\"\n    timeout: int = 600\n    max_retries: int = 3\n\n\nclass SSLConfig(BaseModel):\n    ssl_key_file: FilePath\n    ssl_cert_file: FilePath\n    ca_cert_file: FilePath | None = None\n\n\nclass ClientNATSConfig(BaseSettings):\n    model_config = SettingsConfigDict(\n        env_prefix=\"MEGAPARSE_NATS_\",\n        env_file=(\".env.local\", \".env\"),\n        env_nested_delimiter=\"__\",\n        extra=\"ignore\",\n    )\n    subject: str = \"parsing\"\n    endpoint: str = \"https://tests@nats.tooling.quivr.app:4222\"\n    timeout: float = 20\n    max_retries: int = 5\n    backoff: float = 3\n    connect_timeout: int = 5\n    reconnect_time_wait: int = 1\n    max_reconnect_attempts: int = 20\n    ssl_config: SSLConfig | None = None\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/endpoints/__init__.py",
    "content": "\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/endpoints/file_upload.py",
    "content": "from typing import Optional\n\nfrom httpx import Response\nfrom pydantic import BaseModel\n\nfrom megaparse_sdk.client import MegaParseClient\nfrom megaparse_sdk.schema.languages import Language\nfrom megaparse_sdk.schema.parser_config import ParserType, StrategyEnum\n\n\nclass UploadFileConfig(BaseModel):\n    method: ParserType\n    strategy: StrategyEnum\n    check_table: bool\n    language: Language\n    parsing_instruction: str | None = None\n    model_name: str = \"gpt-4o\"\n\n\nclass FileUpload:\n    def __init__(self, client: MegaParseClient):\n        self.client = client\n\n    async def upload(\n        self,\n        file_path: str,\n        method: ParserType = ParserType.UNSTRUCTURED,\n        strategy: StrategyEnum = StrategyEnum.AUTO,\n        check_table: bool = False,\n        language: Language = Language.ENGLISH,\n        parsing_instruction: Optional[str] = None,\n        model_name: str = \"gpt-4o\",\n    ) -> Response:\n        data = UploadFileConfig(\n            method=method,\n            strategy=strategy,\n            check_table=check_table,\n            language=language,\n            parsing_instruction=parsing_instruction,\n            model_name=model_name,\n        )\n        with open(file_path, \"rb\") as file:\n            files = {\"file\": (file_path, file)}\n\n            response = await self.client.request(\n                \"POST\",\n                \"/v1/file\",\n                files=files,\n                data=data.model_dump(mode=\"json\"),\n            )\n            return response\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/endpoints/url_upload.py",
    "content": "from httpx import Response\n\nfrom megaparse_sdk.client import MegaParseClient\n\n\nclass URLUpload:\n    def __init__(self, client: MegaParseClient):\n        self.client = client\n\n    async def upload(self, url: str, max_retries: int = 3) -> Response:\n        endpoint = f\"/v1/url?url={url}\"\n        headers = {\"accept\": \"application/json\"}\n        response = await self.client.request(\"POST\", endpoint, headers=headers, data=\"\")\n        return response\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/__init__.py",
    "content": "\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/document.py",
    "content": "import uuid\nfrom enum import Enum\nfrom typing import Any, Dict, List, Literal, NamedTuple, Optional, Self, Tuple\n\nimport numpy as np\nfrom PIL import Image, ImageDraw\nfrom pydantic import BaseModel, Field, field_validator\n\n\nclass Point2D(NamedTuple):\n    x: float\n    y: float\n\n\nclass BlockType(str, Enum):\n    TEXT = \"text\"\n\n\nclass BBOX(NamedTuple):\n    top_left: Point2D\n    bottom_right: Point2D\n\n    def to_numpy(self):\n        return np.array(\n            [self.top_left.x, self.top_left.y, self.bottom_right.x, self.bottom_right.y]\n        )\n\n    def iou(self, other: Self):\n        x1 = max(self.top_left.x, other.top_left.x)\n        y1 = max(self.top_left.y, other.top_left.y)\n        x2 = min(self.bottom_right.x, other.bottom_right.x)\n        y2 = min(self.bottom_right.y, other.bottom_right.y)\n        intersection = max(0, x2 - x1) * max(0, y2 - y1)\n        area_self = (self.bottom_right.x - self.top_left.x) * (\n            self.bottom_right.y - self.top_left.y\n        )\n        area_other = (other.bottom_right.x - other.top_left.x) * (\n            other.bottom_right.y - other.top_left.y\n        )\n        union = area_self + area_other - intersection\n        return intersection / union\n\n\nclass BlockLayout(BaseModel):\n    bbox: BBOX\n    objectness_score: float\n    block_type: BlockType\n\n\nclass TextDetection:\n    __slots__ = [\n        \"bboxes\",\n        \"page_index\",\n        \"dimensions\",\n        \"orientation\",\n        \"origin_page_shape\",\n    ]\n\n    def __init__(\n        self,\n        bboxes: List[BlockLayout],\n        page_index: int,\n        dimensions: Tuple[int, ...],\n        orientation: Tuple[int, float] | Literal[0],\n        origin_page_shape,\n    ):\n        self.bboxes = bboxes\n        self.page_index = page_index\n        self.dimensions = dimensions\n        self.orientation = orientation\n        self.origin_page_shape = origin_page_shape\n\n    def __repr__(self) -> str:\n        return f\"PageLayout(bboxes={self.bboxes}, page_index={self.page_index}, dimensions={self.dimensions}, orientation={self.orientation})\"\n\n    def render(\n        self, page_array: np.ndarray, output_path: Optional[str] = \"page_layout.png\"\n    ):\n        \"\"\"\n        Render the page layout with bounding boxes on the original page image.\n\n        Args:\n            page_array (np.ndarray): The original page image as a NumPy array.\n            output_path (str): The path to save the rendered image.\n        \"\"\"\n        # Convert the NumPy array to a PIL image\n        image = Image.fromarray(page_array)\n        draw = ImageDraw.Draw(image)\n        width, height = self.dimensions\n\n        # Draw each bounding box\n        for block in self.bboxes:\n            bbox = block.bbox\n            top_left = (bbox[0][0] * height, bbox[0][1] * width)\n            bottom_right = (bbox[1][0] * height, bbox[1][1] * width)\n            draw.rectangle([top_left, bottom_right], outline=\"red\", width=2)\n\n        if output_path:\n            # Save the image\n            image.save(output_path)\n            print(f\"Page layout saved to {output_path}\")\n        return image\n\n    def get_loc_preds(self) -> np.ndarray:\n        \"\"\"\n        Get the location predictions of the bounding boxes.\n\n        Returns:\n            np.ndarray: The location predictions as a NumPy array.\n        \"\"\"\n        loc_preds = np.array([block.bbox.to_numpy() for block in self.bboxes])\n        return loc_preds\n\n    def get_objectness_scores(self) -> np.ndarray:\n        \"\"\"\n        Get the objectness scores of the bounding boxes.\n\n        Returns:\n            np.ndarray: The objectness scores as a NumPy array.\n        \"\"\"\n        objectness_scores = np.array([block.objectness_score for block in self.bboxes])\n        return objectness_scores\n\n    def get_origin_page_shapes(self) -> np.ndarray:\n        \"\"\"\n        Get the original page shapes.\n\n        Returns:\n            np.ndarray: The original page shapes as a NumPy array.\n        \"\"\"\n        origin_page_shapes = np.array([self.origin_page_shape for _ in self.bboxes])\n        return origin_page_shapes\n\n    def get_orientations(self) -> np.ndarray:\n        \"\"\"\n        Get the orientations of the bounding boxes.\n\n        Returns:\n            np.ndarray: The orientations as a NumPy array.\n        \"\"\"\n        orientations = np.array([self.orientation for _ in self.bboxes])\n        return orientations\n\n\nclass Block(BaseModel):\n    \"\"\"\n    A class to represent a block\n\n    \"\"\"\n\n    block_id: Optional[uuid.UUID] = Field(default_factory=uuid.uuid4)\n    metadata: Dict[str, Any]  # FIXME: TBD @Amine\n    bbox: Optional[BBOX] = (\n        None  # (x0,y0),(x1, y1) Coordinates are given as Relative positions to the page they are in\n    )\n    page_range: Optional[Tuple[int, int]] = Field(\n        default=None\n    )  # (start_page, end_page)\n\n    @field_validator(\"page_range\")\n    def validate_range(cls, value):\n        if value is None:\n            return None\n        start, end = value\n        if start > end:\n            raise ValueError(\n                \"The first value of the page range must be less than the second value\"\n            )\n        return value\n\n\nclass TextBlock(Block):\n    \"\"\"\n    A class to represent a text block\n\n    \"\"\"\n\n    text: str\n\n    def __str__(self):\n        return self.text\n\n\nclass UndefinedBlock(TextBlock):\n    \"\"\"\n    A class to represent a text block\n\n    \"\"\"\n\n    pass\n\n\nclass TitleBlock(TextBlock):\n    \"\"\"\n    A class to represent a title block\n\n    \"\"\"\n\n    def __str__(self):\n        return f\"# {self.text}\"\n\n\nclass SubTitleBlock(TextBlock):\n    \"\"\"\n    A class to represent a subtitle block\n    \"\"\"\n\n    depth: int = 0\n\n    def __str__(self):\n        heading_level = min(self.depth + 1, 6)\n        return f\"{'#' * heading_level} {self.text}\"\n\n\nclass CaptionBlock(TextBlock):\n    \"\"\"\n    A class to represent a caption block\n    \"\"\"\n\n    pass\n\n\nclass ImageBlock(Block):\n    \"\"\"\n    A class to represent an image block\n    \"\"\"\n\n    text: Optional[str] = None\n    caption: Optional[str] = \"unknown\"\n\n    def __str__(self) -> str:\n        return f\"[Image: {self.caption}]\"\n\n\nclass TableBlock(ImageBlock):\n    \"\"\"\n    A class to represent a table block\n\n    \"\"\"\n\n    def __str__(self):\n        return self.text if self.text else f\"[Table : {self.caption}]\"\n\n\nclass ListElementBlock(TextBlock):\n    \"\"\"\n    A class to represent a list element\n\n    \"\"\"\n\n    depth: int = 0\n\n\nclass ListBlock(Block):\n    \"\"\"\n    A class to represent a list block\n\n    \"\"\"\n\n    list_elements: List[ListElementBlock]\n\n    # rajouter fonction pydantic pour compute l attribut\n\n    def __str__(self):\n        return \"\\n\".join(\n            f\"{' ' * (2 * element.depth)}* {element.text}\"\n            for element in self.list_elements\n        )\n\n\nclass HeaderBlock(TextBlock):\n    \"\"\"\n    A class to represent a header block\n\n    \"\"\"\n\n    def __str__(self):\n        return f\"{'='*len(self.text)}\\n\\n{self.text}\\n\\n{'='*len(self.text)}\"\n\n\nclass FooterBlock(TextBlock):\n    \"\"\"\n    A class to represent a footer block\n\n    \"\"\"\n\n    def __str__(self):\n        return f\"{'='*len(self.text)}\\n\\n{self.text}\\n\\n{'='*len(self.text)}\"\n\n\nclass SectionBlock(Block):\n    \"\"\"\n    A class to represent a section block\n\n    \"\"\"\n\n    title: str\n    depth: int\n    content: List[Block]\n\n    def __str__(self):\n        lines = []\n        lines.extend(str(block) for block in self.content)\n        return \"\\n\".join(lines)\n\n\nclass TOCItem(BaseModel):\n    title: str\n    depth: int\n    page_range: Tuple[int, int] = Field(...)  # (start_page, end_page)\n\n    @field_validator(\"page_range\")\n    def validate_range(cls, value):\n        start, end = value\n        if start >= end:\n            raise ValueError(\n                \"The first value of the page range must be less than the second value\"\n            )\n        return value\n\n    def __str__(self):\n        start_page, end_page = self.page_range\n        page_info = (\n            f\"page {start_page}\"\n            if start_page == end_page\n            else f\"pages {start_page}-{end_page}\"\n        )\n        return f\"{' ' * (2 * self.depth)}* {self.title} ({page_info})\"\n\n\nclass TOC(BaseModel):\n    content: List[TOCItem]\n\n    @property\n    def text(self) -> str:\n        return \"\\n\".join(str(item) for item in self.content)\n\n    def __str__(self):\n        return self.text\n\n\nclass Document(BaseModel):\n    \"\"\"\n\n    A class to represent a document\n\n    \"\"\"\n\n    file_name: Optional[str] = None\n    table_of_contents: Optional[TOC] = None\n    content: List[Block]\n    detection_origin: str\n    metadata: Dict[str, Any]\n\n    def __str__(self) -> str:\n        lines = []\n\n        # If there's a table of contents, include it\n        if self.table_of_contents:\n            lines.append(\"Table of Contents:\")\n            # Use TOC’s own string-building property or method\n            lines.append(self.table_of_contents.text)\n\n        # Print each block’s text representation\n        lines.extend(str(block) + \"\\n\" for block in self.content)\n\n        return \"\\n\".join(lines)\n\n    def clean(self):\n        \"\"\"\n        Clean the Document element by :\n        - Merging Caption in ImageBlock\n        - Merging continuous list items elements into ListBlock\n        - Add Depth to Title / SubTitle / ListElementBlock\n        - Creating sections\n        - Creating TOC\n        \"\"\"\n\n        # Merge caption in ImageBlock simplified\n        i = 0\n        list_elements_stack = []\n        while i < len(self.content) - 1:\n            if isinstance(self.content[i], ListElementBlock):\n                list_elements_stack.append(self.content[i])\n                self.content.pop(i)\n                continue\n            else:\n                if list_elements_stack:\n                    self.content.insert(\n                        i, ListBlock(list_elements=list_elements_stack, metadata={})\n                    )\n                    list_elements_stack = []\n\n            if isinstance(self.content[i], ImageBlock) and isinstance(\n                self.content[i + 1], CaptionBlock\n            ):\n                self.content[i].caption = str(self.content[i + 1])  # type: ignore\n                self.content.pop(i + 1)\n            elif isinstance(self.content[i], CaptionBlock) and isinstance(\n                self.content[i + 1], ImageBlock\n            ):\n                self.content[i + 1].caption = str(self.content[i])  # type: ignore\n                self.content.pop(i)\n\n            i += 1\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/extensions.py",
    "content": "from enum import Enum\n\n\nclass FileExtension(str, Enum):\n    \"\"\"Supported file extension enumeration.\"\"\"\n\n    _mimetype: str\n\n    def __new__(cls, value: str, mimetype: str):\n        obj = str.__new__(cls, value)\n        obj._value_ = value\n        obj._mimetype = mimetype\n        return obj\n\n    PDF = (\".pdf\", \"application/pdf\")\n    DOCX = (\n        \".docx\",\n        \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\",\n    )\n    TXT = (\".txt\", \"text/plain\")\n    OTF = (\".odt\", \"application/vnd.oasis.opendocument.text\")\n    EPUB = (\".epub\", \"application/epub\")\n    HTML = (\".html\", \"text/html\")\n    XML = (\".xml\", \"application/xml\")\n    CSV = (\".csv\", \"text/csv\")\n    XLSX = (\n        \".xlsx\",\n        \"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\",\n    )\n    XLS = (\".xls\", \"application/vnd.ms-excel\")\n    PPTX = (\n        \".pptx\",\n        \"application/vnd.openxmlformats-officedocument.presentationml.presentation\",\n    )\n    MD = (\".md\", \"text/markdown\")\n    MARKDOWN = (\".markdown\", \"text/markdown\")\n\n    @property\n    def mimetype(self) -> str:\n        return self._mimetype\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/languages.py",
    "content": "from enum import Enum\n\n\nclass Language(str, Enum):\n    BAZA = \"abq\"\n    ADYGHE = \"ady\"\n    AFRIKAANS = \"af\"\n    ANGIKA = \"ang\"\n    ARABIC = \"ar\"\n    ASSAMESE = \"as\"\n    AVAR = \"ava\"\n    AZERBAIJANI = \"az\"\n    BELARUSIAN = \"be\"\n    BULGARIAN = \"bg\"\n    BIHARI = \"bh\"\n    BHOJPURI = \"bho\"\n    BENGALI = \"bn\"\n    BOSNIAN = \"bs\"\n    SIMPLIFIED_CHINESE = \"ch_sim\"\n    TRADITIONAL_CHINESE = \"ch_tra\"\n    CHECHEN = \"che\"\n    CZECH = \"cs\"\n    WELSH = \"cy\"\n    DANISH = \"da\"\n    DARGWA = \"dar\"\n    GERMAN = \"de\"\n    ENGLISH = \"en\"\n    SPANISH = \"es\"\n    ESTONIAN = \"et\"\n    PERSIAN_FARSI = \"fa\"\n    FRENCH = \"fr\"\n    IRISH = \"ga\"\n    GOAN_KONKANI = \"gom\"\n    HINDI = \"hi\"\n    CROATIAN = \"hr\"\n    HUNGARIAN = \"hu\"\n    INDONESIAN = \"id\"\n    INGUSH = \"inh\"\n    ICELANDIC = \"is\"\n    ITALIAN = \"it\"\n    JAPANESE = \"ja\"\n    KABARDIAN = \"kbd\"\n    KANNADA = \"kn\"\n    KOREAN = \"ko\"\n    KURDISH = \"ku\"\n    LATIN = \"la\"\n    LAK = \"lbe\"\n    LEZGHIAN = \"lez\"\n    LITHUANIAN = \"lt\"\n    LATVIAN = \"lv\"\n    MAGAHI = \"mah\"\n    MAITHILI = \"mai\"\n    MAORI = \"mi\"\n    MONGOLIAN = \"mn\"\n    MARATHI = \"mr\"\n    MALAY = \"ms\"\n    MALTESE = \"mt\"\n    NEPALI = \"ne\"\n    NEWARI = \"new\"\n    DUTCH = \"nl\"\n    NORWEGIAN = \"no\"\n    OCCITAN = \"oc\"\n    PALI = \"pi\"\n    POLISH = \"pl\"\n    PORTUGUESE = \"pt\"\n    ROMANIAN = \"ro\"\n    RUSSIAN = \"ru\"\n    SERBIAN_CYRILLIC = \"rs_cyrillic\"\n    SERBIAN_LATIN = \"rs_latin\"\n    NAGPURI = \"sck\"\n    SLOVAK = \"sk\"\n    SLOVENIAN = \"sl\"\n    ALBANIAN = \"sq\"\n    SWEDISH = \"sv\"\n    SWAHILI = \"sw\"\n    TAMIL = \"ta\"\n    TABASSARAN = \"tab\"\n    TELUGU = \"te\"\n    THAI = \"th\"\n    TAJIK = \"tjk\"\n    TAGALOG = \"tl\"\n    TURKISH = \"tr\"\n    UYGHUR = \"ug\"\n    UKRAINIAN = \"uk\"\n    URDU = \"ur\"\n    UZBEK = \"uz\"\n    VIETNAMESE = \"vi\"\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/mp_exceptions.py",
    "content": "class ModelNotSupported(Exception):\n    def __init__(\n        self,\n        message: str = \"The requested model is not supported yet.\",\n    ):\n        super().__init__(message)\n\n\nclass MemoryLimitExceeded(Exception):\n    def __init__(self, message=\"The service is under high memory pressure\"):\n        super().__init__(message)\n\n\nclass InternalServiceError(Exception):\n    def __init__(self, message=\"Internal service error occured\"):\n        super().__init__(message)\n\n\nclass DownloadError(Exception):\n    def __init__(self, message=\"Failed to download the file\"):\n        super().__init__(message)\n\n\nclass ParsingException(Exception):\n    def __init__(self, message=\"An error occurred during parsing\"):\n        super().__init__(message)\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/mp_inputs.py",
    "content": "import base64\nfrom enum import Enum\nfrom typing import Literal, Union\n\nfrom pydantic import BaseModel, Field, field_serializer, field_validator\n\nfrom .parser_config import ParseFileConfig\n\n\nclass FileInput(BaseModel):\n    file_name: str\n    file_size: int\n    data: bytes\n\n    @field_validator(\"data\", mode=\"before\")\n    def decode_data(cls, value):\n        if isinstance(value, str):\n            try:\n                return base64.b64decode(value)\n            except Exception:\n                raise ValueError(\"Invalid Base64 encoding for the 'data' field.\")\n        return value\n\n    # TODO: this is slow !!! Move to reading bytes directly from bucket storage\n    # append bytes with CRC32\n    @field_serializer(\"data\", return_type=str)\n    def serialize_data(self, data: bytes, _info):\n        return base64.b64encode(data).decode(\"utf-8\")\n\n\nclass MPParseType(str, Enum):\n    PARSE_FILE = \"parse_file\"\n    PARSE_URL = \"parse_url\"\n\n\nclass ParseFileInput(BaseModel):\n    mp_parse_type: Literal[MPParseType.PARSE_FILE] = MPParseType.PARSE_FILE\n    file_input: FileInput\n    parse_config: ParseFileConfig\n\n\nclass ParseUrlInput(BaseModel):\n    mp_parse_type: Literal[MPParseType.PARSE_URL] = MPParseType.PARSE_URL\n    url: str\n\n\nclass MPInput(BaseModel):\n    input: Union[ParseFileInput, ParseUrlInput] = Field(\n        ..., discriminator=\"mp_parse_type\"\n    )\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/mp_outputs.py",
    "content": "from enum import Enum, auto\nfrom typing import Dict\n\nfrom pydantic import BaseModel, Field\n\nfrom megaparse_sdk.schema.document import Document\n\n\nclass MPErrorType(Enum):\n    MEMORY_LIMIT = auto()\n    INTERNAL_SERVER_ERROR = auto()\n    MODEL_NOT_SUPPORTED = auto()\n    DOWNLOAD_ERROR = auto()\n    PARSING_ERROR = auto()\n\n\nclass ParseError(BaseModel):\n    mp_err_code: MPErrorType\n    message: str\n\n\nclass MPOutputType(str, Enum):\n    PARSE_OK = \"parse_file_ok\"\n    PARSE_ERR = \"parse_file_err\"\n\n\nclass MPOutput(BaseModel):\n    output_type: MPOutputType\n    result: str | Document | None\n    err: ParseError | None = None\n    metadata: Dict[str, str] = Field(default_factory=dict)\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/parser_config.py",
    "content": "from enum import Enum\nfrom typing import Optional\n\nfrom pydantic import BaseModel\n\nfrom .languages import Language\nfrom .supported_models import SupportedModel\n\n\nclass ParserType(str, Enum):\n    \"\"\"Parser type enumeration.\"\"\"\n\n    UNSTRUCTURED = \"unstructured\"\n    LLAMA_PARSER = \"llama_parser\"\n    MEGAPARSE_VISION = \"megaparse_vision\"\n\n\nclass StrategyEnum(str, Enum):\n    \"\"\"Method to use for the conversion\"\"\"\n\n    FAST = \"fast\"\n    AUTO = \"auto\"\n    HI_RES = \"hi_res\"\n\n\nclass ParseFileConfig(BaseModel):\n    llm_model_name: SupportedModel = SupportedModel.GPT_4\n    method: ParserType = ParserType.UNSTRUCTURED\n    strategy: StrategyEnum = StrategyEnum.AUTO\n    check_table: bool = False\n    language: Language = Language.ENGLISH\n    parsing_instruction: Optional[str] = None\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/schema/supported_models.py",
    "content": "from enum import Enum\n\n\nclass SupportedModel(str, Enum):\n    \"\"\"Supported models enumeration.\"\"\"\n\n    # OpenAI Models\n    GPT_4 = \"gpt-4\"\n    GPT_4_TURBO = \"gpt-4-turbo\"\n    GPT_3_5_TURBO = \"gpt-3.5-turbo\"\n    GPT_4O = \"gpt-4o\"\n    GPT_4O_MINI = \"gpt-4o-mini\"\n\n    # Anthropic Models\n    CLAUDE_3_5_SONNET_LATEST = \"claude-3-5-sonnet-latest\"\n    CLAUDE_3_5_SONNET = \"claude-3-5-sonnet-20241022\"\n    CLAUDE_3_5_HAIKU = \"claude-3-5-haiku-20241022\"\n    CLAUDE_3_5_HAIKU_LATEST = \"claude-3-5-haiku-latest\"\n    CLAUDE_3_OPUS = \"claude-3-opus-20240229\"\n    CLAUDE_3_OPUS_LATEST = \"claude-3-opus-latest\"\n    CLAUDE_3_SONNET = \"claude-3-sonnet-20240229\"\n    CLAUDE_3_HAIKU = \"claude-3-haiku-20240307\"\n\n    def __str__(self):\n        return self.value\n\n    @classmethod\n    def is_supported(cls, model_name: str) -> bool:\n        \"\"\"Check if the model is supported.\"\"\"\n        return model_name in cls.__members__.values()\n\n    @classmethod\n    def get_supported_models(cls) -> list[str]:\n        \"\"\"Get the list of supported models.\"\"\"\n        return list(cls.__members__.values())\n"
  },
  {
    "path": "libs/megaparse_sdk/megaparse_sdk/utils/load_ssl.py",
    "content": "import ssl\n\nfrom megaparse_sdk.config import SSLConfig\n\n\ndef load_ssl_cxt(ssl_config: SSLConfig):\n    context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)\n    if ssl_config.ca_cert_file:\n        context.load_verify_locations(cafile=ssl_config.ca_cert_file)\n    context.load_cert_chain(\n        certfile=ssl_config.ssl_cert_file, keyfile=ssl_config.ssl_key_file\n    )\n    return context\n"
  },
  {
    "path": "libs/megaparse_sdk/pyproject.toml",
    "content": "[project]\nname = \"megaparse-sdk\"\nversion = \"0.1.12\"\ndescription = \"Megaparse SDK\"\ndependencies = [\n    \"python-dotenv>=1.0.0\",\n    \"pycryptodome>=3.21.0\",\n    \"psutil>=6.1.0\",\n    \"httpx>=0.27.0\",\n    \"nats-py>=2.9.0\",\n    \"loguru>=0.7.2\",\n]\n\nreadme = \"README.md\"\nrequires-python = \">= 3.11\"\n\n[build-system]\nrequires = [\"hatchling==1.26.3\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.rye]\nmanaged = true\ndev-dependencies = []\nuniversal = true\n\n[tool.hatch.metadata]\nallow-direct-references = true\n\n[tool.hatch.build.targets.wheel]\npackages = [\"megaparse_sdk\"]\n"
  },
  {
    "path": "libs/megaparse_sdk/tests/README.md",
    "content": ""
  },
  {
    "path": "libs/megaparse_sdk/tests/certs/client-cert.pem",
    "content": "-----BEGIN CERTIFICATE-----\nMIIEqDCCAxCgAwIBAgIRAITvq6ZEk6paYFDRbueJhEMwDQYJKoZIhvcNAQELBQAw\ngZ0xHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTE5MDcGA1UECwwwYW1p\nbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChhbWluZSBkaXJob3Vzc2kpMUAw\nPgYDVQQDDDdta2NlcnQgYW1pbmVAYW1pbmVzLU1hY0Jvb2stUHJvLmxvY2FsIChh\nbWluZSBkaXJob3Vzc2kpMB4XDTI0MTExOTEwNDgwN1oXDTI3MDIxOTEwNDgwN1ow\nZDEnMCUGA1UEChMebWtjZXJ0IGRldmVsb3BtZW50IGNlcnRpZmljYXRlMTkwNwYD\nVQQLDDBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhv\ndXNzaSkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC2fDlGlKYIj8bp\ntlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5\nKDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH\nqmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN\ngLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8\nghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT\nWWVVcNfJAgMBAAGjgZowgZcwDgYDVR0PAQH/BAQDAgWgMCcGA1UdJQQgMB4GCCsG\nAQUFBwMCBggrBgEFBQcDAQYIKwYBBQUHAwQwHwYDVR0jBBgwFoAUV2w3gvQM5La1\n2fk80tJXoM/14l4wOwYDVR0RBDQwMoIJbG9jYWxob3N0gRNtZWdhcGFyc2VAcXVp\ndnIuYXBwhxAAAAAAAAAAAAAAAAAAAAABMA0GCSqGSIb3DQEBCwUAA4IBgQAYq4VZ\n6spwGvcqg8kCOghu6o54UPYo/NLzh3oYewJnDJ+2XD786TpTgjZMGA6Ms+det6oV\nHdT5s77VFgJiJloHlD0fpKkRxjzyBOk5/bQcCKkTMBVfgJbMoAfa2gq+/7zxmLcn\nAmNg7BkmsTtHWPsLyN3rYI4dkkDKWkxp8Sezm9WPEa9OGJDJSYf4Dq9pN1lUoP1p\nvxsq7sW0HDWnx/I2zWuz3AaT9b4UayRnk4IRYxAuYYN/k0GNjVmmDveywNoNlkmW\n0Az6ycPN+vvz8Jpm3CbZSIQLO8Yn57H/aU4DmOtunm3VLUiLucmfOggv8Sq5n2g9\nze61UJu9lr2/nWOXnErl3V9UL3kJ1OlbFzTWDGm9zX7boo6MLXy+fAj+Tw0sCeMr\ndrdxo8IUYYU6HUdtuLGMFznBFFUNhfFSwFANGPB38NyofwLPSZM0hYntQqBMt/P7\n/E+wQ67hSEutkIbOD3kGkGREIk3dVyUeajO9DFTaQ+yTnNtnuUbxs5LkRlw=\n-----END CERTIFICATE-----\n"
  },
  {
    "path": "libs/megaparse_sdk/tests/certs/client-key.pem",
    "content": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC2fDlGlKYIj8bp\ntlDYh8ooc56Zt+R1HF1GcqF0Gv+oub/dDvsIZnun5bBnA7W3tJ4M6virwg6cBiA5\nKDkIbwWfzHatsvFM0gMX3ZEfAwemo9Egi8udOsuAkP0OYlxzAB1PqOKCRfcfzFcH\nqmOb/JNlI82LBDLOqJDfGG4cRBYWqWRTYDxHsswSKFr/QHOHpImtrAyqo8qsXobN\ngLWSm1cNNtHa5XiCCJ7NUCVZh5cyEeCv1fS2297N+H0W9BxKpb1f9sAQ2N3ZLei8\nghHuQVA8yhUB1YCO/8jsywvXb8EnZctPLvhuLxeCN7A4TESPk5i0LsITqJcl4vQT\nWWVVcNfJAgMBAAECggEBAIK2AlSzHyacze8UH16qDTzibGVRGjxkf895Rnqi6COU\nQYD3PQrsVYCS/sMbHiujHV7FZC+rRcmufaBTVl7bH10yGIQc28iZ2YtbsppTEkTj\nrGUynTtXJPNHZ2vJOs1I9LXdk7maogPN2zzraIQP7AgTGCSOclIi3fpfRmfKwUOj\nBkEzj7CbaAGtW9vTamPJG/+wgaaBcPhplQk4cD2mjdaMLfGQXNXiYgp09kf0hJ2k\n0QbsQBC85bMSfmPAsoTRLxi94S12at3SABgF0oOCy9FZs/sWsdJRI6nbfvZ3C4xo\n8y+rH7Yaej7AYK+jbU3Uk/1473cuCAnNKg65UyU4+gECgYEA2/ZQYRDU3JWNHQGy\ndJXZRl6hSFCw9y9RUc/QjcRs+VlnXE5UK1eLwfcKh0YYRhIWSE8z3mZmK09M/FG0\nxbU4qIZbDYcAI2nCiUeT8HmTjVSPMS1oWZrt7rh00gcyoLQt2TUS3bo2tsmdPyWW\nOgEiYfb4MoG/KCdYlACE6O4GMMECgYEA1GIMIHM2x4B1wgLnKeI3X2wYWuYCHtFB\nPx56GUFTZytBsHghxtovVlLh88FNS5rthvXuE0FHE9RljKhZaNgqrPOrlAZSuv18\nvK7RmG/NPJl2osbs677a/xoxNuVkfrRcxl4cvYOBL5huHo1D5sOitGFW+IlscgWY\nnWzXlY7AYQkCgYA6H96hp7b4CzTc42Pq1uYxaDQqTdhVmVVdzxKHQ86gHXXouHIZ\neereeI95q5YifgkRVoyYSmrZKv1m95hTXk34inhpHLF2qi3T5Ow88YOCJ0QndJ5M\nf1o8aNXF4k0IllQ/P30axmhK6P/6fc4yybXyOTbg8dQ3oh4QDgsRGkTcgQKBgQCG\nqLgJpyN3cPK5FYAeJUl4nh//GlED2yekbp15/9py0pFu42x/GX3kHN8Y31oz8sJh\nzPKrkLsRTp0ohuFRwaWlTUZfr3arCugY9jr8jP6zSpZW9QvpGXTfRGsp5F5Im/Eq\n8ScF3ih91gcUJfuEiExUVFeBdBinXvb58bXrJLzDiQKBgG+Z06uj2dWxtK4nqJvP\nHllTocAGVm+fEmupVsLU6ksVVrOl8O9TapMbY8pUj9J5oBYJvY+KFGoIoxYwhZrz\n4NqY7iv8w+LQ7mQIwcQ4B67pDAQMJZTShR5v57FlAZldP5UpE5ASt22isBW31sYI\n1OaXIqrCA/V43NydDezh0ylQ\n-----END PRIVATE KEY-----\n"
  },
  {
    "path": "libs/megaparse_sdk/tests/certs/rootCA.pem",
    "content": "-----BEGIN CERTIFICATE-----\nMIIFCzCCA3OgAwIBAgIQESt0eck2KvFrAMyiDyceujANBgkqhkiG9w0BAQsFADCB\nnTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu\nZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+\nBgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt\naW5lIGRpcmhvdXNzaSkwHhcNMjQxMTE5MTAwMTA5WhcNMzQxMTE5MTAwMTA5WjCB\nnTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMTkwNwYDVQQLDDBhbWlu\nZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFtaW5lIGRpcmhvdXNzaSkxQDA+\nBgNVBAMMN21rY2VydCBhbWluZUBhbWluZXMtTWFjQm9vay1Qcm8ubG9jYWwgKGFt\naW5lIGRpcmhvdXNzaSkwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCw\n6TX1kvqVMb8ZUQVT/vuDsedmbYgSFn68yJRlmE9BsqG7TLQHl2Kw6VQqZBSIkeZG\nCypmUysX/3qrvICeArIdmmsrWUTDYPoauw/a/RY0I07rALj3YR0Y7039Hxf/UPT9\nxlUtnM2NafkZyp6WRjEN0N4ETvJDIbUQiosiiPilxhwRbJURhT/JPskaw+OM2Sw5\ndFAT20zkYC5VIc4wJBFLAMG0XzI6Sy/4wI1WdRBXd2UMpQU4u7TyD0RB4mnHorV6\nkXjtLKD/KWSrSG1nnum9SB9eVatbRD+TUgoclwAKedrlCDEM4EsXVVuUuYCizQNb\n+H3BSPfj1upUW5eKfgAyB+8r4QGf2yCY9O8NMMrJ1K5Qv4vSuWAU2tZqAyE8Z4Ke\nUtHsl/M0zIvIKwyki2N/rieL/m6lTzS3dwSf9vv7eePEvxd8SBClSF07MUzyxkZ5\nUYNxaK5t2ZRADZ6n/9/hAQsMscCkHiX1N2ypBFV+86Pr78BC48JgIyCMwuiBN4sC\nAwEAAaNFMEMwDgYDVR0PAQH/BAQDAgIEMBIGA1UdEwEB/wQIMAYBAf8CAQAwHQYD\nVR0OBBYEFFdsN4L0DOS2tdn5PNLSV6DP9eJeMA0GCSqGSIb3DQEBCwUAA4IBgQBj\nKosfLfW/ZH80NM16pvpyRF3mCi+q+I+P8zrfilMYJBH4EEdEGAUgTO5do1kJXeel\nWky+FNxaP6KCNiT+0amypKg+yjBlnqLKVdnEgR5s12ZfmerV59stx1A/c/bYMEAS\nre6xskBkowP2cVQHAC2dy/0Ov+lZsiNaPV2bQx6KUJurveebUQsH3uF3ZEhnUVQ6\nrt5+JGY4x9Tr1YMhvHqEDTrsipPdDB1MyW1SnCkqSXrz+DPXGd8BW0O0hpM5la81\nJ+rfZGinbcUgXM6JMLIHDxLc4Xxzm4NijFzXhbR3XPXqEwsnZOuxcYYFgUGs3FwS\n4ro+34a/O4uKS2KV8wsUWj/tWD2rLpduDgag4WSipCvWtaNve8gPdUiyPxUqxyoZ\naFAFg/izXwmRntogJtV0Zvo3fqAaQQDl8t2s21IIx0wmgHzgmkswb5OwFg3dOn/S\nlmaH8v7FCBP7jHx/NCPTT5Sy/1EMRATmhFDUZ8Bod/TIlV3e+FCVqlX3kBBRbAU=\n-----END CERTIFICATE-----\n"
  },
  {
    "path": "libs/megaparse_sdk/tests/test_nats_client.py",
    "content": "import asyncio\nimport logging\nfrom pathlib import Path\n\nimport nats\nimport pytest\nimport pytest_asyncio\nfrom megaparse_sdk.client import ClientState, MegaParseNATSClient\nfrom megaparse_sdk.config import ClientNATSConfig, SSLConfig\nfrom megaparse_sdk.schema.mp_exceptions import (\n    DownloadError,\n    InternalServiceError,\n    MemoryLimitExceeded,\n    ModelNotSupported,\n    ParsingException,\n)\nfrom megaparse_sdk.schema.mp_inputs import MPInput, ParseFileInput, ParseUrlInput\nfrom megaparse_sdk.schema.mp_outputs import (\n    MPErrorType,\n    MPOutput,\n    MPOutputType,\n    ParseError,\n)\nfrom nats.aio.client import Client\n\nlogger = logging.getLogger(__name__)\n\nNATS_URL = \"nats://test@127.0.0.1:4222\"\nNATS_SUBJECT = \"parsing\"\nSSL_CERT_FILE = \"./tests/certs/client-cert.pem\"\nSSL_KEY_FILE = \"./tests/certs/client-key.pem\"\nCA_CERT_FILE = \"./tests/certs/rootCA.pem\"\n\n\n@pytest.fixture(scope=\"session\")\ndef ssl_config() -> SSLConfig:\n    return SSLConfig(\n        ca_cert_file=CA_CERT_FILE,\n        ssl_key_file=SSL_KEY_FILE,\n        ssl_cert_file=SSL_CERT_FILE,\n    )\n\n\n@pytest.fixture(scope=\"session\")\ndef nc_config(ssl_config: SSLConfig) -> ClientNATSConfig:\n    config = ClientNATSConfig(\n        subject=NATS_SUBJECT,\n        endpoint=NATS_URL,\n        ssl_config=ssl_config,\n        timeout=0.5,\n        max_retries=1,\n        backoff=-1,\n        connect_timeout=1,\n        reconnect_time_wait=1,\n        max_reconnect_attempts=1,\n    )\n    return config\n\n\n@pytest_asyncio.fixture(scope=\"function\")\nasync def nats_service(nc_config: ClientNATSConfig):\n    # TODO: fix TLS handshake to work in CI\n    # ssl_config = load_ssl_cxt(nc_config.ssl_config)\n    nc = await nats.connect(\n        nc_config.endpoint,\n        tls=ssl_config,\n        connect_timeout=nc_config.connect_timeout,\n        reconnect_time_wait=nc_config.reconnect_time_wait,\n        max_reconnect_attempts=nc_config.max_reconnect_attempts,\n    )\n    yield nc\n    await nc.drain()\n\n\n@pytest.mark.asyncio\nasync def test_client_state_transition(nc_config: ClientNATSConfig):\n    mpc = MegaParseNATSClient(nc_config)\n    assert mpc._state == ClientState.UNOPENED\n    async with mpc:\n        assert mpc._state == ClientState.OPENED\n    assert mpc._state == ClientState.CLOSED\n\n    with pytest.raises(RuntimeError):\n        async with mpc:\n            pass\n\n\n@pytest.mark.asyncio(loop_scope=\"session\")\nasync def test_client_parse_file(nats_service: Client, nc_config: ClientNATSConfig):\n    async def message_handler(msg):\n        parsed_input = MPInput.model_validate_json(msg.data.decode(\"utf-8\")).input\n        assert isinstance(parsed_input, ParseFileInput)\n        output = MPOutput(output_type=MPOutputType.PARSE_OK, result=\"test\")\n        await nats_service.publish(msg.reply, output.model_dump_json().encode(\"utf-8\"))\n\n    await nats_service.subscribe(NATS_SUBJECT, \"worker\", cb=message_handler)\n\n    file_path = Path(\"./tests/pdf/sample_table.pdf\")\n    async with MegaParseNATSClient(nc_config) as mp_client:\n        resp = await mp_client.parse_file(file=file_path)\n        assert resp == \"test\"\n\n\n@pytest.mark.asyncio(loop_scope=\"session\")\nasync def test_client_parse_url(nats_service: Client, nc_config: ClientNATSConfig):\n    async def message_handler(msg):\n        parsed_input = MPInput.model_validate_json(msg.data.decode(\"utf-8\")).input\n        assert isinstance(parsed_input, ParseUrlInput)\n        output = MPOutput(output_type=MPOutputType.PARSE_OK, result=\"url\")\n        await nats_service.publish(msg.reply, output.model_dump_json().encode(\"utf-8\"))\n\n    await nats_service.subscribe(NATS_SUBJECT, \"worker\", cb=message_handler)\n\n    async with MegaParseNATSClient(nc_config) as mp_client:\n        resp = await mp_client.parse_url(url=\"this://this\")\n        assert resp == \"url\"\n\n\n@pytest.mark.asyncio(loop_scope=\"session\")\nasync def test_client_parse_timeout(nats_service: Client, ssl_config: SSLConfig):\n    nc_config = ClientNATSConfig(\n        subject=NATS_SUBJECT,\n        endpoint=NATS_URL,\n        ssl_config=ssl_config,\n        timeout=0.1,\n        max_retries=1,\n        backoff=1,\n    )\n\n    async def service(msg):\n        await asyncio.sleep(2 * nc_config.timeout)\n\n    await nats_service.subscribe(NATS_SUBJECT, \"worker\", cb=service)\n\n    file_path = Path(\"./tests/pdf/sample_table.pdf\")\n    with pytest.raises(ParsingException):\n        async with MegaParseNATSClient(nc_config) as mp_client:\n            await mp_client.parse_file(file=file_path)\n\n\n@pytest.mark.asyncio(loop_scope=\"session\")\nasync def test_client_parse_timeout_retry(nats_service: Client, ssl_config: SSLConfig):\n    nc_config = ClientNATSConfig(\n        subject=NATS_SUBJECT,\n        endpoint=NATS_URL,\n        ssl_config=ssl_config,\n        timeout=0.1,\n        max_retries=2,\n        backoff=-5,\n    )\n\n    msgs = []\n\n    async def service(msg):\n        msgs.append(msg)\n        await asyncio.sleep(2 * nc_config.timeout)\n\n    await nats_service.subscribe(NATS_SUBJECT, \"worker\", cb=service)\n\n    file_path = Path(\"./tests/pdf/sample_table.pdf\")\n    with pytest.raises(ParsingException):\n        async with MegaParseNATSClient(nc_config) as mp_client:\n            await mp_client.parse_file(file=file_path)\n    assert len(msgs) == 2\n\n\n@pytest.mark.asyncio(loop_scope=\"session\")\n@pytest.mark.parametrize(\n    \"mp_error_type, exception_class\",\n    [\n        (\"MEMORY_LIMIT\", MemoryLimitExceeded),\n        (\"INTERNAL_SERVER_ERROR\", InternalServiceError),\n        (\"MODEL_NOT_SUPPORTED\", ModelNotSupported),\n        (\"DOWNLOAD_ERROR\", DownloadError),\n        (\"PARSING_ERROR\", ParsingException),\n    ],\n)\nasync def test_client_parse_file_excp(\n    nats_service: Client, nc_config: ClientNATSConfig, mp_error_type, exception_class\n):\n    async def message_handler(msg):\n        parsed_input = MPInput.model_validate_json(msg.data.decode(\"utf-8\")).input\n        assert isinstance(parsed_input, ParseFileInput)\n        err = ParseError(mp_err_code=MPErrorType[mp_error_type], message=\"\")\n        output = MPOutput(\n            output_type=MPOutputType.PARSE_ERR,\n            err=err,\n            result=None,\n        )\n        await nats_service.publish(msg.reply, output.model_dump_json().encode(\"utf-8\"))\n\n    await nats_service.subscribe(NATS_SUBJECT, \"worker\", cb=message_handler)\n\n    file_path = Path(\"./tests/pdf/sample_table.pdf\")\n    with pytest.raises(exception_class):\n        async with MegaParseNATSClient(nc_config) as mp_client:\n            await mp_client.parse_file(file=file_path)\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"megaparse-monorepo\"\nversion = \"0.0.1\"\ndescription = \"Megaparse monorepo\"\nauthors = [\n    { name = \"Stan Girard\", email = \"stan@quivr.app\" },\n    { name = \"Chloé Daems\", email = \"chloe@quivr.app\" },\n    { name = \"Amine Dirhoussi\", email = \"amine@quivr.app\" },\n    { name = \"Jacopo Chevallard\", email = \"jacopo@quivr.app\" },\n]\nreadme = \"README.md\"\nrequires-python = \">= 3.11\"\ndependencies = [\n    \"packaging>=22.0\",\n]\n\n[build-system]\nrequires = [\"hatchling==1.26.3\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.rye]\npython = \">= 3.11\"\nmanaged = true\nuniversal = true\ndev-dependencies = [\n    \"mypy>=1.11.1\",\n    \"pre-commit>=3.8.0\",\n    \"ipykernel>=6.29.5\",\n    \"ruff>=0.6.0\",\n    \"flake8>=7.1.1\",\n    \"flake8-black>=0.3.6\",\n    \"pytest-asyncio>=0.23.8\",\n    \"pytest>=8.3.3\",\n    \"pytest-xdist>=3.6.1\",\n    \"pytest-cov>=5.0.0\",\n    \"pytest-profiling>=1.8.1\",\n]\n\n[tool.rye.workspace]\nmembers = [\"libs/*\"]\n\n[tool.hatch.metadata]\nallow-direct-references = true\n\n[tool.hatch.build.targets.wheel]\npackages = [\"src/megaparse\"]\n\n[tool.ruff]\nline-length = 88\nexclude = [\".git\", \"__pycache__\", \".mypy_cache\", \".pytest_cache\"]\n\n[tool.ruff.lint]\nselect = [\n    \"E\", # pycodestyle errors\n    \"W\", # pycodestyle warnings\n    \"F\", # pyflakes\n    \"I\", # isort\n    \"C\", # flake8-comprehensions\n    \"B\", # flake8-bugbear\n]\nignore = [\n    \"B904\",\n    \"B006\",\n    \"E501\", # line too long, handled by black\n    \"B008\", # do not perform function calls in argument defaults\n    \"C901\", # too complex\n]\n\n[tool.ruff.lint.isort]\norder-by-type = true\nrelative-imports-order = \"closest-to-furthest\"\nextra-standard-library = [\"typing\"]\nsection-order = [\n    \"future\",\n    \"standard-library\",\n    \"third-party\",\n    \"first-party\",\n    \"local-folder\",\n]\nknown-first-party = []\n\n\n[tool.pytest.ini_options]\naddopts = \"--tb=short -ra -v\"\nasyncio_default_fixture_loop_scope = \"session\"\nfilterwarnings = [\"ignore::DeprecationWarning\"]\nmarkers = [\n    \"slow: marks tests as slow (deselect with '-m \\\"not slow\\\"')\",\n    \"base: these tests require quivr-core with extra `base` to be installed\",\n    \"tika: these tests require a tika server to be running\",\n    \"unstructured: these tests require `unstructured` dependency\",\n]\n"
  },
  {
    "path": "release-please-config.json",
    "content": "{\n  \"$schema\": \"https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json\",\n  \"separate-pull-requests\": true,\n  \"include-v-in-tag\": true,\n  \"bump-patch-for-minor-pre-major\": true,\n  \"include-component-in-tag\": true,\n  \"packages\": {\n    \"libs/megaparse\": {\n      \"release-type\": \"python\",\n      \"package-name\": \"megaparse\",\n      \"changelog-notes-type\": \"github\"\n    },\n    \"libs/megaparse_sdk\": {\n      \"release-type\": \"python\",\n      \"package-name\": \"megaparse-sdk\",\n      \"changelog-notes-type\": \"github\"\n    }\n  }\n}\n"
  }
]