[
  {
    "path": ".coveragerc",
    "content": "[run]\nomit =\n    */__init__.py"
  },
  {
    "path": ".devcontainer/devcontainer.json",
    "content": "{\n\t\"name\": \"RAG Experiment Accelerator\",\n\t\"image\": \"mcr.microsoft.com/devcontainers/python:1-3.11-bullseye\",\n\t\"features\": {\n\t\t\"ghcr.io/devcontainers/features/azure-cli:1\": {\n\t\t\t\"version\": \"latest\"\n\t\t},\n\t\t\"ghcr.io/azure/azure-dev/azd:latest\": {}\n\t},\n\t\"postCreateCommand\": \"./.devcontainer/post-create.sh\",\n\t\"customizations\": {\n\t\t\"vscode\": {\n\t\t\t\"extensions\": [\n\t\t\t\t\"github.vscode-pull-request-github\",\n\t\t\t\t\"ms-vscode.azure-account\",\n\t\t\t\t\"ms-python.python\",\n\t\t\t\t\"ms-python.flake8\",\n\t\t\t\t\"ms-azuretools.vscode-bicep\",\n\t\t\t\t\"prompt-flow.prompt-flow\",\n\t\t\t\t\"ms-azuretools.azure-dev\",\n\t\t\t\t\"streetsidesoftware.code-spell-checker\"\n\t\t\t]\n\t\t}\n\t}\n}\n"
  },
  {
    "path": ".devcontainer/post-create.sh",
    "content": "#!/bin/bash\n\npip install --upgrade pip\n\npip install -r requirements.txt\n\npip install -r dev-requirements.txt\n\npython -m spacy download en_core_web_sm\n\n# install the rag-accelerator packages in editable mode (required for pre-commit to work properly with pytest)\npip install -e .\n\npre-commit install\n"
  },
  {
    "path": ".flake8",
    "content": "[flake8]\nmax-line-length = 120\nextend-ignore = E203, E501"
  },
  {
    "path": ".github/actions/configure_azureml_agent/action.yml",
    "content": "name: Prepare build environment\n\ndescription: Prepares build environment for python and prompt flow related workflow execution.\n\ninputs:\n  versionSpec:\n    description: \"The Python version to use in the environment.\"\n    default: \"3.11\"\n\n\nruns:\n  using: composite\n  steps:\n    - name: Checkout\n      uses: actions/checkout@v4\n\n    - uses: actions/setup-python@v4\n      with:\n        python-version: ${{ inputs.versionSpec }}\n\n    - name: Load all prompt flow and related dependencies\n      shell: bash\n      run: |\n        set -e # fail on error\n        python -m pip install --upgrade pip\n        python -m pip install .\n\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where the package manifests are located.\n# Please see the documentation for all configuration options:\n# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\"\n    schedule:\n      interval: \"weekly\"\n  - package-ecosystem: \"pip\"\n    directory: \"/\"\n    schedule:\n      interval: \"weekly\"\n    open-pull-requests-limit: 50\n"
  },
  {
    "path": ".github/workflows/build_validation_workflow.yml",
    "content": "name: Build validation\n\non:\n  workflow_call:\n  workflow_dispatch:\n  pull_request:\n    branches:\n      - main\n      - development\n      - prerelease\n  push:\n    branches:\n      - main\n      - development\n      - prerelease\n  merge_group:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: ${{ github.ref != 'refs/heads/development' && github.ref != 'refs/heads/main' && github.ref != 'refs/heads/prerelease' }}\n\njobs:\n  validate-code:\n    name: job for validating code and structure\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout Actions\n        uses: actions/checkout@v4\n      - uses: actions/setup-python@v5\n        with:\n          python-version: \"3.11\"\n      - name: Load all build validation related dependencies\n        shell: bash\n        run: |\n          set -e # fail on error\n          python -m pip install --upgrade pip\n          python -m pip install -e . -r requirements.txt -r dev-requirements.txt\n\n      - name: Download spacy model\n        shell: bash\n        run: |\n          python -m spacy download en_core_web_sm\n\n      - name: Run flake\n        shell: bash\n        run: |\n          flake8 --extend-ignore=E501\n\n      - name: Execute Unit Tests\n        shell: bash\n        run: |\n          pytest . --cov=. --cov-report=html --cov-config=.coveragerc\n\n      - name: Publish Unit Test Results\n        uses: actions/upload-artifact@v4\n        with:\n          name: unit-test-results\n          path: \"htmlcov/**\"\n"
  },
  {
    "path": ".github/workflows/rag_exp_acc_ci.yml",
    "content": "name: RAG Experiment Accelerator CI\n\non:\n  workflow_call:\n  workflow_dispatch:\n  pull_request:\n    types: [opened, ready_for_review, synchronize]\n    branches:\n      - main\n      - development\n      - prerelease\n  push:\n    branches:\n      - main\n      - development\n      - prerelease\n  merge_group:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: ${{ github.ref != 'refs/heads/development' && github.ref != 'refs/heads/main' && github.ref != 'refs/heads/prerelease'}}\n\njobs:\n  execute-code-and-check:\n    env:\n      AZURE_SEARCH_ADMIN_KEY: ${{ secrets.AZURE_SEARCH_ADMIN_KEY }}\n      AZURE_SEARCH_SERVICE_ENDPOINT: ${{ secrets.AZURE_SEARCH_SERVICE_ENDPOINT }}\n      AZURE_SEARCH_USE_SEMANTIC_SEARCH: \"true\"\n      AZURE_LANGUAGE_SERVICE_KEY: ${{ secrets.AZURE_LANGUAGE_SERVICE_KEY }}\n      AZURE_LANGUAGE_SERVICE_ENDPOINT: ${{ secrets.AZURE_LANGUAGE_SERVICE_ENDPOINT }}\n      OPENAI_API_TYPE: \"azure\"\n      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n      OPENAI_API_VERSION: ${{ secrets.OPENAI_API_VERSION }}\n      OPENAI_ENDPOINT: ${{ secrets.OPENAI_ENDPOINT }}\n      AML_RESOURCE_GROUP_NAME: ${{ secrets.RESOURCE_GROUP_NAME }}\n      AML_SUBSCRIPTION_ID: ${{ secrets.SUBSCRIPTION_ID }}\n      AML_WORKSPACE_NAME: ${{ secrets.WORKSPACE_NAME }}\n      AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT: \"\"\n      AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY: \"\"\n    name: code validation through execution\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout Actions\n        uses: actions/checkout@v4\n      - name: Azure login\n        uses: azure/login@v2\n        with:\n          creds: ${{ secrets.azure_credentials }}\n      - name: Configure Azure ML Agent\n        uses: ./.github/actions/configure_azureml_agent\n      - name: execute index creation step\n        shell: bash\n        run: |\n          python 01_index.py --data_dir='data-ci' --config_path=${{ github.workspace }}/.github/workflows/config.json\n      - name: execute qna step\n        shell: bash\n        run: |\n          python 02_qa_generation.py --data_dir='data-ci' --config_path=${{ github.workspace }}/.github/workflows/config.json\n      - name: execute querying step\n        shell: bash\n        run: |\n          python 03_querying.py --data_dir='data-ci' --config_path=${{ github.workspace }}/.github/workflows/config.json\n      - name: execute evaluation step\n        shell: bash\n        run: |\n          python 04_evaluation.py --data_dir='data-ci' --config_path=${{ github.workspace }}/.github/workflows/config.json\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/#use-with-ide\n.pdm.toml\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n.idea/\n\nartifacts/\n# OSX system folders\n**/.DS_Store\n\n# Data files\ndata/*\n# Keep folders\n!data/pdf/\n!data/html/\n!data/markdown/\n!data/json/\n!data/text/\n# Keep sample files\n!data/pdf/sample-pdf.pdf\n!data/html/sample-html.html\n!data/docx/sample-docx.docx\n!data/markdown/sample-markdown.md\n!data/markdown/sample-txt.txt\n!data/json/sample-json.json\n\n#  promptflow folders\n.promptflow\n.azure\n\n.vscode\n\n<MagicMock*\ncheckpoints/\nmlruns\n\n# AzureML config folder\n.azureml/*\n\n# Generated Conda file\nconda.generated.yaml\n\n# User-defined config\nconfig.json\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/psf/black\n    rev: 23.12.1\n    hooks:\n      - id: black\n        language_version: python3\n  \n  - repo: https://github.com/pycqa/flake8\n    rev: 7.0.0\n    hooks:\n    -   id: flake8\n        args: [--extend-ignore=E501]\n\n  - repo: https://github.com/python-jsonschema/check-jsonschema\n    rev: 0.28.5\n    hooks:\n      - id: check-jsonschema\n        files: ^config.sample.json|.github/workflows/config.json$\n        types: [json]\n        args: [\"--schemafile\", \"config.schema.json\"]\n\n  - repo: local\n    hooks:\n      - id: bicep\n        name: bicep\n        description: Lint and build Bicep files\n        entry: ./infra/generate_arm_template.sh\n        language: script\n        files: \\.bicep$\n        require_serial: true\n        args: # Bicep files that we want to generate ARM templates from\n          - -f=./infra/main.bicep\n"
  },
  {
    "path": "01_index.py",
    "content": "import json\nimport argparse\nimport mlflow\n\nfrom azureml.pipeline import initialise_mlflow_client\n\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint\nfrom rag_experiment_accelerator.run.index import run\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths, mlflow_run_name\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\"--data_dir\", type=str, help=\"input: path to the input data\")\n    parser.add_argument(\n        \"-s\",\n        \"--sampling\",\n        action=\"store_true\",\n        help=\"input: run sampling. Avoid running on distributed compute\",\n    )\n    args, _ = parser.parse_known_args()\n\n    environment = Environment.from_env_or_keyvault()\n    config = Config.from_path(environment, args.config_path, args.data_dir)\n    init_checkpoint(config)\n    file_paths = get_all_file_paths(config.path.data_dir)\n    mlflow_client = initialise_mlflow_client(environment, config)\n    mlflow.set_experiment(config.experiment_name)\n\n    do_sample = args.sampling\n    index_dict = {\"indexes\": []}\n\n    file_paths = get_all_file_paths(config.path.data_dir)\n    for index_config in config.index.flatten():\n        with mlflow.start_run(run_name=mlflow_run_name(f\"index_job_{config.job_name}\")):\n            index_name = run(\n                environment, config, index_config, file_paths, mlflow_client, do_sample\n            )\n            index_dict[\"indexes\"].append(index_name)\n\n    # saves the list of index names locally, not used afterwards\n    with open(config.path.generated_index_names_file, \"w\") as index_names_file:\n        json.dump(index_dict, index_names_file, indent=4)\n"
  },
  {
    "path": "02_qa_generation.py",
    "content": "import argparse\n\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint\nfrom rag_experiment_accelerator.run.qa_generation import run\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\"--data_dir\", type=str, help=\"input: path to the input data\")\n    args, _ = parser.parse_known_args()\n\n    environment = Environment.from_env_or_keyvault()\n    config = Config.from_path(environment, args.config_path, args.data_dir)\n    init_checkpoint(config)\n\n    run(environment, config, get_all_file_paths(config.path.data_dir))\n"
  },
  {
    "path": "03_querying.py",
    "content": "import argparse\nimport mlflow\nfrom azureml.pipeline import initialise_mlflow_client\n\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.config.paths import mlflow_run_name\nfrom rag_experiment_accelerator.run.querying import run\nfrom rag_experiment_accelerator.data_assets.data_asset import create_data_asset\nfrom rag_experiment_accelerator.artifact.handlers.query_output_handler import (\n    QueryOutputHandler,\n)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\n        \"--data_dir\",\n        type=str,\n        help=\"input: path to the input data\",\n        default=None,  # default is initialized in Config\n    )\n    args, _ = parser.parse_known_args()\n\n    environment = Environment.from_env_or_keyvault()\n    config = Config.from_path(\n        environment,\n        args.config_path,\n    )\n    mlflow_client = initialise_mlflow_client(environment, config)\n    mlflow.set_experiment(config.experiment_name)\n\n    handler = QueryOutputHandler(config.path.query_data_dir)\n    init_checkpoint(config)\n\n    for index_config in config.index.flatten():\n        with mlflow.start_run(run_name=mlflow_run_name(config.job_name)):\n            run(environment, config, index_config, mlflow_client)\n\n            index_name = index_config.index_name()\n            create_data_asset(\n                data_path=handler.get_output_path(\n                    index_name, config.experiment_name, config.job_name\n                ),\n                data_asset_name=index_name,\n                environment=environment,\n            )\n"
  },
  {
    "path": "04_evaluation.py",
    "content": "import argparse\n\nimport mlflow\n\nfrom azureml.pipeline import initialise_mlflow_client\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.run.evaluation import run\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.paths import (\n    mlflow_run_name,\n    formatted_datetime_suffix,\n)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\n        \"--data_dir\",\n        type=str,\n        help=\"input: path to the input data\",\n        default=None,  # default is initialized in Config\n    )\n    args, _ = parser.parse_known_args()\n\n    environment = Environment.from_env_or_keyvault()\n    config = Config.from_path(environment, args.config_path, args.data_dir)\n    name_suffix = formatted_datetime_suffix()\n    mlflow_client = initialise_mlflow_client(environment, config)\n    mlflow.set_experiment(config.experiment_name)\n\n    for index_config in config.index.flatten():\n        with mlflow.start_run(run_name=mlflow_run_name(config.job_name, name_suffix)):\n            run(\n                environment,\n                config,\n                index_config,\n                mlflow_client,\n                name_suffix=name_suffix,\n            )\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Microsoft Open Source Code of Conduct\n\nThis project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).\n\nResources:\n\n- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)\n- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)\n- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns\n"
  },
  {
    "path": "LICENSE",
    "content": "    MIT License\n\n    Copyright (c) Microsoft Corporation.\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all\n    copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n    SOFTWARE\n\n\n    requests-oauthlib\n\n    Copyright (c) 2014 Kenneth Reitz.\n\n    Permission to use, copy, modify, and/or distribute this software for any\n    purpose with or without fee is hereby granted, provided that the above\n    copyright notice and this permission notice appear in all copies.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES\n    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF\n    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR\n    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\n    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN\n    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF\n    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\n\n    threadpoolctl\n\n    Copyright (c) 2019, threadpoolctl contributors\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n        * Redistributions of source code must retain the above copyright notice,\n        this list of conditions and the following disclaimer.\n        * Redistributions in binary form must reproduce the above copyright\n        notice, this list of conditions and the following disclaimer in the\n        documentation and/or other materials provided with the distribution.\n        * Neither the name of copyright holder nor the names of its contributors\n        may be used to endorse or promote products derived from this software\n        without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    isodate\n\n    Copyright (c) 2021, Hugo van Kemenade and contributors\n    Copyright (c) 2009-2018, Gerhard Weis and contributors\n    Copyright (c) 2009, Gerhard Weis\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n        * Redistributions of source code must retain the above copyright\n        notice, this list of conditions and the following disclaimer.\n        * Redistributions in binary form must reproduce the above copyright\n        notice, this list of conditions and the following disclaimer in the\n        documentation and/or other materials provided with the distribution.\n        * Neither the name of the <organization> nor the\n        names of its contributors may be used to endorse or promote products\n        derived from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY\n    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    PyPDF2\n\n    Copyright (c) 2006-2008, Mathieu Fenniak\n    Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>\n    Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n    * Redistributions of source code must retain the above copyright notice,\n    this list of conditions and the following disclaimer.\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n    * The name of the author may not be used to endorse or promote products\n    derived from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\n    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n    POSSIBILITY OF SUCH DAMAGE.\n\n    wasabi\n\n    Copyright (C) 2018 Ines Montani\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in\n    all copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n    THE SOFTWARE.\n\n    toolz\n\n    Copyright (c) 2013 Matthew Rocklin\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    a. Redistributions of source code must retain the above copyright notice,\n        this list of conditions and the following disclaimer.\n    b. Redistributions in binary form must reproduce the above copyright\n        notice, this list of conditions and the following disclaimer in the\n        documentation and/or other materials provided with the distribution.\n    c. Neither the name of toolz nor the names of its contributors\n        may be used to endorse or promote products derived from this software\n        without specific prior written permission.\n\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n    ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\n    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\n    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH\n    DAMAGE.\n\n    llvmlite\n\n    Copyright (c) 2014-, Continuum Analytics, Inc.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n    Redistributions of source code must retain the above copyright notice,\n    this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in the\n    documentation and/or other materials provided with the distribution.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    scikit-learn\n\n    BSD 3-Clause License\n\n    Copyright (c) 2007-2023 The scikit-learn developers.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    partd\n\n    Copyright (c) 2015, Continuum Analytics, Inc. and contributors\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice,\n    this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    Neither the name of Continuum Analytics nor the names of any contributors\n    may be used to endorse or promote products derived from this software\n    without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\n    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n    THE POSSIBILITY OF SUCH DAMAGE.\n\n    Pygments\n\n    Copyright (c) 2006-2022 by the respective authors (see AUTHORS file).\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n    * Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in the\n    documentation and/or other materials provided with the distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    pandas\n\n    BSD 3-Clause License\n\n    Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team\n    All rights reserved.\n\n    Copyright (c) 2011-2024, Open source contributors.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    Markdown\n\n    BSD 3-Clause License\n\n    Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)\n    Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)\n    Copyright 2004 Manfred Stienstra (the original version)\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    tiktoken\n\n    Copyright (c) 2022 OpenAI, Shantanu Jain\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all\n    copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n    SOFTWARE.\n\n    dask\n\n    BSD 3-Clause License\n\n    Copyright (c) 2014, Anaconda, Inc. and contributors\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    blis\n\n    NOTE: Portions of this project's code are copyrighted by\n\n    The University of Texas at Austin\n\n    while other portions are copyrighted by\n\n    Hewlett Packard Enterprise Development LP\n    Advanced Micro Devices, Inc.\n    ExplosionAI GmbH\n\n    with some overlap. Please see file-level license headers for file-specific\n    copyright info. All parties provide their portions of the code under the\n    3-clause BSD license, found below.\n\n    ---\n\n    Copyright (C) 2018, The University of Texas at Austin\n    Copyright (C) 2016, Hewlett Packard Enterprise Development LP\n    Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n    - Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n    - Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in the\n    documentation and/or other materials provided with the distribution.\n    - Neither the name(s) of the copyright holder(s) nor the names of its\n    contributors may be used to endorse or promote products derived\n    from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    portalocker\n\n    Copyright 2022 Rick van Hattem\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    Pillow\n\n    The Python Imaging Library (PIL) is\n\n    Copyright © 1997-2011 by Secret Labs AB\n    Copyright © 1995-2011 by Fredrik Lundh\n\n    Pillow is the friendly PIL fork. It is\n\n        Copyright © 2010-2024 by Jeffrey A. Clark (Alex) and contributors.\n\n    Like PIL, Pillow is licensed under the open source HPND License:\n\n    By obtaining, using, and/or copying this software and/or its associated\n    documentation, you agree that you have read, understood, and will comply\n    with the following terms and conditions:\n\n    Permission to use, copy, modify and distribute this software and its\n    documentation for any purpose and without fee is hereby granted,\n    provided that the above copyright notice appears in all copies, and that\n    both that copyright notice and this permission notice appear in supporting\n    documentation, and that the name of Secret Labs AB or the author not be\n    used in advertising or publicity pertaining to distribution of the software\n    without specific, written prior permission.\n\n    SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS\n    SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.\n    IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,\n    INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\n    LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE\n    OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\n    PERFORMANCE OF THIS SOFTWARE.\n\n    bsdiff4\n\n    Copyright (c) 2011-2019, Ilan Schnell\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    The views and conclusions contained in the software and documentation are those\n    of the authors and should not be interpreted as representing official policies,\n    either expressed or implied, of the (Python) bsdiff4 project.\n\n\n    /***************************************************************************\n    *  the module bsdiff4/core.c has been drived from cx_bsdiff written by    *\n    *  Anthony Tuininga, http://cx-bsdiff.sourceforge.net/), which was        *\n    *  released under the BSD Protection License below                        *\n    ***************************************************************************/\n\n\n    BSD Protection License\n    February 2002\n\n    Preamble\n    --------\n\n    The Berkeley Software Distribution (\"BSD\") license has proven very effective\n    over the years at allowing for a wide spread of work throughout both\n    commercial and non-commercial products.  For programmers whose primary\n    intention is to improve the general quality of available software, it is\n    arguable that there is no better license than the BSD license, as it\n    permits improvements to be used wherever they will help, without idealogical\n    or metallic constraint.\n\n    This is of particular value to those who produce reference implementations\n    of proposed standards: The case of TCP/IP clearly illustrates that freely\n    and universally available implementations leads the rapid acceptance of\n    standards -- often even being used instead of a de jure standard (eg, OSI\n    network models).\n\n    With the rapid proliferation of software licensed under the GNU General\n    Public License, however, the continued success of this role is called into\n    question.  Given that the inclusion of a few lines of \"GPL-tainted\" work\n    into a larger body of work will result in restricted distribution -- and\n    given that further work will likely build upon the \"tainted\" portions,\n    making them difficult to remove at a future date -- there are inevitable\n    circumstances where authors would, in order to protect their goal of\n    providing for the widespread usage of their work, wish to guard against\n    such \"GPL-taint\".\n\n    In addition, one can imagine that companies which operate by producing and\n    selling (possibly closed-source) code would wish to protect themselves\n    against the rise of a GPL-licensed competitor.  While under existing\n    licenses this would mean not releasing their code under any form of open\n    license, if a license existed under which they could incorporate any\n    improvements back into their own (commercial) products then they might be\n    far more willing to provide for non-closed distribution.\n\n    For the above reasons, we put forth this \"BSD Protection License\": A\n    license designed to retain the freedom granted by the BSD license to use\n    licensed works in a wide variety of settings, both non-commercial and\n    commercial, while protecting the work from having future contributors\n    restrict that freedom.\n\n    The precise terms and conditions for copying, distribution, and\n    modification follow.\n\n    BSD PROTECTION LICENSE\n    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION, AND MODIFICATION\n    ----------------------------------------------------------------\n\n    0. Definitions.\n    a) \"Program\", below, refers to any program or work distributed under\n        the terms of this license.\n    b) A \"work based on the Program\", below, refers to either the Program\n        or any derivative work under copyright law.\n    c) \"Modification\", below, refers to the act of creating derivative works.\n    d) \"You\", below, refers to each licensee.\n\n    1. Scope.\n    This license governs the copying, distribution, and modification of the\n    Program.  Other activities are outside the scope of this license; The\n    act of running the Program is not restricted, and the output from the\n    Program is covered only if its contents constitute a work based on the\n    Program.\n\n    2. Verbatim copies.\n    You may copy and distribute verbatim copies of the Program as you\n    receive it, in any medium, provided that you conspicuously and\n    appropriately publish on each copy an appropriate copyright notice; keep\n    intact all the notices that refer to this License and to the absence of\n    any warranty; and give any other recipients of the Program a copy of this\n    License along with the Program.\n\n    3. Modification and redistribution under closed license.\n    You may modify your copy or copies of the Program, and distribute\n    the resulting derivative works, provided that you meet the\n    following conditions:\n    a) The copyright notice and disclaimer on the Program must be reproduced\n        and included in the source code, documentation, and/or other materials\n        provided in a manner in which such notices are normally distributed.\n    b) The derivative work must be clearly identified as such, in order that\n        it may not be confused with the original work.\n    c) The license under which the derivative work is distributed must\n        expressly prohibit the distribution of further derivative works.\n\n    4. Modification and redistribution under open license.\n    You may modify your copy or copies of the Program, and distribute\n    the resulting derivative works, provided that you meet the\n    following conditions:\n    a) The copyright notice and disclaimer on the Program must be reproduced\n        and included in the source code, documentation, and/or other materials\n        provided in a manner in which such notices are normally distributed.\n    b) You must clearly indicate the nature and date of any changes made\n        to the Program.  The full details need not necessarily be included in\n        the individual modified files, provided that each modified file is\n        clearly marked as such and instructions are included on where the\n        full details of the modifications may be found.\n    c) You must cause any work that you distribute or publish, that in whole\n        or in part contains or is derived from the Program or any part\n        thereof, to be licensed as a whole at no charge to all third\n        parties under the terms of this License.\n\n    5. Implied acceptance.\n    You may not copy or distribute the Program or any derivative works except\n    as expressly provided under this license.  Consequently, any such action\n    will be taken as implied acceptance of the terms of this license.\n\n    6. NO WARRANTY.\n    THIS SOFTWARE IS PROVIDED \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES,\n    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY\n    AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL\n    THE COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR\n    REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE FOR ANY DIRECT,\n    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING, BUT\n    NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\n    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR\n    TORT, EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE\n    POSSIBILITY OF SUCH DAMAGES.\n\n    MarkupSafe\n\n    Copyright 2010 Pallets\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    jsonpickle\n\n    Copyright (C) 2008 John Paulett (john -at- paulett.org)\n    Copyright (C) 2009-2021 David Aguilar (davvid -at- gmail.com)\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice,\n    this list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from this\n    software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n    IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,\n    THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR\n    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;\n    OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\n    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR\n    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,\n    EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    paho-mqtt\n\n    This project is dual licensed under the Eclipse Public License 2.0 and the\n    Eclipse Distribution License 1.0 as described in the epl-v20 and edl-v10 files.\n\n    chardet\n\n    GNU LESSER GENERAL PUBLIC LICENSE\n    Version 2.1, February 1999\n\n    Copyright (C) 1991, 1999 Free Software Foundation, Inc.\n    51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA\n    Everyone is permitted to copy and distribute verbatim copies\n    of this license document, but changing it is not allowed.\n\n    [This is the first released version of the Lesser GPL.  It also counts\n    as the successor of the GNU Library Public License, version 2, hence\n    the version number 2.1.]\n\n    Preamble\n\n    The licenses for most software are designed to take away your\n    freedom to share and change it.  By contrast, the GNU General Public\n    Licenses are intended to guarantee your freedom to share and change\n    free software--to make sure the software is free for all its users.\n\n    This license, the Lesser General Public License, applies to some\n    specially designated software packages--typically libraries--of the\n    Free Software Foundation and other authors who decide to use it.  You\n    can use it too, but we suggest you first think carefully about whether\n    this license or the ordinary General Public License is the better\n    strategy to use in any particular case, based on the explanations below.\n\n    When we speak of free software, we are referring to freedom of use,\n    not price.  Our General Public Licenses are designed to make sure that\n    you have the freedom to distribute copies of free software (and charge\n    for this service if you wish); that you receive source code or can get\n    it if you want it; that you can change the software and use pieces of\n    it in new free programs; and that you are informed that you can do\n    these things.\n\n    To protect your rights, we need to make restrictions that forbid\n    distributors to deny you these rights or to ask you to surrender these\n    rights.  These restrictions translate to certain responsibilities for\n    you if you distribute copies of the library or if you modify it.\n\n    For example, if you distribute copies of the library, whether gratis\n    or for a fee, you must give the recipients all the rights that we gave\n    you.  You must make sure that they, too, receive or can get the source\n    code.  If you link other code with the library, you must provide\n    complete object files to the recipients, so that they can relink them\n    with the library after making changes to the library and recompiling\n    it.  And you must show them these terms so they know their rights.\n\n    We protect your rights with a two-step method: (1) we copyright the\n    library, and (2) we offer you this license, which gives you legal\n    permission to copy, distribute and/or modify the library.\n\n    To protect each distributor, we want to make it very clear that\n    there is no warranty for the free library.  Also, if the library is\n    modified by someone else and passed on, the recipients should know\n    that what they have is not the original version, so that the original\n    author's reputation will not be affected by problems that might be\n    introduced by others.\n    \f\n    Finally, software patents pose a constant threat to the existence of\n    any free program.  We wish to make sure that a company cannot\n    effectively restrict the users of a free program by obtaining a\n    restrictive license from a patent holder.  Therefore, we insist that\n    any patent license obtained for a version of the library must be\n    consistent with the full freedom of use specified in this license.\n\n    Most GNU software, including some libraries, is covered by the\n    ordinary GNU General Public License.  This license, the GNU Lesser\n    General Public License, applies to certain designated libraries, and\n    is quite different from the ordinary General Public License.  We use\n    this license for certain libraries in order to permit linking those\n    libraries into non-free programs.\n\n    When a program is linked with a library, whether statically or using\n    a shared library, the combination of the two is legally speaking a\n    combined work, a derivative of the original library.  The ordinary\n    General Public License therefore permits such linking only if the\n    entire combination fits its criteria of freedom.  The Lesser General\n    Public License permits more lax criteria for linking other code with\n    the library.\n\n    We call this license the \"Lesser\" General Public License because it\n    does Less to protect the user's freedom than the ordinary General\n    Public License.  It also provides other free software developers Less\n    of an advantage over competing non-free programs.  These disadvantages\n    are the reason we use the ordinary General Public License for many\n    libraries.  However, the Lesser license provides advantages in certain\n    special circumstances.\n\n    For example, on rare occasions, there may be a special need to\n    encourage the widest possible use of a certain library, so that it becomes\n    a de-facto standard.  To achieve this, non-free programs must be\n    allowed to use the library.  A more frequent case is that a free\n    library does the same job as widely used non-free libraries.  In this\n    case, there is little to gain by limiting the free library to free\n    software only, so we use the Lesser General Public License.\n\n    In other cases, permission to use a particular library in non-free\n    programs enables a greater number of people to use a large body of\n    free software.  For example, permission to use the GNU C Library in\n    non-free programs enables many more people to use the whole GNU\n    operating system, as well as its variant, the GNU/Linux operating\n    system.\n\n    Although the Lesser General Public License is Less protective of the\n    users' freedom, it does ensure that the user of a program that is\n    linked with the Library has the freedom and the wherewithal to run\n    that program using a modified version of the Library.\n\n    The precise terms and conditions for copying, distribution and\n    modification follow.  Pay close attention to the difference between a\n    \"work based on the library\" and a \"work that uses the library\".  The\n    former contains code derived from the library, whereas the latter must\n    be combined with the library in order to run.\n    \f\n                    GNU LESSER GENERAL PUBLIC LICENSE\n    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION\n\n    0. This License Agreement applies to any software library or other\n    program which contains a notice placed by the copyright holder or\n    other authorized party saying it may be distributed under the terms of\n    this Lesser General Public License (also called \"this License\").\n    Each licensee is addressed as \"you\".\n\n    A \"library\" means a collection of software functions and/or data\n    prepared so as to be conveniently linked with application programs\n    (which use some of those functions and data) to form executables.\n\n    The \"Library\", below, refers to any such software library or work\n    which has been distributed under these terms.  A \"work based on the\n    Library\" means either the Library or any derivative work under\n    copyright law: that is to say, a work containing the Library or a\n    portion of it, either verbatim or with modifications and/or translated\n    straightforwardly into another language.  (Hereinafter, translation is\n    included without limitation in the term \"modification\".)\n\n    \"Source code\" for a work means the preferred form of the work for\n    making modifications to it.  For a library, complete source code means\n    all the source code for all modules it contains, plus any associated\n    interface definition files, plus the scripts used to control compilation\n    and installation of the library.\n\n    Activities other than copying, distribution and modification are not\n    covered by this License; they are outside its scope.  The act of\n    running a program using the Library is not restricted, and output from\n    such a program is covered only if its contents constitute a work based\n    on the Library (independent of the use of the Library in a tool for\n    writing it).  Whether that is true depends on what the Library does\n    and what the program that uses the Library does.\n\n    1. You may copy and distribute verbatim copies of the Library's\n    complete source code as you receive it, in any medium, provided that\n    you conspicuously and appropriately publish on each copy an\n    appropriate copyright notice and disclaimer of warranty; keep intact\n    all the notices that refer to this License and to the absence of any\n    warranty; and distribute a copy of this License along with the\n    Library.\n\n    You may charge a fee for the physical act of transferring a copy,\n    and you may at your option offer warranty protection in exchange for a\n    fee.\n    \f\n    2. You may modify your copy or copies of the Library or any portion\n    of it, thus forming a work based on the Library, and copy and\n    distribute such modifications or work under the terms of Section 1\n    above, provided that you also meet all of these conditions:\n\n        a) The modified work must itself be a software library.\n\n        b) You must cause the files modified to carry prominent notices\n        stating that you changed the files and the date of any change.\n\n        c) You must cause the whole of the work to be licensed at no\n        charge to all third parties under the terms of this License.\n\n        d) If a facility in the modified Library refers to a function or a\n        table of data to be supplied by an application program that uses\n        the facility, other than as an argument passed when the facility\n        is invoked, then you must make a good faith effort to ensure that,\n        in the event an application does not supply such function or\n        table, the facility still operates, and performs whatever part of\n        its purpose remains meaningful.\n\n        (For example, a function in a library to compute square roots has\n        a purpose that is entirely well-defined independent of the\n        application.  Therefore, Subsection 2d requires that any\n        application-supplied function or table used by this function must\n        be optional: if the application does not supply it, the square\n        root function must still compute square roots.)\n\n    These requirements apply to the modified work as a whole.  If\n    identifiable sections of that work are not derived from the Library,\n    and can be reasonably considered independent and separate works in\n    themselves, then this License, and its terms, do not apply to those\n    sections when you distribute them as separate works.  But when you\n    distribute the same sections as part of a whole which is a work based\n    on the Library, the distribution of the whole must be on the terms of\n    this License, whose permissions for other licensees extend to the\n    entire whole, and thus to each and every part regardless of who wrote\n    it.\n\n    Thus, it is not the intent of this section to claim rights or contest\n    your rights to work written entirely by you; rather, the intent is to\n    exercise the right to control the distribution of derivative or\n    collective works based on the Library.\n\n    In addition, mere aggregation of another work not based on the Library\n    with the Library (or with a work based on the Library) on a volume of\n    a storage or distribution medium does not bring the other work under\n    the scope of this License.\n\n    3. You may opt to apply the terms of the ordinary GNU General Public\n    License instead of this License to a given copy of the Library.  To do\n    this, you must alter all the notices that refer to this License, so\n    that they refer to the ordinary GNU General Public License, version 2,\n    instead of to this License.  (If a newer version than version 2 of the\n    ordinary GNU General Public License has appeared, then you can specify\n    that version instead if you wish.)  Do not make any other change in\n    these notices.\n    \f\n    Once this change is made in a given copy, it is irreversible for\n    that copy, so the ordinary GNU General Public License applies to all\n    subsequent copies and derivative works made from that copy.\n\n    This option is useful when you wish to copy part of the code of\n    the Library into a program that is not a library.\n\n    4. You may copy and distribute the Library (or a portion or\n    derivative of it, under Section 2) in object code or executable form\n    under the terms of Sections 1 and 2 above provided that you accompany\n    it with the complete corresponding machine-readable source code, which\n    must be distributed under the terms of Sections 1 and 2 above on a\n    medium customarily used for software interchange.\n\n    If distribution of object code is made by offering access to copy\n    from a designated place, then offering equivalent access to copy the\n    source code from the same place satisfies the requirement to\n    distribute the source code, even though third parties are not\n    compelled to copy the source along with the object code.\n\n    5. A program that contains no derivative of any portion of the\n    Library, but is designed to work with the Library by being compiled or\n    linked with it, is called a \"work that uses the Library\".  Such a\n    work, in isolation, is not a derivative work of the Library, and\n    therefore falls outside the scope of this License.\n\n    However, linking a \"work that uses the Library\" with the Library\n    creates an executable that is a derivative of the Library (because it\n    contains portions of the Library), rather than a \"work that uses the\n    library\".  The executable is therefore covered by this License.\n    Section 6 states terms for distribution of such executables.\n\n    When a \"work that uses the Library\" uses material from a header file\n    that is part of the Library, the object code for the work may be a\n    derivative work of the Library even though the source code is not.\n    Whether this is true is especially significant if the work can be\n    linked without the Library, or if the work is itself a library.  The\n    threshold for this to be true is not precisely defined by law.\n\n    If such an object file uses only numerical parameters, data\n    structure layouts and accessors, and small macros and small inline\n    functions (ten lines or less in length), then the use of the object\n    file is unrestricted, regardless of whether it is legally a derivative\n    work.  (Executables containing this object code plus portions of the\n    Library will still fall under Section 6.)\n\n    Otherwise, if the work is a derivative of the Library, you may\n    distribute the object code for the work under the terms of Section 6.\n    Any executables containing that work also fall under Section 6,\n    whether or not they are linked directly with the Library itself.\n    \f\n    6. As an exception to the Sections above, you may also combine or\n    link a \"work that uses the Library\" with the Library to produce a\n    work containing portions of the Library, and distribute that work\n    under terms of your choice, provided that the terms permit\n    modification of the work for the customer's own use and reverse\n    engineering for debugging such modifications.\n\n    You must give prominent notice with each copy of the work that the\n    Library is used in it and that the Library and its use are covered by\n    this License.  You must supply a copy of this License.  If the work\n    during execution displays copyright notices, you must include the\n    copyright notice for the Library among them, as well as a reference\n    directing the user to the copy of this License.  Also, you must do one\n    of these things:\n\n        a) Accompany the work with the complete corresponding\n        machine-readable source code for the Library including whatever\n        changes were used in the work (which must be distributed under\n        Sections 1 and 2 above); and, if the work is an executable linked\n        with the Library, with the complete machine-readable \"work that\n        uses the Library\", as object code and/or source code, so that the\n        user can modify the Library and then relink to produce a modified\n        executable containing the modified Library.  (It is understood\n        that the user who changes the contents of definitions files in the\n        Library will not necessarily be able to recompile the application\n        to use the modified definitions.)\n\n        b) Use a suitable shared library mechanism for linking with the\n        Library.  A suitable mechanism is one that (1) uses at run time a\n        copy of the library already present on the user's computer system,\n        rather than copying library functions into the executable, and (2)\n        will operate properly with a modified version of the library, if\n        the user installs one, as long as the modified version is\n        interface-compatible with the version that the work was made with.\n\n        c) Accompany the work with a written offer, valid for at\n        least three years, to give the same user the materials\n        specified in Subsection 6a, above, for a charge no more\n        than the cost of performing this distribution.\n\n        d) If distribution of the work is made by offering access to copy\n        from a designated place, offer equivalent access to copy the above\n        specified materials from the same place.\n\n        e) Verify that the user has already received a copy of these\n        materials or that you have already sent this user a copy.\n\n    For an executable, the required form of the \"work that uses the\n    Library\" must include any data and utility programs needed for\n    reproducing the executable from it.  However, as a special exception,\n    the materials to be distributed need not include anything that is\n    normally distributed (in either source or binary form) with the major\n    components (compiler, kernel, and so on) of the operating system on\n    which the executable runs, unless that component itself accompanies\n    the executable.\n\n    It may happen that this requirement contradicts the license\n    restrictions of other proprietary libraries that do not normally\n    accompany the operating system.  Such a contradiction means you cannot\n    use both them and the Library together in an executable that you\n    distribute.\n    \f\n    7. You may place library facilities that are a work based on the\n    Library side-by-side in a single library together with other library\n    facilities not covered by this License, and distribute such a combined\n    library, provided that the separate distribution of the work based on\n    the Library and of the other library facilities is otherwise\n    permitted, and provided that you do these two things:\n\n        a) Accompany the combined library with a copy of the same work\n        based on the Library, uncombined with any other library\n        facilities.  This must be distributed under the terms of the\n        Sections above.\n\n        b) Give prominent notice with the combined library of the fact\n        that part of it is a work based on the Library, and explaining\n        where to find the accompanying uncombined form of the same work.\n\n    8. You may not copy, modify, sublicense, link with, or distribute\n    the Library except as expressly provided under this License.  Any\n    attempt otherwise to copy, modify, sublicense, link with, or\n    distribute the Library is void, and will automatically terminate your\n    rights under this License.  However, parties who have received copies,\n    or rights, from you under this License will not have their licenses\n    terminated so long as such parties remain in full compliance.\n\n    9. You are not required to accept this License, since you have not\n    signed it.  However, nothing else grants you permission to modify or\n    distribute the Library or its derivative works.  These actions are\n    prohibited by law if you do not accept this License.  Therefore, by\n    modifying or distributing the Library (or any work based on the\n    Library), you indicate your acceptance of this License to do so, and\n    all its terms and conditions for copying, distributing or modifying\n    the Library or works based on it.\n\n    10. Each time you redistribute the Library (or any work based on the\n    Library), the recipient automatically receives a license from the\n    original licensor to copy, distribute, link with or modify the Library\n    subject to these terms and conditions.  You may not impose any further\n    restrictions on the recipients' exercise of the rights granted herein.\n    You are not responsible for enforcing compliance by third parties with\n    this License.\n    \f\n    11. If, as a consequence of a court judgment or allegation of patent\n    infringement or for any other reason (not limited to patent issues),\n    conditions are imposed on you (whether by court order, agreement or\n    otherwise) that contradict the conditions of this License, they do not\n    excuse you from the conditions of this License.  If you cannot\n    distribute so as to satisfy simultaneously your obligations under this\n    License and any other pertinent obligations, then as a consequence you\n    may not distribute the Library at all.  For example, if a patent\n    license would not permit royalty-free redistribution of the Library by\n    all those who receive copies directly or indirectly through you, then\n    the only way you could satisfy both it and this License would be to\n    refrain entirely from distribution of the Library.\n\n    If any portion of this section is held invalid or unenforceable under any\n    particular circumstance, the balance of the section is intended to apply,\n    and the section as a whole is intended to apply in other circumstances.\n\n    It is not the purpose of this section to induce you to infringe any\n    patents or other property right claims or to contest validity of any\n    such claims; this section has the sole purpose of protecting the\n    integrity of the free software distribution system which is\n    implemented by public license practices.  Many people have made\n    generous contributions to the wide range of software distributed\n    through that system in reliance on consistent application of that\n    system; it is up to the author/donor to decide if he or she is willing\n    to distribute software through any other system and a licensee cannot\n    impose that choice.\n\n    This section is intended to make thoroughly clear what is believed to\n    be a consequence of the rest of this License.\n\n    12. If the distribution and/or use of the Library is restricted in\n    certain countries either by patents or by copyrighted interfaces, the\n    original copyright holder who places the Library under this License may add\n    an explicit geographical distribution limitation excluding those countries,\n    so that distribution is permitted only in or among countries not thus\n    excluded.  In such case, this License incorporates the limitation as if\n    written in the body of this License.\n\n    13. The Free Software Foundation may publish revised and/or new\n    versions of the Lesser General Public License from time to time.\n    Such new versions will be similar in spirit to the present version,\n    but may differ in detail to address new problems or concerns.\n\n    Each version is given a distinguishing version number.  If the Library\n    specifies a version number of this License which applies to it and\n    \"any later version\", you have the option of following the terms and\n    conditions either of that version or of any later version published by\n    the Free Software Foundation.  If the Library does not specify a\n    license version number, you may choose any version ever published by\n    the Free Software Foundation.\n    \f\n    14. If you wish to incorporate parts of the Library into other free\n    programs whose distribution conditions are incompatible with these,\n    write to the author to ask for permission.  For software which is\n    copyrighted by the Free Software Foundation, write to the Free\n    Software Foundation; we sometimes make exceptions for this.  Our\n    decision will be guided by the two goals of preserving the free status\n    of all derivatives of our free software and of promoting the sharing\n    and reuse of software generally.\n\n                                NO WARRANTY\n\n    15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO\n    WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.\n    EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR\n    OTHER PARTIES PROVIDE THE LIBRARY \"AS IS\" WITHOUT WARRANTY OF ANY\n    KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n    PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE\n    LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME\n    THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n    16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN\n    WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY\n    AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU\n    FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR\n    CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE\n    LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING\n    RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A\n    FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF\n    SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH\n    DAMAGES.\n\n                        END OF TERMS AND CONDITIONS\n    \f\n            How to Apply These Terms to Your New Libraries\n\n    If you develop a new library, and you want it to be of the greatest\n    possible use to the public, we recommend making it free software that\n    everyone can redistribute and change.  You can do so by permitting\n    redistribution under these terms (or, alternatively, under the terms of the\n    ordinary General Public License).\n\n    To apply these terms, attach the following notices to the library.  It is\n    safest to attach them to the start of each source file to most effectively\n    convey the exclusion of warranty; and each file should have at least the\n    \"copyright\" line and a pointer to where the full notice is found.\n\n        <one line to give the library's name and a brief idea of what it does.>\n        Copyright (C) <year>  <name of author>\n\n        This library is free software; you can redistribute it and/or\n        modify it under the terms of the GNU Lesser General Public\n        License as published by the Free Software Foundation; either\n        version 2.1 of the License, or (at your option) any later version.\n\n        This library is distributed in the hope that it will be useful,\n        but WITHOUT ANY WARRANTY; without even the implied warranty of\n        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n        Lesser General Public License for more details.\n\n        You should have received a copy of the GNU Lesser General Public\n        License along with this library; if not, write to the Free Software\n        Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA\n\n    Also add information on how to contact you by electronic and paper mail.\n\n    You should also get your employer (if you work as a programmer) or your\n    school, if any, to sign a \"copyright disclaimer\" for the library, if\n    necessary.  Here is a sample; alter the names:\n\n    Yoyodyne, Inc., hereby disclaims all copyright interest in the\n    library `Frob' (a library for tweaking knobs) written by James Random Hacker.\n\n    <signature of Ty Coon>, 1 April 1990\n    Ty Coon, President of Vice\n\n    That's all there is to it!\n\n\n    itsdangerous\n\n    Copyright 2011 Pallets\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    pyasn1-modules\n\n    Copyright (c) 2005-2020, Ilya Etingof <etingof@gmail.com>\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, \n        this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n        this list of conditions and the following disclaimer in the documentation\n        and/or other materials provided with the distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE \n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE \n    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE\n    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n    POSSIBILITY OF SUCH DAMAGE. \n\n    contourpy\n\n    BSD 3-Clause License\n\n    Copyright (c) 2021-2024, ContourPy Developers.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    GitPython\n\n    Copyright (C) 2008, 2009 Michael Trier and contributors\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions\n    are met:\n\n    * Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in the\n    documentation and/or other materials provided with the distribution.\n\n    * Neither the name of the GitPython project nor the names of\n    its contributors may be used to endorse or promote products derived\n    from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\n    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    Flask\n\n    Copyright 2010 Pallets\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    joblib\n\n    BSD 3-Clause License\n\n    Copyright (c) 2008-2021, The joblib developers.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    cycler\n\n    Copyright (c) 2015, matplotlib project\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the matplotlib project nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    mkdocs-altlink-plugin\n\n    Copyright (c) 2018 Zach Hannum \n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all\n    copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n    SOFTWARE.\n\n    mkdocs-video\n\n    The MIT License (MIT)\n\n    Copyright (c) 2023 Mikalai Lisitsa\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n\n    pycparser\n\n    pycparser -- A C parser in Python\n\n    Copyright (c) 2008-2022, Eli Bendersky\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this \n    list of conditions and the following disclaimer.\n    * Redistributions in binary form must reproduce the above copyright notice, \n    this list of conditions and the following disclaimer in the documentation \n    and/or other materials provided with the distribution.\n    * Neither the name of the copyright holder nor the names of its contributors may \n    be used to endorse or promote products derived from this software without \n    specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND \n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED \n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE \n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE \n    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR \n    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE \n    GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \n    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT \n    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT \n    OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    oauthlib\n\n    Copyright (c) 2019 The OAuthlib Community\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n        1. Redistributions of source code must retain the above copyright notice,\n        this list of conditions and the following disclaimer.\n\n        2. Redistributions in binary form must reproduce the above copyright\n        notice, this list of conditions and the following disclaimer in the\n        documentation and/or other materials provided with the distribution.\n\n        3. Neither the name of this project nor the names of its contributors may\n        be used to endorse or promote products derived from this software without\n        specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    shap\n\n    The MIT License (MIT)\n\n    Copyright (c) 2018 Scott Lundberg\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all\n    copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n    SOFTWARE.\n\n    idna\n\n    BSD 3-Clause License\n\n    Copyright (c) 2013-2023, Kim Davies and contributors. All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    filelock\n\n    This is free and unencumbered software released into the public domain.\n\n    Anyone is free to copy, modify, publish, use, compile, sell, or\n    distribute this software, either in source code form or as a compiled\n    binary, for any purpose, commercial or non-commercial, and by any\n    means.\n\n    In jurisdictions that recognize copyright laws, the author or authors\n    of this software dedicate any and all copyright interest in the\n    software to the public domain. We make this dedication for the benefit\n    of the public at large and to the detriment of our heirs and\n    successors. We intend this dedication to be an overt act of\n    relinquishment in perpetuity of all present and future rights to this\n    software under copyright law.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n    IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n    OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n    OTHER DEALINGS IN THE SOFTWARE.\n\n    For more information, please refer to <http://unlicense.org>\n\n    sqlparse\n\n    Copyright (c) 2016, Andi Albrecht <albrecht.andi@gmail.com>\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n        * Redistributions of source code must retain the above copyright notice,\n        this list of conditions and the following disclaimer.\n        * Redistributions in binary form must reproduce the above copyright notice,\n        this list of conditions and the following disclaimer in the documentation\n        and/or other materials provided with the distribution.\n        * Neither the name of the authors nor the names of its contributors may be\n        used to endorse or promote products derived from this software without\n        specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    paramiko\n\n    GNU LESSER GENERAL PUBLIC LICENSE\n\tVersion 2.1, February 1999\n\n    Copyright (C) 1991, 1999 Free Software Foundation, Inc.\n        51 Franklin Street, Suite 500, Boston, MA  02110-1335  USA\n    Everyone is permitted to copy and distribute verbatim copies\n    of this license document, but changing it is not allowed.\n\n    [This is the first released version of the Lesser GPL.  It also counts\n    as the successor of the GNU Library Public License, version 2, hence\n    the version number 2.1.]\n\n                    Preamble\n\n    The licenses for most software are designed to take away your\n    freedom to share and change it.  By contrast, the GNU General Public\n    Licenses are intended to guarantee your freedom to share and change\n    free software--to make sure the software is free for all its users.\n\n    This license, the Lesser General Public License, applies to some\n    specially designated software packages--typically libraries--of the\n    Free Software Foundation and other authors who decide to use it.  You\n    can use it too, but we suggest you first think carefully about whether\n    this license or the ordinary General Public License is the better\n    strategy to use in any particular case, based on the explanations below.\n\n    When we speak of free software, we are referring to freedom of use,\n    not price.  Our General Public Licenses are designed to make sure that\n    you have the freedom to distribute copies of free software (and charge\n    for this service if you wish); that you receive source code or can get\n    it if you want it; that you can change the software and use pieces of\n    it in new free programs; and that you are informed that you can do\n    these things.\n\n    To protect your rights, we need to make restrictions that forbid\n    distributors to deny you these rights or to ask you to surrender these\n    rights.  These restrictions translate to certain responsibilities for\n    you if you distribute copies of the library or if you modify it.\n\n    For example, if you distribute copies of the library, whether gratis\n    or for a fee, you must give the recipients all the rights that we gave\n    you.  You must make sure that they, too, receive or can get the source\n    code.  If you link other code with the library, you must provide\n    complete object files to the recipients, so that they can relink them\n    with the library after making changes to the library and recompiling\n    it.  And you must show them these terms so they know their rights.\n\n    We protect your rights with a two-step method: (1) we copyright the\n    library, and (2) we offer you this license, which gives you legal\n    permission to copy, distribute and/or modify the library.\n\n    To protect each distributor, we want to make it very clear that\n    there is no warranty for the free library.  Also, if the library is\n    modified by someone else and passed on, the recipients should know\n    that what they have is not the original version, so that the original\n    author's reputation will not be affected by problems that might be\n    introduced by others.\n    \f\n    Finally, software patents pose a constant threat to the existence of\n    any free program.  We wish to make sure that a company cannot\n    effectively restrict the users of a free program by obtaining a\n    restrictive license from a patent holder.  Therefore, we insist that\n    any patent license obtained for a version of the library must be\n    consistent with the full freedom of use specified in this license.\n\n    Most GNU software, including some libraries, is covered by the\n    ordinary GNU General Public License.  This license, the GNU Lesser\n    General Public License, applies to certain designated libraries, and\n    is quite different from the ordinary General Public License.  We use\n    this license for certain libraries in order to permit linking those\n    libraries into non-free programs.\n\n    When a program is linked with a library, whether statically or using\n    a shared library, the combination of the two is legally speaking a\n    combined work, a derivative of the original library.  The ordinary\n    General Public License therefore permits such linking only if the\n    entire combination fits its criteria of freedom.  The Lesser General\n    Public License permits more lax criteria for linking other code with\n    the library.\n\n    We call this license the \"Lesser\" General Public License because it\n    does Less to protect the user's freedom than the ordinary General\n    Public License.  It also provides other free software developers Less\n    of an advantage over competing non-free programs.  These disadvantages\n    are the reason we use the ordinary General Public License for many\n    libraries.  However, the Lesser license provides advantages in certain\n    special circumstances.\n\n    For example, on rare occasions, there may be a special need to\n    encourage the widest possible use of a certain library, so that it becomes\n    a de-facto standard.  To achieve this, non-free programs must be\n    allowed to use the library.  A more frequent case is that a free\n    library does the same job as widely used non-free libraries.  In this\n    case, there is little to gain by limiting the free library to free\n    software only, so we use the Lesser General Public License.\n\n    In other cases, permission to use a particular library in non-free\n    programs enables a greater number of people to use a large body of\n    free software.  For example, permission to use the GNU C Library in\n    non-free programs enables many more people to use the whole GNU\n    operating system, as well as its variant, the GNU/Linux operating\n    system.\n\n    Although the Lesser General Public License is Less protective of the\n    users' freedom, it does ensure that the user of a program that is\n    linked with the Library has the freedom and the wherewithal to run\n    that program using a modified version of the Library.\n\n    The precise terms and conditions for copying, distribution and\n    modification follow.  Pay close attention to the difference between a\n    \"work based on the library\" and a \"work that uses the library\".  The\n    former contains code derived from the library, whereas the latter must\n    be combined with the library in order to run.\n\f\n\t\t  GNU LESSER GENERAL PUBLIC LICENSE\n   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION\n\n    0. This License Agreement applies to any software library or other\n    program which contains a notice placed by the copyright holder or\n    other authorized party saying it may be distributed under the terms of\n    this Lesser General Public License (also called \"this License\").\n    Each licensee is addressed as \"you\".\n\n    A \"library\" means a collection of software functions and/or data\n    prepared so as to be conveniently linked with application programs\n    (which use some of those functions and data) to form executables.\n\n    The \"Library\", below, refers to any such software library or work\n    which has been distributed under these terms.  A \"work based on the\n    Library\" means either the Library or any derivative work under\n    copyright law: that is to say, a work containing the Library or a\n    portion of it, either verbatim or with modifications and/or translated\n    straightforwardly into another language.  (Hereinafter, translation is\n    included without limitation in the term \"modification\".)\n\n    \"Source code\" for a work means the preferred form of the work for\n    making modifications to it.  For a library, complete source code means\n    all the source code for all modules it contains, plus any associated\n    interface definition files, plus the scripts used to control compilation\n    and installation of the library.\n\n    Activities other than copying, distribution and modification are not\n    covered by this License; they are outside its scope.  The act of\n    running a program using the Library is not restricted, and output from\n    such a program is covered only if its contents constitute a work based\n    on the Library (independent of the use of the Library in a tool for\n    writing it).  Whether that is true depends on what the Library does\n    and what the program that uses the Library does.\n    \n    1. You may copy and distribute verbatim copies of the Library's\n    complete source code as you receive it, in any medium, provided that\n    you conspicuously and appropriately publish on each copy an\n    appropriate copyright notice and disclaimer of warranty; keep intact\n    all the notices that refer to this License and to the absence of any\n    warranty; and distribute a copy of this License along with the\n    Library.\n\n    You may charge a fee for the physical act of transferring a copy,\n    and you may at your option offer warranty protection in exchange for a\n    fee.\n    \f\n    2. You may modify your copy or copies of the Library or any portion\n    of it, thus forming a work based on the Library, and copy and\n    distribute such modifications or work under the terms of Section 1\n    above, provided that you also meet all of these conditions:\n\n        a) The modified work must itself be a software library.\n\n        b) You must cause the files modified to carry prominent notices\n        stating that you changed the files and the date of any change.\n\n        c) You must cause the whole of the work to be licensed at no\n        charge to all third parties under the terms of this License.\n\n        d) If a facility in the modified Library refers to a function or a\n        table of data to be supplied by an application program that uses\n        the facility, other than as an argument passed when the facility\n        is invoked, then you must make a good faith effort to ensure that,\n        in the event an application does not supply such function or\n        table, the facility still operates, and performs whatever part of\n        its purpose remains meaningful.\n\n        (For example, a function in a library to compute square roots has\n        a purpose that is entirely well-defined independent of the\n        application.  Therefore, Subsection 2d requires that any\n        application-supplied function or table used by this function must\n        be optional: if the application does not supply it, the square\n        root function must still compute square roots.)\n\n    These requirements apply to the modified work as a whole.  If\n    identifiable sections of that work are not derived from the Library,\n    and can be reasonably considered independent and separate works in\n    themselves, then this License, and its terms, do not apply to those\n    sections when you distribute them as separate works.  But when you\n    distribute the same sections as part of a whole which is a work based\n    on the Library, the distribution of the whole must be on the terms of\n    this License, whose permissions for other licensees extend to the\n    entire whole, and thus to each and every part regardless of who wrote\n    it.\n\n    Thus, it is not the intent of this section to claim rights or contest\n    your rights to work written entirely by you; rather, the intent is to\n    exercise the right to control the distribution of derivative or\n    collective works based on the Library.\n\n    In addition, mere aggregation of another work not based on the Library\n    with the Library (or with a work based on the Library) on a volume of\n    a storage or distribution medium does not bring the other work under\n    the scope of this License.\n\n    3. You may opt to apply the terms of the ordinary GNU General Public\n    License instead of this License to a given copy of the Library.  To do\n    this, you must alter all the notices that refer to this License, so\n    that they refer to the ordinary GNU General Public License, version 2,\n    instead of to this License.  (If a newer version than version 2 of the\n    ordinary GNU General Public License has appeared, then you can specify\n    that version instead if you wish.)  Do not make any other change in\n    these notices.\n    \f\n    Once this change is made in a given copy, it is irreversible for\n    that copy, so the ordinary GNU General Public License applies to all\n    subsequent copies and derivative works made from that copy.\n\n    This option is useful when you wish to copy part of the code of\n    the Library into a program that is not a library.\n\n    4. You may copy and distribute the Library (or a portion or\n    derivative of it, under Section 2) in object code or executable form\n    under the terms of Sections 1 and 2 above provided that you accompany\n    it with the complete corresponding machine-readable source code, which\n    must be distributed under the terms of Sections 1 and 2 above on a\n    medium customarily used for software interchange.\n\n    If distribution of object code is made by offering access to copy\n    from a designated place, then offering equivalent access to copy the\n    source code from the same place satisfies the requirement to\n    distribute the source code, even though third parties are not\n    compelled to copy the source along with the object code.\n\n    5. A program that contains no derivative of any portion of the\n    Library, but is designed to work with the Library by being compiled or\n    linked with it, is called a \"work that uses the Library\".  Such a\n    work, in isolation, is not a derivative work of the Library, and\n    therefore falls outside the scope of this License.\n\n    However, linking a \"work that uses the Library\" with the Library\n    creates an executable that is a derivative of the Library (because it\n    contains portions of the Library), rather than a \"work that uses the\n    library\".  The executable is therefore covered by this License.\n    Section 6 states terms for distribution of such executables.\n\n    When a \"work that uses the Library\" uses material from a header file\n    that is part of the Library, the object code for the work may be a\n    derivative work of the Library even though the source code is not.\n    Whether this is true is especially significant if the work can be\n    linked without the Library, or if the work is itself a library.  The\n    threshold for this to be true is not precisely defined by law.\n\n    If such an object file uses only numerical parameters, data\n    structure layouts and accessors, and small macros and small inline\n    functions (ten lines or less in length), then the use of the object\n    file is unrestricted, regardless of whether it is legally a derivative\n    work.  (Executables containing this object code plus portions of the\n    Library will still fall under Section 6.)\n\n    Otherwise, if the work is a derivative of the Library, you may\n    distribute the object code for the work under the terms of Section 6.\n    Any executables containing that work also fall under Section 6,\n    whether or not they are linked directly with the Library itself.\n    \f\n    6. As an exception to the Sections above, you may also combine or\n    link a \"work that uses the Library\" with the Library to produce a\n    work containing portions of the Library, and distribute that work\n    under terms of your choice, provided that the terms permit\n    modification of the work for the customer's own use and reverse\n    engineering for debugging such modifications.\n\n    You must give prominent notice with each copy of the work that the\n    Library is used in it and that the Library and its use are covered by\n    this License.  You must supply a copy of this License.  If the work\n    during execution displays copyright notices, you must include the\n    copyright notice for the Library among them, as well as a reference\n    directing the user to the copy of this License.  Also, you must do one\n    of these things:\n\n        a) Accompany the work with the complete corresponding\n        machine-readable source code for the Library including whatever\n        changes were used in the work (which must be distributed under\n        Sections 1 and 2 above); and, if the work is an executable linked\n        with the Library, with the complete machine-readable \"work that\n        uses the Library\", as object code and/or source code, so that the\n        user can modify the Library and then relink to produce a modified\n        executable containing the modified Library.  (It is understood\n        that the user who changes the contents of definitions files in the\n        Library will not necessarily be able to recompile the application\n        to use the modified definitions.)\n\n        b) Use a suitable shared library mechanism for linking with the\n        Library.  A suitable mechanism is one that (1) uses at run time a\n        copy of the library already present on the user's computer system,\n        rather than copying library functions into the executable, and (2)\n        will operate properly with a modified version of the library, if\n        the user installs one, as long as the modified version is\n        interface-compatible with the version that the work was made with.\n\n        c) Accompany the work with a written offer, valid for at\n        least three years, to give the same user the materials\n        specified in Subsection 6a, above, for a charge no more\n        than the cost of performing this distribution.\n\n        d) If distribution of the work is made by offering access to copy\n        from a designated place, offer equivalent access to copy the above\n        specified materials from the same place.\n\n        e) Verify that the user has already received a copy of these\n        materials or that you have already sent this user a copy.\n\n    For an executable, the required form of the \"work that uses the\n    Library\" must include any data and utility programs needed for\n    reproducing the executable from it.  However, as a special exception,\n    the materials to be distributed need not include anything that is\n    normally distributed (in either source or binary form) with the major\n    components (compiler, kernel, and so on) of the operating system on\n    which the executable runs, unless that component itself accompanies\n    the executable.\n\n    It may happen that this requirement contradicts the license\n    restrictions of other proprietary libraries that do not normally\n    accompany the operating system.  Such a contradiction means you cannot\n    use both them and the Library together in an executable that you\n    distribute.\n    \f\n    7. You may place library facilities that are a work based on the\n    Library side-by-side in a single library together with other library\n    facilities not covered by this License, and distribute such a combined\n    library, provided that the separate distribution of the work based on\n    the Library and of the other library facilities is otherwise\n    permitted, and provided that you do these two things:\n\n        a) Accompany the combined library with a copy of the same work\n        based on the Library, uncombined with any other library\n        facilities.  This must be distributed under the terms of the\n        Sections above.\n\n        b) Give prominent notice with the combined library of the fact\n        that part of it is a work based on the Library, and explaining\n        where to find the accompanying uncombined form of the same work.\n\n    8. You may not copy, modify, sublicense, link with, or distribute\n    the Library except as expressly provided under this License.  Any\n    attempt otherwise to copy, modify, sublicense, link with, or\n    distribute the Library is void, and will automatically terminate your\n    rights under this License.  However, parties who have received copies,\n    or rights, from you under this License will not have their licenses\n    terminated so long as such parties remain in full compliance.\n\n    9. You are not required to accept this License, since you have not\n    signed it.  However, nothing else grants you permission to modify or\n    distribute the Library or its derivative works.  These actions are\n    prohibited by law if you do not accept this License.  Therefore, by\n    modifying or distributing the Library (or any work based on the\n    Library), you indicate your acceptance of this License to do so, and\n    all its terms and conditions for copying, distributing or modifying\n    the Library or works based on it.\n\n    10. Each time you redistribute the Library (or any work based on the\n    Library), the recipient automatically receives a license from the\n    original licensor to copy, distribute, link with or modify the Library\n    subject to these terms and conditions.  You may not impose any further\n    restrictions on the recipients' exercise of the rights granted herein.\n    You are not responsible for enforcing compliance by third parties with\n    this License.\n    \f\n    11. If, as a consequence of a court judgment or allegation of patent\n    infringement or for any other reason (not limited to patent issues),\n    conditions are imposed on you (whether by court order, agreement or\n    otherwise) that contradict the conditions of this License, they do not\n    excuse you from the conditions of this License.  If you cannot\n    distribute so as to satisfy simultaneously your obligations under this\n    License and any other pertinent obligations, then as a consequence you\n    may not distribute the Library at all.  For example, if a patent\n    license would not permit royalty-free redistribution of the Library by\n    all those who receive copies directly or indirectly through you, then\n    the only way you could satisfy both it and this License would be to\n    refrain entirely from distribution of the Library.\n\n    If any portion of this section is held invalid or unenforceable under any\n    particular circumstance, the balance of the section is intended to apply,\n    and the section as a whole is intended to apply in other circumstances.\n\n    It is not the purpose of this section to induce you to infringe any\n    patents or other property right claims or to contest validity of any\n    such claims; this section has the sole purpose of protecting the\n    integrity of the free software distribution system which is\n    implemented by public license practices.  Many people have made\n    generous contributions to the wide range of software distributed\n    through that system in reliance on consistent application of that\n    system; it is up to the author/donor to decide if he or she is willing\n    to distribute software through any other system and a licensee cannot\n    impose that choice.\n\n    This section is intended to make thoroughly clear what is believed to\n    be a consequence of the rest of this License.\n\n    12. If the distribution and/or use of the Library is restricted in\n    certain countries either by patents or by copyrighted interfaces, the\n    original copyright holder who places the Library under this License may add\n    an explicit geographical distribution limitation excluding those countries,\n    so that distribution is permitted only in or among countries not thus\n    excluded.  In such case, this License incorporates the limitation as if\n    written in the body of this License.\n\n    13. The Free Software Foundation may publish revised and/or new\n    versions of the Lesser General Public License from time to time.\n    Such new versions will be similar in spirit to the present version,\n    but may differ in detail to address new problems or concerns.\n\n    Each version is given a distinguishing version number.  If the Library\n    specifies a version number of this License which applies to it and\n    \"any later version\", you have the option of following the terms and\n    conditions either of that version or of any later version published by\n    the Free Software Foundation.  If the Library does not specify a\n    license version number, you may choose any version ever published by\n    the Free Software Foundation.\n    \f\n    14. If you wish to incorporate parts of the Library into other free\n    programs whose distribution conditions are incompatible with these,\n    write to the author to ask for permission.  For software which is\n    copyrighted by the Free Software Foundation, write to the Free\n    Software Foundation; we sometimes make exceptions for this.  Our\n    decision will be guided by the two goals of preserving the free status\n    of all derivatives of our free software and of promoting the sharing\n    and reuse of software generally.\n\n                    NO WARRANTY\n\n    15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO\n    WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.\n    EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR\n    OTHER PARTIES PROVIDE THE LIBRARY \"AS IS\" WITHOUT WARRANTY OF ANY\n    KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n    PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE\n    LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME\n    THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n    16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN\n    WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY\n    AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU\n    FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR\n    CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE\n    LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING\n    RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A\n    FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF\n    SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH\n    DAMAGES.\n\n\t\t     END OF TERMS AND CONDITIONS\n\f\n           How to Apply These Terms to Your New Libraries\n\n    If you develop a new library, and you want it to be of the greatest\n    possible use to the public, we recommend making it free software that\n    everyone can redistribute and change.  You can do so by permitting\n    redistribution under these terms (or, alternatively, under the terms of the\n    ordinary General Public License).\n\n    To apply these terms, attach the following notices to the library.  It is\n    safest to attach them to the start of each source file to most effectively\n    convey the exclusion of warranty; and each file should have at least the\n    \"copyright\" line and a pointer to where the full notice is found.\n\n        <one line to give the library's name and a brief idea of what it does.>\n        Copyright (C) <year>  <name of author>\n\n        This library is free software; you can redistribute it and/or\n        modify it under the terms of the GNU Lesser General Public\n        License as published by the Free Software Foundation; either\n        version 2.1 of the License, or (at your option) any later version.\n\n        This library is distributed in the hope that it will be useful,\n        but WITHOUT ANY WARRANTY; without even the implied warranty of\n        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n        Lesser General Public License for more details.\n\n        You should have received a copy of the GNU Lesser General Public\n        License along with this library; if not, write to the Free Software\n        Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA  02110-1335  USA\n\n    Also add information on how to contact you by electronic and paper mail.\n\n    You should also get your employer (if you work as a programmer) or your\n    school, if any, to sign a \"copyright disclaimer\" for the library, if\n    necessary.  Here is a sample; alter the names:\n\n    Yoyodyne, Inc., hereby disclaims all copyright interest in the\n    library `Frob' (a library for tweaking knobs) written by James Random Hacker.\n\n    <signature of Ty Coon>, 1 April 1990\n    Ty Coon, President of Vice\n\nThat's all there is to it!\n\n    scipy\n\n    Copyright (c) 2001-2002 Enthought, Inc. 2003-2024, SciPy Developers.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions\n    are met:\n\n    1. Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above\n    copyright notice, this list of conditions and the following\n    disclaimer in the documentation and/or other materials provided\n    with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived\n    from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    smmap\n\n    Copyright (C) 2010, 2011 Sebastian Thiel and contributors\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without \n    modification, are permitted provided that the following conditions \n    are met:\n\n    * Redistributions of source code must retain the above copyright \n    notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright \n    notice, this list of conditions and the following disclaimer in the \n    documentation and/or other materials provided with the distribution.\n\n    * Neither the name of the async project nor the names of \n    its contributors may be used to endorse or promote products derived \n    from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT \n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR \n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT \n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, \n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED \n    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR \n    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF \n    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING \n    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS \n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    Babel\n\n    Copyright (c) 2013-2023 by the Babel Team, see AUTHORS for more information.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions\n    are met:\n\n    1. Redistributions of source code must retain the above copyright\n        notice, this list of conditions and the following disclaimer.\n    2. Redistributions in binary form must reproduce the above copyright\n        notice, this list of conditions and the following disclaimer in\n        the documentation and/or other materials provided with the\n        distribution.\n    3. Neither the name of the copyright holder nor the names of its\n        contributors may be used to endorse or promote products derived\n        from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    click\n\n    Copyright 2014 Pallets\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    PySocks\n\n    Copyright 2006 Dan-Haim. All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n    3. Neither the name of Dan Haim nor the names of his contributors may be used\n    to endorse or promote products derived from this software without specific\n    prior written permission.\n    \n    THIS SOFTWARE IS PROVIDED BY DAN HAIM \"AS IS\" AND ANY EXPRESS OR IMPLIED\n    WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\n    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO\n    EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,\n    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA\n    OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\n    OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE.\n\n    cloudpickle\n\n    This module was extracted from the `cloud` package, developed by\n    PiCloud, Inc.\n\n    Copyright (c) 2015, Cloudpickle contributors.\n    Copyright (c) 2012, Regents of the University of California.\n    Copyright (c) 2009 PiCloud, Inc. http://www.picloud.com.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions\n    are met:\n        * Redistributions of source code must retain the above copyright\n        notice, this list of conditions and the following disclaimer.\n        * Redistributions in binary form must reproduce the above copyright\n        notice, this list of conditions and the following disclaimer in the\n        documentation and/or other materials provided with the distribution.\n        * Neither the name of the University of California, Berkeley nor the\n        names of its contributors may be used to endorse or promote\n        products derived from this software without specific prior written\n        permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\n    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    pyasn1\n\n    Copyright (c) 2005-2020, Ilya Etingof <etingof@gmail.com> All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    numba\n\n    Copyright (c) 2012, Anaconda, Inc.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n    Redistributions of source code must retain the above copyright notice,\n    this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in the\n    documentation and/or other materials provided with the distribution.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    kiwisolver\n\n    =========================\n    The Kiwi licensing terms\n    =========================\n    Kiwi is licensed under the terms of the Modified BSD License (also known as\n    New or Revised BSD), as follows:\n\n    Copyright (c) 2013, Nucleic Development Team\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright notice, this\n    list of conditions and the following disclaimer in the documentation and/or\n    other materials provided with the distribution.\n\n    Neither the name of the Nucleic Development Team nor the names of its\n    contributors may be used to endorse or promote products derived from this\n    software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    About Kiwi\n    ----------\n    Chris Colbert began the Kiwi project in December 2013 in an effort to\n    create a blisteringly fast UI constraint solver. Chris is still the\n    project lead.\n\n    The Nucleic Development Team is the set of all contributors to the Nucleic\n    project and its subprojects.\n\n    The core team that coordinates development on GitHub can be found here:\n    http://github.com/nucleic. The current team consists of:\n\n    * Chris Colbert\n\n    Our Copyright Policy\n    --------------------\n    Nucleic uses a shared copyright model. Each contributor maintains copyright\n    over their contributions to Nucleic. But, it is important to note that these\n    contributions are typically only changes to the repositories. Thus, the Nucleic\n    source code, in its entirety is not the copyright of any single person or\n    institution. Instead, it is the collective copyright of the entire Nucleic\n    Development Team. If individual contributors want to maintain a record of what\n    changes/contributions they have specific copyright on, they should indicate\n    their copyright in the commit message of the change, when they commit the\n    change to one of the Nucleic repositories.\n\n    With this in mind, the following banner should be used in any source code file\n    to indicate the copyright and license terms:\n\n    #------------------------------------------------------------------------------\n    # Copyright (c) 2013, Nucleic Development Team.\n    #\n    # Distributed under the terms of the Modified BSD License.\n    #\n    # The full license is in the file LICENSE, distributed with this software.\n    #------------------------------------------------------------------------------\n\n    locket\n\n    Copyright (c) 2012, Michael Williamson\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    protobuf\n\n    Copyright 2021 Google Inc.  All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n        * Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n        * Redistributions in binary form must reproduce the above\n    copyright notice, this list of conditions and the following disclaimer\n    in the documentation and/or other materials provided with the\n    distribution.\n        * Neither the name of Google Inc. nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    Code generated by the Protocol Buffer compiler is owned by the owner\n    of the input file used when generating it.  This code is not\n    standalone and requires a support library to be linked with it.  This\n    support library is itself covered by the above license.\n\n    dask-mpi\n\n    Copyright (c) 2018, Anaconda, Inc. and contributors\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice,\n    this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    Neither the name of Anaconda nor the names of any contributors may be used to\n    endorse or promote products derived from this software without specific prior\n    written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\n    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n    THE POSSIBILITY OF SUCH DAMAGE.\n\n    nodeenv\n\n    Copyright (c) 2011, Eugene Kalinin.\n\n    Some rights reserved.\n\n    Redistribution and use in source and binary forms of the software as well\n    as documentation, with or without modification, are permitted provided\n    that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above\n    copyright notice, this list of conditions and the following\n    disclaimer in the documentation and/or other materials provided\n    with the distribution.\n\n    * The names of the contributors may not be used to endorse or\n    promote products derived from this software without specific\n    prior written permission.\n\n    THIS SOFTWARE AND DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND\n    CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT\n    NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE AND DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH\n    DAMAGE.\n\n    networkx\n\n    Copyright (C) 2004-2024, NetworkX Developers\n    Aric Hagberg <hagberg@lanl.gov>\n    Dan Schult <dschult@colgate.edu>\n    Pieter Swart <swart@lanl.gov>\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n     * Redistributions of source code must retain the above copyright\n       notice, this list of conditions and the following disclaimer.\n\n     * Redistributions in binary form must reproduce the above\n       copyright notice, this list of conditions and the following\n       disclaimer in the documentation and/or other materials provided\n       with the distribution.\n\n     * Neither the name of the NetworkX Developers nor the names of its\n       contributors may be used to endorse or promote products derived\n       from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    python-dateutil\n\n    Copyright 2017- Paul Ganssle <paul@ganssle.io>\n    Copyright 2017- dateutil contributors (see AUTHORS file)\n\n    Licensed under the Apache License, Version 2.0 (the \"License\");\n    you may not use this file except in compliance with the License.\n    You may obtain a copy of the License at\n\n        http://www.apache.org/licenses/LICENSE-2.0\n\n    Unless required by applicable law or agreed to in writing, software\n    distributed under the License is distributed on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n    See the License for the specific language governing permissions and\n    limitations under the License.\n\n    The above license applies to all contributions after 2017-12-01, as well as\n    all contributions that have been re-licensed (see AUTHORS file for the list of\n    contributors who have re-licensed their code).\n    --------------------------------------------------------------------------------\n    dateutil - Extensions to the standard Python datetime module.\n\n    Copyright (c) 2003-2011 - Gustavo Niemeyer <gustavo@niemeyer.net>\n    Copyright (c) 2012-2014 - Tomi Pieviläinen <tomi.pievilainen@iki.fi>\n    Copyright (c) 2014-2016 - Yaron de Leeuw <me@jarondl.net>\n    Copyright (c) 2015-     - Paul Ganssle <paul@ganssle.io>\n    Copyright (c) 2015-     - dateutil contributors (see AUTHORS file)\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n        * Redistributions of source code must retain the above copyright notice,\n        this list of conditions and the following disclaimer.\n        * Redistributions in binary form must reproduce the above copyright notice,\n        this list of conditions and the following disclaimer in the documentation\n        and/or other materials provided with the distribution.\n        * Neither the name of the copyright holder nor the names of its\n        contributors may be used to endorse or promote products derived from\n        this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    The above BSD License Applies to all code, even that also covered by Apache 2.0.\n\n    colorama\n\n    Copyright (c) 2010 Jonathan Hartley\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holders, nor those of its contributors\n    may be used to endorse or promote products derived from this software without\n    specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    mkdocs-plugin-inline-svg\n\n    Copyright (c) 2021 Craig Roberts\n    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n    The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n\n    Jinja2\n\n    Copyright 2007 Pallets\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    numpy\n\n    Copyright (c) 2005-2024, NumPy Developers.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n        * Redistributions of source code must retain the above copyright\n        notice, this list of conditions and the following disclaimer.\n\n        * Redistributions in binary form must reproduce the above\n        copyright notice, this list of conditions and the following\n        disclaimer in the documentation and/or other materials provided\n        with the distribution.\n\n        * Neither the name of the NumPy Developers nor the names of any\n        contributors may be used to endorse or promote products derived\n        from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    fsspec\n\n    BSD 3-Clause License\n\n    Copyright (c) 2018, Martin Durant\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    spacy-loggers\n\n    MIT License\n\n    Copyright (c) 2021 ExplosionAI GmbH\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all\n    copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n    SOFTWARE.\n\n    Werkzeug\n\n    Copyright 2007 Pallets\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \n\n    mkdocs\n\n    Copyright © 2014-present, Tom Christie. All rights reserved.\n\n    Redistribution and use in source and binary forms, with or\n    without modification, are permitted provided that the following\n    conditions are met:\n\n    Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n    Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in\n    the documentation and/or other materials provided with the\n    distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND\n    CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES,\n    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\n    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR\n    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\n    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED\n    AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\n    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN\n    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n    POSSIBILITY OF SUCH DAMAGE.\n\n    gitdb\n\n    Copyright (C) 2010, 2011 Sebastian Thiel and contributors\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without \n    modification, are permitted provided that the following conditions \n    are met:\n\n    * Redistributions of source code must retain the above copyright \n    notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright \n    notice, this list of conditions and the following disclaimer in the \n    documentation and/or other materials provided with the distribution.\n\n    * Neither the name of the GitDB project nor the names of \n    its contributors may be used to endorse or promote products derived \n    from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT \n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR \n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT \n    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, \n    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED \n    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR \n    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF \n    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING \n    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS \n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n\n    Additional Licenses\n    -------------------\n    The files at \n    gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx\n    and \n    gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack \n    are licensed under GNU GPL as part of the git source repository, \n    see http://en.wikipedia.org/wiki/Git_%28software%29 for more information.\n\n    They are not required for the actual operation, which is why they are not found \n    in the distribution package.\n\n    traitlets\n\n    - Copyright (c) 2001-, IPython Development Team\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    psutil\n\n    Copyright (c) 2009, Jay Loden, Dave Daeschler, Giampaolo Rodola\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the psutil authors nor the names of its contributors\n    may be used to endorse or promote products derived from this software without\n    specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\n    ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    ipykernel\n\n    Copyright (c) 2015, IPython Development Team\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    comm\n\n    BSD 3-Clause License\n\n    Copyright (c) 2022, Jupyter\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    pexpect\n\n    This license is approved by the OSI and FSF as GPL-compatible.\n        http://opensource.org/licenses/isc-license.txt\n\n    Copyright (c) 2013-2014, Pexpect development team\n    Copyright (c) 2012, Noah Spurrier <noah@noah.org>\n\n    Permission to use, copy, modify, and/or distribute this software for any\n    purpose with or without fee is hereby granted, provided that the above\n    copyright notice and this permission notice appear in all copies.\n    \n    THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES\n    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF\n    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR\n    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\n    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN\n    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF\n    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\n\n    pyzmq\n\n    BSD 3-Clause License\n\n    Copyright (c) 2009-2012, Brian Granger, Min Ragan-Kelley\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n\n    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n\n    Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    matplotlib-inline\n\n    BSD 3-Clause License\n\n    Copyright (c) 2019-2022, IPython Development Team.\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF A DVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    backcall\n\n    Copyright (c) 2014, Thomas Kluyver\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice, this\n    list of conditions and the following disclaimer in the documentation and/or\n    other materials provided with the distribution.\n\n    * Neither the name of the {organization} nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\n    ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    jupyter_client\n\n    BSD 3-Clause License\n\n    - Copyright (c) 2001-2015, IPython Development Team\n    - Copyright (c) 2015-, Jupyter Development Team\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    nest-asyncio\n\n    BSD 2-Clause License\n\n    Copyright (c) 2018-2020, Ewald de Wit\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    ipython\n\n    BSD 3-Clause License\n\n    - Copyright (c) 2008-Present, IPython Development Team\n    - Copyright (c) 2001-2007, Fernando Perez <fernando.perez@colorado.edu>\n    - Copyright (c) 2001, Janko Hauser <jhauser@zscout.de>\n    - Copyright (c) 2001, Nathaniel Gray <n8gray@caltech.edu>\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    * Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    jupyter_core\n\n    BSD 3-Clause License\n\n    - Copyright (c) 2015-, Jupyter Development Team\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    1. Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    2. Redistributions in binary form must reproduce the above copyright notice,\n    this list of conditions and the following disclaimer in the documentation\n    and/or other materials provided with the distribution.\n\n    3. Neither the name of the copyright holder nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    asttokens\n\n    Apache License\n    Version 2.0, January 2004\n    http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"{}\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright {yyyy} {name of copyright owner}\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n    debugpy\n\n    Copyright (c) Microsoft Corporation\n    All rights reserved.\n\n    MIT License\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy of\n    this software and associated documentation files (the \"Software\"), to deal in\n    the Software without restriction, including without limitation the rights to\n    use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of\n    the Software, and to permit persons to whom the Software is furnished to do so,\n    subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in all\n    copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS\n    FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR\n    COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER\n    IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\n    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n\n    ptyprocess\n\n    ISC LICENSE\n\n    This license is approved by the OSI and FSF as GPL-compatible.\n        http://opensource.org/licenses/isc-license.txt\n\n    Copyright (c) 2013-2014, Pexpect development team\n    Copyright (c) 2012, Noah Spurrier <noah@noah.org>\n\n    Permission to use, copy, modify, and/or distribute this software for any\n    purpose with or without fee is hereby granted, provided that the above\n    copyright notice and this permission notice appear in all copies.\n    \n    THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES\n    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF\n    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR\n    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\n    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN\n    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF\n    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\n\n    prompt-toolkit\n\n    Copyright (c) 2014, Jonathan Slenders\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without modification,\n    are permitted provided that the following conditions are met:\n\n    * Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright notice, this\n    list of conditions and the following disclaimer in the documentation and/or\n    other materials provided with the distribution.\n\n    * Neither the name of the {organization} nor the names of its\n    contributors may be used to endorse or promote products derived from\n    this software without specific prior written permission.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR\n    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON\n    ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    decorator\n\n    Copyright (c) 2005-2018, Michele Simionato\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are\n    met:\n\n    * Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n    notice, this list of conditions and the following disclaimer in\n    the documentation and/or other materials provided with the\n    distribution.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n    \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n    HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,\n    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS\n    OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR\n    TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\n    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH\n    DAMAGE.\n\n    appnope\n\n    this project is licensed under 2-clause BSD\n\n    Copyright (c) 2013, Min Ragan-Kelley\n\n    All rights reserved.\n\n    Redistribution and use in source and binary forms, with or without\n    modification, are permitted provided that the following conditions are met:\n\n    Redistributions of source code must retain the above copyright notice, this\n    list of conditions and the following disclaimer.\n\n    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n    DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n    HuggingFace\n\n    Apache License\n    Version 2.0, January 2004\n    http://www.apache.org/licenses/\n\n    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n    Definitions.\n\n    \"License\" shall mean the terms and conditions for use, reproduction,\n    and distribution as defined by Sections 1 through 9 of this document.\n\n    \"Licensor\" shall mean the copyright owner or entity authorized by\n    the copyright owner that is granting the License.\n\n    \"Legal Entity\" shall mean the union of the acting entity and all\n    other entities that control, are controlled by, or are under common\n    control with that entity. For the purposes of this definition,\n    \"control\" means (i) the power, direct or indirect, to cause the\n    direction or management of such entity, whether by contract or\n    otherwise, or (ii) ownership of fifty percent (50%) or more of the\n    outstanding shares, or (iii) beneficial ownership of such entity.\n\n    \"You\" (or \"Your\") shall mean an individual or Legal Entity\n    exercising permissions granted by this License.\n\n    \"Source\" form shall mean the preferred form for making modifications,\n    including but not limited to software source code, documentation\n    source, and configuration files.\n\n    \"Object\" form shall mean any form resulting from mechanical\n    transformation or translation of a Source form, including but\n    not limited to compiled object code, generated documentation,\n    and conversions to other media types.\n\n    \"Work\" shall mean the work of authorship, whether in Source or\n    Object form, made available under the License, as indicated by a\n    copyright notice that is included in or attached to the work\n    (an example is provided in the Appendix below).\n\n    \"Derivative Works\" shall mean any work, whether in Source or Object\n    form, that is based on (or derived from) the Work and for which the\n    editorial revisions, annotations, elaborations, or other modifications\n    represent, as a whole, an original work of authorship. For the purposes\n    of this License, Derivative Works shall not include works that remain\n    separable from, or merely link (or bind by name) to the interfaces of,\n    the Work and Derivative Works thereof.\n\n    \"Contribution\" shall mean any work of authorship, including\n    the original version of the Work and any modifications or additions\n    to that Work or Derivative Works thereof, that is intentionally\n    submitted to Licensor for inclusion in the Work by the copyright owner\n    or by an individual or Legal Entity authorized to submit on behalf of\n    the copyright owner. For the purposes of this definition, \"submitted\"\n    means any form of electronic, verbal, or written communication sent\n    to the Licensor or its representatives, including but not limited to\n    communication on electronic mailing lists, source code control systems,\n    and issue tracking systems that are managed by, or on behalf of, the\n    Licensor for the purpose of discussing and improving the Work, but\n    excluding communication that is conspicuously marked or otherwise\n    designated in writing by the copyright owner as \"Not a Contribution.\"\n\n    \"Contributor\" shall mean Licensor and any individual or Legal Entity\n    on behalf of whom a Contribution has been received by Licensor and\n    subsequently incorporated within the Work.\n\n    Grant of Copyright License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    copyright license to reproduce, prepare Derivative Works of,\n    publicly display, publicly perform, sublicense, and distribute the\n    Work and such Derivative Works in Source or Object form.\n\n    Grant of Patent License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    (except as stated in this section) patent license to make, have made,\n    use, offer to sell, sell, import, and otherwise transfer the Work,\n    where such license applies only to those patent claims licensable\n    by such Contributor that are necessarily infringed by their\n    Contribution(s) alone or by combination of their Contribution(s)\n    with the Work to which such Contribution(s) was submitted. If You\n    institute patent litigation against any entity (including a\n    cross-claim or counterclaim in a lawsuit) alleging that the Work\n    or a Contribution incorporated within the Work constitutes direct\n    or contributory patent infringement, then any patent licenses\n    granted to You under this License for that Work shall terminate\n    as of the date such litigation is filed.\n\n    Redistribution. You may reproduce and distribute copies of the\n    Work or Derivative Works thereof in any medium, with or without\n    modifications, and in Source or Object form, provided that You\n    meet the following conditions:\n\n    (a) You must give any other recipients of the Work or\n    Derivative Works a copy of this License; and\n\n    (b) You must cause any modified files to carry prominent notices\n    stating that You changed the files; and\n\n    (c) You must retain, in the Source form of any Derivative Works\n    that You distribute, all copyright, patent, trademark, and\n    attribution notices from the Source form of the Work,\n    excluding those notices that do not pertain to any part of\n    the Derivative Works; and\n\n    (d) If the Work includes a \"NOTICE\" text file as part of its\n    distribution, then any Derivative Works that You distribute must\n    include a readable copy of the attribution notices contained\n    within such NOTICE file, excluding those notices that do not\n    pertain to any part of the Derivative Works, in at least one\n    of the following places: within a NOTICE text file distributed\n    as part of the Derivative Works; within the Source form or\n    documentation, if provided along with the Derivative Works; or,\n    within a display generated by the Derivative Works, if and\n    wherever such third-party notices normally appear. The contents\n    of the NOTICE file are for informational purposes only and\n    do not modify the License. You may add Your own attribution\n    notices within Derivative Works that You distribute, alongside\n    or as an addendum to the NOTICE text from the Work, provided\n    that such additional attribution notices cannot be construed\n    as modifying the License.\n\n    You may add Your own copyright statement to Your modifications and\n    may provide additional or different license terms and conditions\n    for use, reproduction, or distribution of Your modifications, or\n    for any such Derivative Works as a whole, provided Your use,\n    reproduction, and distribution of the Work otherwise complies with\n    the conditions stated in this License.\n\n    Submission of Contributions. Unless You explicitly state otherwise,\n    any Contribution intentionally submitted for inclusion in the Work\n    by You to the Licensor shall be under the terms and conditions of\n    this License, without any additional terms or conditions.\n    Notwithstanding the above, nothing herein shall supersede or modify\n    the terms of any separate license agreement you may have executed\n    with Licensor regarding such Contributions.\n\n    Trademarks. This License does not grant permission to use the trade\n    names, trademarks, service marks, or product names of the Licensor,\n    except as required for reasonable and customary use in describing the\n    origin of the Work and reproducing the content of the NOTICE file.\n\n    Disclaimer of Warranty. Unless required by applicable law or\n    agreed to in writing, Licensor provides the Work (and each\n    Contributor provides its Contributions) on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n    implied, including, without limitation, any warranties or conditions\n    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n    PARTICULAR PURPOSE. You are solely responsible for determining the\n    appropriateness of using or redistributing the Work and assume any\n    risks associated with Your exercise of permissions under this License.\n\n    Limitation of Liability. In no event and under no legal theory,\n    whether in tort (including negligence), contract, or otherwise,\n    unless required by applicable law (such as deliberate and grossly\n    negligent acts) or agreed to in writing, shall any Contributor be\n    liable to You for damages, including any direct, indirect, special,\n    incidental, or consequential damages of any character arising as a\n    result of this License or out of the use or inability to use the\n    Work (including but not limited to damages for loss of goodwill,\n    work stoppage, computer failure or malfunction, or any and all\n    other commercial damages or losses), even if such Contributor\n    has been advised of the possibility of such damages.\n\n    Accepting Warranty or Additional Liability. While redistributing\n    the Work or Derivative Works thereof, You may choose to offer,\n    and charge a fee for, acceptance of support, warranty, indemnity,\n    or other liability obligations and/or rights consistent with this\n    License. However, in accepting such obligations, You may act only\n    on Your own behalf and on Your sole responsibility, not on behalf\n    of any other Contributor, and only if You agree to indemnify,\n    defend, and hold each Contributor harmless for any liability\n    incurred by, or claims asserted against, such Contributor by reason\n    of your accepting any such warranty or additional liability.\n\n    END OF TERMS AND CONDITIONS\n\n    APPENDIX: How to apply the Apache License to your work.\n\n    To apply the Apache License to your work, attach the following\n    boilerplate notice, with the fields enclosed by brackets \"[]\"\n    replaced with your own identifying information. (Don't include\n    the brackets!)  The text should be enclosed in the appropriate\n    comment syntax for the file format. We also recommend that a\n    file or class name and description of purpose be included on the\n    same \"printed page\" as the copyright notice for easier\n    identification within third-party archives.\n    Copyright [yyyy] [name of copyright owner]\n\n    Licensed under the Apache License, Version 2.0 (the \"License\");\n    you may not use this file except in compliance with the License.\n    You may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n    Unless required by applicable law or agreed to in writing, software\n    distributed under the License is distributed on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n    See the License for the specific language governing permissions and\n    limitations under the License.\n\n    spaCy\n    The MIT License (MIT)\n\n    Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal\n\n    Permission is hereby granted, free of charge, to any person obtaining a copy\n    of this software and associated documentation files (the \"Software\"), to deal\n    in the Software without restriction, including without limitation the rights\n    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n    copies of the Software, and to permit persons to whom the Software is\n    furnished to do so, subject to the following conditions:\n\n    The above copyright notice and this permission notice shall be included in\n    all copies or substantial portions of the Software.\n\n    THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n    THE SOFTWARE.\n\n"
  },
  {
    "path": "Makefile",
    "content": ".PHONY: all index qnagen query eval help azureml clear_docs clear_artifacts test flake updatekv\n.DEFAULT_GOAL := help\n\n# Load .env file if exists and export all variables before running any target\nENV_FILE := .env\nifeq ($(filter $(MAKECMDGOALS),config clean),)\n\tifneq ($(strip $(wildcard $(ENV_FILE))),)\n\t\tifneq ($(MAKECMDGOALS),config)\n\t\t\tinclude $(ENV_FILE)\n\t\t\texport\n\t\tendif\n\tendif\nendif\n\nSHELL := /bin/bash\ntarget_title = @echo -e \"\\n\\e[34m»»» 🧩 \\e[96m$(1)\\e[0m...\"\n\nhelp: ## 💬 This help message :)\n\t@grep -E '[a-zA-Z_-]+:.*?## .*$$' $(firstword $(MAKEFILE_LIST)) | awk 'BEGIN {FS = \":.*?## \"}; {printf \"\\033[36m%-25s\\033[0m %s\\n\", $$1, $$2}'\n\nall: index qnagen query eval ## 🔠 Run all steps in sequence: load_env index qnagen query eval\nquery_eval: query eval ## 🔍👓 Run query and eval steps in sequence: load_env query eval\n\nload_env: ## 📃 Load .env file\n\t$(call target_title, \"loading env file\") \\\n\t&& source .env\n\nindex: ## 📚 Index documents (download documents from blob storage, split to chunks, generate embeddings, create and upload to azure search index)\n\t$(call target_title, \"indexing\")\n\tpython3 01_index.py $(if $(dd),--data_dir $(dd), --data_dir ./data) $(if $(cp),--config_path $(cp))\n\nqnagen: ## ❓ Generate questions and answers for all document chunks in configured index\n\t$(call target_title, \"question and answer generation\")\n\tpython3 02_qa_generation.py $(if $(dd),--data_dir $(dd), --data_dir ./data) $(if $(cp),--config_path $(cp))\n\nquery: ## 🔍 Query the index for all questions in jsonl file configured in config.json and generate answers using LLM\n\t$(call target_title, \"querying\")\n\tpython3 03_querying.py $(if $(dd),--data_dir $(dd), --data_dir ./data) $(if $(cp),--config_path $(cp))\n\neval: ## 👓 Evaluate metrics for all answers compared to ground truth\n\t$(call target_title, \"evaluating\")\n\tpython3 04_evaluation.py $(if $(dd),--data_dir $(dd), --data_dir ./data) $(if $(cp),--config_path $(cp))\n\n\nazureml: ## 🚀 Run all steps in sequence on Azure ML\n\t$(call target_title, \"running on Azure ML\")\n\tpython3 azureml/pipeline.py $(if $(dd),--data_dir $(dd), --data_dir ./data)  $(if $(cp),--config_path $(cp), --config_path ./config.json)\n\n\nclear_docs: ## ❌ Delete all downloaded documents from data folder\n\t$(call target_title, \"deleting all downloaded documents from data folder\")\n\trm -rf data\n\nclear_artifacts: ## ❌ Delete all document chunks, index data and evaluation scores from artifacts folder\n\t$(call target_title, \"clearing artifacts folder\") \\\n\t&& rm -rf ./artifacts/docs_data \\\n\t&& rm -rf ./artifacts/eval_score \\\n\t&& rm -rf ./artifacts/index_data \\\n\t&& rm -rf ./artifacts/outputs\n\nclear_checkpoints: ## ❌ Delete all checkpoints data\n\t$(call target_title, \"clearing checkpoints\") \\\n\t&& rm -rf ./artifacts/checkpoints \\\n\ntest: ## 🧪 Run tests\n\t$(call target_title, \"running tests\")\n\tpytest . --cov=. --cov-report=html --cov-config=.coveragerc\n\nflake: ## 🧹 Run flake8\n\t$(call target_title, \"running flake8\")\n\tflake8 --extend-ignore=E501\n\nupdatekv: ## 🔄 Update keyvault secrets\n\t$(call target_title, \"updating keyvault secrets\")\n\tpython3 env_to_keyvault.py\n"
  },
  {
    "path": "README.md",
    "content": "# RAG Experiment Accelerator\n\n## Overview\n\nThe **RAG Experiment Accelerator** is a versatile tool that helps you conduct experiments and evaluations using Azure AI Search and RAG pattern. This document provides a comprehensive guide that covers everything you need to know about this tool, such as its purpose, features, installation, usage, and more.\n\n## Purpose\n\nThe main goal of the **RAG Experiment Accelerator** is to make it easier and faster to run experiments and evaluations of search queries and quality of response from OpenAI. This tool is useful for researchers, data scientists, and developers who want to:\n\n- Test the performance of different Search and OpenAI related hyperparameters.\n- Compare the effectiveness of various search strategies.\n- Fine-tune and optimize parameters.\n- Find the best combination of hyperparameters.\n- Generate detailed reports and visualizations from experiment results.\n\n## Latest changes\n\n18 March 2024: Content sampling has been added. This functionality will allow the dataset to be sampled by a specified percentage. The data is clustered by content and then the sample percentage is taken across each cluster to attempt even distribution of the sampled data.\n\nThis is done to ensure representative results in the sample that one would get across the entire dataset.\n\n**Note**: It is recommended to rebuild your environment if you have used this tool before due to new dependencies.\n\n## Features\n\nThe **RAG Experiment Accelerator** is config driven and offers a rich set of features to support its purpose:\n\n1. **Experiment Setup**: You can define and configure experiments by specifying a range of search engine parameters, search types, query sets, and evaluation metrics.\n\n1. **Integration**: It integrates seamlessly with Azure AI Search, Azure Machine Learning, MLFlow and Azure OpenAI.\n\n1. **Rich Search Index**: It creates multiple search indexes based on hyperparameter configurations available in the config file.\n\n1. **Multiple Document Loaders**: The tool supports multiple document loaders, including loading via Azure Document Intelligence and basic LangChain loaders. This gives you the flexibility to experiment with different extraction methods and evaluate their effectiveness.\n\n1. **Custom Document Intelligence Loader**: When selecting the 'prebuilt-layout' API model for Document Intelligence, the tool utilizes a custom Document Intelligence loader to load the data. This custom loader supports formatting of tables with column headers into key-value pairs (to enhance readability for the LLM), excludes irrelevant parts of the file for the LLM (such as page numbers and footers), removes recurring patterns in the file using regex, and more. Since each table row is transformed into a text line, to avoid breaking a row in the middle, chunking is done recursively by paragraph and line.\nThe custom loader resorts to the simpler 'prebuilt-layout' API model as a fallback when the 'prebuilt-layout' fails. Any other API model will utilize LangChain's implementation, which returns the raw response from Document Intelligence's API.\n\n1. **Query Generation**: The tool can generate a variety of diverse and customizable query sets, which can be tailored for specific experimentation needs.\n\n1. **Multiple Search Types**: It supports multiple search types, including pure text, pure vector, cross-vector, multi-vector, hybrid, and more. This gives you the ability to conduct comprehensive analysis on search capabilities and results.\n\n1. **Sub-Querying**: The pattern evaluates the user query and if it finds it complex enough, it breaks it down into smaller sub-queries to generate relevant context.\n\n1. **Re-Ranking**: The query responses from Azure AI Search are re-evaluated using LLM and ranked according to the relevance between the query and the context.\n\n1. **Metrics and Evaluation**: It supports end-to-end metrics comparing the generated answers (actual) against the ground-truth answers (expected), including distance-based, cosine and semantic similarity metrics. It also includes component-based metrics to assess retrieval and generation performance using LLMs as judges, such as context recall or answer relevance, as well as retrieval metrics to assess search results (e.g. MAP@k).\n\n1. **Report Generation**: The **RAG Experiment Accelerator** automates the process of report generation, complete with visualizations that make it easy to analyze and share experiment findings.\n\n1. **Multi-Lingual**: The tool supports language analyzers for linguistic support on individual languages and specialized (language-agnostic) analyzers for user-defined patterns on search indexes. For more information, see [Types of Analyzers](https://learn.microsoft.com/en-us/azure/search/search-analyzers#types-of-analyzers).\n\n1. **Sampling**: If you have a large dataset and/or want to speed up the experimentation, a sampling process is available to create a small but representative sample of the data for the percentage specified. The data will be clustered by content and a percentage of each cluster will be selected as part of the sample. Results obtained should be roughly indicative of the full dataset within a ~10% margin. Once an approach has been identified, running on the full dataset is recommended for accurate results.\n\n## Products used\n\n- [Azure AI Search Service](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal) (Note: [Semantic Search](https://learn.microsoft.com/en-us/azure/search/search-get-started-semantic?tabs=dotnet) is available in Azure AI Search, at Basic tier or higher.)\n- [Azure OpenAI Service](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#how-do-i-get-access-to-azure-openai) or access to the [OpenAI API](https://platform.openai.com/docs/quickstart?context=python)\n- [Azure Machine Learning Resources](https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-azure-ml-in-a-day?view=azureml-api-2)\n\n## Compute setup\n\nAt the moment, the RAG Experiment Accelerator can be run locally leveraging one of the following:\n\n* [Run within a development container](#1-run-within-a-development-container)\n* [Install locally on your host machine](#2-local-install).\n\n### 1. Run within a Development Container\n\nUsing a development container will mean that all of the required software is installed for you. This will require WSL. For more information about development containers visit [containers.dev](https://containers.dev/)\n\n#### Install the Pre-Requisite Software\n\nInstall the following software on the host machine you will perform the deployment from:\n\n>1. For Windows - [Windows Store Ubuntu 22.04.3 LTS](https://www.microsoft.com/store/productId/9pn20msr04dw)\n>2. [Docker Desktop](https://www.docker.com/products/docker-desktop)\n>3. [Visual Studio Code](https://visualstudio.microsoft.com/downloads/)\n>4. [VS Code Extension: Remote-Containers](vscode:extension/ms-vscode-remote.remote-containers)\n\n#### Developing in a DevContainer\n\nFurther guidance of setting up WSL can be found [here](./docs/wsl.md). Now you have the prerequisites, you can:\n1. **Clone the repository**: Clone the accelerator's repository in your WSL terminal.\n\n```bash\ngit clone https://github.com/microsoft/rag-experiment-accelerator.git\ncode .\n```\n\nOnce the project opens in vscode it should ask you if you would like to \"Reopen this in a development container\". Say yes.\n\n---\n\n### 2. Local install\n\nYou can of course run the **RAG Experiment Accelerator** on a Windows/Mac machine if you like; you are responsible for installing the correct tooling. Follow these installation steps:\n\n1. **Clone the repository**: Clone the accelerator's repository from [GitHub].\n\n```bash\ngit clone https://github.com/microsoft/rag-experiment-accelerator.git\n```\n\n2. Execute the requirements.txt in a conda (first install Anaconda/Miniconda) or virtual environment (then install a couple of dependencies - prompted on the run) to install the dependencies.\n\n```bash\nconda create -n rag-experiment python=3.11\nconda init bash\n```\nClose your terminal, open a new one, and run:\n```bash\nconda activate rag-experiment\npip install .\n```\n\n3. Install Azure CLI and authorize:\n```bash\naz login\naz account set  --subscription=\"<your_subscription_guid>\"\naz account show\n```\n\n## Provision Infrastructure\n\nThere are 3 options to install all the required azure services:\n\n### 1. Install with Azure Developer CLI\nThis project supports Azure Developer CLI.\n```sh\nazd provision\n```\n- You can also use `azd up` if you prefer as this calls `azd provision` anyway\n- Please use the up/down arrows to select your Subscription and Region\n\n![azd](./docs/azd.png)\n\nOnce this has completed you can use the launch configuration to run, or debug the 4 steps and the current environment provisioned by `azd` will be loaded with the correct values.\n\n![alt text](./docs/launch.png)\n\n### 2. One click Azure deployment via Azure portal UI\n\nIf you want to deploy the infrastructure yourself from template you can also click here:\n\n[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2Frag-experiment-accelerator%2Fdevelopment%2Finfra%2Fmain.json)\n\n### 3. Deploy with Azure CLI\n\nIf you don't want to use `azd` you can use the normal `az` cli too.\n\nUse following command to deploy.\n\n```bash\naz login\naz deployment sub create --subscription <subscription-id> --location <location>  --template-file infra/main.bicep\n```\n\nOr\n\nTo deploy with isolated network use following command. Replace the parameter values with the specifics of your isolated network. You **must** supply all three parameters (i.e. `vnetAddressSpace`, `proxySubnetAddressSpace` and `subnetAddressSpace`) if you wish to deploy to an isolated network.\n\n```bash\naz login\naz deployment sub create --location <location> --template-file infra/main.bicep \\\n--parameters vnetAddressSpace=<vnet-address-space> \\\n--parameters proxySubnetAddressSpace=<proxy-subnet-address-space> \\\n--parameters subnetAddressSpace=<azure-subnet-address-space>\n```\n\nhere is an example with parameter values:\n\n```bash\naz deployment sub create --location uksouth --template-file infra/main.bicep \\\n--parameters vnetAddressSpace='10.0.0.0/16' \\\n--parameters proxySubnetAddressSpace='10.0.1.0/24' \\\n--parameters subnetAddressSpace='10.0.2.0/24'\n```\n\n## How to use\n\nTo use the **RAG Experiment Accelerator** locally, follow these steps:\n\n1. Copy the provided `.env.template` file to a file named `.env` and update all of the [required values](./docs/environment-variables.md). Many of the required values for the `.env` file will come from resources which have previously been configured and/or can be gathered from resources provisioned in the [Provision Infrastructure](#provision-infrastructure) section. Also note, by default, `LOGGING_LEVEL` is set to `INFO` but can be changed to any of the following levels: `NOTSET`, `DEBUG`, `INFO`, `WARN`, `ERROR`, `CRITICAL`.\n    ```bash\n    cp .env.template .env\n    # change parameters manually\n    ```\n1. Copy the provided `config.sample.json` file to a file named `config.json` and change any hyperparameters to tailor to your experiment.\n    ```bash\n    cp config.sample.json config.json\n    # change parameters manually\n    ```\n1. Copy any files for ingestion (PDF, HTML, Markdown, Text, JSON or DOCX format) into the `data` folder.\n\n1. Run `01_index.py` (python 01_index.py) to create Azure AI Search indexes and load data into them.\n    ```bash\n    python 01_index.py\n    -d \"The directory holding the configuration files and data. Defaults to current working directory\"\n    -dd \"The directory holding the data. Defaults to data\"\n    -cf \"JSON config filename. Defaults to config.json\"\n    ```\n1. Run `02_qa_generation.py` (python 02_qa_generation.py) to generate question-answer pairs using Azure OpenAI.\n    ```bash\n    python 02_qa_generation.py\n    -d \"The directory holding the configuration files and data. Defaults to current working directory\"\n    -dd \"The directory holding the data. Defaults to data\"\n    -cf \"JSON config filename. Defaults to config.json\"\n    ```\n1. Run `03_querying.py` (python 03_querying.py) to query Azure AI Search to generate context, re-rank items in context, and get response from Azure OpenAI using the new context.\n    ```bash\n    python 03_querying.py\n    -d \"The directory holding the configuration files and data. Defaults to current working directory\"\n    -cf \"JSON config filename. Defaults to config.json\"\n    ```\n1. Run `04_evaluation.py` (python 04_evaluation.py) to calculate metrics using various methods and generate charts and reports in Azure Machine Learning using MLFlow integration.\n    ```bash\n    python 04_evaluation.py\n    -d \"The directory holding the configuration files and data. Defaults to current working directory\"\n    -cf \"JSON config filename. Defaults to config.json\"\n    ```\n\nAlternatively, you can run the above steps (apart from `02_qa_generation.py`) using an Azure ML pipeline. To do so, follow [the guide here](./docs/azureml-pipeline.md).\n\n### Running with sampling\n\nSampling will be run locally to create a small but representative slice of the data. This helps with rapid experimentation and keeps costs down. Results obtained should be roughly indicative of the full dataset within a ~10% margin. Once an approach has been identified, running on the full dataset is recommended for accurate results.\n\n**Note**: Sampling can only be run locally, at this stage it is not supported on a distributed AML compute cluster. So the process would be to run sampling locally and then use the generated sample dataset to run on AML.\n\nIf you have a very large dataset and want to run a similar approach to sample the data, you can use the pyspark in-memory distributed implementation in the [Data Discovery Toolkit](https://github.com/microsoft/Data-Discovery-Toolkit) for [Microsoft Fabric](https://learn.microsoft.com/en-us/fabric/get-started/microsoft-fabric-overview) or [Azure Synapse Analytics](https://learn.microsoft.com/en-gb/azure/synapse-analytics/).\n\n#### Available sampling parameters in the config.json file\n\n```json\n    \"sampling\": {\n        \"sample_data\": \"Set to true to enable sampling\",\n        \"only_run_sampling\": \"If set to true, this will only run the sampling step and will not create an index or any subsequent steps, use this if you want to build a small sampled dataset to run in AML\",\n        \"sample_percentage\": \"Percentage of the document corpus to sample\",\n        \"optimum_k\": \"Set to 'auto' to automatically determine the optimum cluster number or set to a specific value e.g. 15\",\n        \"min_cluster\": \"Used by the automated optimum cluster process, this is the minimum number of clusters e.g. 2\",\n        \"max_cluster\": \"Used by the automated optimum cluster process, this is the maximum number of clusters e.g. 30\",\n    },\n```\n\n\nThe sampling process will produce the following artifacts in the sampling directory:\n\n1. A directory named after the config value ```job_name``` containing the subset of files sampled, these can be specified as ```--data_dir``` argument when running the entire process on AML.\n2. A 2 dimensional scatter plot of the clustered files (by content) selected as the sampled dataset in the sampling folder.\n![images/all_cluster_predictions_cluster_number_5.jpg](images/all_cluster_predictions_cluster_number_5.jpg)\n3. A .cvs file of the entire dataset with cluster predictions named \"all_cluster_predictions...\" and a cvs file with the sampled cluster predictions named \"sampled_cluster_predictions...\". This can be used for further enriching the dataset, for example, creating a meaningful label per cluster and updates all record. See the [Heuristics classifier in the Data Discovery Toolkit as an example](https://github.com/microsoft/Data-Discovery-Toolkit/blob/main/walkthroughs/heuristics/standalone_text_heuristics.ipynb) or [Pixplotml for image data](https://github.com/microsoft/Data-Discovery-Toolkit?tab=readme-ov-file#using-pixplotml-to-rapidly-visualise-and-label-data-for-training).\n4. If the ```\"optimum_k\": auto``` config value is set to auto, the sampling process will attempt to set the optimum number of clusters automatically. This can be overridden if you know roughly how many broad buckets of content exist in your data. An elbow graph will be generated in the sampling folder.\n![Optimum k elbow graph](images/elbow_5.png)\n\nTwo options exist for running sampling, namely:\n\n1. Run the entire process locally with sampling, including the index generation and subsequent steps\n2. Run only the sampling locally and then use the created sampled dataset to execute on AML\n\n#### Run the entire process locally\n\nSet the following values to run the indexing process locally:\n\n```json\n    \"sampling\": {\n        \"sample_data\": true,\n        \"only_run_sampling\": false,\n        \"sample_percentage\": 10,\n        \"optimum_k\": auto,\n        \"min_cluster\": 2,\n        \"max_cluster\": 30\n    },\n```\n\n#### Run only the sampling locally and the subsequent steps on AML\n\nIf ```only_run_sampling```config value is set to true, this will only run the sampling step, no index will be created and any other subsequent steps will not executed. Set the ```--data_dir``` argument to directory created by the sampling process which will be:\n\n```artifacts/sampling/config.[job_name]``` and execute the [AML pipeline step.](docs/azureml-pipeline.md)\n\n# Description of configuration elements\n\nAll the values can be lists of elements. Including the nested configurations.\nEvery array will produce the combinations of flat configurations when the method `flatten()` is called on a particular node, to select 1 random combination - call the method `sample()`.\n\n```json\n{\n    \"experiment_name\": \"If provided, this will be the experiment name in Azure ML and it will group all job run under the same experiment, otherwise (if left empty) index_name_prefix will be used and there may be more than one experiment\",\n    \"job_name\": \"If provided, all jobs runs in Azure ML will be named with this property value plus timestamp, otherwise (if left empty) each job with be named only with timestamp\",\n    \"job_description\": \"You may provide a description for the current job run which describes in words what you are about to experiment with\",\n    \"data_formats\": \"Specifies the supported data formats for the application. You can choose from a variety of formats such as JSON, CSV, PDF, and more. [*] - means all formats included\",\n    \"main_instruction\": \"Defines the main instruction prompt coming with queries to LLM\",\n    \"use_checkpoints\": \"A boolean. If true, enables use of checkpoints to load data and skip processing that was already done in previous executions.\",\n    \"index\": {\n        \"index_name_prefix\": \"Search index name prefix\",\n        \"ef_construction\": \"ef_construction value determines the value of Azure AI Search vector configuration.\",\n        \"ef_search\": \"ef_search value determines the value of Azure AI Search vector configuration.\",\n        \"chunking\": {\n            \"preprocess\": \"A boolean. If true, preprocess documents, split into smaller chunks, embed and enrich them, and finally upload documents chunks for retrieval into Azure Search Index.\",\n            \"chunk_size\": \"Size of each chunk e.g. [500, 1000, 2000]\",\n            \"overlap_size\": \"Overlap Size for each chunk e.g. [100, 200, 300]\",\n            \"generate_title\": \"A boolean. If true, a title is generated for the chunk of content and an embedding is created for it\",\n            \"generate_summary\": \"A boolean. If true, a summary is generated for the chunk of content and an embedding is created for it\",\n            \"override_content_with_summary\": \"A boolean. If true, The chunk content is replaced with its summary\",\n            \"chunking_strategy\": \"determines the chunking strategy. Valid values are 'azure-document-intelligence' or 'basic'\",\n            \"azure_document_intelligence_model\": \"represents the Azure Document Intelligence Model. Used when chunking strategy is 'azure-document-intelligence'. When set to 'prebuilt-layout', provides additional features (see above)\"\n        },\n        \"embedding_model\": \"see 'Description of embedding models config' below\",\n        \"sampling\": {\n            \"sample_data\": \"Set to true to enable sampling\",\n            \"percentage\": \"Percentage of the document corpus to sample\",\n            \"optimum_k\": \"Set to 'auto' to automatically determine the optimum cluster number or set to a specific value e.g. 15\",\n            \"min_cluster\": \"Used by the automated optimum cluster process, this is the minimum number of clusters e.g. 2\",\n            \"max_cluster\": \"Used by the automated optimum cluster process, this is the maximum number of clusters e.g. 30\"\n        }\n    },\n    \"language\": {\n        \"analyzer\": {\n            \"analyzer_name\": \"name of the analyzer to use for the field. This option can be used only with searchable fields and it can't be set together with either searchAnalyzer or indexAnalyzer.\",\n            \"index_analyzer_name\": \"name of the analyzer used at indexing time for the field. This option can be used only with searchable fields. It must be set together with searchAnalyzer and it cannot be set together with the analyzer option.\",\n            \"search_analyzer_name\": \"name of the analyzer used at search time for the field. This option can be used only with searchable fields. It must be set together with indexAnalyzer and it cannot be set together with the analyzer option. This property cannot be set to the name of a language analyzer; use the analyzer property instead if you need a language analyzer.\",\n            \"char_filters\": \"The character filters for the index\",\n            \"tokenizers\": \"The tokenizers for the index\",\n            \"token_filters\": \"The token filters for the index\"\n        },\n        \"query_language\": \"The language of the query. Possible values: en-us, en-gb, fr-fr etc.\"\n    },\n    \"rerank\": {\n        \"enabled\": \"determines if search results should be re-ranked. Value values are TRUE or FALSE\",\n        \"type\": \"determines the type of re-ranking. Value values are llm or cross_encoder\",\n        \"llm_rerank_threshold\": \"determines the threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10.\",\n        \"cross_encoder_at_k\": \"determines the threshold when using cross-encoding re-ranking. Chunks with given rank value are selected.\",\n        \"cross_encoder_model\": \"determines the model used for cross-encoding re-ranking step. Valid value is cross-encoder/stsb-roberta-base\"\n    },\n    \"search\": {\n        \"retrieve_num_of_documents\": \"determines the number of chunks to retrieve from the search index\",\n        \"search_type\": \"determines the search types used for experimentation. Valid value are search_for_match_semantic, search_for_match_Hybrid_multi, search_for_match_Hybrid_cross, search_for_match_text, search_for_match_pure_vector, search_for_match_pure_vector_multi, search_for_match_pure_vector_cross, search_for_manual_hybrid. e.g. ['search_for_manual_hybrid', 'search_for_match_Hybrid_multi','search_for_match_semantic']\",\n        \"search_relevancy_threshold\": \"the similarity threshold to determine if a doc is relevant. Valid ranges are from 0.0 to 1.0\"\n    },\n    \"query_expansion\": {\n        \"expand_to_multiple_questions\": \"whether the system should expand a single question into multiple related questions. By enabling this feature, you can generate a set of alternative related questions that may improve the retrieval process and provide more accurate results\".,\n        \"query_expansion\": \"determines if query expansion feature is on. Value values are TRUE or FALSE\",\n        \"hyde\": \"this feature allows you to experiment with various query expansion approaches which may improve the retrieval metrics. The possible values are 'disabled' (default), 'generated_hypothetical_answer', 'generated_hypothetical_document_to_answer' reference article - Precise Zero-Shot Dense Retrieval without Relevance Labels (HyDE - Hypothetical Document Embeddings) - https://arxiv.org/abs/2212.10496\",\n        \"min_query_expansion_related_question_similarity_score\": \"minimum similarity score in percentage between LLM generated related queries to the original query using cosine similarly score. default 90%\"\n    },\n    \"openai\": {\n        \"azure_oai_chat_deployment_name\": \"determines the Azure OpenAI deployment name\",\n        \"azure_oai_eval_deployment_name\": \"determines the Azure OpenAI deployment name used for evaluation\",\n        \"temperature\": \"determines the OpenAI temperature. Valid value ranges from 0 to 1.\"\n    },\n    \"eval\": {\n        \"metric_types\": \"determines the metrics used for evaluation (end-to-end or component-wise metrics using LLMs). Valid values for end-to-end metrics are lcsstr, lcsseq, cosine, jaro_winkler, hamming, jaccard, levenshtein, fuzzy_score, cosine_ochiai, bert_all_MiniLM_L6_v2, bert_base_nli_mean_tokens, bert_large_nli_mean_tokens, bert_large_nli_stsb_mean_tokens, bert_distilbert_base_nli_stsb_mean_tokens, bert_paraphrase_multilingual_MiniLM_L12_v2. Valid values for component-wise LLM-based metrics are llm_answer_relevance, llm_context_precision and llm_context_recall. e.g ['fuzzy_score','bert_all_MiniLM_L6_v2','cosine_ochiai','bert_distilbert_base_nli_stsb_mean_tokens', 'llm_answer_relevance']\",\n    }\n}\n```\n\n> NOTE: When changing the config, remember to change:\n- `config.sample.json` (the example config to be copied by others)\n- [Github actions config file](.github/workflows/config.json) to be used by tests on CI\n- [Unit tests config file](run/tests/data/config.json)\n\n## Description of embedding models config\n\n`embedding_model` is an array containing the configuration for the embedding models to use. Embedding model `type` must be `azure` for Azure OpenAI models and `sentence-transformer` for HuggingFace sentence transformer models.\n\n### Azure OpenAI embedding model config\n\n```json\n{\n    \"type\": \"azure\",\n    \"model_name\": \"the name of the Azure OpenAI model\",\n    \"dimension\": \"the dimension of the embedding model. For example, 1536 which is the dimension of text-embedding-ada-002\"\n}\n```\n\nIf you are using a model other than `text-embedding-ada-002`, you must specify the corresponding dimension for the model in the `dimension` field; for example:\n\n```json\n{\n    \"type\": \"azure\",\n    \"model_name\": \"text-embedding-3-large\",\n    \"dimension\": 3072\n}\n```\n\nThe dimensions for the different Azure OpenAI embeddings models can be found in the [Azure OpenAI Service models](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#embeddings-models) documentation.\n\nWhen using the [newer embeddings models (v3)](https://openai.com/blog/new-embedding-models-and-api-updates), you can also leverage their support for shortening embeddings. In this case, specify the number of dimensions you require, and add the `shorten_dimensions` flag to indicate that you want to shorten the embeddings. For example:\n\n```json\n{\n    \"type\": \"azure\",\n    \"model_name\": \"text-embedding-3-large\",\n    \"dimension\": 256,\n    \"shorten_dimensions\": true\n}\n```\n\n### Sentence Transformer embedding model\n\n```json\n{\n    \"type\": \"sentence-transformer\",\n    \"model_name\": \"the name of the sentence transformer model\",\n    \"dimension\": \"the dimension of the model. This field is not required if model name is one of ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'bert-large-nli-mean-tokens]\"\n}\n```\n\n## Query Expansion\n\nGiving an example of an hypothetical answer for the question in query, an hypothetical passage which holds an answer to the query, or generate few alternative related question might improve retrieval and thus get more accurate chunks of docs to pass into LLM context.\nBased on the reference article - [Precise Zero-Shot Dense Retrieval without Relevance Labels (HyDE - Hypothetical Document Embeddings)](https://arxiv.org/abs/2212.10496).\n\nThe following configuration options turns on this experimentation approaches:\n\n\n### Generate hypothetical answer for the question in query\n```json\n{\n    \"hyde\": \"generated_hypothetical_answer\"\n}\n```\n\n### Generate hypothetical document which includes an answer for the question in query\n```json\n{\n    \"hyde\": \"generated_hypothetical_document_to_answer\"\n}\n```\n\n### Generate related question for the question in query\n\nThis feature will generate fine related questions, filter out those which are less than `min_query_expansion_related_question_similarity_score` percent from the original query (using cosine similarity score), and search documents for each one of them along with the original query,\ndeduplicate results and return them to the reranker and top k steps.\n\ndefault value for `min_query_expansion_related_question_similarity_score` is set to 90%, you may change this in the `config.json`\n\n```json\n{\n    \"query_expansion\": true,\n    \"min_query_expansion_related_question_similarity_score\": 90\n}\n```\n\n## Reports\n\nThe solution integrates with Azure Machine Learning and uses MLFlow to manage experiments, jobs, and artifacts. You can view the following reports as part of the evaluation process:\n\n### Metric Comparison\n`all_metrics_current_run.html` shows average scores across questions and search types for each selected metric:\n\n![Metric Comparison](./images/metric_comparison.png)\n\n### Metric Analysis\nThe computation of each metric and fields used for evaluation are tracked for each question and search type in the output csv file:\n\n![Alt text](./images/metric_analysis.png)\n\n### Hyper Parameters\n\n![Hyper Parameters](./images/hyper_parameters.png)\n\n### Sample Metrics\nMetrics can be compared across runs:\n\n![Sample Metrics](./images/sample_metric.png)\n\n### Search evaluation\nMetrics can be compared across different search strategies:\n\n![Search evaluation](./images/search_chart.png)\n\n### Retrieval evaluation\nMean average precision scores are tracked and average MAP scores can be compared across search type:\n\n![Retrieval evaluation](./images/map_at_k.png)\n![Retrieval evaluation](./images/map_scores.png)\n\n### Pitfalls\n\nThis section outlines common gotchas or pitfalls that engineers/developers/data scientists may encounter while working with RAG Experiment Accelerator.\n\n#### Azure Authentication and Authorization\n\nTo successfully utilize this solution, you must first authenticate yourself by logging in to your Azure account. This essential step ensures you have the required permissions to access and manage Azure resources used by it. You might errors related to storing QnA data into Azure Machine Learning Data Assets, executing the query and evaluation step as a result of inappropriate authorization and authentication to Azure. Refer to Point 4 in this document for authentication and authorization.\n\nThere might be situations in which the solution would still generate errors despite of valid authentication and authorization. In such cases, start a new session with a brand new terminal instance, login to Azure using steps mentioned in step 4 and also check if the user has contribute access to the Azure resources related to the solution.\n\n#### Configuration\n\nThis solution utilizes several configuration parameters in `config.json` that directly impact its functionality and performance. Please pay close attention to these settings:\n\n**retrieve_num_of_documents:** This config controls the initial number of documents retrieved for analysis. Excessively high or low values can lead to \"index out of range\" errors due to rank processing of Search AI results.\n\n**cross_encoder_at_k:** This config influences the ranking process. A high value might result in irrelevant documents being included in the final results.\n\n**llm_rerank_threshold:** This config determines which documents are passed to the language model (LLM) for further processing. Setting this value too high could create an overly large context for the LLM to handle, potentially leading to processing errors or degraded results. This might also result in exception from Azure OpenAI endpoint.\n\n#### Azure OpenAI Model and Deployment\n\nBefore running this solution, please ensure you've correctly set up both your Azure OpenAI deployment name within config.json file and add relevant secrets to environment variables (.env file). This information is crucial for the application to connect to the appropriate Azure OpenAI resources and function as designed. If you're unsure about the configuration data, please refer to .env.template and config.json file. The solution has been tested with GPT 3.5 turbo model and needs further tests for any other model.\n\n#### QnA Generation and Querying step\n\nDuring the QnA generation step, you may occasionally encounter errors related to the JSON output received from Azure OpenAI. These errors can prevent the successful generation of few questions and answers. Here's what you need to know:\n\n##### Possible Causes:\n\n**Incorrect Formatting:** The JSON output from Azure OpenAI may not adhere to the expected format, causing issues with the QnA generation process.\n**Content Filtering:** Azure OpenAI has content filters in place. If the input text or generated responses are deemed inappropriate, it could lead to errors.\n**API Limitations:** The Azure OpenAI service have token and rate limitations that affect the output.\n\n#### Evaluation step\n\n**End-to-end evaluation metrics:** not all the metrics comparing the generated and ground-truth answers are able to capture differences in semantics. For example, metrics such as `levenshtein` or `jaro_winkler` only measure edit distances. The `cosine` metric doesn't allow the comparison of semantics either: it uses the *textdistance* token-based implementation based on term frequency vectors. To calculate the semantic similarity between the generated answers and the expected responses, consider using embedding-based metrics such as Bert scores (`bert_`).\n\n**Component-wise evaluation metrics:** evaluation metrics using LLM-as-judges aren't deterministic. The `llm_` metrics included in the accelerator use the model indicated in the `azure_oai_eval_deployment_name` config field. The prompts used for evaluation instruction can be adjusted and are included in the `prompts.py` file (`llm_answer_relevance_instruction`, `llm_context_recall_instruction`, `llm_context_precision_instruction`).\n\n**Retrieval-based metrics:** MAP scores are computed by comparing each retrieved chunk against the question and the chunk used to generate the qna pair. To assess whether a retrieved chunk is relevant or not, the similarity between the retrieved chunk and the concatenation of the end user question and the chunk used in the qna step (`02_qa_generation.py`) is computed using the SpacyEvaluator. Spacy similarity defaults to the average of the token vectors, meaning that the computation is insensitive to the order of the words. By default, the similarity threshold is set to 80% (`spacy_evaluator.py`).\n\n## Contributing\n\nWe welcome your contributions and suggestions. To contribute, you need to agree to a\nContributor License Agreement (CLA) that confirms you have the right to, and actually do, grant us\nthe rights to use your contribution. For details, visit [https://cla.opensource.microsoft.com].\n\nWhen you submit a pull request, a CLA bot will automatically check whether you need to provide\na CLA and give you instructions (for example, status check, comment). Follow the instructions\nfrom the bot. You only need to do this once for all repos that use our CLA.\n\nBefore you contribute, make sure to run\n\n```\npip install -e .\npre-commit install\n```\n\nThis project follows the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).\nFor more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or\ncontact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any questions or comments.\n\n### Developer Contribution Guidelines\n\n- **Branch Naming Convention**:\n    - Use the GitHub UI to include a tag in the branch name, then create the branch directly from the UI. Here are some examples:\n        - `bug/11-short-description`\n        - `feature/22-short-description`\n- **Merging Changes**:\n    - When merging, squash your commits to have up to 3 incremental commits for Pull Requests (PRs) and merges.\n    - Project maintainers can merge accepted code changes from contributors or contributors can request Write Permissions to the repo to merge a pull request once it has been reviewed by project maintainers.\n    - Project Maintainers can be contacted via [email](ISE_AI_PLATFORM_TD@microsoft.com).\n- **Branch Hygiene**:\n    - Delete the branch after it has been merged.\n- **Testing Changes Locally**:\n    - Before merging, test your changes locally.\n- **Naming Conventions**:\n    - Use snake case for metric names and configuration variables, like `example_snake_case`.\n    - Set up your Git username to be your first and last name, like this: `git config --global user.name \"First Last\"`\n- **Issue Tracking**:\n    - Working on a contribution to the RAG Experiment Accelerator? Before opening a new issue, make sure to check if the feature has already been requested by searching for it in the associated [project issue tracker](https://github.com/orgs/microsoft/projects/991), and consider adding to that discussion instead. Otherwise, please open an issue for it using the feature request template or create a PR and make sure it is associated to the [project](https://github.com/orgs/microsoft/projects/991).\n\n\n## Trademarks\n\nThis project might contain trademarks or logos for projects, products, or services. You must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general) to use Microsoft\ntrademarks or logos correctly.\nDon't use Microsoft trademarks or logos in modified versions of this project in a way that causes confusion or implies Microsoft sponsorship.\nFollow the policies of any third-party trademarks or logos that this project contains.\n"
  },
  {
    "path": "SECURITY.md",
    "content": "<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->\n\n## Security\n\nMicrosoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).\n\nIf you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.\n\n## Reporting Security Issues\n\n**Please do not report security vulnerabilities through public GitHub issues.**\n\nInstead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).\n\nIf you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).\n\nYou should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). \n\nPlease include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:\n\n  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)\n  * Full paths of source file(s) related to the manifestation of the issue\n  * The location of the affected source code (tag/branch/commit or direct URL)\n  * Any special configuration required to reproduce the issue\n  * Step-by-step instructions to reproduce the issue\n  * Proof-of-concept or exploit code (if possible)\n  * Impact of the issue, including how an attacker might exploit the issue\n\nThis information will help us triage your report more quickly.\n\nIf you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.\n\n## Preferred Languages\n\nWe prefer all communications to be in English.\n\n## Policy\n\nMicrosoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).\n\n<!-- END MICROSOFT SECURITY.MD BLOCK -->\n"
  },
  {
    "path": "SUPPORT.md",
    "content": "# TODO: The maintainer of this repo has not yet edited this file\r\n\r\n**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?\r\n\r\n- **No CSS support:** Fill out this template with information about how to file issues and get help.\r\n- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.\r\n- **Not sure?** Fill out an intake as though the answer were \"Yes\". CSS will help you decide.\r\n\r\n*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*\r\n\r\n# Support\r\n\r\n## How to file issues and get help  \r\n\r\nThis project uses GitHub Issues to track bugs and feature requests. Please search the existing \r\nissues before filing new issues to avoid duplicates.  For new issues, file your bug or \r\nfeature request as a new Issue.\r\n\r\nFor help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE \r\nFOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER\r\nCHANNEL. WHERE WILL YOU HELP PEOPLE?**.\r\n\r\n## Microsoft Support Policy  \r\n\r\nSupport for this **PROJECT or PRODUCT** is limited to the resources listed above.\r\n"
  },
  {
    "path": "azure.yaml",
    "content": "# yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json\n\nname: rag-experiment-accelerator\nmetadata:\n    template: azd-init@1.6.1\n"
  },
  {
    "path": "azureml/eval.py",
    "content": "import os\nimport sys\nimport argparse\nimport mlflow\n\nproject_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), \"..\"))\nsys.path.append(project_dir)\n\nfrom rag_experiment_accelerator.config.environment import Environment  # noqa: E402\nfrom rag_experiment_accelerator.config.config import Config  # noqa: E402\nfrom rag_experiment_accelerator.config.index_config import IndexConfig  # noqa: E402\nfrom rag_experiment_accelerator.run.evaluation import run as eval_run  # noqa: E402\n\n\ndef main():\n    \"\"\"Main function of the script.\"\"\"\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\n        \"--index_name_path\",\n        type=str,\n        help=\"input: path to a file containing index name\",\n    )\n    parser.add_argument(\n        \"--query_result_dir\",\n        type=str,\n        help=\"input: path to read results of querying from\",\n    )\n    parser.add_argument(\n        \"--keyvault\", type=str, help=\"keyvault to load the environment from\"\n    )\n    parser.add_argument(\n        \"--mlflow_tracking_uri\",\n        type=str,\n        help=\"input: mlflow tracking uri to log to\",\n    )\n    parser.add_argument(\n        \"--mlflow_parent_run_id\",\n        type=str,\n        help=\"input: mlflow parent run id to connect nested run to\",\n    )\n    parser.add_argument(\n        \"--eval_result_dir\",\n        type=str,\n        help=\"output: path to write results of evaluation to\",\n    )\n    args = parser.parse_args()\n\n    environment = Environment.from_keyvault(args.keyvault)\n    config = Config.from_path(environment, config_path=args.config_path)\n    with open(args.index_name_path, \"r\") as f:\n        index_name = f.readline()\n    index_config = IndexConfig.from_index_name(index_name)\n\n    config.path.query_data_dir = args.query_result_dir\n    config.path.eval_data_dir = args.eval_result_dir\n\n    mlflow_client = mlflow.MlflowClient(args.mlflow_tracking_uri)\n    eval_run(environment, config, index_config, mlflow_client, name_suffix=\"_result\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "azureml/index.py",
    "content": "from rag_experiment_accelerator.checkpoint import init_checkpoint\nimport os\nimport sys\nimport argparse\n\nimport mlflow\n\nproject_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), \"..\"))\nsys.path.append(project_dir)\n\nfrom rag_experiment_accelerator.config.environment import Environment  # noqa: E402\nfrom rag_experiment_accelerator.config.config import Config  # noqa: E402\nfrom rag_experiment_accelerator.config.index_config import IndexConfig  # noqa: E402\nfrom rag_experiment_accelerator.run.index import run as index_run  # noqa: E402\n\n\ndef init():\n    \"\"\"Main function of the script.\"\"\"\n\n    global args\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\"--data_dir\", type=str, help=\"input: path to the data\")\n    parser.add_argument(\"--index_name\", type=str, help=\"input: experiment index name\")\n    parser.add_argument(\n        \"--keyvault\",\n        type=str,\n        help=\"input: keyvault to load the environment from\",\n    )\n    parser.add_argument(\n        \"--mlflow_tracking_uri\",\n        type=str,\n        help=\"input: mlflow tracking uri to log to\",\n    )\n    parser.add_argument(\n        \"--index_name_path\",\n        type=str,\n        help=\"output: path to write a file with index name\",\n    )\n\n    args, _ = parser.parse_known_args()\n\n    global config\n    global environment\n    global index_config\n    global mlflow_client\n\n    environment = Environment.from_keyvault(args.keyvault)\n    config = Config.from_path(environment, args.config_path, args.data_dir)\n    init_checkpoint(config)\n\n    index_config = IndexConfig.from_index_name(args.index_name)\n    mlflow_client = mlflow.MlflowClient(args.mlflow_tracking_uri)\n\n\ndef run(input_paths: list[str]) -> list[str]:\n    global args\n    global config\n    global environment\n    global index_config\n    global mlflow_client\n\n    index_run(environment, config, index_config, input_paths, mlflow_client)\n\n    return [args.index_name]\n"
  },
  {
    "path": "azureml/pipeline.py",
    "content": "import os\nimport sys\nimport argparse\nfrom azure.ai.ml import MLClient, Input, Output, dsl, command\nfrom azure.ai.ml.parallel import parallel_run_function, RunFunction\nimport azure.ai.ml.entities\nfrom azure.ai.ml.entities import Job\nimport mlflow\nimport warnings\n\n\nfrom rag_experiment_accelerator.config.paths import mlflow_run_name  # noqa: E402\nfrom rag_experiment_accelerator.config.environment import Environment  # noqa: E402\nfrom rag_experiment_accelerator.config.config import (\n    Config,\n    ExecutionEnvironment,\n)  # noqa: E402\nfrom rag_experiment_accelerator.config.index_config import IndexConfig  # noqa: E402\nfrom rag_experiment_accelerator.utils.auth import get_default_az_cred  # noqa: E402\nfrom rag_experiment_accelerator.utils.logging import get_logger  # noqa: E402\n\n\nproject_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), \"..\"))\nsys.path.append(project_dir)\nlogger = get_logger(__name__)\nAML_ENVIRONMENT_NAME = \"CliV2AnonymousEnvironment\"\nINDEX_STEP_RETRIES = 3\nINDEX_STEP_TIMEOUT_SECONDS = 10 * 3600\nINDEX_STEP_FILES_PER_BATCH = 4\nINDEX_STEP_ERROR_THRESHOLD = 1\n\n\ndef generate_conda_file():\n    \"\"\"Generates a file describing a conda environment compatible with AML.\"\"\"\n    conda_config = \"\"\"name: aml-env\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.11\n  - pip\n  - pkg-config\n  - cmake\n  - spacy\n  - spacy-model-en_core_web_lg==3.5.0\n  - pip:\n\"\"\"\n    with open(\"requirements.txt\", \"r\") as requirements_file:\n        pip_dependencies = requirements_file.readlines()\n    conda_config += \"\".join([\"    - \" + dependency for dependency in pip_dependencies])\n\n    conda_filename = \"conda.generated.yaml\"\n    with open(conda_filename, \"w\") as conda_file:\n        conda_file.write(conda_config)\n    return conda_filename\n\n\ndef initialise_mlflow_client(environment: Environment, config: Config):\n    \"\"\"\n    Initializes the ML client and sets the MLflow tracking URI.\n    \"\"\"\n    ml_client = MLClient(\n        get_default_az_cred(),\n        environment.aml_subscription_id,\n        environment.aml_resource_group_name,\n        environment.aml_workspace_name,\n    )\n    mlflow_tracking_uri = ml_client.workspaces.get(\n        ml_client.workspace_name\n    ).mlflow_tracking_uri\n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n\n    return mlflow.MlflowClient(mlflow_tracking_uri)\n\n\ndef start_pipeline(\n    environment: Environment,\n    config: Config,\n    index_config: IndexConfig,\n    config_path: str,\n    mlflow_client: mlflow.MlflowClient,\n) -> Job:\n    warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"azure.ai.ml\")\n\n    ml_client = MLClient(\n        credential=get_default_az_cred(),\n        subscription_id=environment.aml_subscription_id,\n        resource_group_name=environment.aml_resource_group_name,\n        workspace_name=environment.aml_workspace_name,\n    )\n\n    # Generate conda file\n    conda_filename = generate_conda_file()\n\n    # Create environment for AML\n    pipeline_job_env = azure.ai.ml.entities.Environment(\n        name=AML_ENVIRONMENT_NAME,\n        description=\"Environment for RAG Experiment Accelerator\",\n        conda_file=conda_filename,\n        image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest\",\n    )\n    pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)\n\n    index_pipeline_component = parallel_run_function(\n        name=\"index_job\",\n        experiment_name=config.experiment_name,\n        display_name=f\"Index documents for {config.experiment_name} experiment\",\n        description=\"Preprocess documents, split into smaller chunks, embed and enrich them, and finally upload documents chunks for retrieval into Azure Search Index\",\n        inputs={\n            \"data\": Input(type=\"uri_folder\"),\n            \"config_path\": Input(type=\"uri_file\"),\n            \"mlflow_tracking_uri\": Input(type=\"string\"),\n        },\n        outputs={\"index_name\": Output(type=\"uri_file\", mode=\"rw_mount\")},\n        input_data=\"${{inputs.data}}\",\n        instance_count=int(environment.aml_compute_instances_number),\n        mini_batch_size=str(INDEX_STEP_FILES_PER_BATCH),\n        mini_batch_error_threshold=INDEX_STEP_ERROR_THRESHOLD,\n        retry_settings=dict(\n            max_retries=INDEX_STEP_RETRIES, timeout=INDEX_STEP_TIMEOUT_SECONDS\n        ),\n        task=RunFunction(\n            code=\"./\",\n            entry_script=\"azureml/index.py\",\n            program_arguments=\"\"\"--data_dir ${{inputs.data}} \\\n                --index_name_path ${{outputs.index_name}} \\\n                --config_path ${{inputs.config_path}} \\\n                --mlflow_tracking_uri ${{inputs.mlflow_tracking_uri}}\"\"\"\n            + f\" --keyvault {environment.azure_key_vault_endpoint}\"\n            + f\" --index_name {index_config.index_name()}\",\n            environment=pipeline_job_env,\n            append_row_to=\"${{outputs.index_name}}\",\n        ),\n        environment_variables={\n            \"LOGGING_LEVEL\": os.getenv(\"LOGGING_LEVEL\", \"\"),\n            \"MAX_WORKER_THREADS\": os.getenv(\"MAX_WORKER_THREADS\", \"\"),\n        },\n    )\n\n    query_pipeline_component = command(\n        name=\"query_job\",\n        experiment_name=config.experiment_name,\n        display_name=\"Query documents for the experiment\",\n        description=\"Query documents for the experiment\",\n        inputs={\n            \"index_name\": Input(type=\"uri_file\"),\n            \"config_path\": Input(type=\"uri_file\"),\n            \"eval_data\": Input(type=\"uri_file\"),\n            \"mlflow_tracking_uri\": Input(type=\"string\"),\n        },\n        outputs={\"query_result\": Output(type=\"uri_folder\", mode=\"rw_mount\")},\n        code=\"./\",\n        command=\"\"\"python ./azureml/query.py \\\n            --eval_data_path ${{inputs.eval_data}} \\\n            --config_path ${{inputs.config_path}} \\\n            --index_name_path ${{inputs.index_name}} \\\n            --query_result_dir ${{outputs.query_result}} \\\n            --mlflow_tracking_uri ${{inputs.mlflow_tracking_uri}}\"\"\"\n        + f\" --keyvault {environment.azure_key_vault_endpoint}\",\n        environment=pipeline_job_env,\n        environment_variables={\n            \"LOGGING_LEVEL\": os.getenv(\"LOGGING_LEVEL\", \"\"),\n            \"MAX_WORKER_THREADS\": os.getenv(\"MAX_WORKER_THREADS\", \"\"),\n        },\n    )\n\n    eval_pipeline_component = command(\n        name=\"eval_job\",\n        experiment_name=config.experiment_name,\n        display_name=\"Evaluate experiment\",\n        description=\"Evaluate experiment\",\n        inputs={\n            \"index_name\": Input(type=\"uri_file\"),\n            \"config_path\": Input(type=\"uri_file\"),\n            \"query_result\": Input(type=\"uri_folder\"),\n            \"mlflow_tracking_uri\": Input(type=\"string\"),\n        },\n        outputs=dict(eval_result=Output(type=\"uri_folder\", mode=\"rw_mount\")),\n        code=\"./\",\n        command=\"\"\"python ./azureml/eval.py \\\n                --config_path ${{inputs.config_path}} \\\n                --index_name_path ${{inputs.index_name}} \\\n                --query_result_dir ${{inputs.query_result}} \\\n                --eval_result_dir ${{outputs.eval_result}} \\\n                --mlflow_tracking_uri ${{inputs.mlflow_tracking_uri}}\"\"\"\n        + f\" --keyvault {environment.azure_key_vault_endpoint}\",\n        environment=pipeline_job_env,\n        environment_variables={\n            \"LOGGING_LEVEL\": os.getenv(\"LOGGING_LEVEL\", \"\"),\n            \"MAX_WORKER_THREADS\": os.getenv(\"MAX_WORKER_THREADS\", \"\"),\n        },\n    )\n\n    job_name = mlflow_run_name(config.job_name)\n\n    @dsl.pipeline(\n        name=job_name,\n        experiment_name=config.experiment_name,\n        compute=environment.aml_compute_name,\n        description=config.job_description or \"RAG Experiment Pipeline\",\n        display_name=job_name,\n    )\n    def rag_pipeline(\n        config_path_input,\n        data_input,\n        eval_data_input,\n        mlflow_tracking_uri,\n        mlflow_parent_run_id,\n    ):\n        index_job = index_pipeline_component(\n            data=data_input,\n            config_path=config_path_input,\n            mlflow_tracking_uri=mlflow_tracking_uri,\n        )\n\n        query_job = query_pipeline_component(\n            index_name=index_job.outputs.index_name,\n            config_path=config_path_input,\n            eval_data=eval_data_input,\n            mlflow_tracking_uri=mlflow_tracking_uri,\n        )\n\n        eval_job = eval_pipeline_component(\n            index_name=index_job.outputs.index_name,\n            config_path=config_path_input,\n            query_result=query_job.outputs.query_result,\n            mlflow_tracking_uri=mlflow_tracking_uri,\n        )\n\n        return {\"eval_result\": eval_job.outputs.eval_result}\n\n    # Save the environment into Keyvault for the pipeline steps to retrieve later\n    environment.to_keyvault()\n\n    pipeline = rag_pipeline(\n        config_path_input=Input(type=\"uri_file\", path=config_path),\n        data_input=Input(type=\"uri_folder\", path=config.path.data_dir),\n        eval_data_input=Input(type=\"uri_file\", path=config.path.eval_data_file),\n        mlflow_tracking_uri=mlflow_client.tracking_uri,\n    )\n    return ml_client.jobs.create_or_update(\n        pipeline, experiment_name=config.experiment_name\n    )\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--data_dir\", type=str, help=\"path to the data folder\", default=\"./data\"\n    )\n    parser.add_argument(\n        \"--config_path\",\n        type=str,\n        help=\"relative path to the config file\",\n        default=\"./config.json\",\n    )\n    args = parser.parse_args()\n\n    environment = Environment.from_env_or_keyvault()\n    config = Config.from_path(environment, args.config_path, args.data_dir)\n    config.execution_environment = ExecutionEnvironment.AZURE_ML\n\n    if config.index.sampling.sample_data:\n        logger.error(\n            \"Can't sample data when running on AzureML pipeline. Please run the pipeline locally\"\n        )\n        exit()\n\n    mlflow_client = initialise_mlflow_client(environment, config)\n\n    # Starting multiple pipelines hence unable to stream them\n    for index_config in config.index.flatten():\n        # with mlflow.start_run(run_name=config.job_name, description=config.job_description, experiment_id=experiment.experiment_id) as run:\n        logger.info(f\"Starting pipeline for index: {index_config.index_name()}\")\n        job = start_pipeline(\n            environment, config, index_config, args.config_path, mlflow_client\n        )\n        logger.info(\n            f\"Pipeline job started...\\nIndex name: {index_config.index_name()}\\nMonitoring url: {job.studio_url}\"\n        )\n"
  },
  {
    "path": "azureml/query.py",
    "content": "import os\nimport sys\nimport argparse\n\nimport mlflow\n\nproject_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), \"..\"))\nsys.path.append(project_dir)\n\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint  # noqa: E402\nfrom rag_experiment_accelerator.config.environment import Environment  # noqa: E402\nfrom rag_experiment_accelerator.config.config import Config  # noqa: E402\nfrom rag_experiment_accelerator.config.index_config import IndexConfig  # noqa: E402\nfrom rag_experiment_accelerator.run.querying import run as query_run  # noqa: E402\n\n\ndef main():\n    \"\"\"Main function of the script.\"\"\"\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--config_path\", type=str, help=\"input: path to the config file\"\n    )\n    parser.add_argument(\n        \"--eval_data_path\", type=str, help=\"input: path to the data to evaluate on\"\n    )\n    parser.add_argument(\n        \"--index_name_path\",\n        type=str,\n        help=\"input: path to a file containing index name\",\n    )\n    parser.add_argument(\n        \"--keyvault\",\n        type=str,\n        help=\"input: keyvault to load the environment from\",\n    )\n    parser.add_argument(\n        \"--mlflow_tracking_uri\",\n        type=str,\n        help=\"input: mlflow tracking uri to log to\",\n    )\n    parser.add_argument(\n        \"--mlflow_parent_run_id\",\n        type=str,\n        help=\"input: mlflow parent run id to connect nested run to\",\n    )\n    parser.add_argument(\n        \"--query_result_dir\",\n        type=str,\n        help=\"output: path to write results of querying to\",\n    )\n    args = parser.parse_args()\n\n    environment = Environment.from_keyvault(args.keyvault)\n\n    config = Config.from_path(environment, args.config_path)\n    config.path.eval_data_file = args.eval_data_path\n    config.path.query_data_dir = args.query_result_dir\n    init_checkpoint(config)\n\n    with open(args.index_name_path, \"r\") as f:\n        index_name = f.readline()\n    index_config = IndexConfig.from_index_name(index_name)\n\n    mlflow_client = mlflow.MlflowClient(args.mlflow_tracking_uri)\n    query_run(environment, config, index_config, mlflow_client)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "config.sample.json",
    "content": "{\n    \"$schema\": \"https://raw.githubusercontent.com/microsoft/rag-experiment-accelerator/development/config.schema.json\",\n    \"experiment_name\": \"exp-name\",\n    \"job_name\": \"job-name\",\n    \"job_description\": \"\",\n    \"data_formats\": [\"*\"],\n    \"main_instruction\": \"\",\n    \"use_checkpoints\": true,\n    \"path\": {},\n    \"index\": {\n        \"index_name_prefix\": \"idx\",\n        \"ef_construction\": [400],\n        \"ef_search\": [400],\n        \"chunking\": {\n            \"preprocess\": false,\n            \"chunk_size\": [1000],\n            \"overlap_size\": [200],\n            \"generate_title\": false,\n            \"generate_summary\": false,\n            \"override_content_with_summary\": false,\n            \"chunking_strategy\": \"basic\",\n            \"azure_document_intelligence_model\": \"prebuilt-read\"\n        },\n        \"embedding_model\": [\n            {\n                \"type\": \"sentence-transformer\",\n                \"model_name\": \"all-mpnet-base-v2\"\n            }\n        ],\n        \"sampling\": {\n            \"sample_data\": false,\n            \"percentage\": 5,\n            \"optimum_k\": \"auto\",\n            \"min_cluster\": 2,\n            \"max_cluster\": 30,\n            \"only_sample\": false\n        }\n    },\n    \"language\": {\n        \"analyzer\": {\n            \"analyzer_name\": \"en.microsoft\",\n            \"index_analyzer_name\": \"\",\n            \"search_analyzer_name\": \"\",\n            \"char_filters\": [],\n            \"tokenizers\": [],\n            \"token_filters\": []\n        },\n        \"query_language\": \"en-us\"\n    },\n    \"rerank\": {\n        \"enabled\": true,\n        \"type\": \"cross_encoder\",\n        \"llm_rerank_threshold\": 3,\n        \"cross_encoder_at_k\": 4,\n        \"cross_encoder_model\": \"cross-encoder/stsb-roberta-base\"\n    },\n    \"search\": {\n        \"retrieve_num_of_documents\": 5,\n        \"search_type\": [\n            \"search_for_manual_hybrid\",\n            \"search_for_match_Hybrid_multi\",\n            \"search_for_match_semantic\"\n        ],\n        \"search_relevancy_threshold\": 0.8\n    },\n    \"query_expansion\": {\n        \"hyde\": \"disabled\",\n        \"query_expansion\": false,\n        \"min_query_expansion_related_question_similarity_score\": 90,\n        \"expand_to_multiple_questions\": false\n    },\n    \"openai\": {\n        \"azure_oai_chat_deployment_name\": \"gpt-35-turbo\",\n        \"azure_oai_eval_deployment_name\": \"gpt-35-turbo\",\n        \"temperature\": 0\n    },\n    \"eval\": {\n        \"metric_types\": [\n            \"fuzzy_score\",\n            \"bert_all_MiniLM_L6_v2\",\n            \"cosine_ochiai\",\n            \"bert_distilbert_base_nli_stsb_mean_tokens\",\n            \"llm_answer_relevance\",\n            \"llm_context_precision\"\n        ]\n    }\n}\n"
  },
  {
    "path": "config.schema.json",
    "content": "{\n    \"$schema\": \"http://json-schema.org/draft-06/schema#\",\n    \"type\": \"object\",\n    \"properties\": {\n        \"experiment_name\": {\n            \"type\": \"string\",\n            \"minLength\": 1,\n            \"description\": \"Azure ML Experiment name\"\n        },\n        \"job_name\": {\n            \"type\": \"string\",\n            \"minLength\": 1,\n            \"description\": \"Azure ML Job name prefix\"\n        },\n        \"job_description\": {\n            \"type\": \"string\",\n            \"description\": \"Azure ML Job description for the current experimentation\"\n        },\n        \"data_formats\": {\n            \"type\": \"array\",\n            \"items\": {\n                \"type\": \"string\",\n                \"enum\": [\n                    \"*\",\n                    \"pdf\",\n                    \"html\",\n                    \"markdown\",\n                    \"json\",\n                    \"text\",\n                    \"docx\"\n                ]\n            },\n            \"default\": \"*\",\n            \"description\": \"List of formats or 'all' to allow any supported format\"\n        },\n        \"main_instruction\": {\n            \"type\": \"string\",\n            \"description\": \"The main instruction prompt coming with queries to LLM\"\n        },\n        \"use_checkpoints\": {\n            \"type\": \"boolean\",\n            \"description\": \"If checkpoints are enabled\"\n        },\n        \"path\": {\n            \"type\": \"object\",\n            \"properties\": {},\n            \"required\": []\n        },\n        \"index\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"index_name_prefix\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1,\n                    \"description\": \"Search Index name prefix\"\n                },\n                \"ef_construction\": {\n                    \"type\": \"array\",\n                    \"items\": {\n                        \"type\": \"integer\",\n                        \"minimum\": 100,\n                        \"maximum\": 1000\n                    },\n                    \"minItems\": 1,\n                    \"description\": \"Number of `ef_construction` to use for Azure AI Search vector configuration\"\n                },\n                \"ef_search\": {\n                    \"type\": \"array\",\n                    \"items\": {\n                        \"type\": \"integer\",\n                        \"minimum\": 100,\n                        \"maximum\": 1000\n                    },\n                    \"minItems\": 1,\n                    \"description\": \"Number of `ef_search` to use for Azure AI Search vector configuration\"\n                },\n                \"chunking\": {\n                    \"type\": \"object\",\n                    \"properties\": {\n                        \"preprocess\": {\n                            \"type\": \"boolean\",\n                            \"default\": false,\n                            \"description\": \"Whether or not to preprocess the text\"\n                        },\n                        \"chunk_size\": {\n                            \"type\": \"array\",\n                            \"items\": {\n                                \"type\": \"integer\"\n                            },\n                            \"minItems\": 1,\n                            \"description\": \"A list of integers representing the chunk sizes for chunking documents\"\n                        },\n                        \"overlap_size\": {\n                            \"type\": \"array\",\n                            \"items\": {\n                                \"type\": \"integer\"\n                            },\n                            \"minItems\": 1,\n                            \"description\": \"A list of integers representing the overlap sizes for chunking documents\"\n                        },\n                        \"generate_title\": {\n                            \"type\": \"boolean\",\n                            \"default\": false,\n                            \"description\": \"Whether or not to generate title for chunk content\"\n                        },\n                        \"generate_summary\": {\n                            \"type\": \"boolean\",\n                            \"default\": false,\n                            \"description\": \"Whether or not to generate summary for chunk content\"\n                        },\n                        \"override_content_with_summary\": {\n                            \"type\": \"boolean\",\n                            \"default\": false,\n                            \"description\": \"Whether or not to override chunk content with generated summary\"\n                        },\n                        \"chunking_strategy\": {\n                            \"type\": \"string\",\n                            \"enum\": [\n                                \"azure-document-intelligence\",\n                                \"basic\"\n                            ],\n                            \"description\": \"Chunking strategy to use for the document chunking process\"\n                        },\n                        \"azure_document_intelligence_model\": {\n                            \"type\": \"string\",\n                            \"minLength\": 1,\n                            \"default\": \"prebuilt-read\",\n                            \"description\": \"Azure Document Intelligence model name to use for chunking strategy `azure-document-intelligence`\"\n                        }\n                    },\n                    \"required\": [\n                        \"chunk_size\",\n                        \"overlap_size\",\n                        \"chunking_strategy\"\n                    ]\n                },\n                \"embedding_model\": {\n                    \"type\": \"array\",\n                    \"uniqueItems\": true,\n                    \"minItems\": 1,\n                    \"description\": \"Array containing the configuration for the embedding models to use\",\n                    \"items\": {\n                        \"properties\": {\n                            \"type\": {\n                                \"type\": \"string\",\n                                \"enum\": [\n                                    \"azure\",\n                                    \"sentence-transformer\"\n                                ],\n                                \"description\": \"Type of the embedding model. `azure` for Azure OpenAI models and `sentence-transformer` for HuggingFace sentence transformer models.\"\n                            },\n                            \"model_name\": {\n                                \"type\": \"string\",\n                                \"minLength\": 1,\n                                \"description\": \"Deployment name of the Azure OpenAI model\"\n                            },\n                            \"dimension\": {\n                                \"type\": \"integer\",\n                                \"description\": \"Dimension of the embedding model\"\n                            }\n                        },\n                        \"required\": [\n                            \"type\",\n                            \"model_name\"\n                        ]\n                    }\n                },\n                \"sampling\": {\n                    \"type\": \"object\",\n                    \"properties\": {\n                        \"sample_data\": {\n                            \"type\": \"boolean\",\n                            \"description\": \"Set to true to enable sampling\"\n                        },\n                        \"percentage\": {\n                            \"type\": \"number\",\n                            \"minimum\": 0,\n                            \"maximum\": 100,\n                            \"description\": \"Percentage of the document corpus to sample\"\n                        },\n                        \"optimum_k\": {\n                            \"description\": \"Set to 'auto' to automatically determine the optimum cluster number or set to a specific value e.g. 15\",\n                            \"oneOf\": [\n                                {\n                            \"type\": \"string\",\n                                    \"enum\": [\n                                        \"auto\"\n                                    ]\n                                },\n                                {\n                                    \"type\": \"integer\",\n                                    \"minimum\": 1\n                                }\n                            ]\n                        },\n                        \"min_cluster\": {\n                            \"type\": \"integer\",\n                            \"minimum\": 1,\n                            \"description\": \"Minimum number of clusters to use for sampling\"\n                        },\n                        \"max_cluster\": {\n                            \"type\": \"integer\",\n                            \"minimum\": 1,\n                            \"description\": \"Maximum number of clusters to use for sampling\"\n                        }\n                    },\n                    \"required\": [\n                        \"sample_data\",\n                        \"percentage\",\n                        \"optimum_k\",\n                        \"min_cluster\",\n                        \"max_cluster\"\n                    ]\n                }\n            },\n            \"required\": [\n                \"index_name_prefix\",\n                \"ef_construction\",\n                \"ef_search\",\n                \"chunking\",\n                \"embedding_model\"\n            ]\n        },\n        \"language\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"analyzer\": {\n                    \"type\": \"object\",\n                    \"properties\": {\n                        \"char_filters\": {\n                            \"type\": \"array\",\n                            \"items\": {\n                                \"type\": \"object\",\n                                \"properties\": {\n                                    \"name\": {\n                                        \"type\": \"string\",\n                                        \"minLength\": 1,\n                                        \"description\": \"Name of the char filter\"\n                                    },\n                                    \"mappings\": {\n                                        \"type\": \"array\",\n                                        \"items\": {\n                                            \"type\": \"string\"\n                                        }\n                                    }\n                                },\n                                \"required\": [\n                                    \"name\",\n                                    \"mappings\"\n                                ]\n                            }\n                        },\n                        \"tokenizers\": {\n                            \"type\": \"array\",\n                            \"items\": {\n                                \"type\": \"object\",\n                                \"properties\": {\n                                    \"name\": {\n                                        \"type\": \"string\",\n                                        \"minLength\": 1,\n                                        \"description\": \"Name of the tokenizer\"\n                                    },\n                                    \"token_chars\": {\n                                        \"type\": \"array\",\n                                        \"items\": {\n                                            \"type\": \"string\"\n                                        }\n                                    }\n                                },\n                                \"required\": [\n                                    \"name\",\n                                    \"token_chars\"\n                                ]\n                            }\n                        },\n                        \"token_filters\": {\n                            \"type\": \"array\",\n                            \"items\": {\n                                \"type\": \"string\"\n                            }\n                        }\n                    },\n                    \"required\": [\n                        \"analyzer_name\",\n                        \"index_analyzer_name\",\n                        \"search_analyzer_name\",\n                        \"char_filters\",\n                        \"tokenizers\",\n                        \"token_filters\"\n                    ],\n                    \"oneOf\": [\n                        {\n                            \"properties\": {\n                                \"analyzer_name\": {\n                                    \"type\": \"string\",\n                                    \"minLength\": 1,\n                                    \"description\": \"Name of the analyzer to use for the field\"\n                                },\n                                \"index_analyzer_name\": {\n                                    \"type\": \"string\",\n                                    \"enum\": [\n                                        \"\"\n                                    ]\n                                },\n                                \"search_analyzer_name\": {\n                                    \"type\": \"string\",\n                                    \"enum\": [\n                                        \"\"\n                                    ]\n                                }\n                            }\n                        },\n                        {\n                            \"properties\": {\n                                \"analyzer_name\": {\n                                    \"type\": \"string\",\n                                    \"description\": \"Name of the analyzer to use for the field\",\n                                    \"enum\": [\n                                        \"\"\n                                    ]\n                                },\n                                \"index_analyzer_name\": {\n                                    \"type\": \"string\",\n                                    \"minLength\": 1\n                                },\n                                \"search_analyzer_name\": {\n                                    \"type\": \"string\",\n                                    \"minLength\": 1\n                                }\n                            }\n                        }\n                    ]\n                },\n                \"query_language\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                        \"none\",\n                        \"en-us\",\n                        \"en-gb\",\n                        \"en-in\",\n                        \"en-ca\",\n                        \"en-au\",\n                        \"fr-fr\",\n                        \"fr-ca\",\n                        \"de-de\",\n                        \"es-es\",\n                        \"es-mx\",\n                        \"zh-cn\",\n                        \"zh-tw\",\n                        \"pt-br\",\n                        \"pt-pt\",\n                        \"it-it\",\n                        \"ja-jp\",\n                        \"ko-kr\",\n                        \"ru-ru\",\n                        \"cs-cz\",\n                        \"nl-be\",\n                        \"nl-nl\",\n                        \"hu-hu\",\n                        \"pl-pl\",\n                        \"sv-se\",\n                        \"tr-tr\",\n                        \"hi-in\",\n                        \"ar-sa\",\n                        \"ar-eg\",\n                        \"ar-ma\",\n                        \"ar-kw\",\n                        \"ar-jo\",\n                        \"da-dk\",\n                        \"no-no\",\n                        \"bg-bg\",\n                        \"hr-hr\",\n                        \"hr-ba\",\n                        \"ms-my\",\n                        \"ms-bn\",\n                        \"sl-sl\",\n                        \"ta-in\",\n                        \"vi-vn\",\n                        \"el-gr\",\n                        \"ro-ro\",\n                        \"is-is\",\n                        \"id-id\",\n                        \"th-th\",\n                        \"lt-lt\",\n                        \"uk-ua\",\n                        \"lv-lv\",\n                        \"et-ee\",\n                        \"ca-es\",\n                        \"fi-fi\",\n                        \"sr-ba\",\n                        \"sr-me\",\n                        \"sr-rs\",\n                        \"sk-sk\",\n                        \"nb-no\",\n                        \"hy-am\",\n                        \"bn-in\",\n                        \"eu-es\",\n                        \"gl-es\",\n                        \"gu-in\",\n                        \"he-il\",\n                        \"ga-ie\",\n                        \"kn-in\",\n                        \"ml-in\",\n                        \"mr-in\",\n                        \"fa-ae\",\n                        \"pa-in\",\n                        \"te-in\",\n                        \"ur-pk\"\n                    ],\n                    \"description\": \"The language of the query\"\n                }\n            },\n            \"required\": [\n                \"analyzer\",\n                \"query_language\"\n            ]\n        },\n        \"rerank\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"enabled\": {\n                    \"type\": \"boolean\",\n                    \"description\": \"If search results should be re-ranked\"\n                },\n                \"type\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                        \"llm\",\n                        \"cross_encoder\"\n                    ],\n                    \"description\": \"Type of re-ranking\"\n                },\n                \"llm_rerank_threshold\": {\n                    \"type\": \"integer\",\n                    \"description\": \"Threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10.\"\n                },\n                \"cross_encoder_at_k\": {\n                    \"type\": \"integer\",\n                    \"description\": \"Threshold when using cross-encoding re-ranking. Chunks with given rank value are selected\"\n                },\n                \"cross_encoder_model\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                        \"cross-encoder/stsb-roberta-base\"\n                    ],\n                    \"description\": \"Model used for cross-encoding re-ranking step\"\n                }\n            }\n        },\n        \"search\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"retrieve_num_of_documents\": {\n                    \"type\": \"integer\",\n                    \"description\": \"Number of chunks to retrieve from the search index\"\n                },\n                \"search_type\": {\n                    \"type\": \"array\",\n                    \"items\": {\n                        \"type\": \"string\",\n                        \"enum\": [\n                            \"search_for_match_semantic\",\n                            \"search_for_match_Hybrid_multi\",\n                            \"search_for_match_Hybrid_cross\",\n                            \"search_for_match_text\",\n                            \"search_for_match_pure_vector\",\n                            \"search_for_match_pure_vector_multi\",\n                            \"search_for_match_pure_vector_cross\",\n                            \"search_for_manual_hybrid\"\n                        ]\n                    },\n                    \"description\": \"Search types used for experimentation\"\n                },\n                \"search_relevancy_threshold\": {\n                    \"type\": \"number\",\n                    \"minimum\": 0,\n                    \"maximum\": 1,\n                    \"default\": 0.8,\n                    \"description\": \"Similarity threshold to determine if a document is relevant\"\n                }\n            },\n            \"required\": [\n                \"retrieve_num_of_documents\",\n                \"search_type\"\n            ]\n        },\n        \"query_expansion\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"hyde\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                        \"disabled\",\n                        \"generated_hypothetical_answer\",\n                        \"generated_hypothetical_document_to_answer\"\n                    ],\n                    \"default\": \"disabled\",\n                    \"description\": \"Whether or not to generate hypothetical answer or document which holds an answer for the query using LLM\"\n                },\n                \"query_expansion\": {\n                    \"type\": \"boolean\",\n                    \"default\": false,\n                    \"description\": \"Whether or not to expand the query using related questions\"\n                },\n                \"min_query_expansion_related_question_similarity_score\": {\n                    \"type\": \"integer\",\n                    \"default\": 90,\n                    \"description\": \"Minimum similarity score for query expansion generated related questions\"\n                },\n                \"expand_to_multiple_questions\": {\n                    \"type\": \"boolean\",\n                    \"default\": false,\n                    \"description\": \"\"\n                }\n            },\n            \"required\": [\n                \"hyde\",\n                \"query_expansion\",\n                \"min_query_expansion_related_question_similarity_score\",\n                \"expand_to_multiple_questions\"\n            ]\n        },\n        \"openai\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"azure_oai_chat_deployment_name\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1,\n                    \"description\": \"Azure OpenAI deployment name\"\n                },\n                \"azure_oai_eval_deployment_name\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1,\n                    \"description\": \"Azure OpenAI evaluation deployment name\"\n                },\n                \"temperature\": {\n                    \"type\": \"number\",\n                    \"minimum\": 0,\n                    \"maximum\": 1,\n                    \"description\": \"Temperature for OpenAI API\"\n                }\n            },\n            \"required\": [\n                \"azure_oai_chat_deployment_name\",\n                \"azure_oai_eval_deployment_name\",\n                \"temperature\"\n            ]\n        },\n        \"eval\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"metric_types\": {\n                    \"type\": \"array\",\n                    \"items\": {\n                        \"type\": \"string\",\n                        \"enum\": [\n                            \"lcsstr\",\n                            \"lcsseq\",\n                            \"jaro_winkler\",\n                            \"hamming\",\n                            \"jaccard\",\n                            \"levenshtein\",\n                            \"fuzzy_score\",\n                            \"cosine_ochiai\",\n                            \"rouge1_precision\",\n                            \"rouge1_recall\",\n                            \"rouge1_fmeasure\",\n                            \"rouge2_precision\",\n                            \"rouge2_recall\",\n                            \"rouge2_fmeasure\",\n                            \"rougeL_precision\",\n                            \"rougeL_recall\",\n                            \"rougeL_fmeasure\",\n                            \"bert_all_MiniLM_L6_v2\",\n                            \"bert_base_nli_mean_tokens\",\n                            \"bert_large_nli_mean_tokens\",\n                            \"bert_large_nli_stsb_mean_tokens\",\n                            \"bert_distilbert_base_nli_stsb_mean_tokens\",\n                            \"bert_paraphrase_multilingual_MiniLM_L12_v2\",\n                            \"llm_answer_relevance\",\n                            \"llm_context_precision\",\n                            \"llm_context_recall\"\n                        ]\n                    },\n                    \"description\": \"Metrics used for evaluation\"\n                }\n            }\n        }\n    },\n    \"required\": [\n        \"experiment_name\",\n        \"job_name\",\n        \"job_description\",\n        \"data_formats\",\n        \"main_instruction\",\n        \"use_checkpoints\",\n        \"path\",\n        \"index\",\n        \"language\",\n        \"rerank\",\n        \"search\",\n        \"openai\",\n        \"eval\"\n    ]\n}"
  },
  {
    "path": "cspell.json",
    "content": "{\n    \"version\": \"0.2\",\n    \"ignorePaths\": [],\n    \"dictionaryDefinitions\": [],\n    \"dictionaries\": [],\n    \"words\": [\n        \"AOAI\",\n        \"azuretools\",\n        \"dataframe\",\n        \"groundtruth\",\n        \"jaccard\",\n        \"keyvault\",\n        \"spacy\",\n        \"textdistance\"\n    ],\n    \"ignoreWords\": [\n        \"BLAS\",\n        \"CNTK\",\n        \"CUDA\",\n        \"Caffe\",\n        \"Chainer\",\n        \"Cython\",\n        \"Hogwild\",\n        \"Jetson\",\n        \"LAPACK\",\n        \"NCCL\",\n        \"NOTSET\",\n        \"Numba\",\n        \"OPENAI\",\n        \"ROCM\",\n        \"Theano\",\n        \"aarch\",\n        \"autograd\",\n        \"azureml\",\n        \"coveragerc\",\n        \"devcontainer\",\n        \"distilbert\",\n        \"distro\",\n        \"htmlcov\",\n        \"keyvault\",\n        \"libuv\",\n        \"mlflow\",\n        \"mpnet\",\n        \"ndarray\",\n        \"promptflow\",\n        \"ptrblck\",\n        \"pytest\",\n        \"rerank\",\n        \"scikit\"\n    ],\n    \"import\": []\n}\n"
  },
  {
    "path": "dev-requirements.txt",
    "content": "promptflow==1.15.0\npromptflow-tools==1.4.0\npytest==8.3.3\npytest-cov==5.0.0\nflake8==7.1.1\npre-commit==3.8.0\nblack==24.8.0\n"
  },
  {
    "path": "docs/azureml-pipeline.md",
    "content": "# AzureML Pipeline\r\n\r\n## What is it\r\n\r\nRAG Experiment Accelerator supports running the pipeline on Azure ML compute, in addition to running it on local machine. This document describes the steps to deploy the necessary resources on Azure, and to run the pipeline on Azure ML compute.\r\n\r\n## Architecture diagram\r\n\r\n![Architecture diagram](../images/AzureMLPipeline.drawio.png)\r\n\r\n![AzureML pipeline overview](../images/azureml_pipeline_overview.png)\r\n\r\n## Deploying the cloud resources \r\n\r\nTo run the steps of RAG experiment accelerator on Azure ML compute, you will need perform the steps from [installation](../README.md#installation) section of the README file. This will take you through the setup to deploy all necessary Azure configuration for you.\r\n\r\nAfter this, follow these steps to deploy additional resources to be able to run Azure ML pipeline:\r\n\r\n1. Deploy an AML Compute cluster. In your Azure ML workspace, navigate to Compute -> Compute clusters, and click New. Pick the virtual machine required, give it a name, and set minimum and maximum number of nodes. It is recommended to set the minimum number of nodes to 0, so that the cluster can scale down to 0. In Advanced settings, assign a system-assigned managed identity.\r\n\r\n![Screenshot detailing the steps to create AML Compute cluster](../images/create_compute_cluster.png)\r\n\r\n2. When you created an Azure ML workspace, Azure will have created an associated keyvault for you. In the keyvault, create two Access Policies: one for yourself (or for the principal who is going to run the pipeline), and another one for the system-assigned managed identity created in the previous step. Both of these principals need Get, Set and List permissions on secrets.\r\n\r\n![Screenshot detailing the steps to create access policies](../images/create_access_policies.png)\r\n\r\n3. Add the following environment variables to the .env file:\r\n\r\n```bash\r\nAML_COMPUTE_NAME=<name of the compute you have created>\r\nAML_COMPUTE_INSTANCES_NUMBER=<maximum number of instances in the cluster>\r\nAZURE_KEY_VAULT_ENDPOINT=<keyvault endpoint, e.g. https://<keyvault-name>.vault.azure.net>\r\n```\r\n\r\n4. Populate the keyvault with the environment configuration. To do so, run the script\r\n\r\n```bash\r\npython env_to_keyvault.py\r\n```\r\n\r\nThis script assumes you already have a `.env` file populated with values. and will use this file to create a secret in the keyvault for each environment variable in the `.env` file.\r\n\r\n## Configuring input for the pipeline\r\n\r\n### Generating the QA data for the pipeline\r\n\r\nThe QA data is not generated as part of the AzureML pipeline, so it must be generated locally prior to\r\nrunning the pipeline in AzureML. (Upon completion of generation this data is uploaded to AzureML so it\r\nwill be accessible to the pipeline when it runs in AzureML.)\r\n\r\nIt is important to re-generate QA data whenever the input data is changed. Otherwise, the previous QA data\r\nwill be used to evaluate the revised data, which is usually not what is expected.\r\n\r\n### Providing data input\r\n\r\nData input for the pipeline can be provided on the command line:\r\n\r\n```bash\r\npython azureml/pipeline.py --data_dir <path to data>\r\n```\r\n\r\nOr it can be provided in the config file:\r\n\r\n```json\r\n{\r\n  \"data_dir\": \"data input\"\r\n}\r\n```\r\n\r\n### Types of data input\r\n\r\nThe pipeline accepts input in formats understood by AzureML. For more information, see [docs on data inputs and outputs](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-inputs-outputs-pipeline?view=azureml-api-2&tabs=cli#path-and-mode-for-data-inputsoutputs).\r\n\r\nThis means that local file paths as input for the pipeline are supported. Additionally, you can create, for example, an AzureML Data Asset and provide a path to it as input for the pipeline:\r\n\r\n```bash\r\npython azureml/pipeline.py --data_dir azureml://<datastore-name>/<path-to-data>\r\n```\r\n\r\n## Running the pipeline\r\n\r\nTo run the pipeline, run the following command:\r\n\r\n```bash\r\npython azureml/pipeline.py\r\n```\r\n\r\nThis will submit the pipeline to Azure ML compute, and you will be able to monitor the run in the Azure ML workspace.\r\n\r\nOn AzureML, head to your workspace and select Jobs on the left. Find the name for your experiment as defined by the name in the config. You will see all runs for your experiment. Click on the latest run to see the details of the run.\r\n\r\n![Screenshot detailing the steps to view the run](../images/view_list_of_runs.png)\r\n\r\nYou also can visually compare the metrics for each run by using Dashboard tab. \r\n\r\n## Troubleshooting the pipeline\r\n\r\nIf the pipeline fails, you can check the logs of the run by clicking on the failed run, and then clicking on the logs tab. This will show you the logs for each step of the pipeline under `Outputs + logs` tab.\r\n\r\n![Screenshot detailing the steps to view logs](../images/view_logs.png)\r\n\r\nIf the first step (the indexing step) of the pipeline fails, you may not be able to see the logs under that tab. You may need to look at the error logs for each individual compute instance or even each individual node.\r\n\r\n![Screenshot detailing the steps to view logs](../images/view_logs_parallel_step.png)"
  },
  {
    "path": "docs/configs-appendix.md",
    "content": "# Understanding the config files\n\n## Prerequisites\nFamiliarity with [ReadMe configuration of elements](/README.md#Description-of-configuration-elements)\n\n## Configuration links for more reading. \n- Search Types\n    - [Semantic Search][semantic search]\n    - [Vector Search][vector search]\n    - [Hybrid Search][hybrid search]\n- Chunking Strategies\n    - [Size][Chunk Size]\n    - [Overlap][Overlap]\n- [Embedding][Embeddings]\n    - Models: The accelerator uses [Sentence Transformer][Sentence Transformer] to generate the embeddings which utilizes [Pre-Trained Models][Transformer Models] based on embedding dimensions.\n    - Dimensions: Each valid value maps to different models for embedding.\n        - 384: [all-MiniLM-L6-v2][all-MiniLM-L6-v2]\n        - 768: [all-mpnet-base-v2][all-mpnet-base-v2]\n        - 1024:[bert-large-nli-mean-tokens][bert-large-nli-mean-tokens]\n- LLM Metrics calculated using scikit-learn in combination with `Math.mean`\n    - [Precision][precision score]\n    - [Recall][recall score]\n- [Prompt Engineering][prompts]\n\n\n<!--- Link references --->\n[Chunk Size]: https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-chunk-documents#common-chunking-techniques\n[Overlap]: https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-chunk-documents#content-overlap-considerations\n[Embeddings]: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/understand-embeddings\n[Sentence Transformer]: https://www.sbert.net/\n[Transformer Models]: https://www.sbert.net/docs/pretrained_models.html\n[all-MiniLM-L6-v2]: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2\n[all-mpnet-base-v2]: https://huggingface.co/sentence-transformers/all-mpnet-base-v2\n[bert-large-nli-mean-tokens]: https://huggingface.co/sentence-transformers/bert-large-nli-mean-tokens\n[prompts]: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/advanced-prompt-engineering?pivots=programming-language-chat-completions\n[recall score]: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html\n[precision score]: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score\n[vector search]: https://learn.microsoft.com/en-us/azure/search/vector-search-overview\n[hybrid search]: https://learn.microsoft.com/en-us/azure/search/hybrid-search-overview\n[semantic search]: https://learn.microsoft.com/en-us/azure/search/semantic-search-overview"
  },
  {
    "path": "docs/environment-variables.md",
    "content": "# Resource Deployment and Environment Variables\n\n## Required Resources\n\nTo use the rag-experiment-accelerator, create the following resources:\n- [Azure AI Search Service](https://azure.microsoft.com/en-us/products/ai-services/ai-search/)\n    - Turning on [Semantic Ranking](https://learn.microsoft.com/en-us/azure/search/semantic-search-overview) is optional\n- [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview)\n    - Create new deployments using models `gpt-35-turbo` and `text-embedding-ada-002`\n- [Azure ML Workspace](https://learn.microsoft.com/en-us/azure/machine-learning/concept-workspace?view=azureml-api-2)\n- [Azure AI Service for Language](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-language-detection) is optional\n\n\n\n## Environment Variables\n\nBelow are the required environment variables, to be added to a local .env file at the root of the repo.\n| Variable                      | Value                                      | Note                                                                     |\n|-------------------------------|--------------------------------------------|--------------------------------------------------------------------------|\n| AZURE_SEARCH_SERVICE_ENDPOINT |                                            | Azure AI Search API Endpoint                                      |\n| AZURE_SEARCH_ADMIN_KEY        |                                            | Azure AI Search Key                                               |\n| AZURE_SEARCH_USE_SEMANTIC_SEARCH         | True / False                                             | Whether to use semantic search (not all regions support this feature)                                               |\n| OPENAI_API_KEY                |                                            | OpenAI API key                                                           |\n| OPENAI_API_TYPE               | azure, open_ai                             | Must be `azure` for Azure OpenAI or `open_ai` for OpenAI.                |\n| OPENAI_ENDPOINT               |                                            | Azure OpenAI API endpoint.                                               |\n| OPENAI_API_VERSION            | 2023-03-15-preview                         | Azure OpenAI API version. See https://learn.microsoft.com/en-us/azure/ai-services/openai/reference. |\n| AML_SUBSCRIPTION_ID           |                                            | Azure Machine Learning subscription ID                                   |\n| AML_WORKSPACE_NAME            |                                            | Name of deployed Azure Machine Learning Workspace                        |\n| AML_RESOURCE_GROUP_NAME       |                                            | Azure Machine Learning resource group name                               |\n| LOGGING_LEVEL                 | NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL | LOGGING_LEVEL is INFO by default                                         |\n\nBelow are optional environment variables, to be added to a local .env file at the root of the repo.\n| Variable                        | Value                                      | Note                                                                     |\n|---------------------------------|--------------------------------------------|--------------------------------------------------------------------------|\n| AZURE_LANGUAGE_SERVICE_ENDPOINT |                                            | Azure AI Service for Language API Endpoint                               |\n| AZURE_LANGUAGE_SERVICE_KEY      |                                            | Azure AI Service for Language Key                                        |"
  },
  {
    "path": "docs/evaluation-metrics.md",
    "content": "# Evaluation Metrics\n\nThe following is an overview of the available evaluation metrics that can be used to evaluate end-to-end performance of\na RAG application by measuring a distance between the ground truth answer and the predicted answer.\n\nThese metrics are calculated as part of  the `04_evaluation.py` script based on the `actual`, `expected` and `context` fields of\nthe `.jsonl` output file (referred to as \"calculation base\"), generated by `03_querying.py` script. See the [script inputs and outputs\nguide](/docs/script-inputs-outputs.md#03_queryingpy) for more information.\n\nYou can choose which metrics should be calculated in your experiment by updating the `metric_types` field in the\n`search_config.json` configuration file.\n\n## Configuration Example\n\n```json\n\"metric_types\": [\n    \"lcsstr\",\n    \"lcsseq\",\n    \"cosine\",\n    \"jaro_winkler\",\n    \"hamming\",\n    \"jaccard\",\n    \"levenshtein\",\n    \"fuzzy_score\",\n    \"rouge1_precision\",\n    \"rouge1_recall\",\n    \"rouge1_fmeasure\",\n    \"rouge2_precision\",\n    \"rouge2_recall\",\n    \"rouge2_fmeasure\",\n    \"rougeL_precision\",\n    \"rougeL_recall\",\n    \"rougeL_fmeasure\",\n    \"bert_all_MiniLM_L6_v2\",\n    \"bert_base_nli_mean_tokens\",\n    \"bert_large_nli_mean_tokens\",\n    \"bert_large_nli_stsb_mean_tokens\",\n    \"bert_distilbert_base_nli_stsb_mean_tokens\",\n    \"bert_paraphrase_multilingual_MiniLM_L12_v2\",\n    \"llm_answer_relevance\",\n    \"llm_context_precision\",\n    \"llm_context_recall\"\n]\n```\n\n## Algorithm-based Metrics\n\nThe following metrics are calculated by using different string similarity algorithms mostly backed by the [TextDistance\nPython package](https://pypi.org/project/textdistance/).\n\n### Longest common substring\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `lcsstr`          | `actual`, `expected` | Percentage (0-100) |\n\nCalculates the longest common substring (LCS) similarity score between two strings.\n\n### Longest common subsequence\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `lcsseq`          | `actual`, `expected` | Percentage (0-100) |\n\nComputes the longest common subsequence (LCS) similarity score between two input strings.\n\n### Cosine similarity (Ochiai coefficient)\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `cosine`          | `actual`, `expected` | Percentage (0-100) |\n\nThis coefficient is calculated as the intersection of the term-frequency vectors of the generated answer (actual) and the ground-truth answer (expected) divided by the geometric mean of the sizes of these vectors.\n\n### Jaro-Winkler distance\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `jaro_winkler`    | `actual`, `expected` | Percentage (0-100) |\n\nThe Jaro-Winkler similarity score is a measure of similarity between two strings. The Jaro-Winkler similarity score is\ncalculated as the number of characters that are different between the two strings divided by the number of characters\nthat are the same between the two strings.\n\n### Hamming distance\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `hamming`         | `actual`, `expected` | Percentage (0-100) |\n\nThe Hamming distance is a measure of similarity between two strings. The Hamming distance is calculated as the number of\ncharacters that are different between the two strings.\n\n### Jaccard similarity\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `jaccard`         | `actual`, `expected` | Percentage (0-100) |\n\nThe Jaccard similarity is calculated as the number of elements in the intersection of the two sets divided by the number\nof elements in the union of the two sets.\n\n### Levenshtein distance\n\n| Configuration Key | Calculation Base     | Possible Values    |\n| ----------------- | -------------------- | ------------------ |\n| `levenshtein`     | `actual`, `expected` | Percentage (0-100) |\n\nThe Levenshtein distance is a measure of similarity between two strings. The Levenshtein distance is calculated as the\nminimum number of insertions, deletions, or substitutions required to transform one string into the other.\n\n### RapidFuzz similarity\n\n| Configuration Key | Calculation Base     | Possible Values       |\n| ----------------- | -------------------- | --------------------- |\n| `fuzzy_score`           | `actual`, `expected` | Percentage (0 - 100) |\n\nThis metric is backed by the [RapidfFuzz Python package](https://github.com/rapidfuzz/RapidFuzz).\nCalculates the fuzzy score between two documents using the levenshtein distance.\n\n### Rouge retrieval metrics (Token based)\n\n**Rouge** short for Recall-Oriented Understudy for Gisting Evaluation, is typically used in summarization evaluation tasks, comparing human produced references and system generated summaries. The core idea is to compare and validate sufficient overlap of common words or phrases in both reference and prediction. String metrics look at character level differences, whereas Rouge can help us compare token level matches. We use the [`rouge-score`](https://pypi.org/project/rouge-score/) to compute these measures. Here are some of the metrics we capture.\n\n| Configuration Key                            | Calculation Base             | Possible Values       |\n| -------------------------------------------- | ---------------------------- | --------------------- |\n| `rouge{1 \\| 2 \\| L}_{precision \\| recall \\| fmeasure}` | `ground_truth`, `prediction` | Percentage (0 - 100)  |\n\n\n- **rouge1_precision**: The ROUGE-1 precision score is the number of overlapping unigrams between the predicted and ground_truth strings divided by the number of unigrams in the ground_truth string.\n- **rouge1_recall**: The ROUGE-1 recall score is the number of overlapping unigrams between the predicted and ground_truth strings divided by the number of unigrams in the predicted string.\n- **rouge1_fmeasure**: This is the harmonic mean of the ROUGE-1 precision and recall scores.\n- **rouge2_precision**: The ROUGE-2 precision score is the number of overlapping bigrams between the predicted and ground_truth strings divided by the number of bigrams in the ground_truth string.\n- **rouge2_recall**: The ROUGE-2 recall score is the number of overlapping bigrams between the predicted and ground_truth strings divided by the number of bigrams in the predicted string.\n- **rouge2_fmeasure**: This is the harmonic mean of the ROUGE-2 precision and recall scores.\n- **rougeL_precision**: The ROUGE-L precision score is the length of overlapping longest common subsequence between the predicted and ground_truth strings divided by the number of unigrams in the predicted string.\n- **rougeL_recall**: The ROUGE-L recall score is the length of overlapping longest common subsequence between the predicted and ground_truth strings divided by the number of unigrams in the ground truth string.\n- **rougeL_fmeasure**: This is the harmonic mean of the ROUGE-L precision and recall scores.\n\n## BERT-based semantic similarity\n\nThe following set of metrics calculates semantic similarity between two strings as percentage of differences based on\nembeddings created by different BERT models. Backed by the [sentence-transformers Python\npackage](https://pypi.org/project/sentence-transformers/).\n\n| Calculation Base     | Possible Values    |\n| -------------------- | ------------------ |\n| `actual`, `expected` | Percentage (0-100) |\n\n| Configuration Key                          | BERT Model                                   |\n| ------------------------------------------ | -------------------------------------------- |\n| bert_all_MiniLM_L6_v2                      | MiniLM L6 v2 model                           |\n| bert_base_nli_mean_tokens                  | Base model, mean tokens                      |\n| bert_large_nli_mean_tokens                 | Large model, mean tokens                     |\n| bert_large_nli_stsb_mean_tokens            | Large model, STS-B, mean tokens              |\n| bert_distilbert_base_nli_stsb_mean_tokens  | DistilBERT base model, STS-B, mean tokens    |\n| bert_paraphrase_multilingual_MiniLM_L12_v2 | Multilingual paraphrase model, MiniLM L12 v2 |\n\n## LLM-based Metrics\n\nThe following metrics are calculated based on LLM reasoning. These metrics require the OpenAI endpoint to be configured\n(see [Environment Variables](./environment-variables.md)).\n\nThese metrics also require the `chat_model_name` property to be set in the `search_config.json` configuration file. See\n[Description of configuration elements](../README.md#description-of-configuration-elements) for details.\n\n### LLM Answer relevance\n\n| Configuration Key  | Calculation Base     | Possible Values                   |\n| ------------------ | -------------------- | --------------------------------- |\n| `llm_answer_relevance` | `actual`, `expected` | From 0 to 1 with 1 being the best |\n\nScores the relevancy of the answer according to the given question. Answers with incomplete, redundant or unnecessary\ninformation is penalized.\n\n### LLM Context precision\n\n| Configuration Key   | Calculation Base    | Possible Values                                                   |\n| ------------------- | ------------------- | ----------------------------------------------------------------- |\n| `llm_context_precision` | `question`, `retrieved_contexts` | Percentage (0-100) |\n\nProportion of retrieved contexts relevant to the question. Evaluates whether or not the context generated by the RAG solution is useful for answering a question.\n\n### LLM Context recall\n\n| Configuration Key   | Calculation Base    | Possible Values                                                   |\n| ------------------- | ------------------- | ----------------------------------------------------------------- |\n| `llm_context_recall` | `question`, `expected`, `retrieved_contexts` | Percentage (0-100) |\n\nEstimates context recall by estimating TP and FN using annotated answer (ground truth) and retrieved contexts. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context.\n"
  },
  {
    "path": "docs/script-inputs-outputs.md",
    "content": "# Script Overview\n\nThis document provides an overview of the scripts that are used to run the RAG Experiment Accelerator. The scripts are numbered in the order of execution, and each script has its own inputs and outputs.\n\n## Prerequisites\nBefore running the scripts, you need to:\n\n- Install the required packages and dependencies by following the instructions in the installation guide.\n- Set up the required resources, such as Azure AI Search, Azure ML Studio, and Azure OpenAI by following the steps in the [required resources guide](/docs/environment-variables.md#required-resources).\n- Create a `.env` file that contains the environment variables for the resources, such as subscription ID, resource group name, and service credentials. For more information on the `.env` file, see the [environment variables guide](/docs/environment-variables.md#environment-variables).\n\n## 01_Index.py\nThis script creates and populates a search index in your Azure AI Search resource.\n\nInputs:\n\n- A `/data` folder that contains the documents that you want to index. The `/data` folder (and its subfolders) can have documents in the following formats: PDF, HTML, Markdown, Text, JSON and Word (DOCX). For more information on how the documents are loaded, see the [documentLoader.py](/rag_experiment_accelerator/doc_loader/documentLoader.py) file.\n\nOutputs:\n\n- A `artifacts/generated_index_names.jsonl` file that contains the names of the generated search indexes.\n- A populated search index in your Azure AI Search resource that contains the indexed documents and their metadata.\n\n## 02_qa_generation.py\nThis script generates question-answer pairs from the indexed documents using the RAG model.\n\nInputs:\n\n- The same `/data` folder as the previous step.\n\nOutputs:\n\n- A `artifacts/eval_data.jsonl` file that contains a list of JSON objects. Each JSON object has three fields: `user_prompt`, `output_prompt`, and `context`. The `user_prompt` field contains the generated question, the `output_prompt` field contains the generated answer, and the `context` field contains the document sections from which the question-answer pair was generated.\n\n## 03_querying.py\nThis script queries the search index using the generated question-answer pairs and evaluates the search results.\n\nInputs:\n\n- The `artifacts/eval_data.jsonl` file from the previous step.\n- You can choose to provide your own `.jsonl` file (with same format as the one generated in the previous step) as input and update `config.json` with the full path of your file in the `eval_data_file` field.\n- `prompts_config.json` (optional)\n  - You can provide a custom prompt to be used as the main prompt for the questions generated to search the data. The custom prompt can be provided as a string as follows:\n\n```json\n{\"main_prompt_instruction\": \"<custom_main_prompt_instruction>\"}\n```\n\n\n\nOutputs:\n\n- A `artifacts/outputs/<CONFIG_VALUES>.jsonl` file that contains a list of JSON objects. Each JSON object has eleven fields: `actual`, `expected`, and nine fields relating to configuration information. The name of the output file is based on the values in the `config.json` file.\n- The same output file is also uploaded to Azure Machine Learning Studio under `Assets -> Data`.\n\n## 04_evaluation.py\nThis script calculates and displays the overall evaluation scores for the search index using the output files from the previous step.\n\nInputs:\n\n- The `artifacts/outputs/<CONFIG_VALUES>.jsonl` file that was uploaded to Azure Machine Learning Studio.\n\nOutputs:\n\n- Several `artifacts/eval_score/*.csv` files that contain the calculated scores for different evaluation metrics, such as bert score, fuzzy score, and mean average precision. The name of the CSV file indicates the evaluation metric and the configuration values.\n- The same CSV files are also found in Azure Machine Learning Studio at `Jobs -> <CONFIG_NAME> -> Outputs + Logs`.\n"
  },
  {
    "path": "docs/wsl.md",
    "content": "# Setting up WSL\n\nThere are numerous guides to setting up WSL, and this is not a comprehensive guide. Instead this might help you setup the basics.\n\n#### If you are using Docker Desktop \n\nTo enable **Developing inside a Container** you must configure the integration between Docker Desktop and WSL on your machine.\n\n>1. Launch Docker Desktop\n>2. Open **Settings > General**. Make sure the *Use the WSL 2 based engine\" is enabled.\n>3. Navigate to **Settings > Resources > WSL INTEGRATION**.\n>      - Ensure *Enable Integration with my default WSL distro\" is enabled.\n>      - Enable the Ubuntu-18.04 option.\n>4. Select **Apply & Restart**\n\n## Configure Git in Ubuntu WSL environment\n\nThe next step is to configure Git for your Ubuntu WSL environment. We will use the bash prompt from the previous step to issue the following commands:\n\nSet Git User Name and Email\n\n``` bash\ngit config --global user.name \"Your Name\"\ngit config --global user.email \"youremail@yourdomain.com\"\n```\n\nSet Git [UseHttps](https://github.com/microsoft/Git-Credential-Manager-Core/blob/main/docs/configuration.md#credentialusehttppath)\n\n``` bash\ngit config --global credential.useHttpPath true\n```\n\nConfigure Git to use the Windows Host Credential Manager\n\n``` bash\ngit config --global credential.helper \"/mnt/c/Program\\ Files/Git/mingw64/libexec/git-core/git-credential-manager-core.exe\"\n```\n\n## Install Azure CLI On WSL\n\nIn your Ubuntu 18.04(WSL) terminal from the previous step, follow the directions [here](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli-linux) to install Azure CLI.\n\n\nInstall Azure CLI and authorize:\n```bash\naz login\naz account set  --subscription=\"<your_subscription_guid>\"\naz account show\n```\n"
  },
  {
    "path": "env_to_keyvault.py",
    "content": "\"\"\"\nThis script is used to create secrets in Azure Keyvault from the environment variables.\n\nFor the list of environment parameters that will be created as secrets, please refer to the Environment class in rag_experiment_accelerator/config/environment.py.\n\"\"\"\n\nimport argparse\n\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n\n    environment = Environment.from_env_or_keyvault()\n    logger.info(\"Creating secrets in Keyvault from the environment\")\n    logger.info(\"The following secrets will be created:\")\n    for secret in environment.fields():\n        logger.info(f\"  - {secret[0]}\")\n\n    environment.to_keyvault()\n    logger.info(\n        f\"Secrets in Keyvault {environment.azure_key_vault_endpoint} have been created successfully.\"\n    )\n"
  },
  {
    "path": "experimental/readme.md",
    "content": "\nRAG EXPERIMENT ACCELERATOR EXPERIMENTAL\n\nThis is the experimental version of the RAG Experiment Accelerator. It is a work in progress and is not yet ready for production use."
  },
  {
    "path": "images/AzureMLPipeline.drawio",
    "content": "<mxfile host=\"app.diagrams.net\" modified=\"2024-03-15T10:47:30.734Z\" agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36\" etag=\"f9RPtMtE0Jcu6UtH4gO5\" version=\"24.0.7\" type=\"device\">\n  <diagram name=\"Page-1\" id=\"c9OLLCgnKFHFIHMQfsV-\">\n    <mxGraphModel dx=\"1528\" dy=\"2131\" grid=\"1\" gridSize=\"10\" guides=\"1\" tooltips=\"1\" connect=\"1\" arrows=\"1\" fold=\"1\" page=\"1\" pageScale=\"1\" pageWidth=\"827\" pageHeight=\"1169\" math=\"0\" shadow=\"0\">\n      <root>\n        <mxCell id=\"0\" />\n        <mxCell id=\"1\" parent=\"0\" />\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-36\" value=\"\" style=\"shape=cylinder3;whiteSpace=wrap;html=1;boundedLbl=1;backgroundOutline=1;size=15;rotation=-90;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"579.5\" y=\"50.5\" width=\"240\" height=\"443\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-35\" value=\"\" style=\"shape=cylinder3;whiteSpace=wrap;html=1;boundedLbl=1;backgroundOutline=1;size=15;rotation=-90;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"550\" y=\"25\" width=\"240\" height=\"440\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-27\" value=\"\" style=\"shape=cylinder3;whiteSpace=wrap;html=1;boundedLbl=1;backgroundOutline=1;size=15;rotation=-90;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"522.25\" y=\"-1.25\" width=\"240\" height=\"437.5\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-2\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.128;exitY=0.982;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;dashed=1;dashPattern=8 8;\" parent=\"1\" source=\"221HtTC9qgcgybCVfVqK-4\" target=\"gFHnbSyB1-i4nugdEwJ--8\" edge=\"1\">\n          <mxGeometry relative=\"1\" as=\"geometry\">\n            <Array as=\"points\">\n              <mxPoint x=\"130\" y=\"130\" />\n              <mxPoint x=\"130\" y=\"200\" />\n            </Array>\n          </mxGeometry>\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--1\" value=\"Developer\" style=\"shape=umlActor;verticalLabelPosition=bottom;verticalAlign=top;html=1;outlineConnect=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"50\" y=\"110\" width=\"30\" height=\"60\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-3\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=0;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.75;entryDx=0;entryDy=0;dashed=1;dashPattern=8 8;\" parent=\"1\" source=\"gFHnbSyB1-i4nugdEwJ--4\" target=\"gFHnbSyB1-i4nugdEwJ--8\" edge=\"1\">\n          <mxGeometry relative=\"1\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--4\" value=\"\" style=\"shape=cylinder3;whiteSpace=wrap;html=1;boundedLbl=1;backgroundOutline=1;size=11;rotation=90;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"35\" y=\"260\" width=\"60\" height=\"80\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--6\" value=\"automation\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"30\" y=\"285\" width=\"60\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-12\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;dashed=1;dashPattern=8 8;\" parent=\"1\" source=\"gFHnbSyB1-i4nugdEwJ--8\" edge=\"1\">\n          <mxGeometry relative=\"1\" as=\"geometry\">\n            <mxPoint x=\"420\" y=\"200\" as=\"targetPoint\" />\n          </mxGeometry>\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--8\" value=\"&lt;pre style=&quot;font-size: 8px;&quot;&gt;for config in configs:&lt;br&gt;  for index_config in config.index.flatten():&lt;br&gt;    start_pipeline()&lt;/pre&gt;\" style=\"rounded=1;whiteSpace=wrap;html=1;align=left;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"160\" y=\"150\" width=\"210\" height=\"100\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"ka2Pck3t7iw5_ma6AWct-2\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=0;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;\" edge=\"1\" parent=\"1\" source=\"gFHnbSyB1-i4nugdEwJ--9\" target=\"gFHnbSyB1-i4nugdEwJ--8\">\n          <mxGeometry relative=\"1\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--9\" value=\"\" style=\"shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=16;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"240\" y=\"300\" width=\"50\" height=\"60\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--14\" value=\"Experiment configs\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"235\" y=\"390\" width=\"60\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"gFHnbSyB1-i4nugdEwJ--17\" value=\"Dataset\" style=\"shape=cylinder3;whiteSpace=wrap;html=1;boundedLbl=1;backgroundOutline=1;size=15;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"420\" y=\"-29.5\" width=\"140\" height=\"80\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-4\" value=\"triggers\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"90\" y=\"100\" width=\"60\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-5\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;\" parent=\"1\" source=\"221HtTC9qgcgybCVfVqK-4\" target=\"221HtTC9qgcgybCVfVqK-4\" edge=\"1\">\n          <mxGeometry relative=\"1\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-6\" value=\"triggers\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"110\" y=\"300\" width=\"60\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-8\" value=\"Index\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"540\" y=\"150\" width=\"90\" height=\"40\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-11\" value=\"&lt;font style=&quot;font-size: 10px;&quot;&gt;Input files&lt;/font&gt;\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"530\" y=\"100\" width=\"60\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-19\" value=\"Query\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"664.5\" y=\"200\" width=\"70\" height=\"60\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-20\" value=\"Eval\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"762.25\" y=\"200\" width=\"70\" height=\"60\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-31\" value=\"&lt;font style=&quot;font-size: 10px;&quot;&gt;Triggers&lt;/font&gt;\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"360\" y=\"170\" width=\"60\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"221HtTC9qgcgybCVfVqK-32\" value=\"Azure ML\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"770\" y=\"100\" width=\"80\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-1\" value=\"Index\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"540\" y=\"210\" width=\"90\" height=\"40\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-2\" value=\"Index\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"540\" y=\"270\" width=\"90\" height=\"40\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-3\" value=\"Parallel step\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"440\" y=\"190\" width=\"50\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-4\" value=\"\" style=\"endArrow=classic;html=1;rounded=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;\" parent=\"1\" source=\"gFHnbSyB1-i4nugdEwJ--17\" target=\"221HtTC9qgcgybCVfVqK-8\" edge=\"1\">\n          <mxGeometry width=\"50\" height=\"50\" relative=\"1\" as=\"geometry\">\n            <mxPoint x=\"800\" y=\"200\" as=\"sourcePoint\" />\n            <mxPoint x=\"850\" y=\"150\" as=\"targetPoint\" />\n            <Array as=\"points\">\n              <mxPoint x=\"490\" y=\"170\" />\n            </Array>\n          </mxGeometry>\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-5\" value=\"\" style=\"endArrow=classic;html=1;rounded=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;\" parent=\"1\" source=\"gFHnbSyB1-i4nugdEwJ--17\" target=\"otIctKdWp1Emk4w7bJRC-1\" edge=\"1\">\n          <mxGeometry width=\"50\" height=\"50\" relative=\"1\" as=\"geometry\">\n            <mxPoint x=\"800\" y=\"200\" as=\"sourcePoint\" />\n            <mxPoint x=\"850\" y=\"150\" as=\"targetPoint\" />\n            <Array as=\"points\">\n              <mxPoint x=\"490\" y=\"230\" />\n            </Array>\n          </mxGeometry>\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-6\" value=\"\" style=\"endArrow=classic;html=1;rounded=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;\" parent=\"1\" source=\"gFHnbSyB1-i4nugdEwJ--17\" target=\"otIctKdWp1Emk4w7bJRC-2\" edge=\"1\">\n          <mxGeometry width=\"50\" height=\"50\" relative=\"1\" as=\"geometry\">\n            <mxPoint x=\"610\" y=\"70\" as=\"sourcePoint\" />\n            <mxPoint x=\"660\" y=\"20\" as=\"targetPoint\" />\n            <Array as=\"points\">\n              <mxPoint x=\"490\" y=\"290\" />\n            </Array>\n          </mxGeometry>\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-7\" value=\"Mini batch\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"490\" y=\"130\" width=\"50\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-8\" value=\"Mini batch\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"490\" y=\"190\" width=\"50\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n        <mxCell id=\"otIctKdWp1Emk4w7bJRC-9\" value=\"Mini batch\" style=\"text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;\" parent=\"1\" vertex=\"1\">\n          <mxGeometry x=\"490\" y=\"255\" width=\"50\" height=\"30\" as=\"geometry\" />\n        </mxCell>\n      </root>\n    </mxGraphModel>\n  </diagram>\n</mxfile>\n"
  },
  {
    "path": "infra/abbreviations.json",
    "content": "{\n    \"analysisServicesServers\": \"as\",\n    \"apiManagementService\": \"apim-\",\n    \"appConfigurationConfigurationStores\": \"appcs-\",\n    \"appManagedEnvironments\": \"cae-\",\n    \"appContainerApps\": \"ca-\",\n    \"authorizationPolicyDefinitions\": \"policy-\",\n    \"automationAutomationAccounts\": \"aa-\",\n    \"blueprintBlueprints\": \"bp-\",\n    \"blueprintBlueprintsArtifacts\": \"bpa-\",\n    \"cacheRedis\": \"redis-\",\n    \"cdnProfiles\": \"cdnp-\",\n    \"cdnProfilesEndpoints\": \"cdne-\",\n    \"cognitiveServicesAccounts\": \"cog-\",\n    \"cognitiveServicesFormRecognizer\": \"cog-fr-\",\n    \"cognitiveServicesTextAnalytics\": \"cog-ta-\",\n    \"computeAvailabilitySets\": \"avail-\",\n    \"computeCloudServices\": \"cld-\",\n    \"computeDiskEncryptionSets\": \"des\",\n    \"computeDisks\": \"disk\",\n    \"computeDisksOs\": \"osdisk\",\n    \"computeGalleries\": \"gal\",\n    \"computeSnapshots\": \"snap-\",\n    \"computeVirtualMachines\": \"vm\",\n    \"computeVirtualMachineScaleSets\": \"vmss-\",\n    \"containerInstanceContainerGroups\": \"ci\",\n    \"containerRegistryRegistries\": \"cr\",\n    \"containerServiceManagedClusters\": \"aks-\",\n    \"databricksWorkspaces\": \"dbw-\",\n    \"dataFactoryFactories\": \"adf-\",\n    \"dataLakeAnalyticsAccounts\": \"dla\",\n    \"dataLakeStoreAccounts\": \"dls\",\n    \"dataMigrationServices\": \"dms-\",\n    \"dBforMySQLServers\": \"mysql-\",\n    \"dBforPostgreSQLServers\": \"psql-\",\n    \"devicesIotHubs\": \"iot-\",\n    \"devicesProvisioningServices\": \"provs-\",\n    \"devicesProvisioningServicesCertificates\": \"pcert-\",\n    \"documentDBDatabaseAccounts\": \"cosmos-\",\n    \"eventGridDomains\": \"evgd-\",\n    \"eventGridDomainsTopics\": \"evgt-\",\n    \"eventGridEventSubscriptions\": \"evgs-\",\n    \"eventHubNamespaces\": \"evhns-\",\n    \"eventHubNamespacesEventHubs\": \"evh-\",\n    \"hdInsightClustersHadoop\": \"hadoop-\",\n    \"hdInsightClustersHbase\": \"hbase-\",\n    \"hdInsightClustersKafka\": \"kafka-\",\n    \"hdInsightClustersMl\": \"mls-\",\n    \"hdInsightClustersSpark\": \"spark-\",\n    \"hdInsightClustersStorm\": \"storm-\",\n    \"hybridComputeMachines\": \"arcs-\",\n    \"insightsActionGroups\": \"ag-\",\n    \"insightsComponents\": \"appi-\",\n    \"keyVaultVaults\": \"kv-\",\n    \"kubernetesConnectedClusters\": \"arck\",\n    \"kustoClusters\": \"dec\",\n    \"kustoClustersDatabases\": \"dedb\",\n    \"logicIntegrationAccounts\": \"ia-\",\n    \"logicWorkflows\": \"logic-\",\n    \"machineLearningServicesWorkspaces\": \"mlw-\",\n    \"managedIdentityUserAssignedIdentities\": \"id-\",\n    \"managementManagementGroups\": \"mg-\",\n    \"migrateAssessmentProjects\": \"migr-\",\n    \"networkApplicationGateways\": \"agw-\",\n    \"networkApplicationSecurityGroups\": \"asg-\",\n    \"networkAzureFirewalls\": \"afw-\",\n    \"networkBastionHosts\": \"bas-\",\n    \"networkConnections\": \"con-\",\n    \"networkDnsZones\": \"dnsz-\",\n    \"networkExpressRouteCircuits\": \"erc-\",\n    \"networkFirewallPolicies\": \"afwp-\",\n    \"networkFirewallPoliciesWebApplication\": \"waf\",\n    \"networkFirewallPoliciesRuleGroups\": \"wafrg\",\n    \"networkFrontDoors\": \"fd-\",\n    \"networkFrontdoorWebApplicationFirewallPolicies\": \"fdfp-\",\n    \"networkLoadBalancersExternal\": \"lbe-\",\n    \"networkLoadBalancersInternal\": \"lbi-\",\n    \"networkLoadBalancersInboundNatRules\": \"rule-\",\n    \"networkLocalNetworkGateways\": \"lgw-\",\n    \"networkNatGateways\": \"ng-\",\n    \"networkNetworkInterfaces\": \"nic-\",\n    \"networkNetworkSecurityGroups\": \"nsg-\",\n    \"networkNetworkSecurityGroupsSecurityRules\": \"nsgsr-\",\n    \"networkNetworkWatchers\": \"nw-\",\n    \"networkPrivateDnsZones\": \"pdnsz-\",\n    \"networkPrivateLinkServices\": \"pl-\",\n    \"networkPublicIPAddresses\": \"pip-\",\n    \"networkPublicIPPrefixes\": \"ippre-\",\n    \"networkRouteFilters\": \"rf-\",\n    \"networkRouteTables\": \"rt-\",\n    \"networkRouteTablesRoutes\": \"udr-\",\n    \"networkTrafficManagerProfiles\": \"traf-\",\n    \"networkVirtualNetworkGateways\": \"vgw-\",\n    \"networkVirtualNetworks\": \"vnet-\",\n    \"networkVirtualNetworksSubnets\": \"snet-\",\n    \"networkVirtualNetworksVirtualNetworkPeerings\": \"peer-\",\n    \"networkVirtualWans\": \"vwan-\",\n    \"networkVpnGateways\": \"vpng-\",\n    \"networkVpnGatewaysVpnConnections\": \"vcn-\",\n    \"networkVpnGatewaysVpnSites\": \"vst-\",\n    \"notificationHubsNamespaces\": \"ntfns-\",\n    \"notificationHubsNamespacesNotificationHubs\": \"ntf-\",\n    \"operationalInsightsWorkspaces\": \"log-\",\n    \"portalDashboards\": \"dash-\",\n    \"powerBIDedicatedCapacities\": \"pbi-\",\n    \"purviewAccounts\": \"pview-\",\n    \"recoveryServicesVaults\": \"rsv-\",\n    \"resourcesResourceGroups\": \"rg-\",\n    \"searchSearchServices\": \"srch-\",\n    \"serviceBusNamespaces\": \"sb-\",\n    \"serviceBusNamespacesQueues\": \"sbq-\",\n    \"serviceBusNamespacesTopics\": \"sbt-\",\n    \"serviceEndPointPolicies\": \"se-\",\n    \"serviceFabricClusters\": \"sf-\",\n    \"signalRServiceSignalR\": \"sigr\",\n    \"sqlManagedInstances\": \"sqlmi-\",\n    \"sqlServers\": \"sql-\",\n    \"sqlServersDataWarehouse\": \"sqldw-\",\n    \"sqlServersDatabases\": \"sqldb-\",\n    \"sqlServersDatabasesStretch\": \"sqlstrdb-\",\n    \"storageStorageAccounts\": \"st\",\n    \"storageStorageAccountsVm\": \"stvm\",\n    \"storSimpleManagers\": \"ssimp\",\n    \"streamAnalyticsCluster\": \"asa-\",\n    \"synapseWorkspaces\": \"syn\",\n    \"synapseWorkspacesAnalyticsWorkspaces\": \"synw\",\n    \"synapseWorkspacesSqlPoolsDedicated\": \"syndp\",\n    \"synapseWorkspacesSqlPoolsSpark\": \"synsp\",\n    \"timeSeriesInsightsEnvironments\": \"tsi-\",\n    \"webServerFarms\": \"plan-\",\n    \"webSitesAppService\": \"app-\",\n    \"webSitesAppServiceEnvironment\": \"ase-\",\n    \"webSitesFunctions\": \"func-\",\n    \"webStaticSites\": \"stapp-\"\n}\n"
  },
  {
    "path": "infra/generate_arm_template.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\naz bicep version 2>/dev/null || az bicep install\n\nTEMPLATES=()\nFILES=()\n\nfor ARG in $@; do\n    # If the argument is supplied with \"-f\", then it is a template file that needs to be built\n    if [[ $ARG == -f=* ]]; then\n        TEMPLATES+=(${ARG#-f=})\n    else\n    # Otherwise, it is a file that has been edited\n        az bicep format --insert-final-newline -f $ARG &\n        FILES+=($ARG)\n    fi\ndone\n\nwait\n\ngit add ${FILES[@]}\n\n# Build the templates\nfor TEMPLATE in ${TEMPLATES[@]}; do\n    az bicep build -f $TEMPLATE\n    git add \"${TEMPLATE%.bicep}.json\" # Change the extension from .bicep to .json\ndone\n"
  },
  {
    "path": "infra/main.bicep",
    "content": "targetScope = 'subscription'\n\n@minLength(1)\n@maxLength(64)\n@description('Name of the the environment which is used to generate a short unique hash used in all resources.')\nparam environmentName string\n\nparam resourceToken string = toLower(uniqueString(subscription().id, environmentName, location))\n\n@description('Location for all resources.')\nparam location string\n\n@description('Whether to enable semantic search. If enabled, the service will use the semantic search capability to improve the relevance of search results.')\nparam azureSearchUseSemanticSearch bool = true\n\n@description('Location for all resources. https://aka.ms/semanticsearchavailability for list of available regions.')\n@allowed([\n  'australiaeast'\n  'australiasoutheast'\n  'brazilsouth'\n  'canadacentral'\n  'canadaeast'\n  'centralindia'\n  'centralus'\n  'centraluseuap'\n  'eastasia'\n  'eastus'\n  'eastus2'\n  'eastus2euap'\n  'eastusstg'\n  'francecentral'\n  'japaneast'\n  'japanwest'\n  'koreacentral'\n  'koreasouth'\n  'northcentralus'\n  'northeurope'\n  'qatarcentral'\n  'southcentralus'\n  'southeastasia'\n  'switzerlandnorth'\n  'uksouth'\n  'ukwest'\n  'westcentralus'\n  'westeurope'\n  'westus'\n  'westus2'\n  'westus3'\n])\nparam azureAISearchLocation string = location\n\n@description('Azure AI Search Resource')\nparam azureAISearchName string = 'search-${resourceToken}'\n\n@description('The SKU of the search service you want to create. E.g. free or standard')\n@allowed([\n  'free'\n  'basic'\n  'standard'\n  'standard2'\n  'standard3'\n])\nparam azureSearchSku string = 'standard'\n\n@description('Name of Azure OpenAI Resource')\nparam azureOpenAIResourceName string = 'openai-${resourceToken}'\n\n@description('Name of Azure OpenAI Resource SKU')\nparam azureOpenAISkuName string = 'S0'\n\n@description('Azure OpenAI GPT Model Deployment Name')\nparam azureOpenAIModel string = 'gpt-35-turbo'\n\n@description('Azure OpenAI GPT Model Name')\nparam azureOpenAIModelName string = 'gpt-35-turbo'\n\n@description('Azure OpenAI GPT Model Version')\nparam azureOpenAIModelVersion string = '0613'\n\n@description('Whether to deploy Azure Document Intelligence.')\nparam useAzureAIDocumentIntelligence bool = true\n\n@description('Location for Azure AI Doc Intelligence.')\n@allowed([\n  'australiaeast'\n  'brazilsouth'\n  'canadacentral'\n  'centralindia'\n  'centralus'\n  'centraluseuap'\n  'eastasia'\n  'eastus'\n  'eastus2'\n  'eastus2euap'\n  'francecentral'\n  'germanywestcentral'\n  'japaneast'\n  'japanwest'\n  'jioindiawest'\n  'koreacentral'\n  'northcentralus'\n  'northeurope'\n  'norwayeast'\n  'qatarcentral'\n  'southafricanorth'\n  'southcentralus'\n  'southeastasia'\n  'swedencentral'\n  'switzerlandnorth'\n  'switzerlandwest'\n  'uaenorth'\n  'uksouth'\n  'westcentralus'\n  'westeurope'\n  'westus'\n  'westus2'\n  'westus3'\n])\nparam azureAIDocumentIntelligenceLocation string = location\n\n@description('Name of Azure AI Document Intelligence Resource')\nparam azureAIDocumentIntelligenceResourceName string = 'docintel-${resourceToken}'\n\n@description('Name of Azure AI Document Intelligence SKU')\nparam azureAIDocumentIntelligenceSkuName string = 'S0'\n\n@description('Name of Azure Application Insights Resource')\nparam applicationInsightsName string = 'appinsights-${resourceToken}'\n\n@description('Name of Storage Account')\nparam storageAccountName string = 'str${resourceToken}'\n\n@description('Name of Azure Machine Learning Workspace')\nparam machineLearningName string = 'aml-${resourceToken}'\n\n@description('Id of the user or app to assign application roles')\nparam principalId string = ''\n\n@description('Address space for the virtual network')\nparam vnetAddressSpace string = ''\n\n@description('Address space for the proxy server subnet')\nparam proxySubnetAddressSpace string = ''\n\n@description('Address space for the other azure resources subnet')\nparam subnetAddressSpace string = ''\n\nvar proxySubnetName = 'AzureBastionSubnet'\nvar virtualNetworkName = 'vnet-${resourceToken}'\nvar subnetName = 'subnet-${resourceToken}'\nvar tags = { 'azd-env-name': environmentName }\nvar rgName = 'rg-${environmentName}'\nvar keyVaultName = 'kv-${resourceToken}'\n\n// Organize resources in a resource group\nresource rg 'Microsoft.Resources/resourceGroups@2021-04-01' = {\n  name: rgName\n  location: location\n  tags: tags\n}\n\n// Store secrets in a keyvault\nmodule keyvault './shared/keyvault.bicep' = {\n  name: 'keyvault'\n  scope: rg\n  params: {\n    name: keyVaultName\n    location: location\n    tags: tags\n    principalId: principalId\n  }\n}\n\nmodule search 'shared/search-services.bicep' = {\n  name: azureAISearchName\n  scope: rg\n  params: {\n    name: azureAISearchName\n    location: azureAISearchLocation\n    sku: {\n      name: azureSearchSku\n    }\n    semanticSearch: azureSearchUseSemanticSearch ? 'free' : 'disabled'\n    tags: tags\n  }\n}\n\nmodule openai './shared/cognitiveservices.bicep' = {\n  name: azureOpenAIResourceName\n  scope: rg\n  params: {\n    name: azureOpenAIResourceName\n    location: location\n    tags: tags\n    sku: {\n      name: azureOpenAISkuName\n    }\n    deployments: [\n      {\n        name: azureOpenAIModel\n        model: {\n          format: 'OpenAI'\n          name: azureOpenAIModelName\n          version: azureOpenAIModelVersion\n        }\n        sku: {\n          name: 'Standard'\n          capacity: 30\n        }\n      }\n    ]\n  }\n}\n\nmodule documentIntelligence './shared/cognitiveservices.bicep' = if (useAzureAIDocumentIntelligence) {\n  name: azureAIDocumentIntelligenceResourceName\n  scope: rg\n  params: {\n    name: azureAIDocumentIntelligenceResourceName\n    location: azureAIDocumentIntelligenceLocation\n    tags: tags\n    kind: 'FormRecognizer'\n    sku: {\n      name: azureAIDocumentIntelligenceSkuName\n    }\n  }\n}\n\nmodule storage './shared/storage.bicep' = {\n  name: storageAccountName\n  scope: rg\n  params: {\n    name: storageAccountName\n    location: location\n    sku: {\n      name: 'Standard_GRS'\n    }\n  }\n}\n\nmodule monitoring 'shared/monitoring.bicep' = {\n  name: 'monitoring'\n  scope: rg\n  params: {\n    applicationInsightsName: applicationInsightsName\n    location: location\n    logAnalyticsName: '${environmentName}-logAnalytics-${resourceToken}'\n  }\n}\n\nmodule machineLearning './shared/machineLearning.bicep' = {\n  name: machineLearningName\n  scope: rg\n  params: {\n    name: machineLearningName\n    location: location\n    storageAccount: storage.outputs.id\n    keyVault: keyvault.outputs.id\n    applicationInsights: monitoring.outputs.applicationInsightsId\n  }\n}\n\nmodule storekeys './shared/storekeys.bicep' = {\n  name: 'storekeys'\n  scope: rg\n  params: {\n    keyVaultName: keyvault.outputs.name\n    azureOpenAIName: openai.outputs.name\n    azureAISearchName: search.outputs.name\n    documentIntelligenceName: documentIntelligence.outputs.name\n    rgName: rgName\n  }\n}\n\n// More resources can be added here to deploy with private endpoints.\n// These resources should be added to the azureResources array in the network_resources module.\n// TODO: Add private endpoints to other required resources.\nmodule network_resources 'network/network_isolation.bicep' = if (vnetAddressSpace != '' && proxySubnetAddressSpace != '' && subnetAddressSpace != '') {\n  name: 'network_isolation_resources'\n  scope: rg\n  params: {\n    vnetName: virtualNetworkName\n    location: location\n    vnetAddressSpace: vnetAddressSpace\n    proxySubnetName: proxySubnetName\n    proxySubnetAddressSpace: proxySubnetAddressSpace\n    azureSubnetName: subnetName\n    azureSubnetAddressSpace: subnetAddressSpace\n    resourcePrefix: environmentName\n    azureResources: [\n      {\n        type: 'blob'\n        name: storage.name\n        resourceId: storage.outputs.id\n      }\n      {\n        type: 'vault'\n        name: keyvault.name\n        resourceId: keyvault.outputs.id\n      }\n      {\n        type: 'amlworkspace'\n        name: machineLearning.name\n        resourceId: machineLearning.outputs.workspaceId\n      }\n    ]\n  }\n}\n\noutput USE_KEY_VAULT string = 'true'\noutput AZURE_KEY_VAULT_ENDPOINT string = keyvault.outputs.endpoint\noutput AZURE_SEARCH_SERVICE_ENDPOINT string = search.outputs.endpoint\noutput AZURE_SEARCH_USE_SEMANTIC_SEARCH bool = azureSearchUseSemanticSearch\noutput OPENAI_API_TYPE string = 'azure'\noutput OPENAI_ENDPOINT string = openai.outputs.endpoint\noutput OPENAI_API_VERSION string = '2023-03-15-preview'\noutput AML_SUBSCRIPTION_ID string = subscription().subscriptionId\noutput AML_WORKSPACE_NAME string = machineLearning.outputs.workspaceName\noutput AML_RESOURCE_GROUP_NAME string = rgName\noutput AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT string = documentIntelligence.outputs.endpoint\n// output AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY string =\n// output AZURE_LANGUAGE_SERVICE_ENDPOINT string =\n// output AZURE_LANGUAGE_SERVICE_KEY string =\n"
  },
  {
    "path": "infra/main.bicepparam",
    "content": "using './main.bicep'\n\nparam environmentName = readEnvironmentVariable('AZURE_ENV_NAME', 'env_name')\n\nparam location = readEnvironmentVariable('AZURE_LOCATION', 'location')\n\nparam principalId = readEnvironmentVariable('AZURE_PRINCIPAL_ID', 'principal_id')\n"
  },
  {
    "path": "infra/main.json",
    "content": "{\n  \"$schema\": \"https://schema.management.azure.com/schemas/2018-05-01/subscriptionDeploymentTemplate.json#\",\n  \"contentVersion\": \"1.0.0.0\",\n  \"metadata\": {\n    \"_generator\": {\n      \"name\": \"bicep\",\n      \"version\": \"0.28.1.47646\",\n      \"templateHash\": \"5972590653853251843\"\n    }\n  },\n  \"parameters\": {\n    \"environmentName\": {\n      \"type\": \"string\",\n      \"minLength\": 1,\n      \"maxLength\": 64,\n      \"metadata\": {\n        \"description\": \"Name of the the environment which is used to generate a short unique hash used in all resources.\"\n      }\n    },\n    \"resourceToken\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[toLower(uniqueString(subscription().id, parameters('environmentName'), parameters('location')))]\"\n    },\n    \"location\": {\n      \"type\": \"string\",\n      \"metadata\": {\n        \"description\": \"Location for all resources.\"\n      }\n    },\n    \"azureSearchUseSemanticSearch\": {\n      \"type\": \"bool\",\n      \"defaultValue\": true,\n      \"metadata\": {\n        \"description\": \"Whether to enable semantic search. If enabled, the service will use the semantic search capability to improve the relevance of search results.\"\n      }\n    },\n    \"azureAISearchLocation\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[parameters('location')]\",\n      \"allowedValues\": [\n        \"australiaeast\",\n        \"australiasoutheast\",\n        \"brazilsouth\",\n        \"canadacentral\",\n        \"canadaeast\",\n        \"centralindia\",\n        \"centralus\",\n        \"centraluseuap\",\n        \"eastasia\",\n        \"eastus\",\n        \"eastus2\",\n        \"eastus2euap\",\n        \"eastusstg\",\n        \"francecentral\",\n        \"japaneast\",\n        \"japanwest\",\n        \"koreacentral\",\n        \"koreasouth\",\n        \"northcentralus\",\n        \"northeurope\",\n        \"qatarcentral\",\n        \"southcentralus\",\n        \"southeastasia\",\n        \"switzerlandnorth\",\n        \"uksouth\",\n        \"ukwest\",\n        \"westcentralus\",\n        \"westeurope\",\n        \"westus\",\n        \"westus2\",\n        \"westus3\"\n      ],\n      \"metadata\": {\n        \"description\": \"Location for all resources. https://aka.ms/semanticsearchavailability for list of available regions.\"\n      }\n    },\n    \"azureAISearchName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[format('search-{0}', parameters('resourceToken'))]\",\n      \"metadata\": {\n        \"description\": \"Azure AI Search Resource\"\n      }\n    },\n    \"azureSearchSku\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"standard\",\n      \"allowedValues\": [\n        \"free\",\n        \"basic\",\n        \"standard\",\n        \"standard2\",\n        \"standard3\"\n      ],\n      \"metadata\": {\n        \"description\": \"The SKU of the search service you want to create. E.g. free or standard\"\n      }\n    },\n    \"azureOpenAIResourceName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[format('openai-{0}', parameters('resourceToken'))]\",\n      \"metadata\": {\n        \"description\": \"Name of Azure OpenAI Resource\"\n      }\n    },\n    \"azureOpenAISkuName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"S0\",\n      \"metadata\": {\n        \"description\": \"Name of Azure OpenAI Resource SKU\"\n      }\n    },\n    \"azureOpenAIModel\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"gpt-35-turbo\",\n      \"metadata\": {\n        \"description\": \"Azure OpenAI GPT Model Deployment Name\"\n      }\n    },\n    \"azureOpenAIModelName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"gpt-35-turbo\",\n      \"metadata\": {\n        \"description\": \"Azure OpenAI GPT Model Name\"\n      }\n    },\n    \"azureOpenAIModelVersion\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"0613\",\n      \"metadata\": {\n        \"description\": \"Azure OpenAI GPT Model Version\"\n      }\n    },\n    \"azureAIDocumentIntelligenceResourceName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[format('docintel-{0}', parameters('resourceToken'))]\",\n      \"metadata\": {\n        \"description\": \"Name of Azure AI Document Intelligence Resource\"\n      }\n    },\n    \"azureAIDocumentIntelligenceSkuName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"S0\",\n      \"metadata\": {\n        \"description\": \"Name of Azure AI Document Intelligence SKU\"\n      }\n    },\n    \"applicationInsightsName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[format('appinsights-{0}', parameters('resourceToken'))]\",\n      \"metadata\": {\n        \"description\": \"Name of Azure Application Insights Resource\"\n      }\n    },\n    \"storageAccountName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[format('str{0}', parameters('resourceToken'))]\",\n      \"metadata\": {\n        \"description\": \"Name of Storage Account\"\n      }\n    },\n    \"machineLearningName\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"[format('aml-{0}', parameters('resourceToken'))]\",\n      \"metadata\": {\n        \"description\": \"Name of Azure Machine Learning Workspace\"\n      }\n    },\n    \"principalId\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"\",\n      \"metadata\": {\n        \"description\": \"Id of the user or app to assign application roles\"\n      }\n    },\n    \"vnetAddressSpace\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"\",\n      \"metadata\": {\n        \"description\": \"Address space for the virtual network\"\n      }\n    },\n    \"proxySubnetAddressSpace\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"\",\n      \"metadata\": {\n        \"description\": \"Address space for the proxy server subnet\"\n      }\n    },\n    \"subnetAddressSpace\": {\n      \"type\": \"string\",\n      \"defaultValue\": \"\",\n      \"metadata\": {\n        \"description\": \"Address space for the other azure resources subnet\"\n      }\n    }\n  },\n  \"variables\": {\n    \"proxySubnetName\": \"AzureBastionSubnet\",\n    \"virtualNetworkName\": \"[format('vnet-{0}', parameters('resourceToken'))]\",\n    \"subnetName\": \"[format('subnet-{0}', parameters('resourceToken'))]\",\n    \"tags\": {\n      \"azd-env-name\": \"[parameters('environmentName')]\"\n    },\n    \"rgName\": \"[format('rg-{0}', parameters('environmentName'))]\",\n    \"keyVaultName\": \"[format('kv-{0}', parameters('resourceToken'))]\"\n  },\n  \"resources\": [\n    {\n      \"type\": \"Microsoft.Resources/resourceGroups\",\n      \"apiVersion\": \"2021-04-01\",\n      \"name\": \"[variables('rgName')]\",\n      \"location\": \"[parameters('location')]\",\n      \"tags\": \"[variables('tags')]\"\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"keyvault\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"name\": {\n            \"value\": \"[variables('keyVaultName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"tags\": {\n            \"value\": \"[variables('tags')]\"\n          },\n          \"principalId\": {\n            \"value\": \"[parameters('principalId')]\"\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"11344700457585105755\"\n            }\n          },\n          \"parameters\": {\n            \"name\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"principalId\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"\",\n              \"metadata\": {\n                \"description\": \"Service principal will be granted read access to the KeyVault. If unset, no service principal is granted access by default\"\n              }\n            }\n          },\n          \"variables\": {\n            \"defaultAccessPolicies\": \"[if(not(empty(parameters('principalId'))), createArray(createObject('objectId', parameters('principalId'), 'permissions', createObject('secrets', createArray('get', 'set', 'list')), 'tenantId', subscription().tenantId)), createArray())]\"\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.KeyVault/vaults\",\n              \"apiVersion\": \"2022-07-01\",\n              \"name\": \"[parameters('name')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"properties\": {\n                \"tenantId\": \"[subscription().tenantId]\",\n                \"sku\": {\n                  \"family\": \"A\",\n                  \"name\": \"standard\"\n                },\n                \"enabledForTemplateDeployment\": true,\n                \"accessPolicies\": \"[union(variables('defaultAccessPolicies'), createArray())]\"\n              }\n            }\n          ],\n          \"outputs\": {\n            \"id\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.KeyVault/vaults', parameters('name'))]\"\n            },\n            \"endpoint\": {\n              \"type\": \"string\",\n              \"value\": \"[reference(resourceId('Microsoft.KeyVault/vaults', parameters('name')), '2022-07-01').vaultUri]\"\n            },\n            \"name\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('name')]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"[parameters('azureAISearchName')]\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"name\": {\n            \"value\": \"[parameters('azureAISearchName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('azureAISearchLocation')]\"\n          },\n          \"sku\": {\n            \"value\": {\n              \"name\": \"[parameters('azureSearchSku')]\"\n            }\n          },\n          \"semanticSearch\": \"[if(parameters('azureSearchUseSemanticSearch'), createObject('value', 'free'), createObject('value', 'disabled'))]\",\n          \"tags\": {\n            \"value\": \"[variables('tags')]\"\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"3835024889022694318\"\n            },\n            \"description\": \"Creates an Azure AI Search instance.\"\n          },\n          \"parameters\": {\n            \"name\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"sku\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"name\": \"standard\"\n              }\n            },\n            \"authOptions\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"disableLocalAuth\": {\n              \"type\": \"bool\",\n              \"defaultValue\": false\n            },\n            \"encryptionWithCmk\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"enforcement\": \"Unspecified\"\n              }\n            },\n            \"hostingMode\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"default\",\n              \"allowedValues\": [\n                \"default\",\n                \"highDensity\"\n              ]\n            },\n            \"networkRuleSet\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"bypass\": \"None\",\n                \"ipRules\": []\n              }\n            },\n            \"partitionCount\": {\n              \"type\": \"int\",\n              \"defaultValue\": 1\n            },\n            \"publicNetworkAccess\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"enabled\",\n              \"allowedValues\": [\n                \"enabled\",\n                \"disabled\"\n              ]\n            },\n            \"replicaCount\": {\n              \"type\": \"int\",\n              \"defaultValue\": 1\n            },\n            \"semanticSearch\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"disabled\",\n              \"allowedValues\": [\n                \"disabled\",\n                \"free\",\n                \"standard\"\n              ]\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.Search/searchServices\",\n              \"apiVersion\": \"2023-11-01\",\n              \"name\": \"[parameters('name')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"identity\": {\n                \"type\": \"SystemAssigned\"\n              },\n              \"properties\": {\n                \"authOptions\": \"[parameters('authOptions')]\",\n                \"disableLocalAuth\": \"[parameters('disableLocalAuth')]\",\n                \"encryptionWithCmk\": \"[parameters('encryptionWithCmk')]\",\n                \"hostingMode\": \"[parameters('hostingMode')]\",\n                \"networkRuleSet\": \"[parameters('networkRuleSet')]\",\n                \"partitionCount\": \"[parameters('partitionCount')]\",\n                \"publicNetworkAccess\": \"[parameters('publicNetworkAccess')]\",\n                \"replicaCount\": \"[parameters('replicaCount')]\",\n                \"semanticSearch\": \"[parameters('semanticSearch')]\"\n              },\n              \"sku\": \"[parameters('sku')]\"\n            }\n          ],\n          \"outputs\": {\n            \"id\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.Search/searchServices', parameters('name'))]\"\n            },\n            \"endpoint\": {\n              \"type\": \"string\",\n              \"value\": \"[format('https://{0}.search.windows.net/', parameters('name'))]\"\n            },\n            \"name\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('name')]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"[parameters('azureOpenAIResourceName')]\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"name\": {\n            \"value\": \"[parameters('azureOpenAIResourceName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"tags\": {\n            \"value\": \"[variables('tags')]\"\n          },\n          \"sku\": {\n            \"value\": {\n              \"name\": \"[parameters('azureOpenAISkuName')]\"\n            }\n          },\n          \"deployments\": {\n            \"value\": [\n              {\n                \"name\": \"[parameters('azureOpenAIModel')]\",\n                \"model\": {\n                  \"format\": \"OpenAI\",\n                  \"name\": \"[parameters('azureOpenAIModelName')]\",\n                  \"version\": \"[parameters('azureOpenAIModelVersion')]\"\n                },\n                \"sku\": {\n                  \"name\": \"Standard\",\n                  \"capacity\": 30\n                }\n              }\n            ]\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"9219454040579563054\"\n            },\n            \"description\": \"Creates an Azure Cognitive Services instance.\"\n          },\n          \"parameters\": {\n            \"name\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"customSubDomainName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[parameters('name')]\",\n              \"metadata\": {\n                \"description\": \"The custom subdomain name used to access the API. Defaults to the value of the name parameter.\"\n              }\n            },\n            \"deployments\": {\n              \"type\": \"array\",\n              \"defaultValue\": []\n            },\n            \"kind\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"OpenAI\"\n            },\n            \"publicNetworkAccess\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"Enabled\",\n              \"allowedValues\": [\n                \"Enabled\",\n                \"Disabled\"\n              ]\n            },\n            \"sku\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"name\": \"S0\"\n              }\n            },\n            \"allowedIpRules\": {\n              \"type\": \"array\",\n              \"defaultValue\": []\n            },\n            \"networkAcls\": {\n              \"type\": \"object\",\n              \"defaultValue\": \"[if(empty(parameters('allowedIpRules')), createObject('defaultAction', 'Allow'), createObject('ipRules', parameters('allowedIpRules'), 'defaultAction', 'Deny'))]\"\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.CognitiveServices/accounts\",\n              \"apiVersion\": \"2023-05-01\",\n              \"name\": \"[parameters('name')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"kind\": \"[parameters('kind')]\",\n              \"properties\": {\n                \"customSubDomainName\": \"[parameters('customSubDomainName')]\",\n                \"publicNetworkAccess\": \"[parameters('publicNetworkAccess')]\",\n                \"networkAcls\": \"[parameters('networkAcls')]\"\n              },\n              \"sku\": \"[parameters('sku')]\"\n            },\n            {\n              \"copy\": {\n                \"name\": \"deployment\",\n                \"count\": \"[length(parameters('deployments'))]\",\n                \"mode\": \"serial\",\n                \"batchSize\": 1\n              },\n              \"type\": \"Microsoft.CognitiveServices/accounts/deployments\",\n              \"apiVersion\": \"2023-05-01\",\n              \"name\": \"[format('{0}/{1}', parameters('name'), parameters('deployments')[copyIndex()].name)]\",\n              \"properties\": {\n                \"model\": \"[parameters('deployments')[copyIndex()].model]\",\n                \"raiPolicyName\": \"[if(contains(parameters('deployments')[copyIndex()], 'raiPolicyName'), parameters('deployments')[copyIndex()].raiPolicyName, null())]\"\n              },\n              \"sku\": \"[if(contains(parameters('deployments')[copyIndex()], 'sku'), parameters('deployments')[copyIndex()].sku, createObject('name', 'Standard', 'capacity', 30))]\",\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.CognitiveServices/accounts', parameters('name'))]\"\n              ]\n            }\n          ],\n          \"outputs\": {\n            \"endpoint\": {\n              \"type\": \"string\",\n              \"value\": \"[reference(resourceId('Microsoft.CognitiveServices/accounts', parameters('name')), '2023-05-01').endpoint]\"\n            },\n            \"id\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.CognitiveServices/accounts', parameters('name'))]\"\n            },\n            \"name\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('name')]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"[parameters('azureAIDocumentIntelligenceResourceName')]\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"name\": {\n            \"value\": \"[parameters('azureAIDocumentIntelligenceResourceName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"tags\": {\n            \"value\": \"[variables('tags')]\"\n          },\n          \"kind\": {\n            \"value\": \"FormRecognizer\"\n          },\n          \"sku\": {\n            \"value\": {\n              \"name\": \"[parameters('azureAIDocumentIntelligenceSkuName')]\"\n            }\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"9219454040579563054\"\n            },\n            \"description\": \"Creates an Azure Cognitive Services instance.\"\n          },\n          \"parameters\": {\n            \"name\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"customSubDomainName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[parameters('name')]\",\n              \"metadata\": {\n                \"description\": \"The custom subdomain name used to access the API. Defaults to the value of the name parameter.\"\n              }\n            },\n            \"deployments\": {\n              \"type\": \"array\",\n              \"defaultValue\": []\n            },\n            \"kind\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"OpenAI\"\n            },\n            \"publicNetworkAccess\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"Enabled\",\n              \"allowedValues\": [\n                \"Enabled\",\n                \"Disabled\"\n              ]\n            },\n            \"sku\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"name\": \"S0\"\n              }\n            },\n            \"allowedIpRules\": {\n              \"type\": \"array\",\n              \"defaultValue\": []\n            },\n            \"networkAcls\": {\n              \"type\": \"object\",\n              \"defaultValue\": \"[if(empty(parameters('allowedIpRules')), createObject('defaultAction', 'Allow'), createObject('ipRules', parameters('allowedIpRules'), 'defaultAction', 'Deny'))]\"\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.CognitiveServices/accounts\",\n              \"apiVersion\": \"2023-05-01\",\n              \"name\": \"[parameters('name')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"kind\": \"[parameters('kind')]\",\n              \"properties\": {\n                \"customSubDomainName\": \"[parameters('customSubDomainName')]\",\n                \"publicNetworkAccess\": \"[parameters('publicNetworkAccess')]\",\n                \"networkAcls\": \"[parameters('networkAcls')]\"\n              },\n              \"sku\": \"[parameters('sku')]\"\n            },\n            {\n              \"copy\": {\n                \"name\": \"deployment\",\n                \"count\": \"[length(parameters('deployments'))]\",\n                \"mode\": \"serial\",\n                \"batchSize\": 1\n              },\n              \"type\": \"Microsoft.CognitiveServices/accounts/deployments\",\n              \"apiVersion\": \"2023-05-01\",\n              \"name\": \"[format('{0}/{1}', parameters('name'), parameters('deployments')[copyIndex()].name)]\",\n              \"properties\": {\n                \"model\": \"[parameters('deployments')[copyIndex()].model]\",\n                \"raiPolicyName\": \"[if(contains(parameters('deployments')[copyIndex()], 'raiPolicyName'), parameters('deployments')[copyIndex()].raiPolicyName, null())]\"\n              },\n              \"sku\": \"[if(contains(parameters('deployments')[copyIndex()], 'sku'), parameters('deployments')[copyIndex()].sku, createObject('name', 'Standard', 'capacity', 30))]\",\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.CognitiveServices/accounts', parameters('name'))]\"\n              ]\n            }\n          ],\n          \"outputs\": {\n            \"endpoint\": {\n              \"type\": \"string\",\n              \"value\": \"[reference(resourceId('Microsoft.CognitiveServices/accounts', parameters('name')), '2023-05-01').endpoint]\"\n            },\n            \"id\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.CognitiveServices/accounts', parameters('name'))]\"\n            },\n            \"name\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('name')]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"[parameters('storageAccountName')]\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"name\": {\n            \"value\": \"[parameters('storageAccountName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"sku\": {\n            \"value\": {\n              \"name\": \"Standard_GRS\"\n            }\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"16657182853649829034\"\n            },\n            \"description\": \"Creates an Azure storage account.\"\n          },\n          \"parameters\": {\n            \"name\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"accessTier\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"Hot\",\n              \"allowedValues\": [\n                \"Cool\",\n                \"Hot\",\n                \"Premium\"\n              ]\n            },\n            \"allowBlobPublicAccess\": {\n              \"type\": \"bool\",\n              \"defaultValue\": true\n            },\n            \"allowCrossTenantReplication\": {\n              \"type\": \"bool\",\n              \"defaultValue\": true\n            },\n            \"allowSharedKeyAccess\": {\n              \"type\": \"bool\",\n              \"defaultValue\": true\n            },\n            \"containers\": {\n              \"type\": \"array\",\n              \"defaultValue\": []\n            },\n            \"defaultToOAuthAuthentication\": {\n              \"type\": \"bool\",\n              \"defaultValue\": false\n            },\n            \"deleteRetentionPolicy\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"dnsEndpointType\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"Standard\",\n              \"allowedValues\": [\n                \"AzureDnsZone\",\n                \"Standard\"\n              ]\n            },\n            \"kind\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"StorageV2\"\n            },\n            \"minimumTlsVersion\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"TLS1_2\"\n            },\n            \"queues\": {\n              \"type\": \"array\",\n              \"defaultValue\": []\n            },\n            \"supportsHttpsTrafficOnly\": {\n              \"type\": \"bool\",\n              \"defaultValue\": true\n            },\n            \"networkAcls\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"bypass\": \"AzureServices\",\n                \"defaultAction\": \"Allow\"\n              }\n            },\n            \"publicNetworkAccess\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"Enabled\",\n              \"allowedValues\": [\n                \"Enabled\",\n                \"Disabled\"\n              ]\n            },\n            \"sku\": {\n              \"type\": \"object\",\n              \"defaultValue\": {\n                \"name\": \"Standard_LRS\"\n              }\n            }\n          },\n          \"resources\": [\n            {\n              \"copy\": {\n                \"name\": \"container\",\n                \"count\": \"[length(parameters('containers'))]\"\n              },\n              \"condition\": \"[not(empty(parameters('containers')))]\",\n              \"type\": \"Microsoft.Storage/storageAccounts/blobServices/containers\",\n              \"apiVersion\": \"2022-05-01\",\n              \"name\": \"[format('{0}/{1}/{2}', parameters('name'), 'default', parameters('containers')[copyIndex()].name)]\",\n              \"properties\": {\n                \"publicAccess\": \"[if(contains(parameters('containers')[copyIndex()], 'publicAccess'), parameters('containers')[copyIndex()].publicAccess, 'None')]\"\n              },\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.Storage/storageAccounts/blobServices', parameters('name'), 'default')]\"\n              ]\n            },\n            {\n              \"copy\": {\n                \"name\": \"queue\",\n                \"count\": \"[length(parameters('queues'))]\"\n              },\n              \"condition\": \"[not(empty(parameters('queues')))]\",\n              \"type\": \"Microsoft.Storage/storageAccounts/queueServices/queues\",\n              \"apiVersion\": \"2022-05-01\",\n              \"name\": \"[format('{0}/{1}/{2}', parameters('name'), 'default', parameters('queues')[copyIndex()].name)]\",\n              \"properties\": {\n                \"metadata\": {}\n              },\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('name'), 'default')]\"\n              ]\n            },\n            {\n              \"condition\": \"[not(empty(parameters('containers')))]\",\n              \"type\": \"Microsoft.Storage/storageAccounts/blobServices\",\n              \"apiVersion\": \"2022-05-01\",\n              \"name\": \"[format('{0}/{1}', parameters('name'), 'default')]\",\n              \"properties\": {\n                \"deleteRetentionPolicy\": \"[parameters('deleteRetentionPolicy')]\"\n              },\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.Storage/storageAccounts', parameters('name'))]\"\n              ]\n            },\n            {\n              \"condition\": \"[not(empty(parameters('queues')))]\",\n              \"type\": \"Microsoft.Storage/storageAccounts/queueServices\",\n              \"apiVersion\": \"2022-05-01\",\n              \"name\": \"[format('{0}/{1}', parameters('name'), 'default')]\",\n              \"properties\": {\n                \"cors\": {\n                  \"corsRules\": []\n                }\n              },\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.Storage/storageAccounts', parameters('name'))]\"\n              ]\n            },\n            {\n              \"type\": \"Microsoft.Storage/storageAccounts\",\n              \"apiVersion\": \"2022-05-01\",\n              \"name\": \"[parameters('name')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"kind\": \"[parameters('kind')]\",\n              \"sku\": \"[parameters('sku')]\",\n              \"properties\": {\n                \"accessTier\": \"[parameters('accessTier')]\",\n                \"allowBlobPublicAccess\": \"[parameters('allowBlobPublicAccess')]\",\n                \"allowCrossTenantReplication\": \"[parameters('allowCrossTenantReplication')]\",\n                \"allowSharedKeyAccess\": \"[parameters('allowSharedKeyAccess')]\",\n                \"defaultToOAuthAuthentication\": \"[parameters('defaultToOAuthAuthentication')]\",\n                \"dnsEndpointType\": \"[parameters('dnsEndpointType')]\",\n                \"minimumTlsVersion\": \"[parameters('minimumTlsVersion')]\",\n                \"networkAcls\": \"[parameters('networkAcls')]\",\n                \"publicNetworkAccess\": \"[parameters('publicNetworkAccess')]\",\n                \"supportsHttpsTrafficOnly\": \"[parameters('supportsHttpsTrafficOnly')]\"\n              }\n            }\n          ],\n          \"outputs\": {\n            \"name\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('name')]\"\n            },\n            \"id\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.Storage/storageAccounts', parameters('name'))]\"\n            },\n            \"primaryEndpoints\": {\n              \"type\": \"object\",\n              \"value\": \"[reference(resourceId('Microsoft.Storage/storageAccounts', parameters('name')), '2022-05-01').primaryEndpoints]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"monitoring\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"applicationInsightsName\": {\n            \"value\": \"[parameters('applicationInsightsName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"logAnalyticsName\": {\n            \"value\": \"[format('{0}-logAnalytics-{1}', parameters('environmentName'), parameters('resourceToken'))]\"\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"14422276109150612921\"\n            }\n          },\n          \"parameters\": {\n            \"logAnalyticsName\": {\n              \"type\": \"string\"\n            },\n            \"applicationInsightsName\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.OperationalInsights/workspaces\",\n              \"apiVersion\": \"2021-12-01-preview\",\n              \"name\": \"[parameters('logAnalyticsName')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"properties\": {\n                \"retentionInDays\": 30,\n                \"features\": {\n                  \"searchVersion\": 1\n                },\n                \"sku\": {\n                  \"name\": \"PerGB2018\"\n                }\n              }\n            },\n            {\n              \"type\": \"Microsoft.Insights/components\",\n              \"apiVersion\": \"2020-02-02\",\n              \"name\": \"[parameters('applicationInsightsName')]\",\n              \"location\": \"[parameters('location')]\",\n              \"tags\": \"[parameters('tags')]\",\n              \"kind\": \"web\",\n              \"properties\": {\n                \"Application_Type\": \"web\",\n                \"WorkspaceResourceId\": \"[resourceId('Microsoft.OperationalInsights/workspaces', parameters('logAnalyticsName'))]\"\n              },\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.OperationalInsights/workspaces', parameters('logAnalyticsName'))]\"\n              ]\n            }\n          ],\n          \"outputs\": {\n            \"applicationInsightsId\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.Insights/components', parameters('applicationInsightsName'))]\"\n            },\n            \"applicationInsightsName\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('applicationInsightsName')]\"\n            },\n            \"logAnalyticsWorkspaceId\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.OperationalInsights/workspaces', parameters('logAnalyticsName'))]\"\n            },\n            \"logAnalyticsWorkspaceName\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('logAnalyticsName')]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"[parameters('machineLearningName')]\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"name\": {\n            \"value\": \"[parameters('machineLearningName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"storageAccount\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('storageAccountName')), '2022-09-01').outputs.id.value]\"\n          },\n          \"keyVault\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault'), '2022-09-01').outputs.id.value]\"\n          },\n          \"applicationInsights\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'monitoring'), '2022-09-01').outputs.applicationInsightsId.value]\"\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"16096020836949906486\"\n            },\n            \"description\": \"Creates an Azure Machine Learning Workspace.\"\n          },\n          \"parameters\": {\n            \"name\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"[resourceGroup().location]\"\n            },\n            \"tags\": {\n              \"type\": \"object\",\n              \"defaultValue\": {}\n            },\n            \"storageAccount\": {\n              \"type\": \"string\"\n            },\n            \"keyVault\": {\n              \"type\": \"string\"\n            },\n            \"applicationInsights\": {\n              \"type\": \"string\"\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.MachineLearningServices/workspaces\",\n              \"apiVersion\": \"2023-06-01-preview\",\n              \"name\": \"[parameters('name')]\",\n              \"location\": \"[parameters('location')]\",\n              \"identity\": {\n                \"type\": \"systemAssigned\"\n              },\n              \"tags\": \"[parameters('tags')]\",\n              \"properties\": {\n                \"storageAccount\": \"[parameters('storageAccount')]\",\n                \"keyVault\": \"[parameters('keyVault')]\",\n                \"applicationInsights\": \"[parameters('applicationInsights')]\"\n              }\n            }\n          ],\n          \"outputs\": {\n            \"workspaceName\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('name')]\"\n            },\n            \"workspaceId\": {\n              \"type\": \"string\",\n              \"value\": \"[resourceId('Microsoft.MachineLearningServices/workspaces', parameters('name'))]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault')]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'monitoring')]\",\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('storageAccountName'))]\"\n      ]\n    },\n    {\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"storekeys\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"keyVaultName\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault'), '2022-09-01').outputs.name.value]\"\n          },\n          \"azureOpenAIName\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureOpenAIResourceName')), '2022-09-01').outputs.name.value]\"\n          },\n          \"azureAISearchName\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureAISearchName')), '2022-09-01').outputs.name.value]\"\n          },\n          \"documentIntelligenceName\": {\n            \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureAIDocumentIntelligenceResourceName')), '2022-09-01').outputs.name.value]\"\n          },\n          \"rgName\": {\n            \"value\": \"[variables('rgName')]\"\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"971357261098316609\"\n            }\n          },\n          \"parameters\": {\n            \"keyVaultName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"\"\n            },\n            \"azureOpenAIName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"\"\n            },\n            \"documentIntelligenceName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"\"\n            },\n            \"azureAISearchName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"\"\n            },\n            \"rgName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"\"\n            },\n            \"openAIKeyName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"openai-api-key\"\n            },\n            \"documentIntelligenceKeyName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"azure-document-intelligence-admin-key\"\n            },\n            \"searchKeyName\": {\n              \"type\": \"string\",\n              \"defaultValue\": \"azure-search-admin-key\"\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.KeyVault/vaults/secrets\",\n              \"apiVersion\": \"2022-07-01\",\n              \"name\": \"[format('{0}/{1}', parameters('keyVaultName'), parameters('openAIKeyName'))]\",\n              \"properties\": {\n                \"contentType\": \"string\",\n                \"value\": \"[listKeys(resourceId(subscription().subscriptionId, parameters('rgName'), 'Microsoft.CognitiveServices/accounts', parameters('azureOpenAIName')), '2023-05-01').key1]\"\n              }\n            },\n            {\n              \"type\": \"Microsoft.KeyVault/vaults/secrets\",\n              \"apiVersion\": \"2022-07-01\",\n              \"name\": \"[format('{0}/{1}', parameters('keyVaultName'), parameters('documentIntelligenceKeyName'))]\",\n              \"properties\": {\n                \"contentType\": \"string\",\n                \"value\": \"[listKeys(resourceId(subscription().subscriptionId, parameters('rgName'), 'Microsoft.CognitiveServices/accounts', parameters('documentIntelligenceName')), '2023-05-01').key1]\"\n              }\n            },\n            {\n              \"type\": \"Microsoft.KeyVault/vaults/secrets\",\n              \"apiVersion\": \"2022-07-01\",\n              \"name\": \"[format('{0}/{1}', parameters('keyVaultName'), parameters('searchKeyName'))]\",\n              \"properties\": {\n                \"contentType\": \"string\",\n                \"value\": \"[listAdminKeys(resourceId(subscription().subscriptionId, parameters('rgName'), 'Microsoft.Search/searchServices', parameters('azureAISearchName')), '2023-11-01').primaryKey]\"\n              }\n            }\n          ],\n          \"outputs\": {\n            \"SEARCH_KEY_NAME\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('searchKeyName')]\"\n            },\n            \"OPENAI_KEY_NAME\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('openAIKeyName')]\"\n            },\n            \"DOCUMENTINTELLIGENCE_KEY_NAME\": {\n              \"type\": \"string\",\n              \"value\": \"[parameters('documentIntelligenceKeyName')]\"\n            }\n          }\n        }\n      },\n      \"dependsOn\": [\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureAIDocumentIntelligenceResourceName'))]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault')]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureOpenAIResourceName'))]\",\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureAISearchName'))]\"\n      ]\n    },\n    {\n      \"condition\": \"[and(and(not(equals(parameters('vnetAddressSpace'), '')), not(equals(parameters('proxySubnetAddressSpace'), ''))), not(equals(parameters('subnetAddressSpace'), '')))]\",\n      \"type\": \"Microsoft.Resources/deployments\",\n      \"apiVersion\": \"2022-09-01\",\n      \"name\": \"network_isolation_resources\",\n      \"resourceGroup\": \"[variables('rgName')]\",\n      \"properties\": {\n        \"expressionEvaluationOptions\": {\n          \"scope\": \"inner\"\n        },\n        \"mode\": \"Incremental\",\n        \"parameters\": {\n          \"vnetName\": {\n            \"value\": \"[variables('virtualNetworkName')]\"\n          },\n          \"location\": {\n            \"value\": \"[parameters('location')]\"\n          },\n          \"vnetAddressSpace\": {\n            \"value\": \"[parameters('vnetAddressSpace')]\"\n          },\n          \"proxySubnetName\": {\n            \"value\": \"[variables('proxySubnetName')]\"\n          },\n          \"proxySubnetAddressSpace\": {\n            \"value\": \"[parameters('proxySubnetAddressSpace')]\"\n          },\n          \"azureSubnetName\": {\n            \"value\": \"[variables('subnetName')]\"\n          },\n          \"azureSubnetAddressSpace\": {\n            \"value\": \"[parameters('subnetAddressSpace')]\"\n          },\n          \"resourcePrefix\": {\n            \"value\": \"[parameters('environmentName')]\"\n          },\n          \"azureResources\": {\n            \"value\": [\n              {\n                \"type\": \"blob\",\n                \"name\": \"[parameters('storageAccountName')]\",\n                \"resourceId\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('storageAccountName')), '2022-09-01').outputs.id.value]\"\n              },\n              {\n                \"type\": \"vault\",\n                \"name\": \"keyvault\",\n                \"resourceId\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault'), '2022-09-01').outputs.id.value]\"\n              },\n              {\n                \"type\": \"amlworkspace\",\n                \"name\": \"[parameters('machineLearningName')]\",\n                \"resourceId\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('machineLearningName')), '2022-09-01').outputs.workspaceId.value]\"\n              }\n            ]\n          }\n        },\n        \"template\": {\n          \"$schema\": \"https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#\",\n          \"contentVersion\": \"1.0.0.0\",\n          \"metadata\": {\n            \"_generator\": {\n              \"name\": \"bicep\",\n              \"version\": \"0.28.1.47646\",\n              \"templateHash\": \"284038938591552097\"\n            }\n          },\n          \"parameters\": {\n            \"vnetName\": {\n              \"type\": \"string\"\n            },\n            \"location\": {\n              \"type\": \"string\"\n            },\n            \"vnetAddressSpace\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            },\n            \"proxySubnetName\": {\n              \"type\": \"string\"\n            },\n            \"proxySubnetAddressSpace\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            },\n            \"azureSubnetName\": {\n              \"type\": \"string\"\n            },\n            \"azureSubnetAddressSpace\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            },\n            \"resourcePrefix\": {\n              \"type\": \"string\"\n            },\n            \"azureResources\": {\n              \"type\": \"array\"\n            }\n          },\n          \"resources\": [\n            {\n              \"type\": \"Microsoft.Network/virtualNetworks\",\n              \"apiVersion\": \"2020-06-01\",\n              \"name\": \"[parameters('vnetName')]\",\n              \"location\": \"[parameters('location')]\",\n              \"properties\": {\n                \"addressSpace\": {\n                  \"addressPrefixes\": [\n                    \"[parameters('vnetAddressSpace')]\"\n                  ]\n                },\n                \"subnets\": [\n                  {\n                    \"name\": \"[parameters('proxySubnetName')]\",\n                    \"properties\": {\n                      \"addressPrefix\": \"[parameters('proxySubnetAddressSpace')]\"\n                    }\n                  },\n                  {\n                    \"name\": \"[parameters('azureSubnetName')]\",\n                    \"properties\": {\n                      \"addressPrefix\": \"[parameters('azureSubnetAddressSpace')]\"\n                    }\n                  }\n                ]\n              }\n            },\n            {\n              \"copy\": {\n                \"name\": \"privateEndpoints\",\n                \"count\": \"[length(parameters('azureResources'))]\"\n              },\n              \"type\": \"Microsoft.Network/privateEndpoints\",\n              \"apiVersion\": \"2020-07-01\",\n              \"name\": \"[format('{0}{1}PrivateEndpoint', parameters('resourcePrefix'), parameters('azureResources')[copyIndex()].type)]\",\n              \"location\": \"[parameters('location')]\",\n              \"properties\": {\n                \"privateLinkServiceConnections\": [\n                  {\n                    \"name\": \"[format('{0}{1}PLSConnection', parameters('resourcePrefix'), parameters('azureResources')[copyIndex()].type)]\",\n                    \"properties\": {\n                      \"privateLinkServiceId\": \"[parameters('azureResources')[copyIndex()].resourceId]\",\n                      \"groupIds\": [\n                        \"[parameters('azureResources')[copyIndex()].type]\"\n                      ]\n                    }\n                  }\n                ],\n                \"subnet\": {\n                  \"id\": \"[format('{0}/subnets/{1}', resourceId('Microsoft.Network/virtualNetworks', parameters('vnetName')), parameters('azureSubnetName'))]\"\n                }\n              },\n              \"dependsOn\": [\n                \"[resourceId('Microsoft.Network/virtualNetworks', parameters('vnetName'))]\"\n              ]\n            }\n          ]\n        }\n      },\n      \"dependsOn\": [\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault')]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('machineLearningName'))]\",\n        \"[subscriptionResourceId('Microsoft.Resources/resourceGroups', variables('rgName'))]\",\n        \"[extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('storageAccountName'))]\"\n      ]\n    }\n  ],\n  \"outputs\": {\n    \"USE_KEY_VAULT\": {\n      \"type\": \"string\",\n      \"value\": \"true\"\n    },\n    \"AZURE_KEY_VAULT_ENDPOINT\": {\n      \"type\": \"string\",\n      \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', 'keyvault'), '2022-09-01').outputs.endpoint.value]\"\n    },\n    \"AZURE_SEARCH_SERVICE_ENDPOINT\": {\n      \"type\": \"string\",\n      \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureAISearchName')), '2022-09-01').outputs.endpoint.value]\"\n    },\n    \"AZURE_SEARCH_USE_SEMANTIC_SEARCH\": {\n      \"type\": \"bool\",\n      \"value\": \"[parameters('azureSearchUseSemanticSearch')]\"\n    },\n    \"OPENAI_API_TYPE\": {\n      \"type\": \"string\",\n      \"value\": \"azure\"\n    },\n    \"OPENAI_ENDPOINT\": {\n      \"type\": \"string\",\n      \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureOpenAIResourceName')), '2022-09-01').outputs.endpoint.value]\"\n    },\n    \"OPENAI_API_VERSION\": {\n      \"type\": \"string\",\n      \"value\": \"2023-03-15-preview\"\n    },\n    \"AML_SUBSCRIPTION_ID\": {\n      \"type\": \"string\",\n      \"value\": \"[subscription().subscriptionId]\"\n    },\n    \"AML_WORKSPACE_NAME\": {\n      \"type\": \"string\",\n      \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('machineLearningName')), '2022-09-01').outputs.workspaceName.value]\"\n    },\n    \"AML_RESOURCE_GROUP_NAME\": {\n      \"type\": \"string\",\n      \"value\": \"[variables('rgName')]\"\n    },\n    \"AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT\": {\n      \"type\": \"string\",\n      \"value\": \"[reference(extensionResourceId(format('/subscriptions/{0}/resourceGroups/{1}', subscription().subscriptionId, variables('rgName')), 'Microsoft.Resources/deployments', parameters('azureAIDocumentIntelligenceResourceName')), '2022-09-01').outputs.endpoint.value]\"\n    }\n  }\n}"
  },
  {
    "path": "infra/network/azure_bastion.bicep",
    "content": "param vnetName string\nparam bastionName string\nparam bastionSubnetName string\nparam location string\nparam publicIpName string // Name of the existing public IP resource\n\nresource bastion 'Microsoft.Network/bastionHosts@2023-04-01' = {\n  name: bastionName\n  location: location\n  properties: {\n    dnsName: bastionName\n    ipConfigurations: [\n      {\n        name: 'bastionIpConfig'\n        properties: {\n          subnet: {\n            id: resourceId('Microsoft.Network/virtualNetworks/subnets', vnetName, bastionSubnetName)\n          }\n          publicIPAddress: {\n            id: resourceId('Microsoft.Network/publicIPAddresses', publicIpName)\n          }\n        }\n      }\n    ]\n  }\n}\n\noutput bastionFqdn string = bastion.properties.dnsName\n"
  },
  {
    "path": "infra/network/network_isolation.bicep",
    "content": "param vnetName string\nparam location string\n\n@minLength(1)\nparam vnetAddressSpace string\nparam proxySubnetName string\n\n@minLength(1)\nparam proxySubnetAddressSpace string\nparam azureSubnetName string\n\n@minLength(1)\nparam azureSubnetAddressSpace string\nparam resourcePrefix string\nparam azureResources array\n\nresource vnet 'Microsoft.Network/virtualNetworks@2020-06-01' = {\n  name: vnetName\n  location: location\n  properties: {\n    addressSpace: {\n      addressPrefixes: [\n        vnetAddressSpace\n      ]\n    }\n    subnets: [\n      {\n        name: proxySubnetName\n        properties: {\n          addressPrefix: proxySubnetAddressSpace\n        }\n      }\n      {\n        name: azureSubnetName\n        properties: {\n          addressPrefix: azureSubnetAddressSpace\n        }\n      }\n    ]\n  }\n}\n\nresource privateEndpoints 'Microsoft.Network/privateEndpoints@2020-07-01' = [\n  for (resource, i) in azureResources: {\n    name: '${resourcePrefix}${resource.type}PrivateEndpoint'\n    location: location\n    properties: {\n      privateLinkServiceConnections: [\n        {\n          name: '${resourcePrefix}${resource.type}PLSConnection'\n          properties: {\n            privateLinkServiceId: resource.resourceId\n            groupIds: [resource.type]\n          }\n        }\n      ]\n      subnet: {\n        id: '${vnet.id}/subnets/${azureSubnetName}'\n      }\n    }\n  }\n]\n"
  },
  {
    "path": "infra/shared/cognitiveservices.bicep",
    "content": "metadata description = 'Creates an Azure Cognitive Services instance.'\nparam name string\nparam location string = resourceGroup().location\nparam tags object = {}\n@description('The custom subdomain name used to access the API. Defaults to the value of the name parameter.')\nparam customSubDomainName string = name\nparam deployments array = []\nparam kind string = 'OpenAI'\n\n@allowed([ 'Enabled', 'Disabled' ])\nparam publicNetworkAccess string = 'Enabled'\nparam sku object = {\n  name: 'S0'\n}\n\nparam allowedIpRules array = []\nparam networkAcls object = empty(allowedIpRules) ? {\n  defaultAction: 'Allow'\n} : {\n  ipRules: allowedIpRules\n  defaultAction: 'Deny'\n}\n\nresource account 'Microsoft.CognitiveServices/accounts@2023-05-01' = {\n  name: name\n  location: location\n  tags: tags\n  kind: kind\n  properties: {\n    customSubDomainName: customSubDomainName\n    publicNetworkAccess: publicNetworkAccess\n    networkAcls: networkAcls\n  }\n  sku: sku\n}\n\n@batchSize(1)\nresource deployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01' = [for deployment in deployments: {\n  parent: account\n  name: deployment.name\n  properties: {\n    model: deployment.model\n    raiPolicyName: contains(deployment, 'raiPolicyName') ? deployment.raiPolicyName : null\n  }\n  sku: contains(deployment, 'sku') ? deployment.sku : {\n    name: 'Standard'\n    capacity: 30\n  }\n}]\n\noutput endpoint string = account.properties.endpoint\noutput id string = account.id\noutput name string = account.name\n"
  },
  {
    "path": "infra/shared/keyvault-secret.bicep",
    "content": "metadata description = 'Creates or updates a secret in an Azure Key Vault.'\nparam name string\nparam tags object = {}\nparam keyVaultName string\nparam contentType string = 'string'\n@description('The value of the secret. Provide only derived values like blob storage access, but do not hard code any secrets in your templates')\n@secure()\nparam secretValue string\n\nparam enabled bool = true\nparam exp int = 0\nparam nbf int = 0\n\nresource keyVaultSecret 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = {\n  name: name\n  tags: tags\n  parent: keyVault\n  properties: {\n    attributes: {\n      enabled: enabled\n      exp: exp\n      nbf: nbf\n    }\n    contentType: contentType\n    value: secretValue\n  }\n}\n\nresource keyVault 'Microsoft.KeyVault/vaults@2022-07-01' existing = {\n  name: keyVaultName\n}\n"
  },
  {
    "path": "infra/shared/keyvault.bicep",
    "content": "param name string\nparam location string = resourceGroup().location\nparam tags object = {}\n\n@description('Service principal will be granted read access to the KeyVault. If unset, no service principal is granted access by default')\nparam principalId string = ''\n\nvar defaultAccessPolicies = !empty(principalId) ? [\n  {\n    objectId: principalId\n    permissions: { secrets: [ 'get', 'set', 'list' ] }\n    tenantId: subscription().tenantId\n  }\n] : []\n\nresource keyVault 'Microsoft.KeyVault/vaults@2022-07-01' = {\n  name: name\n  location: location\n  tags: tags\n  properties: {\n    tenantId: subscription().tenantId\n    sku: { family: 'A', name: 'standard' }\n    enabledForTemplateDeployment: true\n    accessPolicies: union(defaultAccessPolicies, [\n      // define access policies here\n    ])\n  }\n}\n\noutput id string = keyVault.id\noutput endpoint string = keyVault.properties.vaultUri\noutput name string = keyVault.name\n"
  },
  {
    "path": "infra/shared/machineLearning.bicep",
    "content": "metadata description = 'Creates an Azure Machine Learning Workspace.'\nparam name string\nparam location string = resourceGroup().location\nparam tags object = {}\nparam storageAccount string\nparam keyVault string\nparam applicationInsights string\n\nresource machineLearningWorkspace 'Microsoft.MachineLearningServices/workspaces@2023-06-01-preview' = {\n  name: name\n  location: location \n  identity: {\n    type: 'systemAssigned'\n  }\n  tags: tags\n  properties: {\n    storageAccount: storageAccount\n    keyVault: keyVault\n    applicationInsights: applicationInsights\n  }\n}\n\noutput workspaceName string = machineLearningWorkspace.name\noutput workspaceId string = machineLearningWorkspace.id\n"
  },
  {
    "path": "infra/shared/monitoring.bicep",
    "content": "param logAnalyticsName string\nparam applicationInsightsName string\nparam location string = resourceGroup().location\nparam tags object = {}\n\nresource logAnalytics 'Microsoft.OperationalInsights/workspaces@2021-12-01-preview' = {\n  name: logAnalyticsName\n  location: location\n  tags: tags\n  properties: any({\n    retentionInDays: 30\n    features: {\n      searchVersion: 1\n    }\n    sku: {\n      name: 'PerGB2018'\n    }\n  })\n}\n\nresource applicationInsights 'Microsoft.Insights/components@2020-02-02' = {\n  name: applicationInsightsName\n  location: location\n  tags: tags\n  kind: 'web'\n  properties: {\n    Application_Type: 'web'\n    WorkspaceResourceId: logAnalytics.id\n  }\n}\n\noutput applicationInsightsId string = applicationInsights.id\noutput applicationInsightsName string = applicationInsights.name\noutput logAnalyticsWorkspaceId string = logAnalytics.id\noutput logAnalyticsWorkspaceName string = logAnalytics.name\n"
  },
  {
    "path": "infra/shared/search-services.bicep",
    "content": "metadata description = 'Creates an Azure AI Search instance.'\nparam name string\nparam location string = resourceGroup().location\nparam tags object = {}\n\nparam sku object = {\n  name: 'standard'\n}\n\nparam authOptions object = {}\nparam disableLocalAuth bool = false\nparam encryptionWithCmk object = {\n  enforcement: 'Unspecified'\n}\n@allowed([\n  'default'\n  'highDensity'\n])\nparam hostingMode string = 'default'\nparam networkRuleSet object = {\n  bypass: 'None'\n  ipRules: []\n}\nparam partitionCount int = 1\n@allowed([\n  'enabled'\n  'disabled'\n])\nparam publicNetworkAccess string = 'enabled'\nparam replicaCount int = 1\n@allowed([\n  'disabled'\n  'free'\n  'standard'\n])\nparam semanticSearch string = 'disabled'\n\nresource search 'Microsoft.Search/searchServices@2023-11-01' = {\n  name: name\n  location: location\n  tags: tags\n  identity: {\n    type: 'SystemAssigned'\n  }\n  properties: {\n    authOptions: authOptions\n    disableLocalAuth: disableLocalAuth\n    encryptionWithCmk: encryptionWithCmk\n    hostingMode: hostingMode\n    networkRuleSet: networkRuleSet\n    partitionCount: partitionCount\n    publicNetworkAccess: publicNetworkAccess\n    replicaCount: replicaCount\n    semanticSearch: semanticSearch\n  }\n  sku: sku\n}\n\noutput id string = search.id\noutput endpoint string = 'https://${name}.search.windows.net/'\noutput name string = search.name\n"
  },
  {
    "path": "infra/shared/storage.bicep",
    "content": "metadata description = 'Creates an Azure storage account.'\nparam name string\nparam location string = resourceGroup().location\nparam tags object = {}\n\n@allowed([\n  'Cool'\n  'Hot'\n  'Premium' ])\nparam accessTier string = 'Hot'\nparam allowBlobPublicAccess bool = true\nparam allowCrossTenantReplication bool = true\nparam allowSharedKeyAccess bool = true\nparam containers array = []\nparam defaultToOAuthAuthentication bool = false\nparam deleteRetentionPolicy object = {}\n@allowed([ 'AzureDnsZone', 'Standard' ])\nparam dnsEndpointType string = 'Standard'\nparam kind string = 'StorageV2'\nparam minimumTlsVersion string = 'TLS1_2'\nparam queues array = []\nparam supportsHttpsTrafficOnly bool = true\nparam networkAcls object = {\n  bypass: 'AzureServices'\n  defaultAction: 'Allow'\n}\n@allowed([ 'Enabled', 'Disabled' ])\nparam publicNetworkAccess string = 'Enabled'\nparam sku object = { name: 'Standard_LRS' }\n\nresource storage 'Microsoft.Storage/storageAccounts@2022-05-01' = {\n  name: name\n  location: location\n  tags: tags\n  kind: kind\n  sku: sku\n  properties: {\n    accessTier: accessTier\n    allowBlobPublicAccess: allowBlobPublicAccess\n    allowCrossTenantReplication: allowCrossTenantReplication\n    allowSharedKeyAccess: allowSharedKeyAccess\n    defaultToOAuthAuthentication: defaultToOAuthAuthentication\n    dnsEndpointType: dnsEndpointType\n    minimumTlsVersion: minimumTlsVersion\n    networkAcls: networkAcls\n    publicNetworkAccess: publicNetworkAccess\n    supportsHttpsTrafficOnly: supportsHttpsTrafficOnly\n  }\n\n  resource blobServices 'blobServices' = if (!empty(containers)) {\n    name: 'default'\n    properties: {\n      deleteRetentionPolicy: deleteRetentionPolicy\n    }\n    resource container 'containers' = [for container in containers: {\n      name: container.name\n      properties: {\n        publicAccess: contains(container, 'publicAccess') ? container.publicAccess : 'None'\n      }\n    }]\n  }\n\n  resource queueServices 'queueServices' = if (!empty(queues)) {\n    name: 'default'\n    properties: {\n      cors: {\n        corsRules: []\n      }\n    }\n    resource queue 'queues' = [for queue in queues: {\n      name: queue.name\n      properties: {\n        metadata: {}\n      }\n    }]\n  }\n}\n\noutput name string = storage.name\noutput id string = storage.id\noutput primaryEndpoints object = storage.properties.primaryEndpoints\n"
  },
  {
    "path": "infra/shared/storekeys.bicep",
    "content": "param keyVaultName string = ''\nparam azureOpenAIName string = ''\nparam documentIntelligenceName string = ''\nparam azureAISearchName string = ''\nparam rgName string = ''\n// Do not use _ in the key names as it is not allowed in the key vault secret name\nparam openAIKeyName string = 'openai-api-key'\nparam documentIntelligenceKeyName string = 'azure-document-intelligence-admin-key'\nparam searchKeyName string = 'azure-search-admin-key'\n\nresource openAIKeySecret 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = {\n  parent: keyVault\n  name: openAIKeyName\n  properties: {\n    contentType: 'string'\n    value: listKeys(\n      resourceId(subscription().subscriptionId, rgName, 'Microsoft.CognitiveServices/accounts', azureOpenAIName),\n      '2023-05-01'\n    ).key1\n  }\n}\n\nresource documentIntelligenceKeySecret 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = {\n  parent: keyVault\n  name: documentIntelligenceKeyName\n  properties: {\n    contentType: 'string'\n    value: listKeys(\n      resourceId(\n        subscription().subscriptionId,\n        rgName,\n        'Microsoft.CognitiveServices/accounts',\n        documentIntelligenceName\n      ),\n      '2023-05-01'\n    ).key1\n  }\n}\n\nresource searchKeySecret 'Microsoft.KeyVault/vaults/secrets@2022-07-01' = {\n  parent: keyVault\n  name: searchKeyName\n  properties: {\n    contentType: 'string'\n    value: listAdminKeys(\n      resourceId(subscription().subscriptionId, rgName, 'Microsoft.Search/searchServices', azureAISearchName),\n      '2023-11-01'\n    ).primaryKey\n  }\n}\n\nresource keyVault 'Microsoft.KeyVault/vaults@2022-07-01' existing = {\n  name: keyVaultName\n}\n\noutput SEARCH_KEY_NAME string = searchKeySecret.name\noutput OPENAI_KEY_NAME string = openAIKeySecret.name\noutput DOCUMENTINTELLIGENCE_KEY_NAME string = documentIntelligenceKeySecret.name\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/README.md",
    "content": "# **RAG Experiment Accelerator** with Prompt Flow\n\n## Flow description\nThe RAG Experiment Accelerator is a versatile tool that helps you conduct experiments and evaluations using Azure AI Search and RAG pattern.\n\nThe top-level `flow.dag.yaml` runs the rag experiment end-to-end based on the configuration provided in `config.json`.\n\n![rag-end-to-end-flow-diagram](./images/end_to_end_flow_diagram.png)\n\n### Set Up\nThe `setup` node runs first and loads the required environment variables from a custom connection\n\n### Index\nThe `index` node will:\n- Create indexes based on the parameters set in `config.json`. Each index name will be in the following format: `{index_name_prefix}-{chunk_size}-{overlap_size}-{dimension}-{ef_construction}-{ef_search}`\n- Chunk documents based on the chunking parameters in `config.json`\n- Generate a summary and title for each chunk\n- Create embeddings for each chunk's content, generated title, and generated summary\n- Uploads the embeddings to Azure AI search Service\n\nIf the indexes have been previously created, this node is optional and can be skipped by setting the input `should_index` to `False`.\n\n### QA Generation\nThe `qa_generation` node will chunk each document and generate ground truth questions and answers for each chunk.\n\nOptionally, this node can be skipped by setting the input `should_generate_qa` to `false` and a set of user-provided ground truth questions and answers can be used. User-provided questions and answers should be in the `jsonl` file format and by default, in the location `./artifacts/eval_data.jsonl`. This location can be configured by updating the `eval_data_file` value in `config.json`. Each line of the `jsonl` file should contain the keys:\n- `user_prompt` field contains the generated question\n- `output_prompt` field contains the generated answer\n- `context` field contains the document sections from which the question-answer pair was generated\n\n### Querying\nThe `querying` node takes the `user_prompt`'s that were generated from the `qa_generation` node and searches Azure AI Search for using the `search_types` specified in `config.json`.\n\nFor each `user_prompt` and `search_type`:\n- If the `user_prompt` is complex, it is broken down into multiple prompts and both prompts are used in Azure AI Search\n- The search results are optionally reranked based on the `rerank` setting in `config.json`\n- Search result metrics are calculated\n- The content from the search results are added as context to the `user_prompt` and the LLM is called.\n- The responses from the LLM are uploaded as a data asset and used by evaluation node.\n\n\n### Evaluation\nThe `evaluation` node takes the results generated by the `querying` node and logs the metrics specified in `config.json` to mlflow. The metrics and configuration parameters can be inspected and compared to past experiments your ML workspace by selecting the `Jobs` tab under `Assets` and clicking on the latest experiment run.\n\n\n## Prerequisites\n- [Azure AI Search Service](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal) (Note: [Semantic Search](https://learn.microsoft.com/en-us/azure/search/search-get-started-semantic?tabs=dotnet) is available in Azure AI Search Service, at Basic tier or higher.)\n- [Azure OpenAI Service](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#how-do-i-get-access-to-azure-openai)\n- [Azure Machine Learning Resources](https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-azure-ml-in-a-day?view=azureml-api-2)\n\n\n## Getting Started\n\n### VSCode\nTo run the **RAG Experiment Accelerator** end-to-end in VSCode, follow these steps:\n1. Ensure you have installed the promptflow extension and pip installed the promptflow and promptflow-tools packages\n2. Run: `pip install ./custom_environment/rag_experiment_accelerator-0.9-py3-none-any.whl`\n3. Create a custom connection. See [env_setup](./env_setup.md).\n4. Add your own documents to the `./data` folder. (a set of sample documents are provided for testing purposes)\n5. Modify the `config.json` file with the hyperparameters for your experiment. Full documentation on the configuration elements can be found [here](#description-of-configuration-elements)\n6. Run the flow from the extension UI or from the CLI by running `pf flow test --flow ./flow.dag.yaml`\n7. Inspect the results in your ML workspace by selecting the `Jobs` tab under `Assets`. Click on the latest experiment run to view the metrics and results.\n\n\n### ML Workspace\n\nTo run the **RAG Experiment Accelerator** end-to-end in Prompt Flow, follow these steps:\n\n1. Create a custom environment using the provided Dockerfile (this will take several minutes)\n```bash\naz login\n\naz account set --subscription <subscription ID>\n\naz extension add --name ml\n\naz configure --defaults workspace=$MLWorkSpaceName group=$ResourceGroupName\n\ncd ./custom_environment\n\naz ml environment create --file ./environment.yaml -w $MLWorkSpaceName\n```\n2. Create a custom runtime using the newly created environment. See [Create runtime in UI](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/how-to-create-manage-runtime?view=azureml-api-2#create-runtime-in-ui).\n3. Modify the `config.json` file with the hyperparameters for your experiment. Full documentation can be found [here](#description-of-configuration-elements)\n4. Create a custom connection. See [env_setup](./env_setup.md).\n4. Add your own documents to the `./data` folder. (a set of sample documents are provided for testing purposes)\n5. Modify the `config.json` file with the hyperparameters for your experiment. Full documentation on can be found [here](#description-of-configuration-elements)\n6. Upload the flow to the ML workspace ![how to upload a local flow](./images/upload_local_flow.png)\n7. Ensure you have also uploaded `config.json` and optionally `prompt_config.json` to the ML workspace.\n8. Select the custom runtime in Prompt Flow\n9. Click run in the UI\n10. Inspect the results in your ML workspace by selecting the `Jobs` tab under `Assets`. Click on the latest experiment run to view the metrics and results.\n\n\n## Description of configuration elements\n\n```json\n{\n    \"index_name_prefix\": \"Search index name prefix used for tracking and comparing jobs\",\n    \"chunking\": {\n        \"chunk_size\": \"Size of each chunk e.g. [500, 1000, 2000]\" ,\n        \"overlap_size\": \"Overlap Size for each chunk e.g. [100, 200, 300]\"\n    },\n    \"embedding_dimension\" : \"embedding size for each chunk e.g. [384, 1024]. Valid values are 384, 768,1024\" ,\n    \"ef_construction\" : \"ef_construction value determines the value of Azure AI Search vector configuration.\" ,\n    \"ef_search\":  \"ef_search value determines the value of Azure AI Search vector configuration.\",\n    \"language\": {\n        \"analyzer_name\" : \"name of the analyzer to use for the field. This option can be used only with searchable fields and it can't be set together with either searchAnalyzer or indexAnalyzer.\",\n        \"index_analyzer_name\" : \"name of the analyzer used at indexing time for the field. This option can be used only with searchable fields. It must be set together with searchAnalyzer and it cannot be set together with the analyzer option.\",\n        \"search_analyzer_name\" : \"name of the analyzer used at search time for the field. This option can be used only with searchable fields. It must be set together with indexAnalyzer and it cannot be set together with the analyzer option. This property cannot be set to the name of a language analyzer; use the analyzer property instead if you need a language analyzer.\",\n    },\n    \"experiment_name\": \"name of the experiment\",\n    \"rerank\": \"determines if search results should be re-ranked. Value values are TRUE or FALSE\" ,\n    \"rerank_type\": \"determines the type of re-ranking. Value values are llm or cross_encoder\",\n    \"llm_rerank_threshold\": \"determines the threshold when using llm re-ranking. Chunks with rank above this number are selected in range from 1 - 10.\" ,\n    \"cross_encoder_at_k\": \"determines the threshold when using cross-encoding re-ranking. Chunks with given rank value are selected.\" ,\n    \"cross_encoder_model\" :\"determines the model used for cross-encoding re-ranking step. Valid value is cross-encoder/stsb-roberta-base\",\n    \"search_types\" : \"determines the search types used for experimentation. Valid value are search_for_match_semantic, search_for_match_Hybrid_multi, search_for_match_Hybrid_cross, search_for_match_text, search_for_match_pure_vector, search_for_match_pure_vector_multi, search_for_match_pure_vector_cross, search_for_manual_hybrid. e.g. ['search_for_manual_hybrid', 'search_for_match_Hybrid_multi','search_for_match_semantic' ]\",\n    \"retrieve_num_of_documents\": \"determines the number of chunks to retrieve from the search index\",\n    \"metric_types\" : \"determines the metrics used for evaluation purpose. Valid value are lcsstr, lcsseq, cosine, jaro_winkler, hamming, jaccard, levenshtein, fuzzy_score, cosine_ochiai, bert_all_MiniLM_L6_v2, bert_base_nli_mean_tokens, bert_large_nli_mean_tokens, bert_large_nli_stsb_mean_tokens, bert_distilbert_base_nli_stsb_mean_tokens, bert_paraphrase_multilingual_MiniLM_L12_v2, llm_context_precision, llm_answer_relevance. e.g ['fuzzy_score','bert_all_MiniLM_L6_v2','cosine_ochiai','bert_distilbert_base_nli_stsb_mean_tokens']\",\n    \"azure_oai_chat_deployment_name\":  \"determines the Azure OpenAI chat deployment name\",\n    \"azure_oai_eval_deployment_name\":  \"determines the Azure OpenAI evaluation deployment name\",\n    \"embedding_model_name\": \"embedding model name\",\n    \"temperature\": \"determines the OpenAI temperature. Valid value ranges from 0 to 1.\",\n    \"search_relevancy_threshold\": \"the similarity threshold to determine if a doc is relevant. Valid ranges are from 0.0 to 1.0\",\n    \"eval_data_file\": \"the file path of the ground truth questions and answers. This must be a jsonl file and each line should contain the keys: user_prompt (question), output_prompt (answer), context (the document context that contains the answer)\"\n}\n```\n\n## Folder structure\n\nThe top-level `flow.dag.yaml` runs the RAG experiments end-to-end and each step can be run independently.\n\n- `data` - the directory for the documents. A set of sample documents are provided for testing\n- `custom_environment` - contains the `Dockerfile`, `environment.yaml` and the `rag-experiment-accelerator` `.whl` file. Building an image is necessary when running in Prompt Flow from the ML workspace.\n- `images` - contains the images used in this `README.md`\n\nFlows:\n- `setup` - (sets the necessary environment variables)\n- `index` - (contains the `index` flow)\n- `qa_generation`- (contains the `qa_generation` flow)\n- `querying`- (contains the `querying` flow)\n- `evaluation` - (contains the `evaluation` flow)\n\nEach step can also be run independently and the flow is contained in its corresponding folder. When running the flows independently, an initial `setup` will run to ensure the proper environment variables are set.\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/custom_environment/Dockerfile",
    "content": "FROM mcr.microsoft.com/azureml/promptflow/promptflow-runtime:20231207.v2\nCOPY ./rag_experiment_accelerator-0.9-py3-none-any.whl ./rag_experiment_accelerator-0.9-py3-none-any.whl\nRUN pip install rag_experiment_accelerator-0.9-py3-none-any.whl --verbose"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/custom_environment/environment.yaml",
    "content": "$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json\nname: rag-experiment-accelerator-environment\nbuild:\n  path: ./"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/env_setup.md",
    "content": "# Promptflow Secret Setup\n\n## Prerequisites\nInstall the dev-requirements and login to the az cli.\n``` bash\n# Install the dev requirements\npip install -r dev-requirements.txt \n\n# Login to the az cli\naz login\n```\n\n## AzureML Connections\nA Custom Connection is a generic connection type that stores and manages credentials required for interacting with LLMs. It has two dictionaries, `secrets` for secrets to be stored in Key Vault, and `configs` for non-secrets that are stored in the AzureML workspace.\n\n\nYou can create a custom connection in the AzureML workspace by following the instructions [here](https://learn.microsoft.com/en-us/azure/machine-learning/prompt-flow/tools-reference/python-tool?view=azureml-api-2#create-a-custom-connection). The key-value pairs required are listed in the Secrets and Configs sections.\n\nThe following variables are required to be set as secret:\n- AZURE_SEARCH_ADMIN_KEY\n- OPENAI_API_KEY\n- AML_SUBSCRIPTION_ID\n- AML_RESOURCE_GROUP_NAME\n- AML_WORKSPACE_NAME\n\nAnd the remaining variables must not be set as secret:\n- AZURE_SEARCH_SERVICE_ENDPOINT\n- OPENAI_ENDPOINT\n- OPENAI_API_VERSION\n\nThe following variables are optional:\n- AZURE_LANGUAGE_SERVICE_KEY - secret\n- AZURE_LANGUAGE_SERVICE_ENDPOINT - non secret\n- LOGGING_LEVEL - non secret\n\n## Configuring your connection locally \nTo configure promptflow to connect to AzureML, you need to update the top level `.azureml/config.json` file with the `workspace_name`, `resource_group`, and `subscription_id` that your connection is stored in. You can find more information about this in the [documentation](https://microsoft.github.io/promptflow/how-to-guides/set-global-configs.html#azureml).\n\nTo update the local promptflow connection provider to look for AzureML connections, you can use the following code:\n``` bash\n# Set your promptflow connection provider to azureml\npf config set connection.provider=azureml\n\n# Verify that the connection appears\npf connection list\n```\nNote: Depending on the context you're running the `pf` commands from, you may need to move the `.azureml` folder into the root of the repository.\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/evaluation/evaluation.py",
    "content": "from promptflow import tool\r\nimport mlflow\r\n\r\nfrom rag_experiment_accelerator.evaluation.eval import get_run_tags\r\nfrom rag_experiment_accelerator.run.evaluation import run, initialise_mlflow_client\r\nfrom rag_experiment_accelerator.config.environment import Environment\r\nfrom rag_experiment_accelerator.config.config import Config\r\nfrom rag_experiment_accelerator.config.paths import (\r\n    mlflow_run_name,\r\n    formatted_datetime_suffix,\r\n)\r\n\r\n\r\n@tool\r\ndef my_python_tool(config_path: str) -> bool:\r\n    environment = Environment.from_env_or_keyvault()\r\n    config = Config.from_path(environment, config_path)\r\n    mlflow_client = initialise_mlflow_client(environment, config)\r\n    name_suffix = formatted_datetime_suffix()\r\n\r\n    mlflow.set_tags(get_run_tags(config))\r\n    with mlflow.start_run(run_name=mlflow_run_name(config.job_name, name_suffix)):\r\n        mlflow.set_tags()\r\n        for index_config in config.index.flatten():\r\n            run(\r\n                environment,\r\n                config,\r\n                index_config,\r\n                mlflow_client,\r\n                name_suffix,\r\n            )\r\n    return True\r\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/evaluation/flow.dag.yaml",
    "content": "inputs:\n  config_dir:\n    type: string\n    default: ../\noutputs: {}\nnodes:\n- name: setup_env\n  type: python\n  source:\n    type: code\n    path: ../setup/setup_env.py\n  inputs:\n    connection: \"\"\n- name: evaluation\n  type: python\n  source:\n    type: code\n    path: evaluation.py\n  inputs:\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${setup_env.output}\n    is: true\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/flow.dag.yaml",
    "content": "inputs:\n  should_index:\n    type: bool\n    default: true\n  config_dir:\n    type: string\n    default: ../..\n  should_generate_qa:\n    type: bool\n    default: true\noutputs: {}\nnodes:\n- name: setup\n  type: python\n  source:\n    type: code\n    path: setup/setup_env.py\n  inputs:\n    connection: \"\"\n- name: index\n  type: python\n  source:\n    type: code\n    path: index/create_index.py\n  inputs:\n    should_index: ${inputs.should_index}\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${setup.output}\n    is: true\n- name: generate_qa\n  type: python\n  source:\n    type: code\n    path: qa_generation/generate_qa.py\n  inputs:\n    config_dir: ${inputs.config_dir}\n    should_generate_qa: ${inputs.should_generate_qa}\n  activate:\n    when: ${index.output}\n    is: true\n- name: querying\n  type: python\n  source:\n    type: code\n    path: querying/querying.py\n  inputs:\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${generate_qa.output}\n    is: true\n- name: evaluation\n  type: python\n  source:\n    type: code\n    path: evaluation/evaluation.py\n  inputs:\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${querying.output}\n    is: true\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/index/create_index.py",
    "content": "from promptflow import tool\r\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint\r\nfrom rag_experiment_accelerator.run.index import run\r\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths\r\nfrom rag_experiment_accelerator.config.environment import Environment\r\nfrom rag_experiment_accelerator.config.config import Config\r\n\r\n\r\n@tool\r\ndef my_python_tool(should_index: bool, config_path: str) -> bool:\r\n    environment = Environment.from_env_or_keyvault()\r\n    config = Config.from_path(environment, config_path)\r\n    init_checkpoint(config)\r\n\r\n    if should_index:\r\n        file_paths = get_all_file_paths(config.path.data_dir)\r\n        for index_config in config.index.flatten():\r\n            run(environment, config, index_config, file_paths)\r\n    return True\r\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/index/flow.dag.yaml",
    "content": "inputs:\n  config_dir:\n    type: string\n    default: ../\n  should_index:\n    type: bool\n    default: true\noutputs: {}\nnodes:\n- name: setup_env\n  type: python\n  source:\n    type: code\n    path: ../setup/setup_env.py\n  inputs:\n    connection: \"\"\n- name: create_index\n  type: python\n  source:\n    type: code\n    path: create_index.py\n  inputs:\n    should_index: true\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${setup_env.output}\n    is: true\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/qa_generation/flow.dag.yaml",
    "content": "inputs:\n  config_dir:\n    type: string\n    default: ../\noutputs: {}\nnodes:\n- name: setup_env\n  type: python\n  source:\n    type: code\n    path: ../setup/setup_env.py\n  inputs:\n    connection: \"\"\n- name: generate_qa\n  type: python\n  source:\n    type: code\n    path: generate_qa.py\n  inputs:\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${setup_env.output}\n    is: true\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/qa_generation/generate_qa.py",
    "content": "from promptflow import tool\r\nfrom rag_experiment_accelerator.run.qa_generation import run\r\nfrom rag_experiment_accelerator.config.config import Config\r\nfrom rag_experiment_accelerator.config.environment import Environment\r\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths\r\n\r\n\r\n@tool\r\ndef my_python_tool(config_path: str, should_generate_qa: bool) -> bool:\r\n    environment = Environment.from_env_or_keyvault()\r\n    config = Config.from_path(environment, config_path)\r\n\r\n    if should_generate_qa:\r\n        run(environment, config, get_all_file_paths(config.path.data_dir))\r\n    return True\r\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/querying/flow.dag.yaml",
    "content": "inputs:\n  config_dir:\n    type: string\n    default: ../\noutputs: {}\nnodes:\n- name: setup_env\n  type: python\n  source:\n    type: code\n    path: ../setup/setup_env.py\n  inputs:\n    connection: \"\"\n- name: querying\n  type: python\n  source:\n    type: code\n    path: querying.py\n  inputs:\n    config_dir: ${inputs.config_dir}\n  activate:\n    when: ${setup_env.output}\n    is: true\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/querying/querying.py",
    "content": "from promptflow import tool\r\nfrom rag_experiment_accelerator.run.querying import run\r\nfrom rag_experiment_accelerator.config.environment import Environment\r\nfrom rag_experiment_accelerator.config.config import Config\r\n\r\n\r\n@tool\r\ndef my_python_tool(config_path: str) -> bool:\r\n    environment = Environment.from_env_or_keyvault()\r\n    config = Config.from_path(environment, config_path)\r\n\r\n    for index_config in config.index.flatten():\r\n        run(environment, config, index_config)\r\n    return True\r\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/setup/flow.dag.yaml",
    "content": "inputs: {}\noutputs: {}\nnodes:\n- name: setup_env\n  type: python\n  source:\n    type: code\n    path: setup_env.py\n"
  },
  {
    "path": "promptflow/rag-experiment-accelerator/setup/setup_env.py",
    "content": "import os\n\nfrom promptflow.connections import CustomConnection\n\nfrom promptflow import tool\n\n\n@tool\ndef my_python_tool(connection: CustomConnection):\n    os.environ[\"AZURE_SEARCH_SERVICE_ENDPOINT\"] = connection.configs[\n        \"AZURE_SEARCH_SERVICE_ENDPOINT\"\n    ]\n    os.environ[\"AZURE_SEARCH_ADMIN_KEY\"] = connection.secrets[\"AZURE_SEARCH_ADMIN_KEY\"]\n    os.environ[\"OPENAI_API_KEY\"] = connection.secrets[\"OPENAI_API_KEY\"]\n    os.environ[\"OPENAI_API_TYPE\"] = \"azure\"\n    os.environ[\"OPENAI_ENDPOINT\"] = connection.configs[\"OPENAI_ENDPOINT\"]\n    os.environ[\"OPENAI_API_VERSION\"] = connection.configs[\"OPENAI_API_VERSION\"]\n    os.environ[\"AML_SUBSCRIPTION_ID\"] = connection.secrets[\"AML_SUBSCRIPTION_ID\"]\n    os.environ[\"AML_RESOURCE_GROUP_NAME\"] = connection.secrets[\n        \"AML_RESOURCE_GROUP_NAME\"\n    ]\n    os.environ[\"AML_WORKSPACE_NAME\"] = connection.secrets[\"AML_WORKSPACE_NAME\"]\n\n    if \"AZURE_LANGUAGE_SERVICE_KEY\" in connection.secrets:\n        os.environ[\"AZURE_LANGUAGE_SERVICE_KEY\"] = connection.secrets[\n            \"AZURE_LANGUAGE_SERVICE_KEY\"\n        ]\n\n    if \"AZURE_LANGUAGE_SERVICE_ENDPOINT\" in connection.configs:\n        os.environ[\"AZURE_LANGUAGE_SERVICE_ENDPOINT\"] = connection.configs[\n            \"AZURE_LANGUAGE_SERVICE_ENDPOINT\"\n        ]\n\n    if \"LOGGING_LEVEL\" in connection.configs:\n        os.environ[\"LOGGING_LEVEL\"] = connection.configs[\"LOGGING_LEVEL\"]\n\n    return True\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.pylic]\nsafe_licenses = [\n    \"Apache Software License\",\n    \"Apache License 2.0\",\n    \"MIT License\",\n    \"Python Software Foundation License\",\n    \"Mozilla Public License 2.0 (MPL 2.0)\",\n]\nunsafe_packages = [\n    \"unlicensedPackage\",\n]\nignore_packages = [\n    \"ignoredPackage\",\n]"
  },
  {
    "path": "rag_experiment_accelerator/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/artifact/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/artifact_handler.py",
    "content": "import time\n\nfrom rag_experiment_accelerator.artifact.handlers.exceptions import LoadException\nfrom rag_experiment_accelerator.artifact.handlers.typing import T, U\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nclass ArtifactHandler:\n    \"\"\"\n    A class that handles artifacts, including loading, archiving, and saving.\n\n    Args:\n        data_location (str): The location of the data.\n        writer (V): The writer object used for writing data.\n        loader (U): The loader object used for loading data.\n\n    Attributes:\n        data_location (str): The location of the data.\n        archive_location (str): The location where archived artifacts are stored.\n        _writer (T): The writer object used for writing data.\n        _loader (U): The loader object used for loading data.\n    \"\"\"\n\n    def __init__(self, data_location: str, writer: T, loader: U) -> None:\n        self.data_location = data_location\n        self.archive_location = f\"{self.data_location}/archive\"\n        self._writer = writer\n        self._loader = loader\n\n    def load(self, name: str, **kwargs) -> list:\n        \"\"\"\n        Load artifacts from a specific path.\n\n        Args:\n            name (str): The name of the artifact.\n            **kwargs: Additional keyword arguments to be passed to the loader.\n\n        Returns:\n            list: The loaded artifacts.\n\n        Raises:\n            LoaderException: If the loader cannot handle the specified path or if no data is loaded.\n        \"\"\"\n        path = f\"{self.data_location}/{name}\"\n        # ensure loader can handle path\n        if not self._loader.can_handle(path):\n            raise LoadException(path=path)\n\n        # load the data\n        logger.info(f\"Loading artifacts from path: {path}\")\n        loaded_data = self._loader.load(path=path, **kwargs)\n\n        # raise if no data loaded\n        if len(loaded_data) == 0:\n            raise LoadException(path=path)\n\n        logger.info(f\"Loaded {len(loaded_data)} artifacts from path: {path}\")\n        return loaded_data\n\n    def handle_archive(self, name: str) -> str | None:\n        \"\"\"\n        Handle archiving of artifacts.\n\n        Args:\n            name (str): The name of the artifact.\n\n        Returns:\n            str | None: The path of the archived artifact, or None if no archiving is performed.\n\n        Raises:\n            None\n        \"\"\"\n        src = f\"{self.data_location}/{name}\"\n\n        # archive if exists, else no-op\n        if self._writer.exists(src):\n            # timestamp filename in archive location\n            timestamp = time.strftime(\"%Y%m%d-%H%M%S\")\n            dest = f\"{self.archive_location}/{timestamp}-{name}\"\n\n            # copy to archive\n            self._writer.copy(src, dest)\n\n            # delete original\n            self._writer.delete(src)\n            logger.info(f\"Archived previous artifacts to {dest}\")\n\n            return dest\n\n        logger.debug(f\"No file to archive at path: {src}\")\n\n    def save_dict(self, data: dict, name: str, **kwargs):\n        \"\"\"\n        Save a dictionary of artifacts to a specific path.\n\n        Args:\n            data (dict): The dictionary of artifacts to be saved.\n            name (str): The name of the artifact.\n            **kwargs: Additional keyword arguments to be passed to the writer.\n\n        Returns:\n            None\n\n        Raises:\n            None\n        \"\"\"\n        path = f\"{self.data_location}/{name}\"\n        logger.info(f\"Saving artifacts to path: {path}\")\n\n        self._writer.write(path, data, **kwargs)\n"
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/exceptions.py",
    "content": "class LoadException(Exception):\n    def __init__(self, path: str):\n        super().__init__(\n            f\"Cannot load at path: {path}. Please ensure it is supported by the loader.\"\n        )\n"
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/query_output_handler.py",
    "content": "from rag_experiment_accelerator.artifact.handlers.artifact_handler import (\n    ArtifactHandler,\n)\nfrom rag_experiment_accelerator.artifact.handlers.typing import T, U\nfrom rag_experiment_accelerator.artifact.models.query_output import QueryOutput\nfrom rag_experiment_accelerator.io.local.loaders.jsonl_loader import JsonlLoader\nfrom rag_experiment_accelerator.io.local.writers.jsonl_writer import JsonlWriter\n\n\nclass QueryOutputHandler(ArtifactHandler):\n    \"\"\"\n    A class that handles query outputs for a given index name.\n    \"\"\"\n\n    def __init__(\n        self, data_location: str, writer: T = JsonlWriter(), loader: U = JsonlLoader()\n    ) -> None:\n        \"\"\"\n        Initializes a QueryOutputHandler instance.\n\n        Args:\n            data_location (str): The location where the data is stored.\n            writer (T, optional): The writer to use for saving data. Defaults to JsonlWriter().\n            loader (U, optional): The loader to use for loading data. Defaults to JsonlLoader().\n        \"\"\"\n        super().__init__(data_location=data_location, writer=writer, loader=loader)\n\n    def _get_output_name(\n        self, index_name: str, experiment_name: str, job_name: str\n    ) -> str:\n        \"\"\"\n        Returns the output name for a given index name.\n\n        Args:\n            index_name (str): The name of the index.\n\n        Returns:\n            str: The output name.\n        \"\"\"\n        return f\"eval_output_{index_name}_{experiment_name}_{job_name}.jsonl\"\n\n    def get_output_path(\n        self, index_name: str, experiment_name: str, job_name: str\n    ) -> str:\n        \"\"\"\n        Returns the output path for a given index name.\n\n        Args:\n            index_name (str): The name of the index.\n\n        Returns:\n            str: The output path.\n        \"\"\"\n        return f\"{self.data_location}/{self._get_output_name(index_name, experiment_name, job_name)}\"\n\n    def load(\n        self, index_name: str, experiment_name: str, job_name: str\n    ) -> list[QueryOutput]:\n        \"\"\"\n        Loads the query outputs for a given index name.\n\n        Args:\n            index_name (str): The name of the index.\n\n        Returns:\n            list[QueryOutput]: The loaded query outputs.\n        \"\"\"\n        output_name = self._get_output_name(index_name, experiment_name, job_name)\n\n        query_outputs = []\n        data_load = super().load(output_name)\n        for d in data_load:\n            if not isinstance(d, dict):\n                raise TypeError(\n                    f\"Query output data loaded is not of type dict. Name: {output_name}\"\n                )\n            query_outputs.append(QueryOutput(**d))\n\n        return query_outputs\n\n    def handle_archive_by_index(\n        self, index_name: str, experiment_name: str, job_name: str\n    ) -> str | None:\n        \"\"\"\n        Handles archiving of query output for a given index name.\n\n        Args:\n            index_name (str): The name of the index.\n\n        Returns:\n            str | None: The output filename if successful, None otherwise.\n        \"\"\"\n        output_filename = self._get_output_name(index_name, experiment_name, job_name)\n        return self.handle_archive(output_filename)\n\n    def save(\n        self, data: QueryOutput, index_name: str, experiment_name: str, job_name: str\n    ):\n        \"\"\"\n        Saves the query output for a given index name.\n\n        Args:\n            data (QueryOutput): The query output to be saved.\n            index_name (str): The name of the index.\n        \"\"\"\n        output_filename = self._get_output_name(index_name, experiment_name, job_name)\n        self.save_dict(data.__dict__, output_filename)\n"
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/tests/test_artifact_handler.py",
    "content": "from unittest.mock import Mock\nimport pytest\n\nfrom rag_experiment_accelerator.artifact.handlers.artifact_handler import (\n    ArtifactHandler,\n)\nfrom rag_experiment_accelerator.artifact.handlers.exceptions import LoadException\n\n\ndef test_loads():\n    data = \"This is test data\"\n    mock_writer = Mock()\n    mock_loader = Mock()\n    mock_loader.can_handle.return_value = True\n    mock_loader.load.return_value = [data]\n\n    handler = ArtifactHandler(\"data_location\", writer=mock_writer, loader=mock_loader)\n\n    name = \"test.jsonl\"\n    loaded_data = handler.load(name)\n\n    assert loaded_data == [data]\n\n\ndef test_save_dict():\n    mock_writer = Mock()\n    mock_loader = Mock()\n\n    handler = ArtifactHandler(\"data_location\", writer=mock_writer, loader=mock_loader)\n\n    dict_to_save = {\"testing\": 123, \"mic\": \"check\"}\n    artifact_name = \"test.jsonl\"\n    handler.save_dict(dict_to_save, \"test.jsonl\")\n    path = f\"{handler.data_location}/{artifact_name}\"\n\n    assert mock_writer.write.call_count == 1\n    assert mock_writer.write.called_with(dict_to_save, path)\n\n\ndef test_loads_raises_no_data_returned():\n    mock_writer = Mock()\n    mock_loader = Mock()\n    mock_loader.can_handle.return_value = True\n    mock_loader.load.return_value = []\n    handler = ArtifactHandler(\"data_location\", writer=mock_writer, loader=mock_loader)\n    name = \"test.jsonl\"\n\n    with pytest.raises(LoadException):\n        handler.load(name)\n\n\ndef test_load_raises_cant_handle():\n    mock_writer = Mock()\n    mock_loader = Mock()\n    handler = ArtifactHandler(\"data_location\", writer=mock_writer, loader=mock_loader)\n\n    mock_loader.can_handle.return_value = False\n\n    with pytest.raises(LoadException):\n        handler.load(\"test.txt\")\n\n\ndef test_handle_archive():\n    mock_writer = Mock()\n    mock_loader = Mock()\n    mock_writer.exists.return_value = True\n    data_location = \"data_location\"\n    handler = ArtifactHandler(data_location, writer=mock_writer, loader=mock_loader)\n\n    name = \"test.jsonl\"\n    dest = handler.handle_archive(name)\n\n    src = f\"{data_location}/{name}\"\n    mock_writer.copy.assert_called_once_with(src, dest)\n    mock_writer.delete.assert_called_once_with(src)\n\n\ndef test_handle_archive_no_op():\n    mock_writer = Mock()\n    mock_loader = Mock()\n    # only archive is exists\n    mock_writer.exists.return_value = False\n    handler = ArtifactHandler(\"data_location\", writer=mock_writer, loader=mock_loader)\n\n    dest = handler.handle_archive(\"test.jsonl\")\n\n    mock_writer.copy.assert_not_called()\n    mock_writer.delete.assert_not_called()\n    assert dest is None\n"
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/tests/test_query_output_handler.py",
    "content": "from unittest.mock import patch\n\nimport pytest\n\nfrom rag_experiment_accelerator.artifact.handlers.query_output_handler import (\n    QueryOutputHandler,\n)\nfrom rag_experiment_accelerator.artifact.models.query_output import QueryOutput\n\n\n@patch(\n    \"rag_experiment_accelerator.artifact.handlers.query_output_handler.ArtifactHandler.handle_archive\"\n)\ndef test_handle_archive_by_index(mock_artifact_handler_handle_archive):\n    index_name = \"index_name\"\n    experiment_name = \"experiment_name\"\n    job_name = \"job_name\"\n    data_location = \"data_location\"\n    handler = QueryOutputHandler(data_location=data_location)\n\n    handler.handle_archive_by_index(index_name, experiment_name, job_name)\n\n    output_filename = handler._get_output_name(index_name, experiment_name, job_name)\n    mock_artifact_handler_handle_archive.assert_called_once_with(output_filename)\n\n\ndef test_get_output_path():\n    index_name = \"index_name\"\n    experiment_name = \"experiment_name\"\n    job_name = \"job_name\"\n    dir = \"/tmp\"\n    handler = QueryOutputHandler(dir)\n    dest = handler.get_output_path(index_name, experiment_name, job_name)\n    name = handler._get_output_name(index_name, experiment_name, job_name)\n    assert dest == f\"{dir}/{name}\"\n\n\ndef test__get_output_name():\n    index_name = \"index_name\"\n    experiment_name = \"experiment_name\"\n    job_name = \"job_name\"\n\n    dir = \"/tmp\"\n    handler = QueryOutputHandler(dir)\n    name = handler._get_output_name(index_name, experiment_name, job_name)\n    assert name == f\"eval_output_{index_name}_{experiment_name}_{job_name}.jsonl\"\n\n\n@patch(\n    \"rag_experiment_accelerator.artifact.handlers.query_output_handler.ArtifactHandler.save_dict\"\n)\ndef test_save(mock_artifact_handler_save_dict):\n    index_name = \"index_name\"\n    experiment_name = \"experiment_name\"\n    job_name = \"job_name\"\n\n    test_data = QueryOutput(\n        rerank=\"rerank1\",\n        rerank_type=\"rerank_type1\",\n        cross_encoder_model=\"cross_encoder_model1\",\n        llm_rerank_threshold=1,\n        retrieve_num_of_documents=1,\n        cross_encoder_at_k=2,\n        question_count=1,\n        actual=\"actual1\",\n        expected=\"expected1\",\n        search_type=\"search_type1\",\n        search_evals=[],\n        context=\"context1\",\n        retrieved_contexts=[\"retrievedcontext1\"],\n        question=\"question1\",\n    )\n\n    handler = QueryOutputHandler(data_location=\"data_location\")\n    handler.save(test_data, index_name, experiment_name, job_name)\n\n    name = handler._get_output_name(index_name, experiment_name, job_name)\n    handler.save_dict.assert_called_once_with(test_data.__dict__, name)\n\n\n@patch(\n    \"rag_experiment_accelerator.artifact.handlers.query_output_handler.ArtifactHandler.load\"\n)\ndef test_load(mock_artifact_handler_load):\n    data = QueryOutput(\n        rerank=\"rerank1\",\n        rerank_type=\"rerank_type1\",\n        cross_encoder_model=\"cross_encoder_model1\",\n        llm_rerank_threshold=1,\n        retrieve_num_of_documents=1,\n        cross_encoder_at_k=1,\n        question_count=1,\n        actual=\"actual1\",\n        expected=\"expected1\",\n        search_type=\"search_type1\",\n        search_evals=[],\n        context=\"context1\",\n        retrieved_contexts=[\"retrievedcontext1\"],\n        question=\"question1\",\n    )\n\n    mock_artifact_handler_load.return_value = [data.__dict__, data.__dict__]\n    index_name = \"index_name\"\n    experiment_name = \"experiment_name\"\n    job_name = \"job_name\"\n\n    handler = QueryOutputHandler(data_location=\"data_location\")\n    loaded_data = handler.load(index_name, experiment_name, job_name)\n\n    assert len(loaded_data) == 2\n    for d in loaded_data:\n        assert d.rerank == data.rerank\n        assert d.rerank_type == data.rerank_type\n        assert d.cross_encoder_model == data.cross_encoder_model\n        assert d.llm_rerank_threshold == data.llm_rerank_threshold\n        assert d.retrieve_num_of_documents == data.retrieve_num_of_documents\n        assert d.cross_encoder_at_k == data.cross_encoder_at_k\n        assert d.question_count == data.question_count\n        assert d.actual == data.actual\n        assert d.expected == data.expected\n        assert d.search_type == data.search_type\n        assert d.search_evals == data.search_evals\n        assert d.context == data.context\n        assert d.retrieved_contexts == data.retrieved_contexts\n\n\n@patch(\n    \"rag_experiment_accelerator.artifact.handlers.query_output_handler.ArtifactHandler.load\"\n)\ndef test_load_raises_when_loaded_data_not_dict(mock_artifact_handler_load):\n    mock_artifact_handler_load.return_value = [\"this is not a dict\"]\n    index_name = \"index_name\"\n\n    handler = QueryOutputHandler(data_location=\"data_location\")\n\n    with pytest.raises(TypeError):\n        handler.load(index_name)\n"
  },
  {
    "path": "rag_experiment_accelerator/artifact/handlers/typing.py",
    "content": "from typing import TypeVar\nfrom rag_experiment_accelerator.io.loader import Loader\nfrom rag_experiment_accelerator.io.writer import Writer\n\nT = TypeVar(\"T\", bound=Writer)\nU = TypeVar(\"U\", bound=Loader)\n"
  },
  {
    "path": "rag_experiment_accelerator/artifact/models/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/artifact/models/query_output.py",
    "content": "class QueryOutput:\n    \"\"\"\n    Represents the output of a query.\n\n    Attributes:\n        rerank (bool): Indicates whether reranking is enabled.\n        rerank_type (str): The type of reranking.\n        cross_encoder_model (str): The model used for cross-encoding.\n        llm_rerank_threshold (int): The threshold for reranking using LLM.\n        retrieve_num_of_documents (int): The number of documents to retrieve.\n        cross_encoder_at_k (int): The value of k for cross-encoder.\n        question_count (int): The count of questions.\n        actual (str): The actual output.\n        expected (str): The expected output.\n        search_type (str): The type of search.\n        search_evals (list): The evaluations for search.\n        context (str): The qna context of the query.\n        retrieved_contexts (list): The list of retrieved contexts of the query.\n        question (str): The question of the query.\n    \"\"\"\n\n    def __init__(\n        self,\n        rerank: bool,\n        rerank_type: str,\n        cross_encoder_model: str,\n        llm_rerank_threshold: int,\n        retrieve_num_of_documents: int,\n        cross_encoder_at_k: int,\n        question_count: int,\n        actual: str,\n        expected: str,\n        search_type: str,\n        search_evals: list,\n        context: str,\n        retrieved_contexts: list,\n        question: str,\n    ):\n        self.rerank = rerank\n        self.rerank_type = rerank_type\n        self.cross_encoder_model = cross_encoder_model\n        self.llm_rerank_threshold = llm_rerank_threshold\n        self.retrieve_num_of_documents = retrieve_num_of_documents\n        self.cross_encoder_at_k = cross_encoder_at_k\n        self.question_count = question_count\n        self.actual = actual\n        self.expected = expected\n        self.search_type = search_type\n        self.search_evals = search_evals\n        self.context = context\n        self.retrieved_contexts = retrieved_contexts\n        self.question = question\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/README.md",
    "content": "# Checkpoints\n\n## What is a checkpoint?\nCheckpoints are used to skip the processing of data that has already been processed in previous runs.\nA checkpoint object is used to wrap methods, so when the method is called with an ID that was called before, instead of executing the method, the checkpoint will return the result of the previous execution.\n\n## Usage\n\n### 1. Initialize the checkpoint object:\n```python\ninit_checkpoint(config)\n```\n\n### 2. Wrap the method you want to cache with the checkpoint decorator:\n```python\n@cache_with_checkpoint(id=\"arg2.id\")\ndef method(arg1, arg2):\n    pass\n```\n\nor wrap the method using the checkpoint object:\n```python\n get_checkpoint().load_or_run(method, arg2.id, arg1, arg2)\n```\n\n(arg2.id is the ID that uniquely identifies the call in this example)\n\nThis call will check if the provided method has previously been executed with the given ID, If it has, it returns the cached result, otherwise, it executes the method with the given arguments and caches the result for future calls.\n\n## Checkpoint types\n\n### Checkpoint\nThe base class for all checkpoints. It provides the basic functionality for initializing and retrieving the checkpoint instance.\n\nA Checkpoint object is a singleton, meaning, only one checkpoint instance exists at a time.\nTo create a new checkpoint instance (or to override the existing instance), use the `init_checkpoint` method, this method will create a checkpoint object according to the provided configuration.\n\nTo get the current checkpoint instance, use the `get_checkpoint` method.\n\n### LocalStorageCheckpoint\nCheckpoint implementation for the local executions of the pipeline (i.e. the developer's machine), uses the `pickle` library for serializing and persisting the method results to the local storage.\nThe checkpoint data is saved in the `artifacts/checkpoint` directory.\n\n### NullCheckpoint\nCheckpoint implementation that does not cache any data. This is useful when you want to disable the checkpointing mechanism.\n\n## Deleting Checkpoint data\nTo delete the checkpoint data, simply run the following `Make` command:\n```bash\nmake clear_checkpoints\n```"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/__init__.py",
    "content": "# flake8: noqa: F401\n\nfrom rag_experiment_accelerator.checkpoint.checkpoint_factory import (\n    init_checkpoint,\n    get_checkpoint,\n)\nfrom rag_experiment_accelerator.checkpoint.checkpoint_decorator import (\n    cache_with_checkpoint,\n)\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/checkpoint.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import Any\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\nclass Checkpoint(ABC):\n    \"\"\"\n    A Checkpoint is used to cache the results of method calls, enabling the reuse of these results if the same method is called again with the same ID.\n    When a method wrapped by a Checkpoint object is called with an ID that was used before,\n    the Checkpoint returns the result of the previous execution instead of executing the method again.\n\n    Initialize the Checkpoint using the `init_checkpoint` method, and use the `get_checkpoint` method to get the Checkpoint object.\n    \"\"\"\n\n    def load_or_run(self, method, id: str, *args, **kwargs) -> Any:\n        \"\"\"\n        Checks if the provided method has previously been executed with the given ID,\n        If it has, it returns the cached result,\n        otherwise, it executes the method with the given arguments and caches the result for future calls.\n\n        Parameters:\n        - id (str): A unique identifier for the data.\n        - method: The method to be executed.\n        - *args: Variable length argument list for the method.\n        - **kwargs: Arbitrary keyword arguments for the method.\n\n        Returns:\n        - Any: The result of the method execution.\n        \"\"\"\n        if self._has_data(id, method):\n            max_id_length = 20\n            trimmed_id = (\n                str(id)[:max_id_length] + \"...\"\n                if len(str(id)) > max_id_length\n                else str(id)\n            )\n            logger.info(\n                f\"Checkpoint data found for '{method.__name__}' and id '{trimmed_id}' - skipping execution and loading from checkpoint.\"\n            )\n            return self._load(id, method)\n        else:\n            method_result = method(*args, **kwargs)\n            self._save(method_result, id, method)\n            return method_result\n\n    @abstractmethod\n    def _has_data(self, id: str, method) -> bool:\n        \"\"\"\n        Returns whether the checkpoint has any data for the given method with the given id.\n\n        Args:\n        - id (str): A unique identifier for the data.\n        - method: The method that is wrapped by the checkpoint.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def _load(self, id: str, method) -> Any:\n        \"\"\"\n        Loads data for the given id and method.\n\n        Args:\n            id (str): A unique identifier for the data.\n            method: The method used to load the data.\n\n        Returns:\n            Any: The loaded data.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def _save(self, data: Any, id: str, method):\n        \"\"\"\n        Saves data to the checkpoint.\n\n        Args:\n            data (Any): A unique identifier for the data.\n            id (str): The identifier for the data.\n            method: The method used.\n        \"\"\"\n        pass\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/checkpoint_decorator.py",
    "content": "from rag_experiment_accelerator.checkpoint.checkpoint_factory import get_checkpoint\n\n\ndef cache_with_checkpoint(id=None):\n    \"\"\"\n    A decorator that can be used to cache the results of a method call using the globally initialized Checkpoint object.\n    An id must be provided to the decorator, which is used to identify the cached result.\n    If the method is called with the same id again, the cached result is returned instead of executing the method.\n    \"\"\"\n\n    def decorator(func):\n        def wrapper(*args, **kwargs):\n            if id is None:\n                raise ValueError(\n                    \"'id' must be provided to the cache_with_checkpoint decorator\"\n                )\n\n            eval_context = {**globals(), **locals(), **kwargs}\n            arg_dict = {\n                param: value\n                for param, value in zip(\n                    func.__code__.co_varnames[: func.__code__.co_argcount], args\n                )\n            }\n            eval_context.update(arg_dict)\n\n            try:\n                evaluated_id = eval(id, eval_context)\n            except Exception as e:\n                raise ValueError(\n                    f\"Failed to evaluate the provided expression: {id}\"\n                ) from e\n\n            checkpoint = get_checkpoint()\n            return checkpoint.load_or_run(func, evaluated_id, *args, **kwargs)\n\n        return wrapper\n\n    return decorator\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/checkpoint_factory.py",
    "content": "from rag_experiment_accelerator.config.config import Config, ExecutionEnvironment\n\nglobal _checkpoint_instance\n_checkpoint_instance = None\n\n\ndef get_checkpoint():\n    \"\"\"\n    Returns the current checkpoint instance.\n    \"\"\"\n    global _checkpoint_instance\n    if not _checkpoint_instance:\n        raise Exception(\"Checkpoint not initialized yet. Call init_checkpoint() first.\")\n    return _checkpoint_instance\n\n\ndef init_checkpoint(config: Config):\n    \"\"\"\n    Initializes the checkpoint instance based on the provided configuration.\n    \"\"\"\n    global _checkpoint_instance\n    _checkpoint_instance = _get_checkpoint_base_on_config(config)\n\n\ndef _get_checkpoint_base_on_config(config: Config):\n    # import inside the method to avoid circular dependencies\n    from rag_experiment_accelerator.checkpoint.null_checkpoint import NullCheckpoint\n    from rag_experiment_accelerator.checkpoint.local_storage_checkpoint import (\n        LocalStorageCheckpoint,\n    )\n\n    if not config.use_checkpoints:\n        return NullCheckpoint()\n\n    if config.execution_environment == ExecutionEnvironment.AZURE_ML:\n        # Currently not supported in Azure ML: https://github.com/microsoft/rag-experiment-accelerator/issues/491\n        return NullCheckpoint()\n\n    return LocalStorageCheckpoint(directory=config.path.artifacts_dir)\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/local_storage_checkpoint.py",
    "content": "import os\nimport pickle\nimport hashlib\n\nfrom typing import Any, List, Set\nfrom rag_experiment_accelerator.checkpoint.checkpoint import Checkpoint\n\n\nclass LocalStorageCheckpoint(Checkpoint):\n    \"\"\"\n    A checkpoint implementation that stores the data in the local file system.\n    \"\"\"\n\n    def __init__(self, directory: str = \".\"):\n        self.checkpoint_location = f\"{directory}/checkpoints\"\n        os.makedirs(self.checkpoint_location, exist_ok=True)\n        self.internal_ids: Set[str] = self._get_existing_checkpoint_ids()\n\n    def _has_data(self, id: str, method) -> bool:\n        checkpoint_id = self._build_internal_id(id, method)\n        return checkpoint_id in self.internal_ids\n\n    def _load(self, id: str, method) -> List:\n        file_path = self._get_checkpoint_file_path(id, method)\n        with open(file_path, \"rb\") as file:\n            data = pickle.load(file)\n            return data\n\n    def _save(self, data: Any, id: str, method):\n        file_path = self._get_checkpoint_file_path(id, method)\n        with open(file_path, \"wb\") as file:\n            pickle.dump(data, file)\n        internal_id = self._build_internal_id(id, method)\n        self.internal_ids.add(internal_id)\n\n    def _get_checkpoint_file_path(self, id: str, method):\n        checkpoint_id = self._build_internal_id(id, method)\n        return f\"{self.checkpoint_location}/{checkpoint_id}.pkl\"\n\n    def _build_internal_id(self, id: str, method):\n        hashed_id = hashlib.sha256(id.encode()).hexdigest()\n        return f\"{method.__name__}___{hashed_id}\"\n\n    def _get_existing_checkpoint_ids(self) -> Set[str]:\n        ids = set()\n        file_names = os.listdir(self.checkpoint_location)\n\n        for file_name in file_names:\n            file_name = file_name.replace(\".pkl\", \"\")\n            ids.add(file_name)\n\n        return ids\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/null_checkpoint.py",
    "content": "from typing import Any\nfrom rag_experiment_accelerator.checkpoint.checkpoint import Checkpoint\n\n\nclass NullCheckpoint(Checkpoint):\n    \"\"\"\n    A dummy checkpoint implementation that does not do anything, used in cases where the checkpoints should be ignored.\n    \"\"\"\n\n    def __init__(self):\n        pass\n\n    def _has_data(self, id: str, method) -> bool:\n        return False\n\n    def _load(self, id: str, method) -> Any:\n        pass\n\n    def _save(self, data: Any, id: str, method):\n        pass\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/tests/test_checkpoint.py",
    "content": "from unittest.mock import MagicMock\nimport pytest\nfrom unittest.mock import patch\n\nfrom rag_experiment_accelerator.checkpoint.checkpoint_factory import (\n    get_checkpoint,\n    init_checkpoint,\n)\nfrom rag_experiment_accelerator.checkpoint.local_storage_checkpoint import (\n    LocalStorageCheckpoint,\n)\nfrom rag_experiment_accelerator.checkpoint.null_checkpoint import NullCheckpoint\nfrom rag_experiment_accelerator.config.config import ExecutionEnvironment\n\n\n@pytest.fixture\ndef mock_checkpoints():\n    with patch.object(\n        LocalStorageCheckpoint, \"__init__\", return_value=None\n    ), patch.object(NullCheckpoint, \"__init__\", return_value=None):\n        yield\n\n\ndef test_get_checkpoint_without_init_fails():\n    with pytest.raises(Exception) as e_info:\n        get_checkpoint()\n    assert (\n        str(e_info.value)\n        == \"Checkpoint not initialized yet. Call init_checkpoint() first.\"\n    )\n\n\ndef test_get_checkpoint_for_local_executions(mock_checkpoints):\n    config = MagicMock()\n    config.execution_environment = ExecutionEnvironment.LOCAL\n    config.use_checkpoints = True\n\n    init_checkpoint(config)\n    checkpoint = get_checkpoint()\n    assert isinstance(checkpoint, LocalStorageCheckpoint)\n\n\ndef test_get_checkpoint_for_azure_ml(mock_checkpoints):\n    config = MagicMock()\n    config.execution_environment = ExecutionEnvironment.AZURE_ML\n    config.use_checkpoints = True\n\n    init_checkpoint(config)\n    checkpoint = get_checkpoint()\n    # currently not supposed for Azure ML, so it should return NullCheckpoint\n    assert isinstance(checkpoint, NullCheckpoint)\n\n\ndef test_get_checkpoint_when_should_not_use_checkpoints_locally(mock_checkpoints):\n    config = MagicMock()\n    config.execution_environment = ExecutionEnvironment.LOCAL\n    config.use_checkpoints = False\n\n    init_checkpoint(config)\n    checkpoint = get_checkpoint()\n    assert isinstance(checkpoint, NullCheckpoint)\n\n\ndef test_get_checkpoint_when_should_not_use_checkpoints_in_azure_ml(mock_checkpoints):\n    config = MagicMock()\n    config.execution_environment = ExecutionEnvironment.AZURE_ML\n    config.use_checkpoints = False\n\n    init_checkpoint(config)\n    checkpoint = get_checkpoint()\n    assert isinstance(checkpoint, NullCheckpoint)\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/tests/test_local_storage_checkpoint.py",
    "content": "import unittest\nimport os\nimport tempfile\nimport shutil\nfrom unittest.mock import MagicMock\n\nfrom rag_experiment_accelerator.checkpoint.checkpoint_factory import (\n    get_checkpoint,\n    init_checkpoint,\n)\nfrom rag_experiment_accelerator.checkpoint.checkpoint_decorator import (\n    cache_with_checkpoint,\n)\nfrom rag_experiment_accelerator.checkpoint.local_storage_checkpoint import (\n    LocalStorageCheckpoint,\n)\n\n\n@cache_with_checkpoint(id=\"call_identifier\")\ndef dummy(word, call_identifier):\n    return f\"hello {word}\"\n\n\nclass TestLocalStorageCheckpoint(unittest.TestCase):\n    def setUp(self):\n        self.temp_dir = tempfile.mkdtemp()\n\n    def tearDown(self):\n        if os.path.exists(self.temp_dir):\n            shutil.rmtree(self.temp_dir)\n\n    def test_wrapped_method_is_cached(self):\n        config = MagicMock()\n        config.use_checkpoints = True\n        config.artifacts_dir = self.temp_dir\n        init_checkpoint(config)\n        checkpoint = get_checkpoint()\n        assert isinstance(checkpoint, LocalStorageCheckpoint)\n\n        data_id = \"same_id\"\n        result1 = dummy(\"first run\", data_id)\n        result2 = dummy(\"second run\", data_id)\n        self.assertEqual(result1, \"hello first run\")\n        self.assertEqual(result2, \"hello first run\")\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "rag_experiment_accelerator/checkpoint/tests/test_null_checkpoint.py",
    "content": "import unittest\n\nfrom rag_experiment_accelerator.checkpoint.null_checkpoint import NullCheckpoint\n\n\ndef dummy(word):\n    return f\"hello {word}\"\n\n\nclass TestNullCheckpoint(unittest.TestCase):\n    def test_wrapped_method_is_not_cached(self):\n        checkpoint = NullCheckpoint()\n        data_id = \"unique_id\"\n        result1 = checkpoint.load_or_run(dummy, data_id, \"first run\")\n        result2 = checkpoint.load_or_run(dummy, data_id, \"second run\")\n        self.assertEqual(result1, \"hello first run\")\n        self.assertEqual(result2, \"hello second run\")\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "rag_experiment_accelerator/config/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/config/base_config.py",
    "content": "from copy import deepcopy\nfrom typing import get_type_hints\nfrom itertools import product\n\nimport random\n\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\nclass BaseConfig:\n    @classmethod\n    def from_dict(cls, config_dict: dict) -> \"BaseConfig\":\n        \"\"\"\n        Create a config object from a dictionary.\n        \"\"\"\n        config = cls()\n\n        type_hints = get_type_hints(cls)\n\n        for key, value in config_dict.items():\n            if isinstance(value, dict):\n                value = type_hints[key].from_dict(value)\n            elif isinstance(value, list):\n                if len(value) and isinstance(value[0], dict):\n                    value = [type_hints[key].from_dict(item) for item in value]\n\n            setattr(config, key, value)\n\n        return config\n\n    def to_dict(self) -> dict:\n        \"\"\"\n        Convert the config to a dictionary.\n        \"\"\"\n        config_dict = {}\n        for key in get_type_hints(self):\n            attr_value = getattr(self, key)\n            if isinstance(attr_value, BaseConfig):\n                attr_value = attr_value.to_dict()\n            elif isinstance(attr_value, list):\n                if len(attr_value) and isinstance(attr_value[0], BaseConfig):\n                    attr_value = [item.to_dict() for item in attr_value]\n\n            config_dict[key] = attr_value\n\n        return config_dict\n\n    def flatten(self, randomize: bool = False):\n        \"\"\"\n        Flattens the configuration object by generating all possible combinations of attribute values.\n\n        Args:\n            randomize (bool, optional): Flag indicating whether to randomize the order of the generated combinations. Defaults to False.\n\n        Yields:\n            BaseConfig: A flattened configuration object with attribute values set to each combination.\n\n        \"\"\"\n        key_values = {key: getattr(self, key) for key in get_type_hints(self)}\n        sampling_key_values = {\n            key: value\n            for key, value in key_values.items()\n            if isinstance(value, BaseConfig) or isinstance(value, list)\n        }\n\n        attribute_variations = {}\n        for key, value in sampling_key_values.items():\n            if isinstance(value, list):\n                if len(value) and isinstance(value[0], BaseConfig):\n                    list_of_lists = [list(value_item.flatten()) for value_item in value]\n                    attribute_variations[key] = [\n                        item for sublist in list_of_lists for item in sublist\n                    ]\n                else:\n                    attribute_variations[key] = value\n            else:\n                attribute_variations[key] = list(value.flatten())\n\n        attribute_names = list(attribute_variations.keys())\n\n        combination_tuples = list(product(*(attribute_variations.values())))\n        if randomize:\n            random.shuffle(combination_tuples)\n\n        for values in combination_tuples:\n            obj_copy = deepcopy(self)\n\n            for idx, attr in enumerate(attribute_names):\n                setattr(obj_copy, attr, values[idx])\n\n            yield obj_copy\n\n    def sample(self) -> list[\"BaseConfig\"]:\n        \"\"\"\n        Returns one randomly selected flattened configuration from the base config.\n\n        Returns:\n            BaseConfig: A randomly selected flattened configuration.\n        \"\"\"\n        return [next(self.flatten(randomize=True))]\n"
  },
  {
    "path": "rag_experiment_accelerator/config/chunking_config.py",
    "content": "from enum import StrEnum\nfrom dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\nclass ChunkingStrategy(StrEnum):\n    BASIC = \"basic\"\n    AZURE_DOCUMENT_INTELLIGENCE = \"azure-document-intelligence\"\n\n    def __repr__(self) -> str:\n        return f'\"{self.value}\"'\n\n\n@dataclass\nclass ChunkingConfig(BaseConfig):\n    preprocess: bool = False\n    chunk_size: int = 512\n    overlap_size: int = 128\n    generate_title: bool = False\n    generate_summary: bool = False\n    override_content_with_summary: bool = False\n    chunking_strategy: ChunkingStrategy = ChunkingStrategy.BASIC\n    # only for azure document intelligence strategy\n    azure_document_intelligence_model: str = \"prebuilt-read\"\n"
  },
  {
    "path": "rag_experiment_accelerator/config/config.py",
    "content": "from enum import StrEnum\nimport json\nimport os\n\nfrom dataclasses import dataclass, field\n\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.config.path_config import PathConfig\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\nfrom rag_experiment_accelerator.config.language_config import LanguageConfig\nfrom rag_experiment_accelerator.config.rerank_config import RerankConfig\nfrom rag_experiment_accelerator.config.search_config import SearchConfig\nfrom rag_experiment_accelerator.config.query_expansion import QueryExpansionConfig\nfrom rag_experiment_accelerator.config.openai_config import OpenAIConfig\nfrom rag_experiment_accelerator.config.eval_config import EvalConfig\n\nfrom rag_experiment_accelerator.embedding.embedding_model import EmbeddingModel\nfrom rag_experiment_accelerator.embedding.factory import create_embedding_model\nfrom rag_experiment_accelerator.llm.prompt.prompt import Prompt\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.config_validator import validate_json_with_schema\nfrom rag_experiment_accelerator.llm.prompt import main_instruction\n\n\nlogger = get_logger(__name__)\n\n\nclass ExecutionEnvironment(StrEnum):\n    LOCAL = \"local\"\n    AZURE_ML = \"azure-ml\"\n\n\n@dataclass\nclass Config(BaseConfig):\n    execution_environment: ExecutionEnvironment = ExecutionEnvironment.LOCAL\n    experiment_name: str = \"\"\n    job_name: str = \"\"\n    job_description: str = \"\"\n    data_formats: list[str] = field(default_factory=lambda: [\"*\"])\n    main_instruction: Prompt = field(init=False)\n    max_worker_threads: int = None\n    use_checkpoints: bool = True\n    path: PathConfig = field(default_factory=PathConfig)\n    index: IndexConfig = field(default_factory=IndexConfig)\n    language: LanguageConfig = field(default_factory=LanguageConfig)\n    rerank: RerankConfig = field(default_factory=RerankConfig)\n    search: SearchConfig = field(default_factory=SearchConfig)\n    query_expansion: QueryExpansionConfig = field(default_factory=QueryExpansionConfig)\n    openai: OpenAIConfig = field(default_factory=OpenAIConfig)\n    eval: EvalConfig = field(default_factory=EvalConfig)\n\n    @classmethod\n    def from_path(\n        cls, environment: Environment, config_path: str = None, data_dir: str = None\n    ) -> \"Config\":\n        if not config_path:\n            config_path = os.path.join(os.getcwd(), \"./config.json\")\n        with open(config_path.strip(), \"r\") as json_file:\n            config_json: dict[str, any] = json.load(json_file)\n            is_valid_config, validation_error = validate_json_with_schema(\n                config_json, config_path.strip()\n            )\n            if not is_valid_config:\n                raise ValueError(f\"Config validation error: {validation_error}\")\n\n        config = Config.from_dict(config_json)\n\n        config.path.initialize_paths(config_path, data_dir)\n\n        # todo: currently main_instruction in the prompt file and not possible to override in the config\n        config.main_instruction = main_instruction\n\n        config.validate_inputs(\n            use_semantic_search=environment.azure_search_use_semantic_search.lower()\n            == \"true\"\n        )\n\n        config.initialize_embedding_models(environment)\n\n        config.execution_environment = ExecutionEnvironment.LOCAL\n        # todo: move to Environment class?\n        max_worker_threads = os.environ.get(\"MAX_WORKER_THREADS\", None)\n        if max_worker_threads:\n            config.max_worker_threads = int(max_worker_threads)\n\n        # todo: remove or flatten\n        # # log all the configuration settings in debug mode\n        # for key, value in config.to_dict():\n        #     logger.debug(f\"Configuration setting: {key} = {value}\")\n\n        return config\n\n    def validate_inputs(self, use_semantic_search: bool = False):\n        if max(self.index.chunking.overlap_size) > min(self.index.chunking.chunk_size):\n            raise ValueError(\n                \"Config param validation error: overlap_size must be less than chunk_size\"\n            )\n\n        if (\n            \"search_for_match_semantic\" in self.search.search_type\n            or \"search_for_manual_hybrid\" in self.search.search_type\n        ) and not use_semantic_search:\n            raise ValueError(\n                \"Semantic search is required for search types 'search_for_match_semantic' or 'search_for_manual_hybrid', but it's not enabled.\"\n            )\n\n    def initialize_embedding_models(self, environment: Environment):\n        self.__embedding_models_dictionary = {}\n        for model_config in self.index.embedding_model:\n            kwargs = {\"environment\": environment, **model_config.to_dict()}\n            self.__embedding_models_dictionary[\n                model_config.model_name\n            ] = create_embedding_model(model_config.type, **kwargs)\n\n    def get_embedding_model(self, model_name) -> EmbeddingModel:\n        return self.__embedding_models_dictionary.get(model_name)\n"
  },
  {
    "path": "rag_experiment_accelerator/config/config_validator.py",
    "content": "import json\nimport os\nfrom jsonschema import ValidationError, validate\nimport requests\n\nschema_cache = {}\n\n\ndef fetch_json_schema_from_url(schema_url):\n    \"\"\"Fetch the JSON schema from a URL.\"\"\"\n    response = requests.get(schema_url, timeout=5)\n    response.raise_for_status()\n    return response.json()\n\n\ndef fetch_json_schema_from_file(schema_file_path, source_file_path):\n    \"\"\"Fetch the JSON schema from a local file path.\"\"\"\n    normalised_schema_path = get_normalised_schema_path(\n        schema_file_path, source_file_path\n    )\n\n    if not os.path.isfile(normalised_schema_path):\n        raise ValueError(f\"Local schema file not found: {normalised_schema_path}\")\n\n    with open(normalised_schema_path, \"r\", encoding=\"utf8\") as schema_file:\n        return json.load(schema_file)\n\n\ndef get_normalised_schema_path(schema_file_path, source_file_path):\n    source_dir = os.path.dirname(source_file_path)\n    new_schema_file_path = os.path.join(source_dir, schema_file_path)\n    return os.path.normpath(new_schema_file_path)\n\n\ndef fetch_json_schema(schema_reference, source_file_path):\n    \"\"\"Fetch the JSON schema from a URL or local file path, with caching.\"\"\"\n    if schema_reference in schema_cache:\n        return schema_cache[schema_reference]\n\n    schema = (\n        fetch_json_schema_from_url(schema_reference)\n        if schema_reference.startswith((\"http://\", \"https://\"))\n        else fetch_json_schema_from_file(schema_reference, source_file_path)\n    )\n\n    schema_cache[schema_reference] = schema\n    return schema\n\n\ndef validate_json_with_schema(\n    json_data, source_file_path\n) -> tuple[bool, ValidationError | None]:\n    \"\"\"Validate a JSON object using the schema specified in its $schema property.\"\"\"\n    try:\n        schema_reference = json_data.get(\"$schema\")\n        if not schema_reference:\n            return True, None\n\n        schema = fetch_json_schema(schema_reference, source_file_path)\n\n        validate(instance=json_data, schema=schema)\n        return True, None\n    except ValidationError as ve:\n        return False, ve\n"
  },
  {
    "path": "rag_experiment_accelerator/config/embedding_model_config.py",
    "content": "from dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass EmbeddingModelConfig(BaseConfig):\n    type: str = \"sentence-transformer\"\n    model_name: str = \"all-mpnet-base-v2\"\n    dimension: int = None\n    shorten_dimensions: bool = False\n"
  },
  {
    "path": "rag_experiment_accelerator/config/environment.py",
    "content": "import os\nfrom typing import Optional\nfrom dataclasses import dataclass\n\nfrom dotenv import load_dotenv\nfrom azure.keyvault.secrets import SecretClient\nfrom azure.core.exceptions import ResourceNotFoundError\nfrom typing import Tuple\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.utils.auth import get_default_az_cred\n\nload_dotenv(override=True)\nlogger = get_logger(__name__)\n\n\ndef field_to_env_name(field_name: str) -> str:\n    \"\"\"\n    Convert from the name of a field to an environment variable name.\n    For example, openai_api_key becomes OPENAI_API_KEY.\n    \"\"\"\n    return field_name.upper()\n\n\ndef _get_value_from_env(var_name: str, is_optional: bool = False) -> Optional[str]:\n    \"\"\"\n    Get the value of an environment variable.\n\n    Raises ValueError if not found and is not optional.\n    \"\"\"\n    var = os.getenv(var_name, None)\n    if var is None and not is_optional:\n        logger.critical(f\"{var_name} environment variable not set.\")\n        raise ValueError(f\"{var_name} environment variable not set.\")\n    return var\n\n\ndef init_keyvault(azure_key_vault_endpoint: str) -> SecretClient:\n    \"\"\"\n    Initializes keyvault client using the provided endpoint and default credentials.\n    \"\"\"\n    return SecretClient(\n        azure_key_vault_endpoint,\n        credential=get_default_az_cred(),\n    )\n\n\ndef field_to_keyvault_name(field_name: str) -> str:\n    \"\"\"\n    Convert from the name of a field to a keyvault secret name.\n    For example, openai_api_key becomes openai-api-key.\n    \"\"\"\n    return field_name.replace(\"_\", \"-\")\n\n\ndef _get_value_from_keyvault(\n    keyvault: SecretClient, field_name: str, is_optional: bool = False\n) -> Optional[str]:\n    \"\"\"\n    Retrieves the value from the provided keyvault.\n\n    Raises ResourceNotFoundError if not found and is not optional.\n    \"\"\"\n    try:\n        value = keyvault.get_secret(field_to_keyvault_name(field_name)).value\n        # None values are stored as 'None'\n        if value == \"None\":\n            value = None\n        return value\n    except ResourceNotFoundError as e:\n        if is_optional:\n            return None\n        raise e\n\n\n@dataclass\nclass Environment:\n    openai_api_type: Optional[str]\n    openai_api_key: str\n    openai_api_version: str\n    openai_endpoint: str\n    aml_subscription_id: str\n    aml_workspace_name: str\n    aml_resource_group_name: str\n    aml_compute_name: Optional[str]\n    aml_compute_instances_number: Optional[str]\n    azure_search_service_endpoint: str\n    azure_search_admin_key: str\n    azure_search_use_semantic_search: str\n    azure_language_service_endpoint: Optional[str]\n    azure_language_service_key: Optional[str]\n    azure_document_intelligence_endpoint: Optional[str]\n    azure_document_intelligence_admin_key: Optional[str]\n    azure_key_vault_endpoint: Optional[str]\n\n    @classmethod\n    def _field_names(cls) -> list[str]:\n        \"\"\"\n        Returns a list of all field names of this class\n        \"\"\"\n        return list(vars(cls)[\"__dataclass_fields__\"].keys())\n\n    @classmethod\n    def _is_field_optional(cls, field_name: str) -> bool:\n        \"\"\"\n        Returns whether a field is optional based on it's type\n        Fields with type Optional[str] are optional, fields with type str are required\n        \"\"\"\n        return vars(cls)[\"__dataclass_fields__\"][field_name].type == Optional[str]\n\n    def fields(self) -> list[Tuple[str, str]]:\n        \"\"\"\n        Returns a list of tuples containing the field name and value of this class instance\n        \"\"\"\n        return list(vars(self).items())\n\n    @classmethod\n    def _from_env(cls) -> \"Environment\":\n        \"\"\"\n        Initialize the Environment using the environment variables.\n        \"\"\"\n        values_dict = {\n            name: _get_value_from_env(\n                field_to_env_name(name), cls._is_field_optional(name)\n            )\n            for name in cls._field_names()\n        }\n        return cls(**values_dict)\n\n    @classmethod\n    def from_keyvault(cls, azure_key_vault_endpoint: str) -> \"Environment\":\n        \"\"\"\n        Initialize the Environment using the keyvault endpoint provided.\n        \"\"\"\n        keyvault = init_keyvault(azure_key_vault_endpoint=azure_key_vault_endpoint)\n        values_dict = {\n            field_name: _get_value_from_keyvault(\n                keyvault, field_name, cls._is_field_optional(field_name)\n            )\n            for field_name in cls._field_names()\n        }\n        return cls(**values_dict)\n\n    @classmethod\n    def from_env_or_keyvault(cls) -> \"Environment\":\n        \"\"\"\n        Initialize the Environment using the environment variables and keyvault.\n\n        If USE_KEY_VAULT is set to True, this will use environment variables for those values that are set there.\n        For those values that are not set in the environment, it will attempt to use the keyvault.\n\n        If USE_KEY_VAULT is not set to True, this will use the environment variables only.\n\n        Note that this method won't work from within AzureML compute, in that case you need to use from_keyvault().\n\n        Raises:\n            ValueError: If a required value is not found in the environment or keyvault.\n        \"\"\"\n        use_key_vault = _get_value_from_env(\"USE_KEY_VAULT\", is_optional=True)\n\n        if use_key_vault and use_key_vault.lower() == \"true\":\n            # Most values will be found in env, but secrets will be found in keyvault\n            azure_key_vault_endpoint = _get_value_from_env(\n                \"AZURE_KEY_VAULT_ENDPOINT\", is_optional=False\n            )\n            keyvault = init_keyvault(azure_key_vault_endpoint)\n            values_dict = {}\n\n            # Try to get values from env first\n            for field_name in cls._field_names():\n                is_optional = cls._is_field_optional(field_name)\n                value = _get_value_from_env(\n                    field_to_env_name(field_name), is_optional=True\n                )\n                # If not found in env, try to get from keyvault\n                if not value:\n                    value = _get_value_from_keyvault(\n                        keyvault, field_name, is_optional=True\n                    )\n                    if not value and not is_optional:\n                        raise ValueError(\n                            f\"Value for {field_name} not found in environment or keyvault\"\n                        )\n                values_dict[field_name] = value\n            return cls(**values_dict)\n\n        return cls._from_env()\n\n    def to_keyvault(self, azure_key_vault_endpoint: str = None) -> None:\n        \"\"\"\n        Serializes the environment to keyvault.\n        Note that for the optional fields that are not set, this will create the value 'None' in the keyvault.\n\n        Raises:\n            ValueError if the keyvault endpoint is not provided and not set in the environment.\n        \"\"\"\n        if not azure_key_vault_endpoint:\n            if not self.azure_key_vault_endpoint:\n                raise ValueError(\n                    \"Keyvault endpoint not provided and not set in environment\"\n                )\n            azure_key_vault_endpoint = self.azure_key_vault_endpoint\n        keyvault = init_keyvault(azure_key_vault_endpoint=azure_key_vault_endpoint)\n        for field_name, value in self.fields():\n            keyvault.set_secret(\n                name=field_to_keyvault_name(field_name), value=str(value)\n            )\n"
  },
  {
    "path": "rag_experiment_accelerator/config/eval_config.py",
    "content": "from dataclasses import dataclass, field\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass EvalConfig(BaseConfig):\n    metric_types: list[str] = field(\n        default_factory=lambda: [\n            \"fuzzy_score\",\n            \"bert_all_MiniLM_L6_v2\",\n            \"cosine_ochiai\",\n            \"bert_distilbert_base_nli_stsb_mean_tokens\",\n            \"llm_answer_relevance\",\n            \"llm_context_precision\",\n        ]\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/config/index_config.py",
    "content": "from dataclasses import dataclass, field\nfrom enum import StrEnum\n\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\nfrom rag_experiment_accelerator.config.chunking_config import ChunkingConfig\nfrom rag_experiment_accelerator.config.embedding_model_config import (\n    EmbeddingModelConfig,\n)\nfrom rag_experiment_accelerator.config.sampling_config import SamplingConfig\n\n\nclass IndexKey(StrEnum):\n    PREFIX = \"idx\"\n    EF_CONSTRUCTION = \"efc\"\n    EF_SEARCH = \"efs\"\n    EMBEDDING_MODEL_NAME = \"em\"\n    DIMENSION = \"d\"\n    SAMPLING_PERCENTAGE = \"sp\"\n    PREPROCESS = \"p\"\n    CHUNK_SIZE = \"cs\"\n    CHUNKING_STRATEGY = \"st\"\n    OVERLAP_SIZE = \"o\"\n    GENERATE_TITLE = \"t\"\n    GENERATE_SUMMARY = \"s\"\n    OVERRIDE_CONTENT_WITH_SUMMARY = \"oc\"\n\n\n@dataclass\nclass IndexConfig(BaseConfig):\n    \"\"\"A class to hold parameters for each index configured through Config.\n\n    Attributes:\n        index_name_prefix (str):\n            Prefix to use for the index created in Azure Search.\n        ef_construction (int):\n            Parameter ef_construction for HNSW index.\n        ef_search (int):\n            Parameter ef_search for HNSW index.\n        chunking (ChunkingConfig):\n            Configuration for chunking documents.\n        embedding_model (EmbeddingModelConfig):\n            Configuration for the embedding model.\n        sampling (SamplingConfig):\n            Configuration for sampling documents.\n    \"\"\"\n\n    index_name_prefix: str = \"idx\"\n    ef_construction: int = 400\n    ef_search: int = 400\n    chunking: ChunkingConfig = field(default_factory=ChunkingConfig)\n    embedding_model: EmbeddingModelConfig = field(default_factory=EmbeddingModelConfig)\n    sampling: SamplingConfig = field(default_factory=SamplingConfig)\n\n    def __label_properties(self) -> dict:\n        \"\"\"\n        Returns properties subset used for labeling.\n        \"\"\"\n        properties = {\n            IndexKey.PREFIX: self.index_name_prefix,\n            IndexKey.EF_CONSTRUCTION: self.ef_construction,\n            IndexKey.EF_SEARCH: self.ef_search,\n            IndexKey.EMBEDDING_MODEL_NAME: self.embedding_model.model_name.lower(),\n            IndexKey.SAMPLING_PERCENTAGE: self.sampling.percentage,\n            IndexKey.PREPROCESS: int(self.chunking.preprocess),\n            IndexKey.CHUNK_SIZE: self.chunking.chunk_size,\n            IndexKey.CHUNKING_STRATEGY: self.chunking.chunking_strategy,\n            IndexKey.OVERLAP_SIZE: self.chunking.overlap_size,\n            IndexKey.GENERATE_TITLE: int(self.chunking.generate_title),\n            IndexKey.GENERATE_SUMMARY: int(self.chunking.generate_summary),\n            IndexKey.OVERRIDE_CONTENT_WITH_SUMMARY: int(\n                self.chunking.override_content_with_summary\n            ),\n        }\n\n        if self.embedding_model.dimension:\n            properties[IndexKey.DIMENSION] = self.embedding_model.dimension\n\n        return properties\n\n    @classmethod\n    def __from_label_properties(cls, properties: dict) -> \"IndexConfig\":\n        \"\"\"\n        Creates IndexConfig from the dictionary with properties.\n        Reverse of __label_properties().\n        \"\"\"\n\n        return IndexConfig(\n            index_name_prefix=properties[IndexKey.PREFIX],\n            ef_construction=int(properties[IndexKey.EF_CONSTRUCTION]),\n            ef_search=int(properties[IndexKey.EF_SEARCH]),\n            chunking=ChunkingConfig(\n                preprocess=bool(int(properties[IndexKey.PREPROCESS])),\n                chunk_size=int(properties[IndexKey.CHUNK_SIZE]),\n                chunking_strategy=properties[IndexKey.CHUNKING_STRATEGY],\n                overlap_size=int(properties[IndexKey.OVERLAP_SIZE]),\n                generate_title=bool(int(properties[IndexKey.GENERATE_TITLE])),\n                generate_summary=bool(int(properties[IndexKey.GENERATE_SUMMARY])),\n                override_content_with_summary=bool(\n                    int(properties[IndexKey.OVERRIDE_CONTENT_WITH_SUMMARY])\n                ),\n            ),\n            embedding_model=EmbeddingModelConfig(\n                model_name=properties[IndexKey.EMBEDDING_MODEL_NAME],\n                dimension=int(properties[IndexKey.DIMENSION])\n                if IndexKey.DIMENSION in properties\n                else None,\n            ),\n            sampling=SamplingConfig(\n                percentage=properties[IndexKey.SAMPLING_PERCENTAGE]\n            ),\n        )\n\n    def index_name(self) -> str:\n        \"\"\"\n        Returns index name from the fields.\n        Reverse of IndexConfig.from_index_name().\n        \"\"\"\n        index_name = \"_\".join(\n            [f\"{key}-{value}\" for (key, value) in self.__label_properties().items()]\n        )\n        if index_name.startswith(\"_\") or index_name.startswith(\"-\"):\n            index_name = \"i\" + index_name\n\n        index_name = index_name[:127]\n\n        return index_name\n\n    @classmethod\n    def from_index_name(cls, index_name: str) -> \"IndexConfig\":\n        \"\"\"\n        Creates IndexConfig from the index name.\n        Reverse of index_name().\n        \"\"\"\n\n        key_values = [kv.split(\"-\", 1) for kv in index_name.split(\"_\")]\n        properties = {kv[0]: kv[1].strip() for kv in key_values}\n\n        try:\n            index_config = IndexConfig.__from_label_properties(properties)\n        except Exception as e:\n            raise ValueError(f\"Invalid index name [{index_name}]. {e}\")\n\n        return index_config\n"
  },
  {
    "path": "rag_experiment_accelerator/config/language_config.py",
    "content": "from dataclasses import dataclass, field\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass LanguageAnalyzerConfig(BaseConfig):\n    analyzer_name: str = \"en.microsoft\"\n    index_analyzer_name: str = \"\"\n    search_analyzer_name: str = \"\"\n    char_filters: list[any] = field(default_factory=list)\n    tokenizers: list[any] = field(default_factory=list)\n    token_filters: list[any] = field(default_factory=list)\n\n\n@dataclass\nclass LanguageConfig(BaseConfig):\n    analyzer: LanguageAnalyzerConfig = field(default_factory=LanguageAnalyzerConfig)\n    query_language: str = \"en-us\"\n"
  },
  {
    "path": "rag_experiment_accelerator/config/openai_config.py",
    "content": "from dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass OpenAIConfig(BaseConfig):\n    azure_oai_chat_deployment_name: str = \"gpt-35-turbo\"\n    azure_oai_eval_deployment_name: str = \"gpt-35-turbo\"\n    temperature: float = 0.0\n"
  },
  {
    "path": "rag_experiment_accelerator/config/path_config.py",
    "content": "from dataclasses import dataclass\nimport os\nfrom rag_experiment_accelerator.config import paths\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\nclass Paths:\n    ARTIFACTS_DIR = \"artifacts\"\n    DATA_DIR = \"data\"\n    EVAL_DATA_FILE = \"eval_data.jsonl\"\n    GENERATED_INDEX_NAMES_FILE = \"generated_index_names.jsonl\"\n    QUERY_DATA_DIR = \"query_data\"\n    EVAL_DATA_DIR = \"eval_score\"\n    SAMPLING_OUTPUT_DIR = \"sampling\"\n\n\n@dataclass\nclass PathConfig(BaseConfig):\n    artifacts_dir: str = \"\"\n    data_dir: str = \"\"\n    eval_data_file: str = \"\"\n    eval_data_dir: str = \"\"\n    generated_index_names_file: str = \"\"\n    query_data_dir: str = \"\"\n    sampling_output_dir: str = \"\"\n\n    def initialize_paths(self, config_file_path: str, data_dir: str) -> None:\n        self._config_dir = os.path.dirname(config_file_path)\n\n        if not self.artifacts_dir:\n            self.artifacts_dir = os.path.join(self._config_dir, Paths.ARTIFACTS_DIR)\n        paths.try_create_directory(self.artifacts_dir)\n\n        if data_dir:\n            self.data_dir = data_dir\n        elif not self.data_dir:\n            self.data_dir = os.path.join(self._config_dir, Paths.DATA_DIR)\n\n        if not self.eval_data_file:\n            self.eval_data_file = os.path.join(self.artifacts_dir, Paths.EVAL_DATA_FILE)\n\n        if not self.generated_index_names_file:\n            self.generated_index_names_file = os.path.join(\n                self.artifacts_dir, Paths.GENERATED_INDEX_NAMES_FILE\n            )\n\n        if not self.query_data_dir:\n            self.query_data_dir = os.path.join(self.artifacts_dir, Paths.QUERY_DATA_DIR)\n        paths.try_create_directory(self.query_data_dir)\n\n        if not self.eval_data_dir:\n            self.eval_data_dir = os.path.join(self.artifacts_dir, Paths.EVAL_DATA_DIR)\n        paths.try_create_directory(self.eval_data_dir)\n\n        if not self.sampling_output_dir:\n            self.sampling_output_dir = os.path.join(\n                self.artifacts_dir, Paths.SAMPLING_OUTPUT_DIR\n            )\n        paths.try_create_directory(self.sampling_output_dir)\n\n    def sampled_cluster_predictions_path(self, optimum_k: int) -> str:\n        return os.path.join(\n            self.sampling_output_dir,\n            f\"sampled_cluster_predictions_cluster_number_{optimum_k}.csv\",\n        )\n"
  },
  {
    "path": "rag_experiment_accelerator/config/paths.py",
    "content": "import os\nimport glob\nfrom datetime import datetime\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\ndef get_all_file_paths(directory: str) -> list[str]:\n    \"\"\"\n    Returns a list of all file paths in a directory listed recursively.\n    \"\"\"\n    pattern = os.path.join(directory, \"**\", \"*\")\n    return [file for file in glob.glob(pattern, recursive=True) if os.path.isfile(file)]\n\n\ndef try_create_directory(directory: str) -> None:\n    \"\"\"\n    Tries to create a directory with the given path.\n\n    Args:\n        directory (str): The path of the directory to be created.\n\n    Returns:\n        None\n\n    Raises:\n        OSError: If an error occurs while creating the directory.\n    \"\"\"\n    try:\n        os.makedirs(directory, exist_ok=True)\n    except OSError as e:\n        logger.warn(f\"Failed to create directory {directory}: {e.strerror}\")\n\n\ndef formatted_datetime_suffix():\n    \"\"\"Return a suffix to use when naming the run and its artifacts.\"\"\"\n    return datetime.now().strftime(\"%Y_%m_%d_%H_%M_%S\")\n\n\ndef mlflow_run_name(job_name: str, suffix: str = None):\n    \"\"\"Returns a name to use for the MlFlow experiment run.\"\"\"\n    if not suffix:\n        suffix = formatted_datetime_suffix()\n    return f\"{job_name}_{suffix}\"\n"
  },
  {
    "path": "rag_experiment_accelerator/config/query_expansion.py",
    "content": "from dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass QueryExpansionConfig(BaseConfig):\n    # todo: refactor the settings to be more descriptive\n    query_expansion: bool = False\n    expand_to_multiple_questions: bool = False\n    min_query_expansion_related_question_similarity_score: int = 90\n    hyde: str = \"disabled\"\n"
  },
  {
    "path": "rag_experiment_accelerator/config/rerank_config.py",
    "content": "from dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass RerankConfig(BaseConfig):\n    enabled: bool = False  # todo: consider moving it as a module switch\n    type: str = \"cross_encoder\"\n    cross_encoder_at_k: int = 3\n    cross_encoder_model: str = \"\"\n    llm_rerank_threshold: int = 3\n"
  },
  {
    "path": "rag_experiment_accelerator/config/sampling_config.py",
    "content": "from dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass SamplingConfig(BaseConfig):\n    \"\"\"A class to hold parameters for the sampling.\n\n    Attributes:\n        sample_data (bool):\n            Flag indicating whether to sample the data.\n        percentage (int):\n            Percentage of data to sample.\n        optimum_k (str):\n            Optimum value of k for clustering.\n        min_cluster (int):\n            Minimum number of clusters.\n        max_cluster (int):\n            Maximum number of clusters.\n    \"\"\"\n\n    sample_data: bool = False\n    percentage: int = 5\n    optimum_k: str = \"auto\"\n    min_cluster: int = 2\n    max_cluster: int = 30\n    only_sample: bool = False\n"
  },
  {
    "path": "rag_experiment_accelerator/config/search_config.py",
    "content": "from dataclasses import dataclass\nfrom rag_experiment_accelerator.config.base_config import BaseConfig\n\n\n@dataclass\nclass SearchConfig(BaseConfig):\n    retrieve_num_of_documents: int = 3\n    search_type: str = \"search_for_match_semantic\"\n    search_relevancy_threshold: float = 0.8\n"
  },
  {
    "path": "rag_experiment_accelerator/config/tests/test_config.py",
    "content": "import pytest\nimport json\nimport os\nfrom unittest.mock import MagicMock, patch\n\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.llm.prompt import main_instruction\n\n\ndef init_config():\n    config = Config()\n    config.index.chunking.chunk_size = [512]\n    config.index.chunking.overlap_size = [128]\n    config.index.ef_construction = [400]\n    config.index.ef_search = [400]\n    return config\n\n\ndef get_test_config_dir():\n    return os.path.join(os.path.dirname(__file__), \"data\")\n\n\n@patch(\"rag_experiment_accelerator.config.config.create_embedding_model\")\n@patch(\"rag_experiment_accelerator.config.config.validate_json_with_schema\")\ndef test_config_init(mock_validate_json_with_schema, mock_create_embedding_model):\n    # Load mock config data from a YAML file\n    config_path = f\"{get_test_config_dir()}/config.json\"\n    with open(config_path, \"r\") as file:\n        mock_config = json.load(file)\n\n    environment = MagicMock()\n    embedding_model_1 = MagicMock()\n    embedding_model_1.model_name.return_value = \"all-MiniLM-L6-v2\"\n    embedding_model_1.dimension.return_value = 384\n    embedding_model_1.shorten_dimensions.return_value = False\n    embedding_model_2 = MagicMock()\n    embedding_model_2.model_name.return_value = \"text-embedding-ada-002\"\n    embedding_model_2.dimension.return_value = 1536\n    embedding_model_2.shorten_dimensions.return_value = False\n    embedding_model_3 = MagicMock()\n    embedding_model_3.model_name.return_value = \"text-embedding-3-large\"\n    embedding_model_3.dimension.return_value = 3072\n    embedding_model_3.shorten_dimensions.return_value = False\n    embedding_model_4 = MagicMock()\n    embedding_model_4.model_name.return_value = \"text-embedding-3-small\"\n    embedding_model_4.dimension.return_value = 256\n    embedding_model_4.shorten_dimensions.return_value = True\n    mock_create_embedding_model.side_effect = [embedding_model_1, embedding_model_2, embedding_model_3, embedding_model_4]\n    mock_validate_json_with_schema.return_value = (True, None)\n\n    config = Config.from_path(environment, config_path)\n\n    assert config.experiment_name == mock_config[\"experiment_name\"]\n    # execution_environment\n    assert config.job_name == mock_config[\"job_name\"]\n    assert config.job_description == mock_config[\"job_description\"]\n    assert config.data_formats == mock_config[\"data_formats\"]\n    assert config.main_instruction.system_message == main_instruction.system_message\n    assert (config.max_worker_threads is None) and (\n        \"max_worker_threads\" not in mock_config\n    )\n    assert config.use_checkpoints == mock_config[\"use_checkpoints\"]\n\n    index = config.index\n    mock_index = mock_config[\"index\"]\n    assert index.index_name_prefix == mock_index[\"index_name_prefix\"]\n    assert index.ef_construction == mock_index[\"ef_construction\"]\n    assert index.ef_search == mock_index[\"ef_search\"]\n\n    chunking = index.chunking\n    mock_chunking = mock_config[\"index\"][\"chunking\"]\n    assert chunking.preprocess == mock_chunking[\"preprocess\"]\n    assert chunking.chunk_size == mock_chunking[\"chunk_size\"]\n    assert chunking.overlap_size == mock_chunking[\"overlap_size\"]\n    assert chunking.generate_title == mock_chunking[\"generate_title\"]\n    assert chunking.generate_summary == mock_chunking[\"generate_summary\"]\n    assert (\n        chunking.override_content_with_summary\n        == mock_chunking[\"override_content_with_summary\"]\n    )\n    assert chunking.chunking_strategy == mock_chunking[\"chunking_strategy\"]\n    assert (\n        chunking.azure_document_intelligence_model\n        == mock_chunking[\"azure_document_intelligence_model\"]\n    )\n\n    sampling = config.index.sampling\n    assert sampling.sample_data == mock_config[\"index\"][\"sampling\"][\"sample_data\"]\n    assert sampling.percentage == mock_config[\"index\"][\"sampling\"][\"percentage\"]\n    assert sampling.optimum_k == mock_config[\"index\"][\"sampling\"][\"optimum_k\"]\n    assert sampling.min_cluster == mock_config[\"index\"][\"sampling\"][\"min_cluster\"]\n    assert sampling.max_cluster == mock_config[\"index\"][\"sampling\"][\"max_cluster\"]\n\n    mock_embedding = mock_config[\"index\"][\"embedding_model\"]\n    assert index.embedding_model[0].type == mock_embedding[0][\"type\"]\n    assert index.embedding_model[0].model_name == mock_embedding[0][\"model_name\"]\n\n    assert index.embedding_model[1].type == mock_embedding[1][\"type\"]\n    assert index.embedding_model[1].model_name == mock_embedding[1][\"model_name\"]\n\n    assert index.embedding_model[2].type == mock_embedding[2][\"type\"]\n    assert index.embedding_model[2].model_name == mock_embedding[2][\"model_name\"]\n    assert index.embedding_model[2].dimension == mock_embedding[2][\"dimension\"]\n\n    assert index.embedding_model[3].type == mock_embedding[3][\"type\"]\n    assert index.embedding_model[3].model_name == mock_embedding[3][\"model_name\"]\n    assert index.embedding_model[3].dimension == mock_embedding[3][\"dimension\"]\n    assert index.embedding_model[3].shorten_dimensions == mock_embedding[3][\"shorten_dimensions\"]\n\n    model1 = config.get_embedding_model(config.index.embedding_model[0].model_name)\n    assert model1.model_name.return_value == \"all-MiniLM-L6-v2\"\n    assert model1.dimension.return_value == 384\n\n    model2 = config.get_embedding_model(config.index.embedding_model[1].model_name)\n    assert model2.model_name.return_value == \"text-embedding-ada-002\"\n    assert model2.dimension.return_value == 1536\n\n    assert config.language.query_language == mock_config[\"language\"][\"query_language\"]\n    analyzer = config.language.analyzer\n    mock_analyzer = mock_config[\"language\"][\"analyzer\"]\n    assert analyzer.analyzer_name == mock_analyzer[\"analyzer_name\"]\n    assert analyzer.index_analyzer_name == mock_analyzer[\"index_analyzer_name\"]\n    assert analyzer.search_analyzer_name == mock_analyzer[\"search_analyzer_name\"]\n    assert analyzer.char_filters == mock_analyzer[\"char_filters\"]\n    assert analyzer.tokenizers == mock_analyzer[\"tokenizers\"]\n    assert analyzer.token_filters == mock_analyzer[\"token_filters\"]\n\n    mock_rerank = mock_config[\"rerank\"]\n    assert config.rerank.enabled == mock_rerank[\"enabled\"]\n    assert config.rerank.type == mock_rerank[\"type\"]\n    assert config.rerank.cross_encoder_at_k == mock_rerank[\"cross_encoder_at_k\"]\n    assert config.rerank.cross_encoder_model == mock_rerank[\"cross_encoder_model\"]\n    assert config.rerank.llm_rerank_threshold == mock_rerank[\"llm_rerank_threshold\"]\n\n    mock_search = mock_config[\"search\"]\n    assert (\n        config.search.retrieve_num_of_documents\n        == mock_search[\"retrieve_num_of_documents\"]\n    )\n    assert config.search.search_type == mock_search[\"search_type\"]\n    assert (\n        config.search.search_relevancy_threshold\n        == mock_search[\"search_relevancy_threshold\"]\n    )\n\n    query_expansion = config.query_expansion\n    mock_query_expansion = mock_config[\"query_expansion\"]\n    assert query_expansion.query_expansion == mock_query_expansion[\"query_expansion\"]\n    assert query_expansion.hyde == mock_query_expansion[\"hyde\"]\n    assert (\n        query_expansion.min_query_expansion_related_question_similarity_score\n        == mock_query_expansion[\"min_query_expansion_related_question_similarity_score\"]\n    )\n    assert (\n        query_expansion.expand_to_multiple_questions\n        == mock_query_expansion[\"expand_to_multiple_questions\"]\n    )\n\n    openai = config.openai\n    mock_openai = mock_config[\"openai\"]\n    assert (\n        openai.azure_oai_chat_deployment_name\n        == mock_openai[\"azure_oai_chat_deployment_name\"]\n    )\n    assert (\n        openai.azure_oai_eval_deployment_name\n        == mock_openai[\"azure_oai_eval_deployment_name\"]\n    )\n    assert openai.temperature == mock_openai[\"temperature\"]\n\n    assert config.eval.metric_types == mock_config[\"eval\"][\"metric_types\"]\n\n    assert config.path.eval_data_file.endswith(\"eval_data.jsonl\") and (\n        \"eval_data_file\" not in mock_config[\"path\"]\n    )\n\n\n@patch(\"rag_experiment_accelerator.config.config.create_embedding_model\")\n@patch(\"rag_experiment_accelerator.config.config.validate_json_with_schema\")\ndef test_config_init_raises_error(\n    mock_validate_json_with_schema, mock_create_embedding_model\n):\n    config_path = f\"{get_test_config_dir()}/config.json\"\n    environment = MagicMock()\n\n    mock_validate_json_with_schema.return_value = (False, ValueError(\"Invalid JSON\"))\n\n    with pytest.raises(ValueError):\n        Config.from_path(environment, config_path)\n\n\ndef test_chunk_size_greater_than_overlap_size():\n    config = init_config()\n    config.index.chunking.chunk_size = [128]\n    config.index.chunking.overlap_size = [512]\n\n    with pytest.raises(ValueError) as info:\n        config.validate_inputs()\n\n    assert (\n        str(info.value)\n        == \"Config param validation error: overlap_size must be less than chunk_size\"\n    )\n\n\ndef test_validate_semantic_search_config():\n    config = init_config()\n\n    # Test case 1: use_semantic_search is False, but semantic search is\n    # required\n    config.search.search_type = [\"search_for_match_semantic\"]\n    use_semantic_search = False\n    with pytest.raises(ValueError) as info:\n        config.validate_inputs(use_semantic_search)\n    assert (\n        str(info.value)\n        == \"Semantic search is required for search types 'search_for_match_semantic' or 'search_for_manual_hybrid', but it's not enabled.\"\n    )\n\n    # Test case 2: use_semantic_search is True, and semantic search is required\n    config.search.search_type = [\"search_for_match_semantic\"]\n    use_semantic_search = True\n    # No exception should be raised\n    config.validate_inputs(use_semantic_search)\n\n    # Test case 3: use_semantic_search is False, and semantic search is not\n    # required\n    config.search.search_type = [\"search_for_exact_match\"]\n    use_semantic_search = False\n    # No exception should be raised\n    config.validate_inputs(use_semantic_search)\n\n    # Test case 4: use_semantic_search is True, and semantic search is not\n    # required\n    config.search.search_type = [\"search_for_exact_match\"]\n    use_semantic_search = True\n    # No exception should be raised\n    config.validate_inputs(use_semantic_search)\n"
  },
  {
    "path": "rag_experiment_accelerator/config/tests/test_config_validator.py",
    "content": "from jsonschema import ValidationError\nfrom unittest.mock import MagicMock, mock_open, patch\nimport pytest\nimport os\nimport requests\n\nfrom rag_experiment_accelerator.config.config_validator import (\n    fetch_json_schema,\n    fetch_json_schema_from_url,\n    fetch_json_schema_from_file,\n    get_normalised_schema_path,\n    validate_json_with_schema,\n)\n\n\n@patch(\"requests.get\")\ndef test_fetch_json_schema_from_url_returns_json(mock_get):\n    schema_url = \"http://test.com/schema.json\"\n\n    mock_response = MagicMock()\n    mock_response.status_code = 200\n    mock_response.json.return_value = {\"key\": \"value\"}\n\n    mock_get.return_value = mock_response\n\n    result = fetch_json_schema_from_url(schema_url)\n    assert result == mock_response.json.return_value\n\n\n@patch(\"requests.get\")\ndef test_fetch_json_schema_from_url_raises_error_for_timeout(mock_get):\n    schema_url = \"http://test.com/schema.json\"\n\n    mock_get.side_effect = requests.exceptions.Timeout\n\n    with pytest.raises(requests.exceptions.Timeout):\n        fetch_json_schema_from_url(schema_url)\n\n\n@patch(\"os.path.isfile\")\n@patch(\n    \"builtins.open\",\n    new_callable=mock_open,\n    read_data='{\"$schema\": \"http://test.com/schema.json\"}',\n)\ndef test_fetch_json_schema_from_file_returns_json_from_file(mock_open, mock_isfile):\n    mock_isfile.return_value = True\n\n    cwd = os.getcwd()\n    schema_file_path = os.path.join(cwd, \"example.schema.json\")\n    source_file_path = os.path.join(cwd, \"source_file.json\")\n\n    result = fetch_json_schema_from_file(schema_file_path, source_file_path)\n\n    mock_open.assert_called_once_with(schema_file_path, \"r\", encoding=\"utf8\")\n    assert result == {\"$schema\": \"http://test.com/schema.json\"}\n\n\n@patch(\"os.path.isfile\")\n@patch(\n    \"builtins.open\",\n    new_callable=mock_open,\n    read_data='{\"$schema\": \"http://test.com/schema.json\"}',\n)\ndef test_fetch_json_schema_from_file_uses_a_relative_path(mock_open, mock_isfile):\n    mock_isfile.return_value = True\n\n    schema_file_path = \"../../config.schema.json\"\n    source_file_path = \"/home/runner/work/rag-experiment-accelerator/rag-experiment-accelerator/.github/workflows/config.json\"\n\n    fetch_json_schema_from_file(schema_file_path, source_file_path)\n\n    mock_open.assert_called_once_with(\n        \"/home/runner/work/rag-experiment-accelerator/rag-experiment-accelerator/config.schema.json\",\n        \"r\",\n        encoding=\"utf8\",\n    )\n\n\n@patch(\"os.path.isfile\")\ndef test_fetch_json_schema_from_file_raises_error_for_non_file_input(mock_isfile):\n    mock_isfile.return_value = False\n\n    cwd = os.getcwd()\n    schema_file_path = os.path.join(cwd, \"not_a_file\")\n    source_file_path = os.path.join(cwd, \"source_file.json\")\n\n    with pytest.raises(ValueError):\n        fetch_json_schema_from_file(schema_file_path, source_file_path)\n\n\ndef test_get_normalised_schema_path():\n    schema_file_path = \"../../config.schema.json\"\n    source_file_path = \"/home/runner/work/rag-experiment-accelerator/rag-experiment-accelerator/.github/workflows/config.json\"\n\n    result = get_normalised_schema_path(schema_file_path, source_file_path)\n    assert (\n        result\n        == \"/home/runner/work/rag-experiment-accelerator/rag-experiment-accelerator/config.schema.json\"\n    )\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.schema_cache\")\ndef test_fetch_json_schema_returns_from_cache(mock_schema_cache):\n    schema_reference = \"http://test.com/schema.json\"\n    schema_cache = {schema_reference: {\"key\": \"value\"}}\n\n    mock_schema_cache.__contains__.return_value = schema_cache.__contains__\n    mock_schema_cache.__getitem__.side_effect = schema_cache.__getitem__\n\n    result = fetch_json_schema(schema_reference, \"source_file.json\")\n    assert result == schema_cache[schema_reference]\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.schema_cache\")\n@patch(\"rag_experiment_accelerator.config.config_validator.fetch_json_schema_from_url\")\ndef test_fetch_json_schema_updates_cache(\n    mock_fetch_json_schema_from_url, mock_schema_cache\n):\n    schema_reference = \"http://test.com/schema.json\"\n    schema_cache = {}\n\n    mock_schema_cache.__contains__.return_value = False\n    mock_schema_cache.__setitem__.side_effect = schema_cache.__setitem__\n    mock_fetch_json_schema_from_url.return_value = {\n        \"$schema\": \"http://test.com/schema.json\"\n    }\n\n    fetch_json_schema(schema_reference, \"source_file.json\")\n    assert schema_cache == {\n        schema_reference: mock_fetch_json_schema_from_url.return_value\n    }\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.schema_cache\")\n@patch(\"rag_experiment_accelerator.config.config_validator.fetch_json_schema_from_url\")\ndef test_fetch_json_schema_returns_from_url_when_http(\n    mock_fetch_json_schema_from_url, mock_schema_cache\n):\n    schema_reference = \"http://test.com/schema.json\"\n\n    mock_schema_cache.__contains__.return_value = False\n    mock_fetch_json_schema_from_url.return_value = {\n        \"$schema\": \"http://test.com/schema.json\"\n    }\n\n    fetch_json_schema(schema_reference, \"source_file.json\")\n    mock_fetch_json_schema_from_url.assert_called_once_with(schema_reference)\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.schema_cache\")\n@patch(\"rag_experiment_accelerator.config.config_validator.fetch_json_schema_from_url\")\ndef test_fetch_json_schema_returns_from_url_when_https(\n    mock_fetch_json_schema_from_url, mock_schema_cache\n):\n    schema_reference = \"https://test.com/schema.json\"\n\n    mock_schema_cache.__contains__.return_value = False\n    mock_fetch_json_schema_from_url.return_value = {\n        \"$schema\": \"http://test.com/schema.json\"\n    }\n\n    fetch_json_schema(schema_reference, \"source_file.json\")\n    mock_fetch_json_schema_from_url.assert_called_once_with(schema_reference)\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.schema_cache\")\n@patch(\"rag_experiment_accelerator.config.config_validator.fetch_json_schema_from_file\")\ndef test_fetch_json_schema_returns_from_file(\n    mock_fetch_json_schema_from_file, mock_schema_cache\n):\n    schema_reference = \"./schema.json\"\n\n    mock_schema_cache.__contains__.return_value = False\n    mock_fetch_json_schema_from_file.return_value = {\n        \"$schema\": \"http://test.com/schema.json\"\n    }\n\n    fetch_json_schema(schema_reference, \"source_file.json\")\n    mock_fetch_json_schema_from_file.assert_called_once_with(\n        schema_reference, \"source_file.json\"\n    )\n\n\ndef test_validate_json_with_schema_returns_true_for_no_schema():\n    json_data = {\"key\": \"value\"}\n    source_file_path = \"source_file.json\"\n\n    result = validate_json_with_schema(json_data, source_file_path)\n    assert result == (True, None)\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.fetch_json_schema\")\n@patch(\"rag_experiment_accelerator.config.config_validator.validate\")\ndef test_validate_json_with_schema_returns_for_valid_json(\n    mock_validate, mock_fetch_json_schema\n):\n    json_schema = {\"$schema\": \"http://test.com/schema.json\"}\n    mock_fetch_json_schema.return_value = json_schema\n\n    mock_validate.return_value = None\n\n    json_data = {\"$schema\": \"./schema.json\"}\n    result = validate_json_with_schema(json_data, \"source_file.json\")\n\n    assert result == (True, None)\n\n\n@patch(\"rag_experiment_accelerator.config.config_validator.fetch_json_schema\")\n@patch(\"rag_experiment_accelerator.config.config_validator.validate\")\ndef test_validate_json_with_schema_returns_for_invalid_json(\n    mock_validate, mock_fetch_json_schema\n):\n    json_schema = {\"$schema\": \"http://test.com/schema.json\"}\n    mock_fetch_json_schema.return_value = json_schema\n\n    mock_validate.side_effect = ValidationError(\"Invalid JSON\")\n\n    json_data = {\"$schema\": \"./schema.json\"}\n    result = validate_json_with_schema(json_data, \"source_file.json\")\n\n    assert result == (False, mock_validate.side_effect)\n"
  },
  {
    "path": "rag_experiment_accelerator/config/tests/test_environment.py",
    "content": "from unittest.mock import patch, MagicMock\nfrom typing import Optional\n\nfrom azure.keyvault.secrets import SecretClient\nfrom azure.core.exceptions import ResourceNotFoundError\n\nfrom rag_experiment_accelerator.config.environment import Environment\n\n\ndef mock_get_value_from_env_with_keyvault(\n    var_name: str, is_optional: bool = False\n) -> Optional[str]:\n    mapping = {\n        \"USE_KEY_VAULT\": \"True\",\n        \"AZURE_KEY_VAULT_ENDPOINT\": \"test_keyvault_endpoint\",\n        \"OPENAI_API_TYPE\": \"azure\",\n        \"OPENAI_API_VERSION\": \"test_api_version\",\n        \"OPENAI_ENDPOINT\": \"test_api_endpoint\",\n        \"AML_SUBSCRIPTION_ID\": \"test_subscription_id\",\n        \"AML_WORKSPACE_NAME\": \"test_workspace_name\",\n        \"AML_RESOURCE_GROUP_NAME\": \"test_resource_group_name\",\n        \"AZURE_SEARCH_SERVICE_ENDPOINT\": \"test_search_endpoint\",\n        \"AZURE_SEARCH_USE_SEMANTIC_SEARCH\": \"True\",\n    }\n    return mapping.get(var_name)\n\n\ndef mock_get_secret_value_from_keyvault(\n    keyvault: SecretClient, field_name: str, is_optional: bool = False\n) -> Optional[str]:\n    mapping = {\n        \"openai_api_key\": \"test_openai_api_key\",\n        \"azure_search_admin_key\": \"test_admin_key\",\n    }\n    return mapping.get(field_name)\n\n\ndef mock_get_any_value_from_keyvault(field_name: str) -> Optional[str]:\n    return_value = MagicMock()\n    mapping = {\n        \"openai-api-type\": \"azure\",\n        \"openai-api-key\": \"test_openai_api_key\",\n        \"openai-api-version\": \"test_api_version\",\n        \"openai-endpoint\": \"test_api_endpoint\",\n        \"azure-search-service-endpoint\": \"test_search_endpoint\",\n        \"azure-search-use-semantic-search\": \"True\",\n        \"azure-search-admin-key\": \"test_admin_key\",\n        \"aml-subscription-id\": \"test_subscription_id\",\n        \"aml-workspace-name\": \"test_workspace_name\",\n        \"aml-resource-group-name\": \"test_resource_group_name\",\n    }\n    if field_name in mapping:\n        return_value.value = mapping[field_name]\n    else:\n        raise ResourceNotFoundError(f\"Not found secret {field_name}\")\n    return return_value\n\n\n@patch(\"rag_experiment_accelerator.config.environment.init_keyvault\")\n@patch(\n    \"rag_experiment_accelerator.config.environment._get_value_from_env\",\n    side_effect=mock_get_value_from_env_with_keyvault,\n)\n@patch(\n    \"rag_experiment_accelerator.config.environment._get_value_from_keyvault\",\n    side_effect=mock_get_secret_value_from_keyvault,\n)\ndef test_create_environment_from_env_or_keyvault(_, __, mock_init_keyvault):\n    environment = Environment.from_env_or_keyvault()\n    mock_init_keyvault.return_value = MagicMock()\n\n    assert environment.azure_search_service_endpoint == \"test_search_endpoint\"\n    assert environment.aml_subscription_id == \"test_subscription_id\"\n    assert environment.aml_workspace_name == \"test_workspace_name\"\n    assert environment.aml_resource_group_name == \"test_resource_group_name\"\n    assert environment.openai_api_version == \"test_api_version\"\n    assert environment.openai_endpoint == \"test_api_endpoint\"\n    assert environment.openai_api_type == \"azure\"\n\n    assert environment.openai_api_key == \"test_openai_api_key\"\n    assert environment.azure_search_admin_key == \"test_admin_key\"\n\n\n@patch(\"rag_experiment_accelerator.config.environment.init_keyvault\")\ndef test_create_environment_from_keyvault(mock_init_keyvault):\n    mock_keyvault = MagicMock()\n    mock_keyvault.get_secret = mock_get_any_value_from_keyvault\n    mock_init_keyvault.return_value = mock_keyvault\n\n    environment = Environment.from_keyvault(\"test_keyvault_endpoint\")\n\n    assert environment.azure_search_service_endpoint == \"test_search_endpoint\"\n    assert environment.aml_subscription_id == \"test_subscription_id\"\n    assert environment.aml_workspace_name == \"test_workspace_name\"\n    assert environment.aml_resource_group_name == \"test_resource_group_name\"\n    assert environment.openai_api_version == \"test_api_version\"\n    assert environment.openai_endpoint == \"test_api_endpoint\"\n    assert environment.openai_api_type == \"azure\"\n\n    assert environment.openai_api_key == \"test_openai_api_key\"\n    assert environment.azure_search_admin_key == \"test_admin_key\"\n\n\n@patch(\"rag_experiment_accelerator.config.environment.init_keyvault\")\ndef test_to_keyvault(mock_init_keyvault):\n    mock_keyvault = MagicMock()\n    mock_keyvault.set_secret = MagicMock()\n    mock_init_keyvault.return_value = mock_keyvault\n\n    environment = Environment(\n        openai_api_type=\"azure\",\n        openai_api_key=\"test_openai_api_key\",\n        openai_api_version=\"test_api_version\",\n        openai_endpoint=\"test_api_endpoint\",\n        aml_subscription_id=\"test_subscription_id\",\n        aml_workspace_name=\"test_workspace_name\",\n        aml_resource_group_name=\"test_resource_group_name\",\n        aml_compute_name=None,\n        aml_compute_instances_number=None,\n        azure_search_service_endpoint=\"test_search_endpoint\",\n        azure_search_admin_key=\"test_admin_key\",\n        azure_document_intelligence_admin_key=None,\n        azure_document_intelligence_endpoint=None,\n        azure_language_service_endpoint=None,\n        azure_language_service_key=None,\n        azure_key_vault_endpoint=\"test_endpoint\",\n        azure_search_use_semantic_search=\"True\",\n    )\n    environment.to_keyvault()\n\n    assert mock_keyvault.set_secret.call_count == 17\n"
  },
  {
    "path": "rag_experiment_accelerator/config/tests/test_index_config.py",
    "content": "from rag_experiment_accelerator.config.chunking_config import ChunkingConfig\nfrom rag_experiment_accelerator.config.embedding_model_config import (\n    EmbeddingModelConfig,\n)\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.config.sampling_config import SamplingConfig\n\n\ndef test_index_config_to_index_name():\n    index_config = IndexConfig(\n        index_name_prefix=\"prefix\",\n        ef_construction=3,\n        ef_search=4,\n        chunking=ChunkingConfig(\n            preprocess=False,\n            chunk_size=1,\n            chunking_strategy=\"abcd\",\n            overlap_size=2,\n            generate_summary=False,\n            generate_title=False,\n            override_content_with_summary=False,\n        ),\n        embedding_model=EmbeddingModelConfig(\n            type=\"type\", model_name=\"modelname\", dimension=100\n        ),\n        sampling=SamplingConfig(percentage=10),\n    )\n\n    assert (\n        index_config.index_name()\n        == \"idx-prefix_efc-3_efs-4_em-modelname_sp-10_p-0_cs-1_st-abcd_o-2_t-0_s-0_oc-0_d-100\"\n    )\n\n\ndef test_index_name_to_index_config():\n    index_name = \"idx-prefix_efc-3_efs-4_em-modelname_sp-10_p-0_cs-1_st-abcd_o-2_t-0_s-1_oc-0_d-100\"\n\n    index_config = IndexConfig.from_index_name(index_name)\n\n    assert index_config.index_name_prefix == \"prefix\"\n    assert index_config.chunking.chunk_size == 1\n    assert index_config.chunking.chunking_strategy == \"abcd\"\n    assert index_config.chunking.overlap_size == 2\n    assert index_config.chunking.generate_summary is True\n    assert index_config.chunking.generate_title is False\n    assert index_config.embedding_model.model_name == \"modelname\"\n    assert index_config.embedding_model.dimension == 100\n    assert index_config.ef_construction == 3\n    assert index_config.ef_search == 4\n\n\ndef test_index_name_to_index_config_shuffled_order():\n    index_name = \"idx-prefix_efc-3_efs-4_em-modelname_p-0_cs-1_st-abcd_o-2_t-0_s-1_oc-0_sp-10_d-100\"\n\n    index_config = IndexConfig.from_index_name(index_name)\n\n    assert index_config.index_name_prefix == \"prefix\"\n    assert index_config.chunking.chunk_size == 1\n    assert index_config.chunking.chunking_strategy == \"abcd\"\n    assert index_config.chunking.overlap_size == 2\n    assert index_config.chunking.generate_summary is True\n    assert index_config.chunking.generate_title is False\n    assert index_config.embedding_model.model_name == \"modelname\"\n    assert index_config.embedding_model.dimension == 100\n    assert index_config.ef_construction == 3\n    assert index_config.ef_search == 4\n\n\ndef test_index_name_to_index_config_missing_property():\n    index_name = (\n        \"idx-prefix_efc-3_efs-4_em-modelname_sp-10_p-0_st-basic_o-2_t-0_s-0_oc-0_d-100\"\n    )\n\n    try:\n        IndexConfig.from_index_name(index_name)\n    except ValueError:\n        assert True\n    else:\n        assert False, \"Expected ValueError to be thrown\"\n\n\ndef test_index_name_to_index_config_hyphens():\n    index_name = (\n        \"idx-prefix_efc-3_efs-4_em-model-name_sp-10_p-0_cs-1_st-ab-cd_o-2_t-0_s-0_oc-0_d-100\"\n    )\n\n    index_config = IndexConfig.from_index_name(index_name)\n\n    assert index_config.index_name_prefix == \"prefix\"\n    assert index_config.chunking.chunk_size == 1\n    assert index_config.chunking.chunking_strategy == \"ab-cd\"\n    assert index_config.chunking.overlap_size == 2\n    assert index_config.embedding_model.model_name == \"model-name\"\n    assert index_config.embedding_model.dimension == 100\n    assert index_config.ef_construction == 3\n    assert index_config.ef_search == 4\n"
  },
  {
    "path": "rag_experiment_accelerator/data_assets/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/data_assets/data_asset.py",
    "content": "from azure.ai.ml import MLClient\nfrom azure.ai.ml.entities import Data\nfrom azure.ai.ml.constants import AssetTypes\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.utils.auth import get_default_az_cred\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef create_data_asset(data_path: str, data_asset_name: str, environment: Environment):\n    \"\"\"\n    Creates a new data asset in Azure Machine Learning workspace.\n\n    Args:\n        data_path (str): The path to the data file.\n        data_asset_name (str): The name of the data asset.\n        environment (Environment): Class containing the environment configuration\n\n    Returns:\n        int: The version of the created data asset.\n    \"\"\"\n\n    ml_client = MLClient(\n        get_default_az_cred(),\n        environment.aml_subscription_id,\n        environment.aml_resource_group_name,\n        environment.aml_workspace_name,\n    )\n\n    aml_dataset = Data(\n        path=data_path,\n        type=AssetTypes.URI_FILE,\n        description=\"rag data\",\n        name=data_asset_name,\n    )\n\n    ml_client.data.create_or_update(aml_dataset)\n\n    aml_dataset_unlabeled = ml_client.data.get(name=data_asset_name, label=\"latest\")\n\n    logger.info(f\"Dataset version: {aml_dataset_unlabeled.version}\")\n    logger.info(f\"Dataset ID: {aml_dataset_unlabeled.id}\")\n\n    return aml_dataset_unlabeled.version\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/customJsonLoader.py",
    "content": "import json\nfrom pathlib import Path\nfrom typing import Union\n\nfrom langchain.docstore.document import Document\nfrom langchain.document_loaders.base import BaseLoader\n\n# Replaces langchain.document_loaders.JSONLoader to not use jq for windows compatibility\n# Note: Does not currently support jsonl, which is what the seq_num metadata field tracks\n\n\nclass CustomJSONLoader(BaseLoader):\n    def __init__(\n        self,\n        file_path: Union[str, Path],\n        keys_to_load: list[str] = [\"content\", \"title\"],\n        strict_keys: bool = True,\n    ):\n        self.file_path = Path(file_path).resolve()\n        self._keys_to_load = keys_to_load\n        self._strict_keys = strict_keys\n\n    def _load_schema_from_dict(self, data: dict) -> str:\n        if self._keys_to_load is None:\n            return data\n        else:\n            return_dict = {}\n            for k in self._keys_to_load:\n                value = data.get(k)\n                if value is None and self._strict_keys:\n                    raise ValueError(\n                        f\"JSON file at path {self.file_path} must contain the field '{k}'\"\n                    )\n                return_dict[k] = value\n        return return_dict\n\n    def load(self) -> list[Document]:\n        \"\"\"Load and return documents from the JSON file.\"\"\"\n        docs: list[Document] = []\n        # Load JSON file\n        with self.file_path.open(encoding=\"utf-8\") as f:\n            data = json.load(f)\n            page_content = []\n\n            if not isinstance(data, list):\n                raise ValueError(\n                    f\"JSON file at path: {self.file_path} must be a list of object and expects each object to contain the fields {self._keys_to_load}\"\n                )\n            else:\n                for entry in data:\n                    data_dict = self._load_schema_from_dict(entry)\n                    page_content.append(data_dict)\n\n            metadata = {\n                \"source\": str(self.file_path),\n            }\n\n            docs.append(Document(page_content=str(page_content), metadata=metadata))\n        return docs\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py",
    "content": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom contextlib import ExitStack\nimport re\nimport os\nimport uuid\nfrom azure.ai.documentintelligence import DocumentIntelligenceClient\nfrom langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom azure.core.credentials import AzureKeyCredential\nfrom langchain_core.documents import Document\nfrom pathlib import Path\n\nfrom langchain_community.document_loaders.base import BaseLoader\nfrom typing import List, Iterator\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom azure.ai.documentintelligence.models import DocumentParagraph\n\nlogger = get_logger(__name__)\n\n\ndef is_supported_by_document_intelligence(format: str) -> bool:\n    \"\"\"\n    Returns whether a format is supported by Azure Document Intelligence or not.\n\n    Returns:\n        bool: True if the format is supported, False otherwise.\n    \"\"\"\n\n    return format.lower() in [\n        \"pdf\",\n        \"jpeg\",\n        \"jpg\",\n        \"png\",\n        \"bmp\",\n        \"heif\",\n        \"tiff\",\n        \"docx\",\n        \"xlsx\",\n        \"pptx\",\n        \"html\",\n    ]\n\n\ndef load_with_azure_document_intelligence(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: int,\n    overlap_size: int,\n    azure_document_intelligence_model: str,\n    **kwargs: dict,\n) -> list[Document]:\n    \"\"\"\n    Load pdf files from a folder using Azure Document Intelligence.\n\n    Args:\n        environment (Environment): The environment class\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (int): The size of each text chunk in characters.\n        overlap_size (int): The size of the overlap between text chunks in characters.\n        azure_document_intelligence_model (str): The model to use for Azure Document Intelligence.\n        **kwargs (dict): Unused.\n\n    Returns:\n        list[Document]: A list of Document objects.\n    \"\"\"\n    documents = []\n    logger.info(f\"Using model {azure_document_intelligence_model}\")\n    for file_path in file_paths:\n        try:\n            loader = DocumentIntelligenceLoader(\n                file_path,\n                environment.azure_document_intelligence_endpoint,\n                environment.azure_document_intelligence_admin_key,\n                azure_document_intelligence_model,\n                glob_patterns=[\"*\"],\n                excluded_paragraph_roles=[\n                    \"pageHeader\",\n                    \"pageFooter\",\n                    \"footnote\",\n                    \"pageNumber\",\n                ],\n            )\n            documents += loader.load()\n        except Exception as e:\n            logger.warning(f\"Failed to load {file_path}: {e}\")\n\n    logger.debug(f\"Loaded {len(documents)} documents using Azure Document Intelligence\")\n\n    text_splitter = RecursiveCharacterTextSplitter(\n        chunk_size=chunk_size,\n        chunk_overlap=overlap_size,\n        separators=[\"\\n\\n\", \"\\n\"],\n    )\n\n    logger.debug(\n        f\"Splitting extracted documents into chunks of {chunk_size} characters with an overlap of {overlap_size} characters\"\n    )\n\n    docs = text_splitter.split_documents(documents)\n\n    return [{str(uuid.uuid4()): doc.__dict__} for doc in docs]\n\n\nclass DocumentIntelligenceLoader(BaseLoader):\n    \"\"\"\n    Analyzes and loads documents and directories using Azure Document Intelligence.\n    \"\"\"\n\n    def __init__(\n        self,\n        path: str,\n        endpoint: str,\n        key: str,\n        api_model: str,\n        glob_patterns: List[str] = None,\n        split_documents_by_page=False,\n        excluded_paragraph_roles=[],\n        patterns_to_remove: List[str] = [],\n    ):\n        \"\"\"\n        Initializes an instance of the DocumentIntelligenceLoader class.\n\n        Parameters:\n            path: path of the document or directory to load from, when a directory path is provided a glob_pattern has to be provided as well\n            end_point: Azure Document Intelligence endpoint\n            key: Azure Document Intelligence key\n            api_model (str): The model to use for Azure Document Intelligence.\n            glob_patterns: when the given path is a directory, glob_patterns is used to match the files that should be loaded\n            split_documents_by_page: if True, each page in the document will be loaded into separate LangChain document, otherwise (default) the entire document will be loaded into a single LangChain document\n            excluded_paragraph_roles: a list of paragraph roles to exclude. The full list of paragraph roles can be viewed here: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0#paragraph-roles\n            patterns_to_remove: a list of specific regex patterns to be removed from the extracted text\n        \"\"\"\n        self.client = DocumentIntelligenceClient(\n            endpoint=endpoint, credential=AzureKeyCredential(key)\n        )\n        self.path = path\n        self.endpoint = endpoint\n        self.key = key\n        self.api_model = api_model\n        self.patterns_to_remove = patterns_to_remove\n        self.glob_patterns = glob_patterns\n        self.split_documents_by_page = split_documents_by_page\n        self.excluded_paragraph_roles = excluded_paragraph_roles\n\n    def load(self) -> List[Document]:\n        documents = []\n        file_paths = self._get_file_paths()\n\n        with ExitStack() as stack:\n            executor = stack.enter_context(ThreadPoolExecutor())\n\n            futures = {\n                executor.submit(self._analyze_document, file_path)\n                for file_path in file_paths\n            }\n\n            for future in as_completed(futures):\n                try:\n                    documents += future.result()\n                except Exception as exc:\n                    logger.error(f\"Processing document generated an exception: {exc}\")\n\n            return documents\n\n    def lazy_load(self) -> Iterator[Document]:\n        file_paths = self._get_file_paths()\n        for file_path in file_paths:\n            yield self._analyze_document(file_path)\n\n    def _get_file_paths(self):\n        if not os.path.isdir(self.path):\n            return [self.path]\n\n        directory = Path(self.path)\n        file_paths = []\n        for pattern in self.glob_patterns:\n            file_paths += list(directory.rglob(f\"*.{pattern}\"))\n\n        return [str(path) for path in file_paths]\n\n    def _analyze_document(self, file_path: str):\n        if not self.api_model == \"prebuilt-layout\":\n            return self._load_with_langchain(file_path, self.api_model)\n\n        documents = []\n        try:\n            result = self._call_document_intelligence(file_path)\n\n            if result.tables:\n                paragraphs = self._substitute_table_paragraphs(\n                    result.paragraphs, result.tables\n                )\n            else:\n                paragraphs = result.paragraphs\n\n            relevant_paragraphs = []\n            for paragraph in paragraphs:\n                if \"role\" in paragraph.keys():\n                    if paragraph[\"role\"] not in self.excluded_paragraph_roles:\n                        relevant_paragraphs.append(paragraph)\n                else:\n                    relevant_paragraphs.append(paragraph)\n\n            if self.split_documents_by_page:\n                paragraphs_by_page = self._split_paragraphs_by_page(relevant_paragraphs)\n                for page_number, page_paragraphs in paragraphs_by_page.items():\n                    documents.append(\n                        self._convert_to_langchain_document(\n                            page_paragraphs, file_path, page_number\n                        )\n                    )\n            else:\n                documents.append(\n                    self._convert_to_langchain_document(\n                        relevant_paragraphs, file_path, 1\n                    )\n                )\n\n            return documents\n        except Exception as exc:\n            logger.warning(\n                f\"Failed to load {file_path} with Azure Document Intelligence using the 'prebuilt-layout' model: {exc}. Attempting to load using the simpler 'prebuilt-read' model...\"\n            )\n            return self._load_with_langchain(file_path, \"prebuilt-read\")\n\n    def _call_document_intelligence(self, file_path):\n        with open(file_path, \"rb\") as file:\n            content = file.read()\n            poller = self.client.begin_analyze_document(\n                \"prebuilt-layout\",\n                content,\n                content_type=\"application/octet-stream\",\n                output_content_format=\"markdown\",\n            )\n\n        result = poller.result()\n        return result\n\n    def _clean_content(self, content: str):\n        # Remove AI doc intelligence traces.\n        pattern = re.compile(r\":selected:|:unselected:\")\n        content = pattern.sub(\"\", content)\n        # Remove specific regex patterns.\n        for regex_pattern in self.patterns_to_remove:\n            pattern = re.compile(regex_pattern)\n            content = pattern.sub(\"\", content)\n\n        return content\n\n    def _convert_to_langchain_document(self, paragraphs, file_path, page_number):\n        content = \"\\n\\n\".join([paragraph.content for paragraph in paragraphs])\n        clean_content = self._clean_content(content)\n        return Document(\n            page_content=clean_content,\n            metadata={\"source\": file_path, \"page\": page_number - 1},\n        )\n\n    def _is_intersecting_regions(self, bounding_region1, bounding_region2):\n        \"\"\"\n        Returns whether two bounding regions intersect or not\n        \"\"\"\n        for region1 in bounding_region1:\n            for region2 in bounding_region2:\n                if region1[\"pageNumber\"] == region2[\n                    \"pageNumber\"\n                ] and self._is_intersecting_polygons(region1.polygon, region2.polygon):\n                    return True\n        return False\n\n    def _is_intersecting_polygons(self, polygon1, polygon2):\n        \"\"\"\n        Returns whether two polygons intersect or not\n        \"\"\"\n        x1_1, y1_1, x2_1, y2_1, x3_1, y3_1, x4_1, y4_1 = polygon1\n        x1_2, y1_2, x2_2, y2_2, x3_2, y3_2, x4_2, y4_2 = polygon2\n\n        # Check for overlap along the x-axis\n        if max(x1_1, x2_1, x3_1, x4_1) < min(x1_2, x2_2, x3_2, x4_2) or min(\n            x1_1, x2_1, x3_1, x4_1\n        ) > max(x1_2, x2_2, x3_2, x4_2):\n            return False\n\n        # Check for overlap along the y-axis\n        if max(y1_1, y2_1, y3_1, y4_1) < min(y1_2, y2_2, y3_2, y4_2) or min(\n            y1_1, y2_1, y3_1, y4_1\n        ) > max(y1_2, y2_2, y3_2, y4_2):\n            return False\n\n        # If the boxes overlap along both axes, they intersect\n        return True\n\n    def _assign_tables_to_paragraphs(self, paragraphs, tables):\n        \"\"\"\n        Returns a list that maps paragraph indexes to their tables indexes.\n        Indexes in the returned list match the indexes of the `paragraphs` list and the value at that index contains the index of the table in the `tables` list that the paragraph belongs to.\n        If the paragraph is not intersecting with any table, the index will be -1.\n\n        For example, this assignments: [-1, 0, 0, 1, -1, -1, -1, 2, 2, 2, -1] means:\n        The paragraph at index 0 does not belong to any table.\n        The paragraphs at indexes 1 and 2 belong to table at index 0.\n        The paragraph at index 3 belongs to table 1\n        The rest of the paragraphs in the example belong to the table at index 2, or do not belong to any table\n        \"\"\"\n        paragraph_to_table = [-1] * len(paragraphs)\n\n        for paragraph_index, paragraph in enumerate(paragraphs):\n            for table_index, table in enumerate(tables):\n                if self._is_intersecting_regions(\n                    paragraph.bounding_regions, table.bounding_regions\n                ):\n                    paragraph_to_table[paragraph_index] = table_index\n                else:\n                    continue\n\n        return paragraph_to_table\n\n    def _convert_to_paragraph(self, table):\n        content = self._format_table(table)\n        return DocumentParagraph(\n            content=content, bounding_regions=table.bounding_regions, role=\"table\"\n        )\n\n    def _format_table(self, table):\n        \"\"\"\n        Formats Azure Document Intelligence's tables to the following format:\n        <Table Captions>\n        <Header1>: <ValueRow1>, <Header2>: <ValueRow1>, <Header3>: <ValueRow1>, ...\n        <Header1>: <ValueRow2>, <Header2>: <ValueRow2>, <Header3>: <ValueRow2>, ...\n        <Header1>: <ValueRow3>, <Header2>: <ValueRow3>, <Header3>: <ValueRow3>, ...\n        ...\n        \"\"\"\n        table_headers = []\n        for cell in table[\"cells\"]:\n            if cell.get(\"kind\") == \"columnHeader\":\n                table_headers.append(cell[\"content\"])\n\n        content = table.get(\"caption\", {}).get(\"content\", \"\")\n\n        previous_row_index = -1\n        for cell in table.cells:\n            if cell.get(\"kind\") == \"columnHeader\":\n                continue\n\n            header = (\n                f\"{table_headers[cell['columnIndex']]}: \"\n                if cell[\"columnIndex\"] < len(table_headers)\n                else \"\"\n            )\n\n            is_new_row = previous_row_index != cell[\"rowIndex\"]\n            if is_new_row:\n                content += \"\\n\" if content else \"\"\n                previous_row_index = cell[\"rowIndex\"]\n\n            content += f\"{header}{cell.content}\"\n            content += \", \" if cell[\"columnIndex\"] < len(table_headers) - 1 else \" \"\n        return content\n\n    def _substitute_table_paragraphs(self, paragraphs, tables):\n        \"\"\"\n        Returns a modified version of the `paragraphs` list, where paragraphs that are part of a table are combined and replaced with a formatted table.\n        \"\"\"\n        result = []\n        paragraphs_to_tables = self._assign_tables_to_paragraphs(paragraphs, tables)\n\n        last_table_index = None\n        for paragraph_index, table_index in enumerate(paragraphs_to_tables):\n            is_table = table_index != -1\n            if not is_table:\n                result.append(paragraphs[paragraph_index])\n                continue\n\n            is_new_table = table_index != last_table_index\n            if is_new_table:\n                table = tables[table_index]\n                formatted_table = self._convert_to_paragraph(table)\n                result.append(formatted_table)\n                last_table_index = table_index\n\n        return result\n\n    def _split_paragraphs_by_page(self, paragraphs):\n        paragraphs_by_page = {}\n        for paragraph in paragraphs:\n            page_number = paragraph.bounding_regions[0][\"pageNumber\"]\n            is_new_page = page_number not in paragraphs_by_page\n            if is_new_page:\n                paragraphs_by_page[page_number] = []\n            paragraphs_by_page[page_number].append(paragraph)\n        return paragraphs_by_page\n\n    def _load_with_langchain(self, file_path, api_model):\n        \"\"\"\n        Loads a file with LangChain's simpler implementation which returns the raw response from Document Intelligence.\n        \"\"\"\n\n        documents = []\n        try:\n            loader = AzureAIDocumentIntelligenceLoader(\n                file_path=file_path,\n                api_key=self.key,\n                api_endpoint=self.endpoint,\n                api_model=api_model,\n            )\n            doc = loader.load()[0]\n            doc.metadata = {\n                \"source\": file_path,\n                \"page\": 0,  # Azure Document Intelligence always returns a single page so we set it to 0\n            }\n            documents.append(doc)\n        except Exception as e:\n            logger.error(\n                f\"Failed to load {file_path} with Azure Document Intelligence using the 'prebuilt-read' model: {e}\"\n            )\n            raise e\n\n        logger.info(\n            f'Successfully loaded {file_path} with Azure Document Intelligence using the \"prebuilt-read\" model.'\n        )\n        return documents\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/documentLoader.py",
    "content": "from rag_experiment_accelerator.doc_loader.docxLoader import load_docx_files\nfrom rag_experiment_accelerator.doc_loader.htmlLoader import load_html_files\nfrom rag_experiment_accelerator.doc_loader.jsonLoader import load_json_files\nfrom rag_experiment_accelerator.doc_loader.markdownLoader import (\n    load_markdown_files,\n)\nfrom rag_experiment_accelerator.doc_loader.pdfLoader import load_pdf_files\nfrom rag_experiment_accelerator.doc_loader.textLoader import load_text_files\nfrom rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import (\n    is_supported_by_document_intelligence,\n    load_with_azure_document_intelligence,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.config.chunking_config import ChunkingStrategy\n\nlogger = get_logger(__name__)\n\n_FORMAT_VERSIONS = {\n    \"pdf\": [\"pdf\", \"pdfa\", \"pdfa-1\", \"pdfl\"],\n    \"html\": [\"html\", \"htm\", \"xhtml\", \"html5\"],\n    \"markdown\": [\"md\", \"markdown\"],\n    \"json\": [\"json\"],\n    \"text\": [\"txt\", \"rtf\"],\n    \"docx\": [\"docx\"],\n}\n_FORMAT_PROCESSORS = {\n    \"pdf\": load_pdf_files,\n    \"html\": load_html_files,\n    \"markdown\": load_markdown_files,\n    \"json\": load_json_files,\n    \"text\": load_text_files,\n    \"docx\": load_docx_files,\n}\n\n\ndef determine_processor(chunking_strategy: ChunkingStrategy, format: str) -> callable:\n    \"\"\"\n    Determine and return document processor based on chunking strategy and format.\n    \"\"\"\n    if (\n        chunking_strategy == ChunkingStrategy.AZURE_DOCUMENT_INTELLIGENCE\n        and is_supported_by_document_intelligence(format)\n    ):\n        return load_with_azure_document_intelligence\n    else:\n        return _FORMAT_PROCESSORS[format]\n\n\ndef load_documents(\n    environment: Environment,\n    chunking_strategy: ChunkingStrategy,\n    allowed_formats: list[str],\n    file_paths: list[str],\n    chunk_size: int,\n    overlap_size: int,\n    azure_document_intelligence_model: str = None,\n):\n    \"\"\"\n    Load documents from a folder and process them into chunks.\n\n    Args:\n        environment (Environment): The environment class\n        chunking_strategy (str): The chunking strategy to use between \"azure-document-intelligence\" and \"basic\".\n        allowed_formats (list[str]]): List of formats, ['*'] - to allow any supported format.\n        folder_path (str): Path to the folder containing the documents.\n        chunk_size (int): Size of each chunk.\n        overlap_size (int): Size of overlap between adjacent chunks.\n        azure_document_intelligence_model (str): The model to use for Azure Document Intelligence.\n\n    Returns:\n        list: A list of dictionaries containing the processed chunks.\n\n    Raises:\n        FileNotFoundError: When the specified folder does not exist.\n    \"\"\"\n\n    if \"*\" in allowed_formats:\n        allowed_formats = _FORMAT_VERSIONS.keys()\n\n    logger.debug(f\"Loading documents with allowed formats {', '.join(allowed_formats)}\")\n\n    documents = {}\n\n    for format in allowed_formats:\n        if format not in _FORMAT_VERSIONS:\n            logger.error(f\"Format {format} is not supported\")\n            continue\n        matching_files = [\n            path\n            for path in file_paths\n            if any(path.endswith(pattern) for pattern in _FORMAT_VERSIONS[format])\n        ]\n\n        processor = determine_processor(\n            chunking_strategy=chunking_strategy, format=format\n        )\n        documents[format] = processor(\n            environment=environment,\n            file_paths=matching_files,\n            chunk_size=chunk_size,\n            overlap_size=overlap_size,\n            azure_document_intelligence_model=azure_document_intelligence_model,\n        )\n\n    all_documents = []\n    for inner_dict in documents.keys():\n        for value in documents[inner_dict]:\n            all_documents.append(value)\n\n    logger.info(f\"Loaded {len(all_documents)} chunks\")\n    return all_documents\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/docxLoader.py",
    "content": "from langchain_community.document_loaders import Docx2txtLoader\n\nfrom rag_experiment_accelerator.doc_loader.structuredLoader import (\n    load_structured_files,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef load_docx_files(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: str,\n    overlap_size: str,\n    **kwargs: dict,\n):\n    \"\"\"\n    Load and process docx files from a given folder path.\n\n    Args:\n        environment (Environment): The environment class\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (int): The size of each text chunk in characters.\n        overlap_size (int): The size of the overlap between text chunks in characters.\n        **kwargs (dict): Unused.\n\n\n    Returns:\n        list[Document]: A list of processed and split document chunks.\n    \"\"\"\n\n    logger.debug(\"Loading docx files\")\n\n    return load_structured_files(\n        file_format=\"DOCX\",\n        language=None,\n        loader=Docx2txtLoader,\n        file_paths=file_paths,\n        chunk_size=chunk_size,\n        overlap_size=overlap_size,\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/htmlLoader.py",
    "content": "from langchain_community.document_loaders import BSHTMLLoader\n\nfrom rag_experiment_accelerator.doc_loader.structuredLoader import (\n    load_structured_files,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef load_html_files(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: str,\n    overlap_size: str,\n    **kwargs: dict,\n):\n    \"\"\"\n    Load and process HTML files from a given folder path.\n\n    Args:\n        chunking_strategy (str): The chunking strategy to use between \"azure-document-intelligence\" and \"basic\".\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (str): The size of the chunks to split the documents into.\n        overlap_size (str): The size of the overlapping parts between chunks.\n        glob_patterns (list[str]): List of file extensions to consider (e.g., [\"html\", \"htm\", ...]).\n        **kwargs (dict): Unused.\n\n    Returns:\n        list[Document]: A list of processed and split document chunks.\n    \"\"\"\n\n    logger.debug(\"Loading html files\")\n\n    return load_structured_files(\n        file_format=\"HTML\",\n        language=\"html\",\n        loader=BSHTMLLoader,\n        file_paths=file_paths,\n        chunk_size=chunk_size,\n        overlap_size=overlap_size,\n        loader_kwargs={\"open_encoding\": \"utf-8\"},\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/jsonLoader.py",
    "content": "from rag_experiment_accelerator.doc_loader.customJsonLoader import (\n    CustomJSONLoader,\n)\nfrom rag_experiment_accelerator.doc_loader.structuredLoader import (\n    load_structured_files,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef load_json_files(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: str,\n    overlap_size: str,\n    **kwargs: dict,\n):\n    \"\"\"\n    Load and process Json files from a given folder path.\n\n    Args:\n        environment (Environment): The environment class\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (int): The size of each text chunk in characters.\n        overlap_size (int): The size of the overlap between text chunks in characters.\n        **kwargs (dict): Unused.\n\n    Returns:\n        list[Document]: A list of processed and split document chunks.\n    \"\"\"\n\n    logger.debug(\"Loading json files\")\n\n    keys_to_load = [\"content\", \"title\"]\n    return load_structured_files(\n        file_format=\"JSON\",\n        language=None,\n        loader=CustomJSONLoader,\n        file_paths=file_paths,\n        chunk_size=chunk_size,\n        overlap_size=overlap_size,\n        loader_kwargs={\n            \"keys_to_load\": keys_to_load,\n        },\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/markdownLoader.py",
    "content": "from langchain_community.document_loaders import UnstructuredMarkdownLoader\n\nfrom rag_experiment_accelerator.doc_loader.structuredLoader import (\n    load_structured_files,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef load_markdown_files(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: str,\n    overlap_size: str,\n    **kwargs: dict,\n):\n    \"\"\"\n    Load and process Markdown files from a given folder path.\n\n    Args:\n        environment (Environment): The environment class\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (str): The size of the chunks to split the documents into.\n        overlap_size (str): The size of the overlapping parts between chunks.\n        **kwargs (dict): Unused.\n\n    Returns:\n        list[Document]: A list of processed and split document chunks.\n    \"\"\"\n\n    logger.debug(\"Loading markdown files\")\n\n    return load_structured_files(\n        file_format=\"MARKDOWN\",\n        language=\"markdown\",\n        loader=UnstructuredMarkdownLoader,\n        file_paths=file_paths,\n        chunk_size=chunk_size,\n        overlap_size=overlap_size,\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/pdfLoader.py",
    "content": "import uuid\nimport re\n\nfrom langchain_community.document_loaders import PyPDFLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef preprocess_pdf_content(content: str):\n    \"\"\"\n    Preprocess the content extracted from a PDF file.\n    This function performs the following preprocessing steps on the input content:\n    1. Replaces multiple consecutive newline characters ('\\\\n') with a single newline character.\n    2. Removes all remaining newline characters.\n    3. Removes Unicode escape sequences in the format '\\\\uXXXX' where X is a hexadecimal digit.\n    4. Converts the content to lowercase.\n    Args:\n        content (str): The content extracted from the PDF file.\n    Returns:\n        str: The preprocessed content.\n    Example:\n        content = \"Hello\\\\n\\\\nWorld\\\\n\\\\u1234 OpenAI\"\n        preprocessed_content = preprocess_pdf_content(content)\n        print(preprocessed_content)\n        # Output: \"hello world openai\"\n    \"\"\"\n\n    content = re.sub(r\"\\n{2,}\", \"\\n\", content)\n    content = re.sub(r\"\\n{1,}\", \"\", content)\n    content = re.sub(r\"\\\\u[0-9a-fA-F]{4}\", \"\", content)\n    content = content.lower()\n\n    return content\n\n\ndef load_pdf_files(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: int,\n    overlap_size: int,\n    **kwargs: dict,\n):\n    \"\"\"\n    Load PDF files from a folder and split them into chunks of text.\n\n    Args:\n        environment (Environment): The environment class\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (int): The size of each text chunk in characters.\n        overlap_size (int): The size of the overlap between text chunks in characters.\n        **kwargs (dict): Unused.\n\n    Returns:\n        list[Document]: A list of Document objects, each representing a chunk of text from a PDF file.\n    \"\"\"\n\n    logger.info(\"Loading PDF files\")\n    documents = []\n    for file_path in file_paths:\n        loader = PyPDFLoader(file_path=file_path)\n        documents += loader.load()\n\n    logger.debug(f\"Loaded {len(documents)} pages from PDF files\")\n    text_splitter = RecursiveCharacterTextSplitter(\n        chunk_size=chunk_size,\n        chunk_overlap=overlap_size,\n    )\n\n    logger.debug(\n        f\"Splitting PDF pages into chunks of {chunk_size} characters with an overlap of {overlap_size} characters\"\n    )\n    docs = text_splitter.split_documents(documents)\n    docsList = []\n    for doc in docs:\n        docsList.append(\n            {\n                str(uuid.uuid4()): {\n                    \"content\": preprocess_pdf_content(doc.page_content),\n                    \"metadata\": doc.metadata,\n                }\n            }\n        )\n\n    logger.info(f\"Split {len(documents)} PDF pages into {len(docs)} chunks\")\n\n    return docsList\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/structuredLoader.py",
    "content": "import uuid\n\nfrom langchain.document_loaders.base import BaseLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef load_structured_files(\n    file_format: str,\n    language: str,\n    loader: BaseLoader,\n    file_paths: list[str],\n    chunk_size: str,\n    overlap_size: str,\n    loader_kwargs: dict[any] = None,\n):\n    \"\"\"\n    Load and process structured files.\n\n    Args:\n        chunking_strategy (str): The chunking strategy to use between \"azure-document-intelligence\" and \"basic\".\n        file_format (str): The file_format of the documents to be loaded.\n        language (str): The language of the documents to be loaded.\n        loader (BaseLoader): The document loader object that reads the files.\n        file_paths (str): The paths to the files to load.\n        chunk_size (str): The size of the chunks to split the documents into.\n        overlap_size (str): The size of the overlapping parts between chunks.\n        glob_patterns (list[str]): List of file extensions to consider (e.g., [\"txt\", \"md\"]).\n        loader_kwargs (dict[any]): Extra arguments to loader.\n\n    Returns:\n        list[Document]: A list of processed and split document chunks.\n    \"\"\"\n\n    logger.info(f\"Loading {file_format} files\")\n\n    documents = []\n    if loader_kwargs is None:\n        loader_kwargs = {}\n\n    for file in file_paths:\n        documents += loader(file, **loader_kwargs).load()\n\n    logger.debug(f\"Loaded {len(documents)} {file_format} files\")\n    if language is None:\n        text_splitter = RecursiveCharacterTextSplitter(\n            chunk_size=chunk_size,\n            chunk_overlap=overlap_size,\n            length_function=len,\n        )\n    else:\n        text_splitter = RecursiveCharacterTextSplitter().from_language(\n            language=language,\n            chunk_size=chunk_size,\n            chunk_overlap=overlap_size,\n        )\n\n    logger.debug(\n        f\"Splitting {file_format} files into chunks of {chunk_size} characters with an overlap of {overlap_size} characters\"\n    )\n\n    docs = text_splitter.split_documents(documents)\n    docsList = []\n    for doc in docs:\n        docsList.append(\n            {str(uuid.uuid4()): {\"content\": doc.page_content, \"metadata\": doc.metadata}}\n        )\n\n    logger.info(f\"Split {len(documents)} {file_format} files into {len(docs)} chunks\")\n\n    return docsList\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_custom_html_loader.py",
    "content": "from unittest.mock import Mock\n\nfrom rag_experiment_accelerator.doc_loader.htmlLoader import load_html_files\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths\n\n\ndef test_load_html_files():\n    chunks = load_html_files(\n        environment=Mock(),\n        file_paths=get_all_file_paths(\"./data/html\"),\n        chunk_size=1000,\n        overlap_size=200,\n    )\n\n    assert len(chunks) == 20\n\n    assert (\n        \"Deep Neural Nets: 33 years ago and 33 years from now\"\n        in list(chunks[0].values())[0][\"content\"]\n    )\n    assert (\n        \"Deep Neural Nets: 33 years ago and 33 years from now\"\n        not in list(chunks[5].values())[0][\"content\"]\n    )\n    assert \"Musings of a Computer Scientist.\" in list(chunks[19].values())[0][\"content\"]\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_custom_json_loader.py",
    "content": "import os\nimport pytest\n\nfrom rag_experiment_accelerator.doc_loader.customJsonLoader import (\n    CustomJSONLoader,\n)\nfrom rag_experiment_accelerator.doc_loader.structuredLoader import (\n    load_structured_files,\n)\n\n\ndef test_load_json_files():\n    keys_to_load = [\"content\", \"title\"]\n    expected_content = [\n        {\"content\": \"This is the content for item 1.\", \"title\": \"Title TEST 1\"},\n        {\"content\": \"This is the content for item 2.\", \"title\": \"Title 2\"},\n        {\"content\": \"This is the content for item 3.\", \"title\": \"Title 3\"},\n        {\"content\": \"This is the content for item 4.\", \"title\": \"Title 4\"},\n        {\"content\": \"This is the content for item 5.\", \"title\": \"Title 5\"},\n        {\"content\": \"This is the content for item 6.\", \"title\": \"Title 6\"},\n    ]\n\n    docs = load_structured_files(\n        file_format=\"JSON\",\n        language=None,\n        loader=CustomJSONLoader,\n        file_paths=[\n            \"rag_experiment_accelerator/doc_loader/tests/test_data/json/data.valid.json\"\n        ],\n        chunk_size=1000,\n        overlap_size=200,\n        loader_kwargs={\n            \"keys_to_load\": keys_to_load,\n        },\n    )\n    doc = list(docs[0].values())[0]\n    assert doc[\"content\"] == str(expected_content)\n    assert doc[\"metadata\"][\"source\"].endswith(\n        \"rag_experiment_accelerator/doc_loader/tests/test_data/json/data.valid.json\"\n    )\n\n\ndef test_load_json_files_raises_invalid_keys():\n    keys_to_load = [\"content\", \"title\"]\n    with pytest.raises(ValueError) as exec_info:\n        load_structured_files(\n            file_format=\"JSON\",\n            language=None,\n            loader=CustomJSONLoader,\n            file_paths=[\n                os.path.abspath(\n                    \"rag_experiment_accelerator/doc_loader/tests/test_data/json/data.bad.invalid_keys.json\"\n                )\n            ],\n            chunk_size=1000,\n            overlap_size=200,\n            loader_kwargs={\n                \"keys_to_load\": keys_to_load,\n            },\n        )\n\n    file_path = os.path.abspath(\n        \"rag_experiment_accelerator/doc_loader/tests/test_data/json/data.bad.invalid_keys.json\"\n    )\n    assert (\n        str(exec_info.value)\n        == f\"JSON file at path {file_path} must contain the field 'content'\"\n    )\n\n\ndef test_load_json_files_raises_not_a_list():\n    keys_to_load = [\"content\", \"title\"]\n    loader = CustomJSONLoader\n    with pytest.raises(ValueError) as exec_info:\n        load_structured_files(\n            file_format=\"JSON\",\n            language=None,\n            loader=loader,\n            file_paths=[\n                os.path.abspath(\n                    \"rag_experiment_accelerator/doc_loader/tests/test_data/json/data.bad.not_a_list.json\"\n                )\n            ],\n            chunk_size=1000,\n            overlap_size=200,\n            loader_kwargs={\n                \"keys_to_load\": keys_to_load,\n            },\n        )\n\n    file_path = os.path.abspath(\n        \"rag_experiment_accelerator/doc_loader/tests/test_data/json/data.bad.not_a_list.json\"\n    )\n    assert (\n        str(exec_info.value)\n        == f\"JSON file at path: {file_path} must be a list of object and expects each object to contain the fields ['content', 'title']\"\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json",
    "content": "{\n    \"apiVersion\": \"2023-10-31-preview\",\n    \"modelId\": \"prebuilt-layout\",\n    \"content\": \"Title for page number one Some text for the first page\\n\\\\n# Title for page number two\\\\n\\\\nSome text for the 2nd page. Here we also have a table:\\\\n\\\\n| Name | Age |\\\\n| - | - |\\\\n| Alice | 25 |\\\\n| Bob | 32 |\\\\n\\\\nTitle for page number three This is the end - at page 3.\\\\n===\\\\n\",\n    \"pages\": [\n        {\n            \"pageNumber\": 1,\n            \"angle\": 0.30557239055633545,\n            \"width\": 8.5,\n            \"height\": 11,\n            \"unit\": \"inch\",\n            \"words\": [\n                {\n                    \"content\": \"Title\",\n                    \"polygon\": [\n                        0.9798,\n                        1.0215,\n                        1.2837,\n                        1.0215,\n                        1.2837,\n                        1.1807,\n                        0.9798,\n                        1.1762\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 0,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"for\",\n                    \"polygon\": [\n                        1.3152,\n                        1.0215,\n                        1.5143,\n                        1.0215,\n                        1.5143,\n                        1.1831,\n                        1.3152,\n                        1.181\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 6,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"page\",\n                    \"polygon\": [\n                        1.5457,\n                        1.0215,\n                        1.8811,\n                        1.0217,\n                        1.8811,\n                        1.184,\n                        1.5457,\n                        1.1833\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 10,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"number\",\n                    \"polygon\": [\n                        1.9125,\n                        1.0217,\n                        2.4679,\n                        1.0222,\n                        2.4679,\n                        1.1824,\n                        1.9125,\n                        1.184\n                    ],\n                    \"confidence\": 0.998,\n                    \"span\": {\n                        \"offset\": 15,\n                        \"length\": 6\n                    }\n                },\n                {\n                    \"content\": \"one\",\n                    \"polygon\": [\n                        2.4994,\n                        1.0222,\n                        2.7792,\n                        1.0225,\n                        2.7792,\n                        1.1786,\n                        2.4994,\n                        1.1821\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 22,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"Some\",\n                    \"polygon\": [\n                        0.9828,\n                        1.3283,\n                        1.3779,\n                        1.3283,\n                        1.3755,\n                        1.4882,\n                        0.9798,\n                        1.4837\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 26,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"text\",\n                    \"polygon\": [\n                        1.4119,\n                        1.3284,\n                        1.6579,\n                        1.3292,\n                        1.656,\n                        1.4909,\n                        1.4096,\n                        1.4886\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 31,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"for\",\n                    \"polygon\": [\n                        1.6893,\n                        1.3293,\n                        1.8619,\n                        1.3303,\n                        1.8604,\n                        1.4925,\n                        1.6874,\n                        1.4912\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 36,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"the\",\n                    \"polygon\": [\n                        1.8933,\n                        1.3305,\n                        2.1105,\n                        1.3321,\n                        2.1094,\n                        1.494,\n                        1.8918,\n                        1.4927\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 40,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"first\",\n                    \"polygon\": [\n                        2.1471,\n                        1.3324,\n                        2.4035,\n                        1.335,\n                        2.4029,\n                        1.4944,\n                        2.1461,\n                        1.4942\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 44,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"page\",\n                    \"polygon\": [\n                        2.4428,\n                        1.3354,\n                        2.7744,\n                        1.3397,\n                        2.7744,\n                        1.4944,\n                        2.4422,\n                        1.4944\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 50,\n                        \"length\": 4\n                    }\n                }\n            ],\n            \"lines\": [\n                {\n                    \"content\": \"Title for page number one\",\n                    \"polygon\": [\n                        0.9789,\n                        1.0169,\n                        2.7744,\n                        1.0217,\n                        2.7744,\n                        1.1793,\n                        0.9789,\n                        1.1793\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 0,\n                            \"length\": 25\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Some text for the first page\",\n                    \"polygon\": [\n                        0.9789,\n                        1.3273,\n                        2.7697,\n                        1.332,\n                        2.7697,\n                        1.4944,\n                        0.9789,\n                        1.48\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 26,\n                            \"length\": 28\n                        }\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 0,\n                    \"length\": 55\n                }\n            ]\n        },\n        {\n            \"pageNumber\": 2,\n            \"angle\": 0.07827287167310715,\n            \"width\": 8.5,\n            \"height\": 11,\n            \"unit\": \"inch\",\n            \"words\": [\n                {\n                    \"content\": \"Title\",\n                    \"polygon\": [\n                        0.9828,\n                        1.0245,\n                        1.2902,\n                        1.0232,\n                        1.2902,\n                        1.1789,\n                        0.9828,\n                        1.1735\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 58,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"for\",\n                    \"polygon\": [\n                        1.3238,\n                        1.0231,\n                        1.5201,\n                        1.0224,\n                        1.5201,\n                        1.1817,\n                        1.3238,\n                        1.1794\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 64,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"page\",\n                    \"polygon\": [\n                        1.5511,\n                        1.0223,\n                        1.8894,\n                        1.0215,\n                        1.8894,\n                        1.1834,\n                        1.5511,\n                        1.182\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 68,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"number\",\n                    \"polygon\": [\n                        1.923,\n                        1.0214,\n                        2.4706,\n                        1.0207,\n                        2.4706,\n                        1.1797,\n                        1.923,\n                        1.1834\n                    ],\n                    \"confidence\": 0.998,\n                    \"span\": {\n                        \"offset\": 73,\n                        \"length\": 6\n                    }\n                },\n                {\n                    \"content\": \"two\",\n                    \"polygon\": [\n                        2.5016,\n                        1.0206,\n                        2.7744,\n                        1.0204,\n                        2.7744,\n                        1.1746,\n                        2.5016,\n                        1.1794\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 80,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"Some\",\n                    \"polygon\": [\n                        0.9898,\n                        1.3293,\n                        1.3766,\n                        1.3298,\n                        1.3735,\n                        1.4844,\n                        0.9868,\n                        1.4803\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 85,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"text\",\n                    \"polygon\": [\n                        1.4127,\n                        1.3298,\n                        1.6498,\n                        1.3301,\n                        1.6468,\n                        1.4869,\n                        1.4096,\n                        1.4848\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 90,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"for\",\n                    \"polygon\": [\n                        1.6808,\n                        1.3301,\n                        1.8716,\n                        1.3304,\n                        1.8685,\n                        1.4886,\n                        1.6777,\n                        1.4871\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 95,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"the\",\n                    \"polygon\": [\n                        1.9025,\n                        1.3304,\n                        2.1113,\n                        1.3306,\n                        2.1083,\n                        1.49,\n                        1.8994,\n                        1.4888\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 99,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2nd\",\n                    \"polygon\": [\n                        2.1526,\n                        1.3307,\n                        2.3279,\n                        1.3309,\n                        2.3248,\n                        1.491,\n                        2.1495,\n                        1.4902\n                    ],\n                    \"confidence\": 0.938,\n                    \"span\": {\n                        \"offset\": 103,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"page.\",\n                    \"polygon\": [\n                        2.3691,\n                        1.3309,\n                        2.7172,\n                        1.3312,\n                        2.7141,\n                        1.4921,\n                        2.3661,\n                        1.4912\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 107,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"Here\",\n                    \"polygon\": [\n                        2.7687,\n                        1.3313,\n                        3.0755,\n                        1.3316,\n                        3.0725,\n                        1.4924,\n                        2.7657,\n                        1.4922\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 113,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"we\",\n                    \"polygon\": [\n                        3.1116,\n                        1.3316,\n                        3.2972,\n                        1.3317,\n                        3.2942,\n                        1.4921,\n                        3.1086,\n                        1.4924\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 118,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"also\",\n                    \"polygon\": [\n                        3.3385,\n                        1.3318,\n                        3.6066,\n                        1.332,\n                        3.6036,\n                        1.4913,\n                        3.3354,\n                        1.4921\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 121,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"have\",\n                    \"polygon\": [\n                        3.6504,\n                        1.332,\n                        3.9521,\n                        1.3322,\n                        3.949,\n                        1.4897,\n                        3.6474,\n                        1.4911\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 126,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"a\",\n                    \"polygon\": [\n                        3.9933,\n                        1.3322,\n                        4.0603,\n                        1.3323,\n                        4.0573,\n                        1.4889,\n                        3.9903,\n                        1.4894\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 131,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"table:\",\n                    \"polygon\": [\n                        4.099,\n                        1.3323,\n                        4.484,\n                        1.3324,\n                        4.484,\n                        1.4857,\n                        4.096,\n                        1.4887\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 133,\n                        \"length\": 6\n                    }\n                },\n                {\n                    \"content\": \"Name\",\n                    \"polygon\": [\n                        1.0687,\n                        1.657,\n                        1.5042,\n                        1.6553,\n                        1.5042,\n                        1.7996,\n                        1.0627,\n                        1.7933\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 143,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"Age\",\n                    \"polygon\": [\n                        3.2185,\n                        1.6615,\n                        3.5146,\n                        1.6615,\n                        3.5134,\n                        1.8094,\n                        3.2185,\n                        1.807\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 150,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"Alice\",\n                    \"polygon\": [\n                        1.0553,\n                        1.8429,\n                        1.4135,\n                        1.8429,\n                        1.4135,\n                        1.9813,\n                        1.0553,\n                        1.9789\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 168,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"25\",\n                    \"polygon\": [\n                        3.2281,\n                        1.8524,\n                        3.4185,\n                        1.8524,\n                        3.4185,\n                        1.9766,\n                        3.2281,\n                        1.9766\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 176,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"Bob\",\n                    \"polygon\": [\n                        1.0693,\n                        2.0386,\n                        1.3463,\n                        2.0386,\n                        1.3451,\n                        2.1771,\n                        1.0682,\n                        2.1747\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 183,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"32\",\n                    \"polygon\": [\n                        3.2281,\n                        2.0434,\n                        3.4067,\n                        2.0434,\n                        3.4085,\n                        2.1699,\n                        3.2281,\n                        2.1723\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 189,\n                        \"length\": 2\n                    }\n                }\n            ],\n            \"lines\": [\n                {\n                    \"content\": \"# Title for page number two\",\n                    \"polygon\": [\n                        0.9789,\n                        1.0217,\n                        2.7697,\n                        1.0169,\n                        2.7697,\n                        1.1793,\n                        0.9789,\n                        1.1793\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 56,\n                            \"length\": 27\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Some text for the 2nd page. Here we also have a table:\",\n                    \"polygon\": [\n                        0.9837,\n                        1.3273,\n                        4.4792,\n                        1.332,\n                        4.4792,\n                        1.4896,\n                        0.9837,\n                        1.4848\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 85,\n                            \"length\": 54\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Name\",\n                    \"polygon\": [\n                        1.0601,\n                        1.6519,\n                        1.4994,\n                        1.6519,\n                        1.4994,\n                        1.7951,\n                        1.0601,\n                        1.7951\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 143,\n                            \"length\": 4\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Age\",\n                    \"polygon\": [\n                        3.2185,\n                        1.6615,\n                        3.5098,\n                        1.6615,\n                        3.5051,\n                        1.8095,\n                        3.2185,\n                        1.8047\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 150,\n                            \"length\": 3\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Alice\",\n                    \"polygon\": [\n                        1.0601,\n                        1.8429,\n                        1.4087,\n                        1.8477,\n                        1.4039,\n                        1.9813,\n                        1.0553,\n                        1.9813\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 168,\n                            \"length\": 5\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"25\",\n                    \"polygon\": [\n                        3.2281,\n                        1.8524,\n                        3.4143,\n                        1.8524,\n                        3.4096,\n                        1.9766,\n                        3.2281,\n                        1.9766\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 176,\n                            \"length\": 2\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Bob\",\n                    \"polygon\": [\n                        1.0649,\n                        2.0386,\n                        1.3419,\n                        2.0386,\n                        1.3371,\n                        2.1771,\n                        1.0601,\n                        2.1723\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 183,\n                            \"length\": 3\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"32\",\n                    \"polygon\": [\n                        3.2281,\n                        2.0482,\n                        3.4048,\n                        2.0434,\n                        3.4048,\n                        2.1723,\n                        3.2281,\n                        2.1723\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 189,\n                            \"length\": 2\n                        }\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 56,\n                    \"length\": 138\n                }\n            ]\n        },\n        {\n            \"pageNumber\": 3,\n            \"angle\": 0.07891959697008133,\n            \"width\": 8.5,\n            \"height\": 11,\n            \"unit\": \"inch\",\n            \"words\": [\n                {\n                    \"content\": \"Title\",\n                    \"polygon\": [\n                        0.9798,\n                        1.0234,\n                        1.2894,\n                        1.0226,\n                        1.2899,\n                        1.181,\n                        0.9798,\n                        1.1762\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 195,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"for\",\n                    \"polygon\": [\n                        1.3232,\n                        1.0226,\n                        1.5105,\n                        1.0221,\n                        1.5114,\n                        1.1833,\n                        1.3238,\n                        1.1814\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 201,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"page\",\n                    \"polygon\": [\n                        1.5418,\n                        1.0221,\n                        1.8826,\n                        1.0213,\n                        1.884,\n                        1.184,\n                        1.5427,\n                        1.1835\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 205,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"number\",\n                    \"polygon\": [\n                        1.9164,\n                        1.0212,\n                        2.4575,\n                        1.02,\n                        2.4599,\n                        1.1797,\n                        1.9179,\n                        1.184\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 210,\n                        \"length\": 6\n                    }\n                },\n                {\n                    \"content\": \"three\",\n                    \"polygon\": [\n                        2.4887,\n                        1.0199,\n                        2.8841,\n                        1.019,\n                        2.8843,\n                        1.1717,\n                        2.4911,\n                        1.1793\n                    ],\n                    \"confidence\": 0.998,\n                    \"span\": {\n                        \"offset\": 217,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"This\",\n                    \"polygon\": [\n                        0.9701,\n                        1.321,\n                        1.2405,\n                        1.3233,\n                        1.2369,\n                        1.4938,\n                        0.9658,\n                        1.4923\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 223,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"is\",\n                    \"polygon\": [\n                        1.2743,\n                        1.3235,\n                        1.4067,\n                        1.3242,\n                        1.4035,\n                        1.4944,\n                        1.2708,\n                        1.494\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 228,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"the\",\n                    \"polygon\": [\n                        1.4405,\n                        1.3244,\n                        1.649,\n                        1.3251,\n                        1.6464,\n                        1.4944,\n                        1.4374,\n                        1.4944\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 231,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"end\",\n                    \"polygon\": [\n                        1.6913,\n                        1.3253,\n                        1.9279,\n                        1.3254,\n                        1.926,\n                        1.4944,\n                        1.6888,\n                        1.4944\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 235,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"-\",\n                    \"polygon\": [\n                        1.9702,\n                        1.3254,\n                        2.0152,\n                        1.3253,\n                        2.0136,\n                        1.4944,\n                        1.9684,\n                        1.4944\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 239,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"at\",\n                    \"polygon\": [\n                        2.0575,\n                        1.3252,\n                        2.1645,\n                        1.3249,\n                        2.1632,\n                        1.4944,\n                        2.0559,\n                        1.4944\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 241,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"page\",\n                    \"polygon\": [\n                        2.1983,\n                        1.3249,\n                        2.528,\n                        1.323,\n                        2.5275,\n                        1.4944,\n                        2.1971,\n                        1.4944\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 244,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"3.\",\n                    \"polygon\": [\n                        2.5702,\n                        1.3227,\n                        2.7028,\n                        1.3216,\n                        2.7028,\n                        1.4944,\n                        2.5699,\n                        1.4944\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 249,\n                        \"length\": 2\n                    }\n                }\n            ],\n            \"lines\": [\n                {\n                    \"content\": \"Title for page number three\",\n                    \"polygon\": [\n                        0.9789,\n                        1.0217,\n                        2.8795,\n                        1.0169,\n                        2.8795,\n                        1.1793,\n                        0.9789,\n                        1.1793\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 195,\n                            \"length\": 27\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"This is the end - at page 3.\\n===\",\n                    \"polygon\": [\n                        0.9646,\n                        1.3177,\n                        2.698,\n                        1.3177,\n                        2.698,\n                        1.4944,\n                        0.9646,\n                        1.4896\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 223,\n                            \"length\": 32\n                        }\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 195,\n                    \"length\": 61\n                }\n            ]\n        }\n    ],\n    \"tables\": [\n        {\n            \"rowCount\": 3,\n            \"columnCount\": 2,\n            \"cells\": [\n                {\n                    \"kind\": \"columnHeader\",\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 0,\n                    \"content\": \"Name\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 2,\n                            \"polygon\": [\n                                1.0028,\n                                1.6185,\n                                3.1612,\n                                1.6233,\n                                3.1612,\n                                1.819,\n                                1.0028,\n                                1.819\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 143,\n                            \"length\": 4\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/3\"\n                    ]\n                },\n                {\n                    \"kind\": \"columnHeader\",\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 1,\n                    \"content\": \"Age\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 2,\n                            \"polygon\": [\n                                3.1612,\n                                1.6233,\n                                5.3244,\n                                1.6233,\n                                5.3292,\n                                1.819,\n                                3.1612,\n                                1.819\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 150,\n                            \"length\": 3\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/4\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 0,\n                    \"content\": \"Alice\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 2,\n                            \"polygon\": [\n                                1.0028,\n                                1.819,\n                                3.1612,\n                                1.819,\n                                3.1612,\n                                2.01,\n                                1.0028,\n                                2.01\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 168,\n                            \"length\": 5\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/5\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 1,\n                    \"content\": \"25\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 2,\n                            \"polygon\": [\n                                3.1612,\n                                1.819,\n                                5.3292,\n                                1.819,\n                                5.334,\n                                2.01,\n                                3.1612,\n                                2.01\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 176,\n                            \"length\": 2\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/6\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 0,\n                    \"content\": \"Bob\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 2,\n                            \"polygon\": [\n                                1.0028,\n                                2.01,\n                                3.1612,\n                                2.01,\n                                3.1612,\n                                2.2057,\n                                1.0028,\n                                2.2057\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 183,\n                            \"length\": 3\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/7\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 1,\n                    \"content\": \"32\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 2,\n                            \"polygon\": [\n                                3.1612,\n                                2.01,\n                                5.334,\n                                2.01,\n                                5.334,\n                                2.2057,\n                                3.1612,\n                                2.2057\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 189,\n                            \"length\": 2\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/8\"\n                    ]\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        0.9968,\n                        1.6109,\n                        5.3383,\n                        1.6132,\n                        5.3382,\n                        2.2191,\n                        0.9976,\n                        2.2177\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 141,\n                    \"length\": 52\n                }\n            ]\n        }\n    ],\n    \"paragraphs\": [\n        {\n            \"spans\": [\n                {\n                    \"offset\": 0,\n                    \"length\": 54\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9789,\n                        1.0169,\n                        2.7749,\n                        1.0217,\n                        2.7736,\n                        1.4944,\n                        0.9777,\n                        1.4896\n                    ]\n                }\n            ],\n            \"content\": \"Title for page number one Some text for the first page\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 56,\n                    \"length\": 27\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        0.9789,\n                        1.0169,\n                        2.7697,\n                        1.0169,\n                        2.7697,\n                        1.1793,\n                        0.9789,\n                        1.1793\n                    ]\n                }\n            ],\n            \"role\": \"sectionHeading\",\n            \"content\": \"# Title for page number two\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 85,\n                    \"length\": 54\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        0.9837,\n                        1.3273,\n                        4.4794,\n                        1.332,\n                        4.4792,\n                        1.4896,\n                        0.9835,\n                        1.4848\n                    ]\n                }\n            ],\n            \"content\": \"Some text for the 2nd page. Here we also have a table:\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 143,\n                    \"length\": 4\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        1.0028,\n                        1.6185,\n                        3.1612,\n                        1.6233,\n                        3.1612,\n                        1.819,\n                        1.0028,\n                        1.819\n                    ]\n                }\n            ],\n            \"content\": \"Name\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 150,\n                    \"length\": 3\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        3.1612,\n                        1.6233,\n                        5.3244,\n                        1.6233,\n                        5.3292,\n                        1.819,\n                        3.1612,\n                        1.819\n                    ]\n                }\n            ],\n            \"content\": \"Age\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 168,\n                    \"length\": 5\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        1.0028,\n                        1.819,\n                        3.1612,\n                        1.819,\n                        3.1612,\n                        2.01,\n                        1.0028,\n                        2.01\n                    ]\n                }\n            ],\n            \"content\": \"Alice\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 176,\n                    \"length\": 2\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        3.1612,\n                        1.819,\n                        5.3292,\n                        1.819,\n                        5.334,\n                        2.01,\n                        3.1612,\n                        2.01\n                    ]\n                }\n            ],\n            \"content\": \"25\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 183,\n                    \"length\": 3\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        1.0028,\n                        2.01,\n                        3.1612,\n                        2.01,\n                        3.1612,\n                        2.2057,\n                        1.0028,\n                        2.2057\n                    ]\n                }\n            ],\n            \"content\": \"Bob\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 189,\n                    \"length\": 2\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 2,\n                    \"polygon\": [\n                        3.1612,\n                        2.01,\n                        5.334,\n                        2.01,\n                        5.334,\n                        2.2057,\n                        3.1612,\n                        2.2057\n                    ]\n                }\n            ],\n            \"content\": \"32\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 195,\n                    \"length\": 60\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 3,\n                    \"polygon\": [\n                        0.9634,\n                        1.0217,\n                        2.8795,\n                        1.0169,\n                        2.8807,\n                        1.4939,\n                        0.9646,\n                        1.4987\n                    ]\n                }\n            ],\n            \"role\": \"title\",\n            \"content\": \"Title for page number three This is the end - at page 3.\\n===\"\n        }\n    ]\n}"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json",
    "content": "{\n    \"apiVersion\": \"2023-10-31-preview\",\n    \"modelId\": \"prebuilt-layout\",\n    \"content\": \"This is the Title\\n\\\\nSome text\\\\n\\\\n| Col 1 | Col 2 | Col 3 |\\\\n| - | - | - |\\\\n| Row 1 Col 1 | Row 1 Col 2 | Row 1 Col 3 |\\\\n| Row 2 Col 1 | Row 2 Col 2 | Row 2 Col 3 |\\\\n\\\\nThis is the end.\\\\n\",\n    \"pages\": [\n        {\n            \"pageNumber\": 1,\n            \"angle\": 0,\n            \"width\": 8.5,\n            \"height\": 11,\n            \"unit\": \"inch\",\n            \"words\": [\n                {\n                    \"content\": \"This\",\n                    \"polygon\": [\n                        0.9869,\n                        1.0203,\n                        1.2815,\n                        1.0214,\n                        1.2784,\n                        1.1667,\n                        0.9838,\n                        1.1739\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 0,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"is\",\n                    \"polygon\": [\n                        1.3222,\n                        1.0215,\n                        1.4323,\n                        1.022,\n                        1.4293,\n                        1.1648,\n                        1.3191,\n                        1.1661\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 5,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"the\",\n                    \"polygon\": [\n                        1.4659,\n                        1.0221,\n                        1.691,\n                        1.0232,\n                        1.688,\n                        1.1655,\n                        1.4628,\n                        1.1647\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 8,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"Title\",\n                    \"polygon\": [\n                        1.7221,\n                        1.0234,\n                        2.0438,\n                        1.0253,\n                        2.0438,\n                        1.1731,\n                        1.7191,\n                        1.1659\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 12,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"Some\",\n                    \"polygon\": [\n                        0.9898,\n                        1.3361,\n                        1.3741,\n                        1.3411,\n                        1.3723,\n                        1.4728,\n                        0.9898,\n                        1.4753\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 19,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"text\",\n                    \"polygon\": [\n                        1.4114,\n                        1.3414,\n                        1.6761,\n                        1.3426,\n                        1.6761,\n                        1.4727,\n                        1.4095,\n                        1.4726\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 24,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        1.0697,\n                        1.6519,\n                        1.3192,\n                        1.6519,\n                        1.3184,\n                        1.7848,\n                        1.0697,\n                        1.7831\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 32,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"1\",\n                    \"polygon\": [\n                        1.3594,\n                        1.6519,\n                        1.4351,\n                        1.6519,\n                        1.4342,\n                        1.7856,\n                        1.3585,\n                        1.7851\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 36,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        3.2281,\n                        1.6519,\n                        3.4843,\n                        1.6519,\n                        3.4834,\n                        1.7896,\n                        3.2281,\n                        1.7879\n                    ],\n                    \"confidence\": 0.998,\n                    \"span\": {\n                        \"offset\": 40,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2\",\n                    \"polygon\": [\n                        3.5166,\n                        1.6519,\n                        3.6066,\n                        1.6519,\n                        3.6057,\n                        1.7903,\n                        3.5157,\n                        1.7898\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 44,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        5.3961,\n                        1.6567,\n                        5.6483,\n                        1.6534,\n                        5.65,\n                        1.7871,\n                        5.3961,\n                        1.7903\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 48,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"3\",\n                    \"polygon\": [\n                        5.6817,\n                        1.653,\n                        5.7775,\n                        1.6519,\n                        5.7781,\n                        1.7855,\n                        5.6834,\n                        1.7867\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 52,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Row\",\n                    \"polygon\": [\n                        1.0698,\n                        1.848,\n                        1.3652,\n                        1.8458,\n                        1.366,\n                        1.9852,\n                        1.0728,\n                        1.9854\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 72,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"1\",\n                    \"polygon\": [\n                        1.4061,\n                        1.8457,\n                        1.4516,\n                        1.8455,\n                        1.4517,\n                        1.9852,\n                        1.4066,\n                        1.9852\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 76,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        1.513,\n                        1.8457,\n                        1.7402,\n                        1.847,\n                        1.7381,\n                        1.9856,\n                        1.5125,\n                        1.9853\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 78,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"1\",\n                    \"polygon\": [\n                        1.7834,\n                        1.8475,\n                        1.8576,\n                        1.8482,\n                        1.8576,\n                        1.9859,\n                        1.7809,\n                        1.9857\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 82,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Row\",\n                    \"polygon\": [\n                        3.2327,\n                        1.8463,\n                        3.5305,\n                        1.8466,\n                        3.5294,\n                        1.9861,\n                        3.2327,\n                        1.9849\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 86,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"1\",\n                    \"polygon\": [\n                        3.5741,\n                        1.8467,\n                        3.6199,\n                        1.8467,\n                        3.6184,\n                        1.9861,\n                        3.5727,\n                        1.9861\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 90,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        3.6794,\n                        1.8469,\n                        3.9039,\n                        1.8475,\n                        3.9013,\n                        1.9861,\n                        3.6777,\n                        1.9861\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 92,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2\",\n                    \"polygon\": [\n                        3.9406,\n                        1.8476,\n                        4.0256,\n                        1.8479,\n                        4.0256,\n                        1.9861,\n                        3.9379,\n                        1.9861\n                    ],\n                    \"confidence\": 0.975,\n                    \"span\": {\n                        \"offset\": 96,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Row\",\n                    \"polygon\": [\n                        5.3987,\n                        1.8475,\n                        5.6963,\n                        1.8469,\n                        5.697,\n                        1.9878,\n                        5.4017,\n                        1.9837\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 100,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"1\",\n                    \"polygon\": [\n                        5.7378,\n                        1.8468,\n                        5.7839,\n                        1.8467,\n                        5.784,\n                        1.9885,\n                        5.7382,\n                        1.9881\n                    ],\n                    \"confidence\": 0.973,\n                    \"span\": {\n                        \"offset\": 104,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        5.8416,\n                        1.8464,\n                        6.07,\n                        1.8451,\n                        6.0678,\n                        1.9881,\n                        5.8412,\n                        1.9886\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 106,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"3\",\n                    \"polygon\": [\n                        6.1046,\n                        1.8449,\n                        6.1888,\n                        1.8443,\n                        6.1888,\n                        1.9872,\n                        6.1022,\n                        1.9878\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 110,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Row\",\n                    \"polygon\": [\n                        1.0657,\n                        2.0421,\n                        1.3659,\n                        2.0438,\n                        1.3629,\n                        2.1768,\n                        1.0627,\n                        2.1771\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 116,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2\",\n                    \"polygon\": [\n                        1.4032,\n                        2.0439,\n                        1.4776,\n                        2.0441,\n                        1.4746,\n                        2.1768,\n                        1.4001,\n                        2.1768\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 120,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        1.5149,\n                        2.0441,\n                        1.7647,\n                        2.0439,\n                        1.7616,\n                        2.1771,\n                        1.5119,\n                        2.1769\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 122,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"1\",\n                    \"polygon\": [\n                        1.7932,\n                        2.0439,\n                        1.8576,\n                        2.0438,\n                        1.8576,\n                        2.1771,\n                        1.7901,\n                        2.1771\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 126,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Row\",\n                    \"polygon\": [\n                        3.2317,\n                        2.0408,\n                        3.5306,\n                        2.0421,\n                        3.5283,\n                        2.1808,\n                        3.2316,\n                        2.1796\n                    ],\n                    \"confidence\": 0.998,\n                    \"span\": {\n                        \"offset\": 130,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2\",\n                    \"polygon\": [\n                        3.5671,\n                        2.0422,\n                        3.6401,\n                        2.0422,\n                        3.6369,\n                        2.181,\n                        3.5645,\n                        2.1809\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 134,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        3.6812,\n                        2.042,\n                        3.907,\n                        2.0407,\n                        3.9019,\n                        2.1804,\n                        3.6777,\n                        2.181\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 136,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2\",\n                    \"polygon\": [\n                        3.9413,\n                        2.0404,\n                        4.0303,\n                        2.0396,\n                        4.0287,\n                        2.1799,\n                        3.9358,\n                        2.1803\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 140,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Row\",\n                    \"polygon\": [\n                        5.3976,\n                        2.042,\n                        5.6964,\n                        2.0422,\n                        5.6964,\n                        2.1815,\n                        5.3976,\n                        2.1789\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 144,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"2\",\n                    \"polygon\": [\n                        5.7286,\n                        2.0422,\n                        5.8021,\n                        2.042,\n                        5.8021,\n                        2.1819,\n                        5.7286,\n                        2.1817\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 148,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"Col\",\n                    \"polygon\": [\n                        5.8435,\n                        2.0417,\n                        6.0664,\n                        2.04,\n                        6.0664,\n                        2.1819,\n                        5.8435,\n                        2.1819\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 150,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"3\",\n                    \"polygon\": [\n                        6.1032,\n                        2.0396,\n                        6.1935,\n                        2.0387,\n                        6.1935,\n                        2.1819,\n                        6.1032,\n                        2.1819\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 154,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"This\",\n                    \"polygon\": [\n                        0.9809,\n                        2.5439,\n                        1.2621,\n                        2.5463,\n                        1.2612,\n                        2.6912,\n                        0.9809,\n                        2.6958\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 159,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"is\",\n                    \"polygon\": [\n                        1.305,\n                        2.5466,\n                        1.4051,\n                        2.547,\n                        1.4038,\n                        2.6897,\n                        1.304,\n                        2.6907\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 164,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"the\",\n                    \"polygon\": [\n                        1.4408,\n                        2.547,\n                        1.6481,\n                        2.5469,\n                        1.6461,\n                        2.689,\n                        1.4394,\n                        2.6895\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 167,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"end.\",\n                    \"polygon\": [\n                        1.691,\n                        2.5468,\n                        1.9961,\n                        2.5447,\n                        1.9961,\n                        2.6912,\n                        1.6889,\n                        2.6891\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 171,\n                        \"length\": 4\n                    }\n                }\n            ],\n            \"lines\": [\n                {\n                    \"content\": \"This is the Title\",\n                    \"polygon\": [\n                        0.9837,\n                        1.0169,\n                        2.039,\n                        1.0169,\n                        2.039,\n                        1.1697,\n                        0.9837,\n                        1.1697\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 0,\n                            \"length\": 17\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Some text\",\n                    \"polygon\": [\n                        0.9885,\n                        1.332,\n                        1.6713,\n                        1.3368,\n                        1.6713,\n                        1.4753,\n                        0.9885,\n                        1.4705\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 19,\n                            \"length\": 9\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Col 1\",\n                    \"polygon\": [\n                        1.0744,\n                        1.6519,\n                        1.4326,\n                        1.6567,\n                        1.4326,\n                        1.7856,\n                        1.0697,\n                        1.7856\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 32,\n                            \"length\": 5\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Col 2\",\n                    \"polygon\": [\n                        3.2281,\n                        1.6519,\n                        3.6006,\n                        1.6519,\n                        3.6053,\n                        1.7904,\n                        3.2281,\n                        1.7856\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 40,\n                            \"length\": 5\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Col 3\",\n                    \"polygon\": [\n                        5.3961,\n                        1.6567,\n                        5.7733,\n                        1.6519,\n                        5.7685,\n                        1.7856,\n                        5.3961,\n                        1.7904\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 48,\n                            \"length\": 5\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Row 1 Col 1\",\n                    \"polygon\": [\n                        1.0697,\n                        1.8429,\n                        1.8528,\n                        1.8429,\n                        1.8528,\n                        1.9813,\n                        1.0697,\n                        1.9813\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 72,\n                            \"length\": 11\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Row 1 Col 2\",\n                    \"polygon\": [\n                        3.2281,\n                        1.8429,\n                        4.0208,\n                        1.8429,\n                        4.0208,\n                        1.9813,\n                        3.2281,\n                        1.9813\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 86,\n                            \"length\": 11\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Row 1 Col 3\",\n                    \"polygon\": [\n                        5.3961,\n                        1.8429,\n                        6.184,\n                        1.8429,\n                        6.184,\n                        1.9861,\n                        5.3961,\n                        1.9861\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 100,\n                            \"length\": 11\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Row 2 Col 1\",\n                    \"polygon\": [\n                        1.0601,\n                        2.0386,\n                        1.8528,\n                        2.0386,\n                        1.8528,\n                        2.1723,\n                        1.0601,\n                        2.1723\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 116,\n                            \"length\": 11\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Row 2 Col 2\",\n                    \"polygon\": [\n                        3.2281,\n                        2.0386,\n                        4.0256,\n                        2.0386,\n                        4.0256,\n                        2.1771,\n                        3.2281,\n                        2.1771\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 130,\n                            \"length\": 11\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Row 2 Col 3\",\n                    \"polygon\": [\n                        5.3961,\n                        2.0386,\n                        6.1888,\n                        2.0386,\n                        6.1888,\n                        2.1771,\n                        5.3961,\n                        2.1771\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 144,\n                            \"length\": 11\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"This is the end.\",\n                    \"polygon\": [\n                        0.9789,\n                        2.5399,\n                        1.9913,\n                        2.5399,\n                        1.9913,\n                        2.6879,\n                        0.9789,\n                        2.6927\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 159,\n                            \"length\": 16\n                        }\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 0,\n                    \"length\": 176\n                }\n            ]\n        }\n    ],\n    \"tables\": [\n        {\n            \"rowCount\": 3,\n            \"columnCount\": 3,\n            \"cells\": [\n                {\n                    \"kind\": \"columnHeader\",\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 0,\n                    \"content\": \"Col 1\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9964,\n                                1.6192,\n                                3.1622,\n                                1.6192,\n                                3.1622,\n                                1.8195,\n                                0.9964,\n                                1.8195\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 32,\n                            \"length\": 5\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/2\"\n                    ]\n                },\n                {\n                    \"kind\": \"columnHeader\",\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 1,\n                    \"content\": \"Col 2\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.1622,\n                                1.6192,\n                                5.328,\n                                1.6192,\n                                5.3346,\n                                1.8195,\n                                3.1622,\n                                1.8195\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 40,\n                            \"length\": 5\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/3\"\n                    ]\n                },\n                {\n                    \"kind\": \"columnHeader\",\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 2,\n                    \"content\": \"Col 3\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                5.328,\n                                1.6192,\n                                7.4871,\n                                1.6192,\n                                7.4871,\n                                1.8195,\n                                5.3346,\n                                1.8195\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 48,\n                            \"length\": 5\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/4\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 0,\n                    \"content\": \"Row 1 Col 1\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9964,\n                                1.8195,\n                                3.1622,\n                                1.8195,\n                                3.1622,\n                                2.0065,\n                                0.9964,\n                                2.0065\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 72,\n                            \"length\": 11\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/5\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 1,\n                    \"content\": \"Row 1 Col 2\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.1622,\n                                1.8195,\n                                5.3346,\n                                1.8195,\n                                5.3346,\n                                2.0065,\n                                3.1622,\n                                2.0065\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 86,\n                            \"length\": 11\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/6\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 2,\n                    \"content\": \"Row 1 Col 3\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                5.3346,\n                                1.8195,\n                                7.4871,\n                                1.8195,\n                                7.4937,\n                                2.0065,\n                                5.3346,\n                                2.0065\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 100,\n                            \"length\": 11\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/7\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 0,\n                    \"content\": \"Row 2 Col 1\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9964,\n                                2.0065,\n                                3.1622,\n                                2.0065,\n                                3.1555,\n                                2.2068,\n                                0.9964,\n                                2.2068\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 116,\n                            \"length\": 11\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/8\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 1,\n                    \"content\": \"Row 2 Col 2\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.1622,\n                                2.0065,\n                                5.3346,\n                                2.0065,\n                                5.3346,\n                                2.2068,\n                                3.1555,\n                                2.2068\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 130,\n                            \"length\": 11\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/9\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 2,\n                    \"content\": \"Row 2 Col 3\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                5.3346,\n                                2.0065,\n                                7.4937,\n                                2.0065,\n                                7.4937,\n                                2.2135,\n                                5.3346,\n                                2.2068\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 144,\n                            \"length\": 11\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/10\"\n                    ]\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.999,\n                        1.6078,\n                        7.4993,\n                        1.61,\n                        7.5003,\n                        2.2221,\n                        1.0011,\n                        2.2207\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 30,\n                    \"length\": 127\n                }\n            ]\n        }\n    ],\n    \"paragraphs\": [\n        {\n            \"spans\": [\n                {\n                    \"offset\": 0,\n                    \"length\": 17\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9837,\n                        1.0169,\n                        2.039,\n                        1.0169,\n                        2.039,\n                        1.1697,\n                        0.9837,\n                        1.1697\n                    ]\n                }\n            ],\n            \"content\": \"This is the Title\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 19,\n                    \"length\": 9\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9885,\n                        1.332,\n                        1.6723,\n                        1.3368,\n                        1.6713,\n                        1.4753,\n                        0.9875,\n                        1.4705\n                    ]\n                }\n            ],\n            \"content\": \"Some text\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 32,\n                    \"length\": 5\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9964,\n                        1.6192,\n                        3.1622,\n                        1.6192,\n                        3.1622,\n                        1.8195,\n                        0.9964,\n                        1.8195\n                    ]\n                }\n            ],\n            \"content\": \"Col 1\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 40,\n                    \"length\": 5\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.1622,\n                        1.6192,\n                        5.328,\n                        1.6192,\n                        5.3346,\n                        1.8195,\n                        3.1622,\n                        1.8195\n                    ]\n                }\n            ],\n            \"content\": \"Col 2\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 48,\n                    \"length\": 5\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        5.328,\n                        1.6192,\n                        7.4871,\n                        1.6192,\n                        7.4871,\n                        1.8195,\n                        5.3346,\n                        1.8195\n                    ]\n                }\n            ],\n            \"content\": \"Col 3\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 72,\n                    \"length\": 11\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9964,\n                        1.8195,\n                        3.1622,\n                        1.8195,\n                        3.1622,\n                        2.0065,\n                        0.9964,\n                        2.0065\n                    ]\n                }\n            ],\n            \"content\": \"Row 1 Col 1\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 86,\n                    \"length\": 11\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.1622,\n                        1.8195,\n                        5.3346,\n                        1.8195,\n                        5.3346,\n                        2.0065,\n                        3.1622,\n                        2.0065\n                    ]\n                }\n            ],\n            \"content\": \"Row 1 Col 2\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 100,\n                    \"length\": 11\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        5.3346,\n                        1.8195,\n                        7.4871,\n                        1.8195,\n                        7.4937,\n                        2.0065,\n                        5.3346,\n                        2.0065\n                    ]\n                }\n            ],\n            \"content\": \"Row 1 Col 3\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 116,\n                    \"length\": 11\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9964,\n                        2.0065,\n                        3.1622,\n                        2.0065,\n                        3.1555,\n                        2.2068,\n                        0.9964,\n                        2.2068\n                    ]\n                }\n            ],\n            \"content\": \"Row 2 Col 1\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 130,\n                    \"length\": 11\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.1622,\n                        2.0065,\n                        5.3346,\n                        2.0065,\n                        5.3346,\n                        2.2068,\n                        3.1555,\n                        2.2068\n                    ]\n                }\n            ],\n            \"content\": \"Row 2 Col 2\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 144,\n                    \"length\": 11\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        5.3346,\n                        2.0065,\n                        7.4937,\n                        2.0065,\n                        7.4937,\n                        2.2135,\n                        5.3346,\n                        2.2068\n                    ]\n                }\n            ],\n            \"content\": \"Row 2 Col 3\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 159,\n                    \"length\": 16\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9789,\n                        2.5399,\n                        1.9913,\n                        2.5399,\n                        1.9913,\n                        2.6927,\n                        0.9789,\n                        2.6927\n                    ]\n                }\n            ],\n            \"content\": \"This is the end.\"\n        }\n    ]\n}"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json",
    "content": "{\n    \"apiVersion\": \"2023-10-31-preview\",\n    \"modelId\": \"prebuilt-layout\",\n    \"content\": \"Table without Headers\\n===\\\\n\\\\nTesting a table that has no headers\\\\n\\\\n|||\\\\n| - | - |\\\\n| A | B |\\\\n| C | D |\\\\n| E | F |\\\\n| G | H |\\\\n\\\\nThis is the end.\\\\n\",\n    \"pages\": [\n        {\n            \"pageNumber\": 1,\n            \"angle\": 0,\n            \"width\": 8.5,\n            \"height\": 11,\n            \"unit\": \"inch\",\n            \"words\": [\n                {\n                    \"content\": \"Table\",\n                    \"polygon\": [\n                        0.9839,\n                        1.026,\n                        1.3776,\n                        1.0228,\n                        1.3768,\n                        1.1729,\n                        0.9839,\n                        1.1712\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 0,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"without\",\n                    \"polygon\": [\n                        1.4098,\n                        1.0226,\n                        1.9396,\n                        1.0214,\n                        1.9378,\n                        1.1745,\n                        1.4089,\n                        1.173\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 6,\n                        \"length\": 7\n                    }\n                },\n                {\n                    \"content\": \"Headers\",\n                    \"polygon\": [\n                        1.9842,\n                        1.0215,\n                        2.5739,\n                        1.0245,\n                        2.5739,\n                        1.1745,\n                        1.9823,\n                        1.1745\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 14,\n                        \"length\": 7\n                    }\n                },\n                {\n                    \"content\": \"Testing\",\n                    \"polygon\": [\n                        0.9898,\n                        1.3413,\n                        1.4657,\n                        1.3387,\n                        1.4626,\n                        1.4834,\n                        0.9868,\n                        1.4877\n                    ],\n                    \"confidence\": 0.996,\n                    \"span\": {\n                        \"offset\": 27,\n                        \"length\": 7\n                    }\n                },\n                {\n                    \"content\": \"a\",\n                    \"polygon\": [\n                        1.4956,\n                        1.3386,\n                        1.5668,\n                        1.3383,\n                        1.5638,\n                        1.4826,\n                        1.4925,\n                        1.4832\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 35,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"table\",\n                    \"polygon\": [\n                        1.6059,\n                        1.3381,\n                        1.9346,\n                        1.3375,\n                        1.9316,\n                        1.4799,\n                        1.6029,\n                        1.4823\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 37,\n                        \"length\": 5\n                    }\n                },\n                {\n                    \"content\": \"that\",\n                    \"polygon\": [\n                        1.9737,\n                        1.3374,\n                        2.2404,\n                        1.3376,\n                        2.2373,\n                        1.4781,\n                        1.9707,\n                        1.4796\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 43,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"has\",\n                    \"polygon\": [\n                        2.268,\n                        1.3377,\n                        2.507,\n                        1.3383,\n                        2.504,\n                        1.4768,\n                        2.2649,\n                        1.4779\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 48,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"no\",\n                    \"polygon\": [\n                        2.5346,\n                        1.3384,\n                        2.6978,\n                        1.339,\n                        2.6948,\n                        1.476,\n                        2.5316,\n                        1.4767\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 52,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"headers\",\n                    \"polygon\": [\n                        2.7369,\n                        1.3392,\n                        3.2949,\n                        1.343,\n                        3.2948,\n                        1.4744,\n                        2.7339,\n                        1.4759\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 55,\n                        \"length\": 7\n                    }\n                },\n                {\n                    \"content\": \"A\",\n                    \"polygon\": [\n                        1.0697,\n                        1.6615,\n                        1.1843,\n                        1.6615,\n                        1.1843,\n                        1.7856,\n                        1.0697,\n                        1.7856\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 80,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"B\",\n                    \"polygon\": [\n                        3.2376,\n                        1.6615,\n                        3.3379,\n                        1.6615,\n                        3.3379,\n                        1.7808,\n                        3.2376,\n                        1.7808\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 84,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"C\",\n                    \"polygon\": [\n                        1.084,\n                        1.8524,\n                        1.189,\n                        1.8524,\n                        1.189,\n                        1.9766,\n                        1.084,\n                        1.9766\n                    ],\n                    \"confidence\": 0.975,\n                    \"span\": {\n                        \"offset\": 90,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"D\",\n                    \"polygon\": [\n                        3.2281,\n                        1.8524,\n                        3.3522,\n                        1.8524,\n                        3.3496,\n                        1.9765,\n                        3.2281,\n                        1.9739\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 94,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"E\",\n                    \"polygon\": [\n                        1.0744,\n                        2.0482,\n                        1.1699,\n                        2.0482,\n                        1.1666,\n                        2.1674,\n                        1.0744,\n                        2.1648\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 100,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"F\",\n                    \"polygon\": [\n                        3.2329,\n                        2.053,\n                        3.3246,\n                        2.053,\n                        3.3246,\n                        2.1675,\n                        3.2329,\n                        2.1675\n                    ],\n                    \"confidence\": 0.994,\n                    \"span\": {\n                        \"offset\": 104,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"G\",\n                    \"polygon\": [\n                        1.0792,\n                        2.2439,\n                        1.189,\n                        2.2439,\n                        1.1864,\n                        2.3536,\n                        1.0792,\n                        2.351\n                    ],\n                    \"confidence\": 0.995,\n                    \"span\": {\n                        \"offset\": 110,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"H\",\n                    \"polygon\": [\n                        3.236,\n                        2.2439,\n                        3.3522,\n                        2.2439,\n                        3.3522,\n                        2.3633,\n                        3.236,\n                        2.3633\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 114,\n                        \"length\": 1\n                    }\n                },\n                {\n                    \"content\": \"This\",\n                    \"polygon\": [\n                        0.9839,\n                        2.7348,\n                        1.2519,\n                        2.737,\n                        1.2511,\n                        2.8816,\n                        0.9839,\n                        2.887\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 119,\n                        \"length\": 4\n                    }\n                },\n                {\n                    \"content\": \"is\",\n                    \"polygon\": [\n                        1.2806,\n                        2.7371,\n                        1.4051,\n                        2.7376,\n                        1.4038,\n                        2.8798,\n                        1.2797,\n                        2.8812\n                    ],\n                    \"confidence\": 0.997,\n                    \"span\": {\n                        \"offset\": 124,\n                        \"length\": 2\n                    }\n                },\n                {\n                    \"content\": \"the\",\n                    \"polygon\": [\n                        1.441,\n                        2.7375,\n                        1.6492,\n                        2.7372,\n                        1.6471,\n                        2.8798,\n                        1.4396,\n                        2.8797\n                    ],\n                    \"confidence\": 0.999,\n                    \"span\": {\n                        \"offset\": 127,\n                        \"length\": 3\n                    }\n                },\n                {\n                    \"content\": \"end.\",\n                    \"polygon\": [\n                        1.6898,\n                        2.737,\n                        1.9865,\n                        2.7346,\n                        1.9859,\n                        2.8841,\n                        1.6877,\n                        2.88\n                    ],\n                    \"confidence\": 0.993,\n                    \"span\": {\n                        \"offset\": 131,\n                        \"length\": 4\n                    }\n                }\n            ],\n            \"lines\": [\n                {\n                    \"content\": \"Table without Headers\\n===\",\n                    \"polygon\": [\n                        0.9837,\n                        1.0169,\n                        2.5691,\n                        1.0217,\n                        2.5691,\n                        1.1697,\n                        0.9837,\n                        1.1697\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 0,\n                            \"length\": 25\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"Testing a table that has no headers\",\n                    \"polygon\": [\n                        0.9837,\n                        1.3368,\n                        3.2902,\n                        1.3368,\n                        3.2902,\n                        1.4753,\n                        0.9837,\n                        1.4848\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 27,\n                            \"length\": 35\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"A\",\n                    \"polygon\": [\n                        1.0744,\n                        1.6615,\n                        1.1795,\n                        1.6615,\n                        1.1747,\n                        1.7856,\n                        1.0697,\n                        1.7856\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 80,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"B\",\n                    \"polygon\": [\n                        3.2424,\n                        1.6615,\n                        3.3331,\n                        1.6615,\n                        3.3331,\n                        1.7808,\n                        3.2376,\n                        1.7808\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 84,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"C\",\n                    \"polygon\": [\n                        1.0888,\n                        1.8524,\n                        1.1843,\n                        1.8524,\n                        1.1843,\n                        1.9766,\n                        1.084,\n                        1.9766\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 90,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"D\",\n                    \"polygon\": [\n                        3.2329,\n                        1.8524,\n                        3.3475,\n                        1.8524,\n                        3.3427,\n                        1.9766,\n                        3.2281,\n                        1.9718\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 94,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"E\",\n                    \"polygon\": [\n                        1.0792,\n                        2.0482,\n                        1.1652,\n                        2.053,\n                        1.1652,\n                        2.1675,\n                        1.0744,\n                        2.1675\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 100,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"F\",\n                    \"polygon\": [\n                        3.2329,\n                        2.053,\n                        3.3236,\n                        2.053,\n                        3.3236,\n                        2.1675,\n                        3.2329,\n                        2.1675\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 104,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"G\",\n                    \"polygon\": [\n                        1.084,\n                        2.2439,\n                        1.1843,\n                        2.2439,\n                        1.1795,\n                        2.3537,\n                        1.0792,\n                        2.349\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 110,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"H\",\n                    \"polygon\": [\n                        3.2281,\n                        2.2439,\n                        3.3475,\n                        2.2439,\n                        3.3475,\n                        2.3633,\n                        3.2281,\n                        2.3633\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 114,\n                            \"length\": 1\n                        }\n                    ]\n                },\n                {\n                    \"content\": \"This is the end.\",\n                    \"polygon\": [\n                        0.9837,\n                        2.7309,\n                        1.9817,\n                        2.7309,\n                        1.9817,\n                        2.8789,\n                        0.9837,\n                        2.8837\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 119,\n                            \"length\": 16\n                        }\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 0,\n                    \"length\": 136\n                }\n            ]\n        }\n    ],\n    \"tables\": [\n        {\n            \"rowCount\": 4,\n            \"columnCount\": 2,\n            \"cells\": [\n                {\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 0,\n                    \"content\": \"A\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9933,\n                                1.6233,\n                                3.166,\n                                1.628,\n                                3.166,\n                                1.819,\n                                0.9933,\n                                1.8142\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 80,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/2\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 0,\n                    \"columnIndex\": 1,\n                    \"content\": \"B\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.166,\n                                1.628,\n                                5.3197,\n                                1.628,\n                                5.3244,\n                                1.8142,\n                                3.166,\n                                1.819\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 84,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/3\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 0,\n                    \"content\": \"C\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9933,\n                                1.8142,\n                                3.166,\n                                1.819,\n                                3.1612,\n                                2.01,\n                                0.9933,\n                                2.01\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 90,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/4\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 1,\n                    \"columnIndex\": 1,\n                    \"content\": \"D\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.166,\n                                1.819,\n                                5.3244,\n                                1.8142,\n                                5.3292,\n                                2.01,\n                                3.1612,\n                                2.01\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 94,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/5\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 0,\n                    \"content\": \"E\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9933,\n                                2.01,\n                                3.1612,\n                                2.01,\n                                3.1612,\n                                2.2057,\n                                0.9933,\n                                2.2057\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 100,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/6\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 2,\n                    \"columnIndex\": 1,\n                    \"content\": \"F\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.1612,\n                                2.01,\n                                5.3292,\n                                2.01,\n                                5.3292,\n                                2.2057,\n                                3.1612,\n                                2.2057\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 104,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/7\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 3,\n                    \"columnIndex\": 0,\n                    \"content\": \"G\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                0.9933,\n                                2.2057,\n                                3.1612,\n                                2.2057,\n                                3.1612,\n                                2.3967,\n                                0.998,\n                                2.3967\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 110,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/8\"\n                    ]\n                },\n                {\n                    \"rowIndex\": 3,\n                    \"columnIndex\": 1,\n                    \"content\": \"H\",\n                    \"bounding_regions\": [\n                        {\n                            \"pageNumber\": 1,\n                            \"polygon\": [\n                                3.1612,\n                                2.2057,\n                                5.3292,\n                                2.2057,\n                                5.334,\n                                2.3967,\n                                3.1612,\n                                2.3967\n                            ]\n                        }\n                    ],\n                    \"spans\": [\n                        {\n                            \"offset\": 114,\n                            \"length\": 1\n                        }\n                    ],\n                    \"elements\": [\n                        \"/paragraphs/9\"\n                    ]\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        1.0016,\n                        1.6182,\n                        5.3415,\n                        1.6188,\n                        5.3409,\n                        2.3975,\n                        1.0018,\n                        2.3976\n                    ]\n                }\n            ],\n            \"spans\": [\n                {\n                    \"offset\": 64,\n                    \"length\": 53\n                }\n            ]\n        }\n    ],\n    \"paragraphs\": [\n        {\n            \"spans\": [\n                {\n                    \"offset\": 0,\n                    \"length\": 25\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9837,\n                        1.0169,\n                        2.5691,\n                        1.0169,\n                        2.5691,\n                        1.1697,\n                        0.9837,\n                        1.1697\n                    ]\n                }\n            ],\n            \"role\": \"title\",\n            \"content\": \"Table without Headers\\n===\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 27,\n                    \"length\": 35\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9837,\n                        1.3368,\n                        3.2902,\n                        1.3368,\n                        3.2902,\n                        1.4848,\n                        0.9837,\n                        1.4848\n                    ]\n                }\n            ],\n            \"content\": \"Testing a table that has no headers\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 80,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9933,\n                        1.6233,\n                        3.166,\n                        1.628,\n                        3.166,\n                        1.819,\n                        0.9933,\n                        1.8142\n                    ]\n                }\n            ],\n            \"content\": \"A\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 84,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.166,\n                        1.628,\n                        5.3197,\n                        1.628,\n                        5.3244,\n                        1.8142,\n                        3.166,\n                        1.819\n                    ]\n                }\n            ],\n            \"content\": \"B\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 90,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9933,\n                        1.8142,\n                        3.166,\n                        1.819,\n                        3.1612,\n                        2.01,\n                        0.9933,\n                        2.01\n                    ]\n                }\n            ],\n            \"content\": \"C\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 94,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.166,\n                        1.819,\n                        5.3244,\n                        1.8142,\n                        5.3292,\n                        2.01,\n                        3.1612,\n                        2.01\n                    ]\n                }\n            ],\n            \"content\": \"D\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 100,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9933,\n                        2.01,\n                        3.1612,\n                        2.01,\n                        3.1612,\n                        2.2057,\n                        0.9933,\n                        2.2057\n                    ]\n                }\n            ],\n            \"content\": \"E\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 104,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.1612,\n                        2.01,\n                        5.3292,\n                        2.01,\n                        5.3292,\n                        2.2057,\n                        3.1612,\n                        2.2057\n                    ]\n                }\n            ],\n            \"content\": \"F\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 110,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9933,\n                        2.2057,\n                        3.1612,\n                        2.2057,\n                        3.1612,\n                        2.3967,\n                        0.998,\n                        2.3967\n                    ]\n                }\n            ],\n            \"content\": \"G\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 114,\n                    \"length\": 1\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        3.1612,\n                        2.2057,\n                        5.3292,\n                        2.2057,\n                        5.334,\n                        2.3967,\n                        3.1612,\n                        2.3967\n                    ]\n                }\n            ],\n            \"content\": \"H\"\n        },\n        {\n            \"spans\": [\n                {\n                    \"offset\": 119,\n                    \"length\": 16\n                }\n            ],\n            \"bounding_regions\": [\n                {\n                    \"pageNumber\": 1,\n                    \"polygon\": [\n                        0.9837,\n                        2.7309,\n                        1.9817,\n                        2.7309,\n                        1.9817,\n                        2.8837,\n                        0.9837,\n                        2.8837\n                    ]\n                }\n            ],\n            \"content\": \"This is the end.\"\n        }\n    ],\n    \"contentFormat\": \"markdown\"\n}"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_data/json/data.bad.invalid_keys.json",
    "content": "[\n    {\n        \"title\": \"Contains 'contents' key instead of 'content'\",\n        \"contents\": \"This is the content for item 1.\"\n    }\n]"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_data/json/data.bad.not_a_list.json",
    "content": "{\n    \"title\": \"Data should be a list\",\n    \"content\": \"This is the content for item 1.\"\n}"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_data/json/data.valid.json",
    "content": "[\n  {\n    \"item_id\": \"1\",\n    \"title\": \"Title TEST 1\",\n    \"content\": \"This is the content for item 1.\",\n    \"summary\": \"Summary for item 1.\",\n    \"description\": \"Description for item 1.\",\n    \"created_date\": \"2023-01-01T00:00:00Z\",\n    \"modified_date\": \"2023-01-02T00:00:00Z\"\n  },\n  {\n    \"item_id\": \"2\",\n    \"title\": \"Title 2\",\n    \"content\": \"This is the content for item 2.\",\n    \"summary\": \"Summary for item 2.\",\n    \"description\": \"Description for item 2.\",\n    \"created_date\": \"2023-01-02T00:00:00Z\",\n    \"modified_date\": \"2023-01-03T00:00:00Z\"\n  },\n  {\n    \"item_id\": \"3\",\n    \"title\": \"Title 3\",\n    \"content\": \"This is the content for item 3.\",\n    \"summary\": \"Summary for item 3.\",\n    \"description\": \"Description for item 3.\",\n    \"created_date\": \"2023-01-03T00:00:00Z\",\n    \"modified_date\": \"2023-01-04T00:00:00Z\"\n  },\n  {\n    \"item_id\": \"4\",\n    \"title\": \"Title 4\",\n    \"content\": \"This is the content for item 4.\",\n    \"summary\": \"Summary for item 4.\",\n    \"description\": \"Description for item 4.\",\n    \"created_date\": \"2023-01-04T00:00:00Z\",\n    \"modified_date\": \"2023-01-05T00:00:00Z\"\n  },\n  {\n    \"item_id\": \"5\",\n    \"title\": \"Title 5\",\n    \"content\": \"This is the content for item 5.\",\n    \"summary\": \"Summary for item 5.\",\n    \"description\": \"Description for item 5.\",\n    \"created_date\": \"2023-01-05T00:00:00Z\",\n    \"modified_date\": \"2023-01-06T00:00:00Z\"\n  },\n  {\n    \"item_id\": \"6\",\n    \"title\": \"Title 6\",\n    \"content\": \"This is the content for item 6.\",\n    \"summary\": \"Summary for item 6.\",\n    \"description\": \"Description for item 6.\",\n    \"created_date\": \"2023-01-05T00:00:00Z\",\n    \"modified_date\": \"2023-01-06T00:00:00Z\"\n  }\n]"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py",
    "content": "import json\nfrom rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import (\n    DocumentIntelligenceLoader,\n)\nfrom unittest.mock import patch\n\n\nclass SimplePythonObject:\n    def __init__(self, **kwargs):\n        self.__dict__.update(kwargs)\n\n    def __getitem__(self, key):\n        return getattr(self, key, None)\n\n    def get(self, key, default=None):\n        return getattr(self, key, default)\n\n    def keys(self):\n        return self.__dict__.keys()\n\n\ndef mock_simple_response(file_name):\n    with open(\n        f\"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/{file_name}\",\n        \"r\",\n    ) as f:\n        return json.load(f, object_hook=lambda d: SimplePythonObject(**d))\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\"\n)\ndef test__load(mock_document_intelligence, _):\n    mock_document_intelligence.return_value = mock_simple_response(\n        \"simple_response.json\"\n    )\n\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n    )\n\n    documents = loader.load()\n\n    assert len(documents) == 1, \"No documents were loaded\"\n    assert (\n        documents[0].page_content\n        == \"This is the Title\\n\\nSome text\\n\\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3 \\nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3 \\n\\nThis is the end.\"\n    )\n    assert documents[0].metadata[\"source\"] == \"path/to/some/file\"\n    assert documents[0].metadata[\"page\"] == 0\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\",\n    side_effect=Exception(\"Error\"),\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._load_with_langchain\"\n)\ndef test_load_with_langchain_is_used_as_fallback(mock_load_with_langchain, _, __):\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n    )\n\n    loader.load()\n\n    mock_load_with_langchain.assert_called_once()\n    mock_load_with_langchain.assert_called_with(\"path/to/some/file\", \"prebuilt-read\")\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\"\n)\ndef test_content_cleaning(mock_document_intelligence, _):\n    mock_document_intelligence.return_value = mock_simple_response(\n        \"simple_response.json\"\n    )\n\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n        patterns_to_remove=[\"Ti.*e\"],\n    )\n\n    documents = loader.load()\n\n    assert (\n        documents[0].page_content\n        == \"This is the \\n\\nSome text\\n\\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3 \\nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3 \\n\\nThis is the end.\"\n    )\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\"\n)\ndef test_table_without_headers(mock_document_intelligence, _):\n    mock_document_intelligence.return_value = mock_simple_response(\n        \"table_without_headers.json\"\n    )\n\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n    )\n\n    documents = loader.load()\n\n    assert (\n        documents[0].page_content\n        == \"Table without Headers\\n===\\n\\nTesting a table that has no headers\\n\\nA B \\nC D \\nE F \\nG H \\n\\nThis is the end.\"\n    )\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\"\n)\ndef test_document_with_multiple_pages_without_splitting_documents_by_page(\n    mock_document_intelligence, _\n):\n    mock_document_intelligence.return_value = mock_simple_response(\n        \"multiple_pages.json\"\n    )\n\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n        split_documents_by_page=False,\n    )\n\n    documents = loader.load()\n\n    assert (\n        documents[0].page_content\n        == \"Title for page number one Some text for the first page\\n\\n# Title for page number two\\n\\nSome text for the 2nd page. Here we also have a table:\\n\\nName: Alice, Age: 25 \\nName: Bob, Age: 32 \\n\\nTitle for page number three This is the end - at page 3.\\n===\"\n    )\n    assert len(documents) == 1\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\"\n)\ndef test_document_with_multiple_pages_with_split_documents_by_page(\n    mock_document_intelligence, _\n):\n    mock_document_intelligence.return_value = mock_simple_response(\n        \"multiple_pages.json\"\n    )\n\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n        split_documents_by_page=True,\n    )\n\n    documents = loader.load()\n\n    assert len(documents) == 3\n    assert (\n        documents[0].page_content\n        == \"Title for page number one Some text for the first page\"\n    )\n    assert (\n        documents[1].page_content\n        == \"# Title for page number two\\n\\nSome text for the 2nd page. Here we also have a table:\\n\\nName: Alice, Age: 25 \\nName: Bob, Age: 32 \"\n    )\n    assert (\n        documents[2].page_content\n        == \"Title for page number three This is the end - at page 3.\\n===\"\n    )\n\n\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths\",\n    return_value=[\"path/to/some/file\"],\n)\n@patch(\n    \"rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence\"\n)\ndef test_excluding_paragraphs(mock_document_intelligence, _):\n    mock_document_intelligence.return_value = mock_simple_response(\n        \"multiple_pages.json\"\n    )\n\n    loader = DocumentIntelligenceLoader(\n        path=\"path\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n        excluded_paragraph_roles=[\"sectionHeading\"],\n    )\n\n    documents = loader.load()\n\n    assert (\n        documents[0].page_content\n        == \"Title for page number one Some text for the first page\\n\\nSome text for the 2nd page. Here we also have a table:\\n\\nName: Alice, Age: 25 \\nName: Bob, Age: 32 \\n\\nTitle for page number three This is the end - at page 3.\\n===\"\n    )\n\n\ndef test_get_file_paths():\n    loader = DocumentIntelligenceLoader(\n        path=\"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"json\"],\n    )\n\n    assert set(loader._get_file_paths()) == set(\n        [\n            \"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json\",\n            \"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json\",\n            \"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json\",\n        ]\n    )\n\n\ndef test_get_file_paths_returns_according_to_glob():\n    loader = DocumentIntelligenceLoader(\n        path=\"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response\",\n        endpoint=\"endpoint\",\n        key=\"key\",\n        api_model=\"prebuilt-layout\",\n        glob_patterns=[\"pdf\"],\n    )\n\n    assert loader._get_file_paths() == []\n\n\ndef test_get_file_paths_works_for_single_files():\n    loader = DocumentIntelligenceLoader(\n        path=\"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json\",\n        endpoint=\"endpoint\",\n        api_model=\"prebuilt-layout\",\n        key=\"key\",\n    )\n\n    assert loader._get_file_paths() == [\n        \"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json\"\n    ]\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/tests/test_docx_loader.py",
    "content": "from unittest.mock import Mock\n\nfrom rag_experiment_accelerator.doc_loader.docxLoader import load_docx_files\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths\n\n\ndef test_load_docx_files():\n    folder_path = \"./data/docx\"\n    chunk_size = 1000\n    overlap_size = 400\n\n    original_doc = load_docx_files(\n        environment=Mock(),\n        file_paths=get_all_file_paths(folder_path),\n        chunk_size=chunk_size,\n        overlap_size=overlap_size,\n    )\n\n    assert len(original_doc) == 3\n\n    assert \"We recently commissioned\" in list(original_doc[0].values())[0][\"content\"]\n    assert \"We recently commissioned\" in list(original_doc[1].values())[0][\"content\"]\n    assert (\n        \"We recently commissioned\" not in list(original_doc[2].values())[0][\"content\"]\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/doc_loader/textLoader.py",
    "content": "from langchain_community.document_loaders import TextLoader\n\nfrom rag_experiment_accelerator.doc_loader.structuredLoader import (\n    load_structured_files,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef load_text_files(\n    environment: Environment,\n    file_paths: list[str],\n    chunk_size: str,\n    overlap_size: str,\n    **kwargs: dict,\n):\n    \"\"\"\n    Load and process text files from a given folder path.\n\n    Args:\n        environment (Environment): The environment class\n        chunking_strategy (str): The chunking strategy to use between \"azure-document-intelligence\" and \"basic\".\n        file_paths (list[str]): Sequence of paths to load.\n        chunk_size (int): The size of each text chunk in characters.\n        overlap_size (int): The size of the overlap between text chunks in characters.\n        **kwargs (dict): Unused.\n\n    Returns:\n        list[Document]: A list of processed and split document chunks.\n    \"\"\"\n\n    logger.debug(\"Loading text files\")\n\n    return load_structured_files(\n        file_format=\"TEXT\",\n        language=None,\n        loader=TextLoader,\n        file_paths=file_paths,\n        chunk_size=chunk_size,\n        overlap_size=overlap_size,\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/embedding/aoai_embedding_model.py",
    "content": "from openai import AzureOpenAI\n\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.embedding.embedding_model import EmbeddingModel\n\n\nclass AOAIEmbeddingModel(EmbeddingModel):\n    \"\"\"\n    A class representing an AOAI Embedding Model.\n\n    Args:\n        model_name (str): The name of the deployment.\n        environment (Environment): The initialized environment.\n        dimension (int, optional): The dimension of the embedding. Defaults to 1536 which is the dimension of text-embedding-ada-002.\n        **kwargs: Additional keyword arguments.\n\n    Attributes:\n        model_name (str): The name of the deployment.\n        _client (AzureOpenAI): The initialized AzureOpenAI client.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        model_name: str,\n        environment: Environment,\n        dimension: int = 1536,\n        shorten_dimensions: bool = False,\n        **kwargs\n    ) -> None:\n        super().__init__(name=model_name, dimension=dimension, **kwargs)\n        self.model_name = model_name\n        self.shorten_dimensions = shorten_dimensions\n        self._client: AzureOpenAI = self._initialize_client(environment=environment)\n\n    def _initialize_client(self, environment: Environment) -> AzureOpenAI:\n        \"\"\"\n        Initializes the AzureOpenAIClient.\n\n        Args:\n            environment (Environment): The initialized environment.\n\n        Returns:\n            AzureOpenAI: The initialized AzureOpenAI client.\n\n        \"\"\"\n        return AzureOpenAI(\n            azure_endpoint=environment.openai_endpoint,\n            api_key=environment.openai_api_key,\n            api_version=environment.openai_api_version,\n        )\n\n    def generate_embedding(self, chunk: str) -> list[float]:\n        \"\"\"\n        Generates the embedding for a given chunk of text.\n\n        Args:\n            chunk (str): The input text.\n\n        Returns:\n            list[float]: The generated embedding.\n\n        \"\"\"\n\n        kwargs = {}\n        if self.shorten_dimensions:\n            kwargs[\"dimensions\"] = self.dimension\n\n        response = self._client.embeddings.create(\n            input=chunk, model=self.model_name, **kwargs\n        )\n\n        return response.data[0].embedding\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/embedding_model.py",
    "content": "from abc import abstractmethod, ABC\n\n\nclass EmbeddingModel(ABC):\n    \"\"\"\n    Base class for embedding models.\n\n    Args:\n        name (str): The name of the embedding model.\n        dimension (int): The dimension of the embeddings.\n\n    Attributes:\n        dimension (int): The dimension of the embeddings.\n\n    Methods:\n        generate_embedding(chunk: str) -> list: Abstract method to generate embeddings for a given chunk of text.\n    \"\"\"\n\n    def __init__(self, name: str, dimension: int, **kwargs) -> None:\n        self.name = name\n        self.dimension = dimension\n\n    @abstractmethod\n    def generate_embedding(self, chunk: str) -> list[float]:\n        \"\"\"\n        abstract method to generate embeddings for a given chunk of text.\n\n        Args:\n            chunk (str): The input text chunk for which the embedding needs to be generated.\n\n        Returns:\n            list: The generated embedding as a list.\n        \"\"\"\n        pass\n\n    def to_dict(self) -> dict:\n        return {\n            \"dimension\": self.dimension,\n            \"name\": self.name,\n        }\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/factory.py",
    "content": "from rag_experiment_accelerator.embedding.aoai_embedding_model import AOAIEmbeddingModel\nfrom rag_experiment_accelerator.embedding.st_embedding_model import STEmbeddingModel\n\n\ndef create_embedding_model(model_type: str, **kwargs):\n    match model_type:\n        case \"azure\":\n            return AOAIEmbeddingModel(**kwargs)\n        case \"sentence-transformer\":\n            return STEmbeddingModel(**kwargs)\n        case _:\n            raise ValueError(\n                f\"Invalid embedding type: {model_type}. Must be one of ['azure', 'sentence-transformer']\"\n            )\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/st_embedding_model.py",
    "content": "from sentence_transformers import SentenceTransformer\nfrom rag_experiment_accelerator.embedding.embedding_model import EmbeddingModel\nfrom typing import Optional\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\nclass STEmbeddingModel(EmbeddingModel):\n    \"\"\"\n    STEmbeddingModel is a class that represents a sentence transformer embedding model.\n\n    Args:\n        model_name (str): The name of the pre-trained model to use for embedding.\n        dimension (int, optional): The dimension of the embedding. If not provided, it will be determined based on the model name.\n        **kwargs: Additional keyword arguments to be passed to the base class constructor.\n\n    Attributes:\n        _size_model_mapping (dict): A mapping of supported model names to their corresponding dimensions.\n\n    Raises:\n        ValueError: If the dimension is not provided and the model name is not found in the mapping.\n\n    Methods:\n        generate_embedding(chunk: str) -> list: Generates the embedding for a given chunk of text.\n\n    \"\"\"\n\n    _size_model_mapping = {\n        \"all-MiniLM-L6-v2\": 384,\n        \"all-mpnet-base-v2\": 768,\n        \"bert-large-nli-mean-tokens\": 1024,\n    }\n\n    def __init__(\n        self, model_name: str, dimension: Optional[int] = None, **kwargs\n    ) -> None:\n        \"\"\"\n        Initializes an instance of the STEmbeddingModel class.\n\n        Args:\n            model_name (str): The name of the pre-trained model to use for embedding.\n            dimension (int, optional): The dimension of the embedding. If not provided, it will be determined based on the model name.\n            **kwargs: Additional keyword arguments to be passed to the base class constructor.\n\n        Raises:\n            ValueError: If the dimension is not provided and the model name is not found in the mapping.\n\n        \"\"\"\n        if dimension is None:\n            dimension = self._size_model_mapping.get(model_name)\n            if dimension is None:\n                raise ValueError(\n                    f\"Dimension not provided and model name {model_name} not found in mapping. Please provide a dimension or specify a supported model name in {self._size_model_mapping.keys()}\"\n                )\n        super().__init__(name=model_name, dimension=dimension, **kwargs)\n        try:\n            self._model = SentenceTransformer(self.name)\n        except OSError as e:\n            logger.error(\n                f\"Error retrieving model: {self.name}. Please check that the model name is correct and that you have an internet connection.\"\n            )\n            raise e\n\n    def generate_embedding(self, chunk: str) -> list[float]:\n        \"\"\"\n        Generates the embedding for a given chunk of text.\n\n        Args:\n            chunk (str): The text to generate the embedding for.\n\n        Returns:\n            list: The generated embedding as a list.\n\n        \"\"\"\n        return self._model.encode([str(chunk)]).tolist()[0]\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/tests/test_aoai_embedding_model.py",
    "content": "from unittest.mock import patch, MagicMock\n\nfrom openai.types.create_embedding_response import CreateEmbeddingResponse, Usage\nfrom openai.types.embedding import Embedding\n\nfrom rag_experiment_accelerator.embedding.aoai_embedding_model import AOAIEmbeddingModel\n\n\n@patch(\n    \"rag_experiment_accelerator.embedding.aoai_embedding_model.AOAIEmbeddingModel._initialize_client\"\n)\ndef test_generate_embedding(mock_client):\n    expected_embeddings = Embedding(\n        embedding=[0.1, 0.2, 0.3], index=0, object=\"embedding\"\n    )\n    mock_embeddings = CreateEmbeddingResponse(\n        data=[expected_embeddings],\n        model=\"model_name\",\n        object=\"list\",\n        usage=Usage(prompt_tokens=0, total_tokens=0),\n    )\n\n    mock_client().embeddings.create.return_value = mock_embeddings\n\n    environment = MagicMock()\n    model = AOAIEmbeddingModel(\"text-embedding-ada-002\", environment=environment)\n    embeddings = model.generate_embedding(\"Hello world\")\n    assert embeddings == mock_embeddings.data[0].embedding\n\n\ndef test_emebdding_dimension_has_default():\n    environment = MagicMock()\n    model = AOAIEmbeddingModel(\"text-embedding-ada-002\", environment)\n    assert model.dimension == 1536\n\n\ndef test_can_set_embedding_dimension():\n    environment = MagicMock()\n    model = AOAIEmbeddingModel(\"text-embedding-ada-002\", environment, 123)\n    assert model.dimension == 123\n\n\n@patch(\n    \"rag_experiment_accelerator.embedding.aoai_embedding_model.AOAIEmbeddingModel._initialize_client\"\n)\ndef test_generate_embeddings_no_shortening(mock_client):\n    mock_client().embeddings.create.return_value = MagicMock()\n    environment = MagicMock()\n\n    model = AOAIEmbeddingModel(\n        \"text-embedding-3-large\", environment=environment, dimension=3072\n    )\n    model.generate_embedding(\"Hello world\")\n\n    mock_client().embeddings.create.assert_called_with(\n        input=\"Hello world\", model=\"text-embedding-3-large\"\n    )\n\n\n@patch(\n    \"rag_experiment_accelerator.embedding.aoai_embedding_model.AOAIEmbeddingModel._initialize_client\"\n)\ndef test_generate_embeddings_with_shortening(mock_client):\n    mock_client().embeddings.create.return_value = MagicMock()\n    environment = MagicMock()\n\n    model = AOAIEmbeddingModel(\n        \"text-embedding-3-large\",\n        environment=environment,\n        dimension=256,\n        shorten_dimensions=True,\n    )\n    model.generate_embedding(\"Hello world\")\n\n    mock_client().embeddings.create.assert_called_with(\n        input=\"Hello world\", model=\"text-embedding-3-large\", dimensions=256\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/tests/test_factory.py",
    "content": "from unittest.mock import patch, MagicMock\nimport pytest\n\nfrom rag_experiment_accelerator.embedding.aoai_embedding_model import AOAIEmbeddingModel\nfrom rag_experiment_accelerator.embedding.st_embedding_model import STEmbeddingModel\nfrom rag_experiment_accelerator.embedding.factory import create_embedding_model\n\n\ndef test_create_aoai_embedding_model():\n    embedding_type = \"azure\"\n    model_name = \"test_model\"\n    dimension = 768\n    environment = MagicMock()\n    model = create_embedding_model(\n        model_type=embedding_type,\n        model_name=model_name,\n        dimension=dimension,\n        environment=environment,\n    )\n    assert isinstance(model, AOAIEmbeddingModel)\n\n\n@patch(\"rag_experiment_accelerator.embedding.st_embedding_model.SentenceTransformer\")\ndef test_create_st_embedding_model(mock_sentence_transformer):\n    embedding_type = \"sentence-transformer\"\n    model_name = \"all-mpnet-base-v2\"\n    dimension = 768\n    environment = MagicMock()\n    model = create_embedding_model(\n        model_type=embedding_type,\n        model_name=model_name,\n        dimension=dimension,\n        environment=environment,\n    )\n    assert isinstance(model, STEmbeddingModel)\n\n\ndef test_create_raises_invalid_embedding_type():\n    embedding_type = \"not-valid\"\n    model_name = \"test_model\"\n    dimension = 768\n    environment = MagicMock()\n    with pytest.raises(ValueError):\n        create_embedding_model(\n            model_type=embedding_type,\n            model_name=model_name,\n            dimension=dimension,\n            environment=environment,\n        )\n"
  },
  {
    "path": "rag_experiment_accelerator/embedding/tests/test_st_embedding_model.py",
    "content": "from unittest.mock import patch\nimport pytest\nimport numpy as np\nfrom rag_experiment_accelerator.embedding.st_embedding_model import STEmbeddingModel\n\n\n@patch(\"rag_experiment_accelerator.embedding.st_embedding_model.SentenceTransformer\")\ndef test_generate_embedding(mock_sentence_transformer):\n    expected_embeddings = [0.1, 0.2, 0.3]\n    mock_embeddings = np.array([expected_embeddings])\n    mock_sentence_transformer.return_value.encode.return_value = mock_embeddings\n\n    model = STEmbeddingModel(\"all-mpnet-base-v2\")\n    embeddings = model.generate_embedding(\"Hello world\")\n\n    assert expected_embeddings == embeddings\n\n\ndef test_sentence_transformer_embedding_model_raises_non_existing_model():\n    with pytest.raises(OSError):\n        STEmbeddingModel(\"non-existing-model\", 123)\n\n\ndef test_sentence_transformer_embedding_model_raises_unsupported_model():\n    with pytest.raises(ValueError):\n        STEmbeddingModel(\"non-existing-model\")\n\n\n@patch(\"rag_experiment_accelerator.embedding.st_embedding_model.SentenceTransformer\")\ndef test_sentence_transformer_embedding_model_succeeds(mock_sentence_transformer):\n    try:\n        STEmbeddingModel(\"all-mpnet-base-v2\")\n    except BaseException:\n        assert False, \"Should not have thrown an exception\"\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/LICENSE.txt",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [2023] [Exploding Gradients]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/evaluation/eval.py",
    "content": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom contextlib import ExitStack\nimport os\nimport warnings\n\nimport mlflow\nimport pandas as pd\n\nfrom dotenv import load_dotenv\nfrom numpy import mean\n\nfrom rag_experiment_accelerator.artifact.handlers.query_output_handler import (\n    QueryOutputHandler,\n)\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.evaluation import plain_metrics\nfrom rag_experiment_accelerator.evaluation.llm_based_metrics import (\n    compute_llm_based_score,\n    lower_and_strip,\n)\nfrom rag_experiment_accelerator.evaluation.plot_metrics import (\n    draw_hist_df,\n    draw_search_chart,\n    generate_metrics,\n    plot_apk_scores,\n    plot_map_scores,\n    plot_mapk_scores,\n)\nfrom rag_experiment_accelerator.evaluation.transformer_based_metrics import (\n    compute_transformer_based_score,\n)\n\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\nload_dotenv()\nwarnings.filterwarnings(\"ignore\")\n\n\ndef compute_metrics(\n    metric_type,\n    question,\n    actual,\n    expected,\n    response_generator: ResponseGenerator,\n    retrieved_contexts,\n):\n    \"\"\"\n    Computes a score for the similarity between two strings using a specified metric.\n\n    Args:\n        metric_type (str): The type of metric to use for comparison. Valid options are:\n            - \"lcsstr\": Longest common substring\n            - \"lcsseq\": Longest common subsequence\n            - \"jaro_winkler\": Jaro-Winkler distance\n            - \"hamming\": Hamming distance\n            - \"jaccard\": Jaccard similarity\n            - \"levenshtein\": Levenshtein distance\n            - \"fuzzy_score\": RapidFuzz similarity. This is faster than the associated function in FuzzyWuzzy.\n                             Default match type is \"token_set_ratio\".\n            - \"cosine_ochiai\": Cosine similarity (Ochiai coefficient)\n            - \"rouge1_precision\": The ROUGE-1 precision score. This is the number of overlapping unigrams\n                                  between the actual and expected strings divided by the number of unigrams\n                                  in the expected string.\n            - \"rouge1_recall\": The ROUGE-1 recall score. This is the number of overlapping unigrams between\n                               the actual and expected strings divided by the number of unigrams in the actual string.\n            - \"rouge1_fmeasure\": ROUGE-1 F1 score. This is the harmonic mean of the ROUGE-1 precision and recall scores.\n            - \"rouge2_precision\": The ROUGE-2 precision score. This is the number of overlapping bigrams between\n                                    the actual and expected strings divided by the number of bigrams in the expected string.\n            - \"rouge2_recall\": The ROUGE-2 recall score. This is the number of overlapping bigrams between the actual\n                               and expected strings divided by the number of bigrams in the actual string.\n            - \"rouge2_fmeasure\": ROUGE-2 F1 score. This is the harmonic mean of the ROUGE-2 precision and recall scores.\n            - \"rougeL_precision\": The ROUGE-L precision score is the length of overlapping longest common subsequence\n                                  between the actual and expected strings divided by the number of unigrams\n                                  in the predicted string.\n            - \"rougeL_recall\": The ROUGE-L recall score is the length of overlapping longest common subsequence\n                               between the actual and expected strings divided by the number of unigrams in the\n                               actual string.\n            - \"rougeL_fmeasure\": ROUGE-L F1 score. This is the harmonic mean of the ROUGE-L precision and recall scores.\n            - \"bert_all_MiniLM_L6_v2\": BERT-based semantic similarity (MiniLM L6 v2 model)\n            - \"bert_base_nli_mean_tokens\": BERT-based semantic similarity (base model, mean tokens)\n            - \"bert_large_nli_mean_tokens\": BERT-based semantic similarity (large model, mean tokens)\n            - \"bert_large_nli_stsb_mean_tokens\": BERT-based semantic similarity (large model, STS-B, mean tokens)\n            - \"bert_distilbert_base_nli_stsb_mean_tokens\": BERT-based semantic similarity (DistilBERT base model, STS-B, mean tokens)\n            - \"bert_paraphrase_multilingual_MiniLM_L12_v2\": BERT-based semantic similarity (multilingual paraphrase model, MiniLM L12 v2)\n            - \"llm_context_precision\": Verifies whether or not a given context is useful for answering a question.\n            - \"llm_answer_relevance\": Scores the relevancy of the answer according to the given question.\n            - \"llm_context_recall\": Scores context recall by estimating TP and FN using annotated answer (ground truth) and retrieved context.\n        question (str): question text\n        actual (str): The first string to compare.\n        expected (str): The second string to compare.\n        response_generator (ResponseGenerator): The response generator to use for generating responses.\n        retrieved_contexts (list[str]): The list of retrieved contexts for the query.\n\n\n    Returns:\n        float: The similarity score between the two strings, as determined by the specified metric.\n    \"\"\"\n\n    if metric_type.startswith(\"rouge\"):\n        return plain_metrics.rouge_score(ground_truth=expected, prediction=actual, rouge_metric_name=metric_type)\n    else:\n        plain_metric_func = getattr(plain_metrics, metric_type, None)\n        if plain_metric_func:\n            return plain_metric_func(actual, expected)\n\n    try:\n        score = compute_transformer_based_score(actual, expected, metric_type)\n    except KeyError:\n        try:\n            score = compute_llm_based_score(\n                metric_type,\n                question,\n                actual,\n                expected,\n                response_generator,\n                retrieved_contexts,\n            )\n        except KeyError:\n            logger.error(f\"Unsupported metric type: {metric_type}\")\n\n    return score\n\n\ndef evaluate_single_prompt(\n    data,\n    response_generator,\n    metric_types,\n    data_list,\n    total_precision_scores_by_search_type,\n    map_scores_by_search_type,\n    average_precision_for_search_type,\n):\n    actual = lower_and_strip(data.actual)\n    expected = lower_and_strip(data.expected)\n\n    metric_dic = {}\n\n    for metric_type in metric_types:\n        score = compute_metrics(\n            metric_type,\n            data.question,\n            actual,\n            expected,\n            response_generator,\n            data.retrieved_contexts,\n        )\n        metric_dic[metric_type] = score\n\n    metric_dic[\"question\"] = data.question\n    metric_dic[\"retrieved_contexts\"] = data.retrieved_contexts\n    metric_dic[\"actual\"] = actual\n    metric_dic[\"expected\"] = expected\n    metric_dic[\"search_type\"] = data.search_type\n    data_list.append(metric_dic)\n\n    if not total_precision_scores_by_search_type.get(data.search_type):\n        total_precision_scores_by_search_type[data.search_type] = {}\n        map_scores_by_search_type[data.search_type] = []\n        average_precision_for_search_type[data.search_type] = []\n    for eval in data.search_evals:\n        scores = eval.get(\"precision_scores\")\n        if scores:\n            average_precision_for_search_type[data.search_type].append(mean(scores))\n        for i, score in enumerate(scores):\n            if total_precision_scores_by_search_type[data.search_type].get(i + 1):\n                total_precision_scores_by_search_type[data.search_type][i + 1].append(\n                    score\n                )\n            else:\n                total_precision_scores_by_search_type[data.search_type][i + 1] = [score]\n\n\ndef evaluate_prompts(\n    environment: Environment,\n    config: Config,\n    index_config: IndexConfig,\n    mlflow_client: mlflow.MlflowClient,\n    name_suffix: str,\n):\n    \"\"\"\n    Evaluates prompts using various metrics and logs the results to MLflow.\n\n    Args:\n        environment (Environment): Initialized Environment class containing environment configuration\n        config (Config): The configuration settings to use for evaluation.\n        index_config (IndexConfig): Parameters of the index such as chunking and embedding model.\n        mlflow_client (mlflow.MlflowClient): The MLflow client to use for logging the results.\n        name_suffix (str): Name suffix to use for all outputs created.\n\n    Returns:\n        None\n    \"\"\"\n    metric_types = config.eval.metric_types\n    num_search_type = config.search.search_type\n    data_list = []\n\n    pd.set_option(\"display.max_columns\", None)\n\n    total_precision_scores_by_search_type = {}\n    map_scores_by_search_type = {}\n    average_precision_for_search_type = {}\n\n    handler = QueryOutputHandler(config.path.query_data_dir)\n\n    response_generator = ResponseGenerator(\n        environment, config, config.openai.azure_oai_eval_deployment_name\n    )\n\n    query_data_load = handler.load(\n        index_config.index_name(), config.experiment_name, config.job_name\n    )\n    question_count = query_data_load[0].question_count\n\n    with ExitStack() as stack:\n        executor = stack.enter_context(ThreadPoolExecutor(config.max_worker_threads))\n        futures = {\n            executor.submit(\n                evaluate_single_prompt,\n                data,\n                response_generator,\n                metric_types,\n                data_list,\n                total_precision_scores_by_search_type,\n                map_scores_by_search_type,\n                average_precision_for_search_type,\n            ): data\n            for data in query_data_load\n        }\n\n        for future in as_completed(futures):\n            try:\n                future.result()\n            except Exception as exc:\n                logger.error(f\"Evaluate query line generated an exception: {exc}\")\n\n    eval_scores_df = {\"search_type\": [], \"k\": [], \"score\": [], \"map_at_k\": []}\n\n    for (\n        search_type,\n        scores_at_k,\n    ) in total_precision_scores_by_search_type.items():\n        for k, scores in scores_at_k.items():\n            avg_at_k = mean(scores)\n            # not sure if this would be problematic or not.\n            eval_scores_df[\"search_type\"].append(search_type)\n            eval_scores_df[\"k\"].append(k)\n            eval_scores_df[\"score\"].append(avg_at_k)\n            mean_at_k = mean(eval_scores_df[\"score\"][:k])\n            eval_scores_df[\"map_at_k\"].append(mean_at_k)\n\n    mean_scores = {\"search_type\": [], \"mean\": []}\n\n    for search_type, scores in average_precision_for_search_type.items():\n        mean_scores[\"search_type\"].append(search_type)\n        mean_scores[\"mean\"].append(mean(scores))\n\n    run_id = mlflow.active_run().info.run_id\n    columns_to_remove = [\"question\", \"retrieved_contexts\", \"actual\", \"expected\"]\n    additional_columns_to_remove = [\"search_type\"]\n    df = pd.DataFrame(data_list)\n    df.to_csv(\n        os.path.join(config.path.eval_data_dir, f\"{name_suffix}.csv\"), index=False\n    )\n    logger.debug(f\"Eval scores: {df.head()}\")\n\n    temp_df = df.drop(columns=columns_to_remove)\n    draw_search_chart(temp_df, run_id, mlflow_client)\n\n    temp_df = temp_df.drop(columns=additional_columns_to_remove)\n\n    if isinstance(num_search_type, str):\n        num_search_type = [num_search_type]\n    sum_all_columns = temp_df.sum() / (question_count * len(num_search_type))\n    sum_df = pd.DataFrame([sum_all_columns], columns=temp_df.columns)\n\n    sum_dict = {}\n    for col_name in sum_df.columns:\n        sum_dict[col_name] = float(sum_df[col_name].values)\n\n    sum_df.to_csv(\n        os.path.join(config.path.eval_data_dir, f\"sum_{name_suffix}.csv\"), index=False\n    )\n\n    ap_scores_df = pd.DataFrame(eval_scores_df)\n    ap_scores_df.to_csv(\n        os.path.join(\n            config.path.eval_data_dir, f\"{name_suffix}_ap_scores_at_k_test.csv\"\n        ),\n        index=False,\n    )\n    plot_apk_scores(ap_scores_df, run_id, mlflow_client)\n    plot_mapk_scores(ap_scores_df, run_id, mlflow_client)\n\n    map_scores_df = pd.DataFrame(mean_scores)\n    map_scores_df.to_csv(\n        os.path.join(config.path.eval_data_dir, f\"{name_suffix}_map_scores_test.csv\"),\n        index=False,\n    )\n    plot_map_scores(map_scores_df, run_id, mlflow_client)\n\n    common_data = query_data_load[0]\n    mlflow.log_param(\"question_count\", common_data.question_count)\n    mlflow.log_param(\"retrieve_num_of_documents\", common_data.retrieve_num_of_documents)\n    mlflow.log_param(\"cross_encoder_at_k\", common_data.cross_encoder_at_k)\n    mlflow.log_param(\"chunk_overlap\", index_config.chunking.overlap_size)\n    mlflow.log_param(\n        \"embedding_dimension\",\n        config.get_embedding_model(index_config.embedding_model.model_name).dimension,\n    )\n    mlflow.log_param(\"embedding_model_name\", index_config.embedding_model.model_name)\n    mlflow.log_param(\"ef_construction\", index_config.ef_construction)\n    mlflow.log_param(\"ef_search\", index_config.ef_search)\n    mlflow.log_param(\"run_metrics\", sum_dict)\n    mlflow.log_metrics(sum_dict)\n    mlflow.log_artifact(os.path.join(config.path.eval_data_dir, f\"{name_suffix}.csv\"))\n    mlflow.log_artifact(\n        os.path.join(config.path.eval_data_dir, f\"sum_{name_suffix}.csv\")\n    )\n    draw_hist_df(sum_df, run_id, mlflow_client)\n    generate_metrics(config.experiment_name, run_id, mlflow_client)\n    mlflow.end_run()\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/llm_based_metrics.py",
    "content": "from sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity\n\nfrom rag_experiment_accelerator.llm.prompt import (\n    llm_answer_relevance_instruction,\n    llm_context_recall_instruction,\n    llm_context_precision_instruction,\n)\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef lower_and_strip(text):\n    \"\"\"\n    Converts the input to lowercase without spaces or empty string if None.\n\n    Args:\n        text (str): The string to format.\n\n    Returns:\n        str: The formatted input string.\n    \"\"\"\n    if text is None:\n        return \"\"\n    else:\n        return text.lower().strip()\n\n\ndef llm_answer_relevance(\n    response_generator: ResponseGenerator, question, answer\n) -> float:\n    \"\"\"\n    Scores the relevancy of the answer according to the given question.\n    Answers with incomplete, redundant or unnecessary information is penalized.\n    Score can range from 0 to 1 with 1 being the best.\n\n    Args:\n        question (str): The question being asked.\n        answer (str): The generated answer.\n\n    Returns:\n        double: The relevancy score generated between the question and answer.\n\n    \"\"\"\n    result = response_generator.generate_response(\n        llm_answer_relevance_instruction, text=answer\n    )\n    if result is None:\n        logger.warning(\"Unable to generate answer relevance score\")\n        return 0.0\n\n    model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")\n\n    embedding1 = model.encode([str(question)])\n    embedding2 = model.encode([str(result)])\n    similarity_score = cosine_similarity(embedding1, embedding2)\n\n    return float(similarity_score[0][0] * 100)\n\n\ndef llm_context_precision(\n    response_generator: ResponseGenerator, question, retrieved_contexts\n) -> float:\n    \"\"\"\n    Computes precision by assessing whether each retrieved context is useful for answering a question.\n    Only considers the presence of relevant chunks in the retrieved contexts, but doesn't take into\n    account their ranking order.\n\n    Args:\n        question (str): The question being asked.\n        retrieved_contexts (list[str]): The list of retrieved contexts for the query.\n\n    Returns:\n        double: proportion of relevant chunks retrieved for the question\n    \"\"\"\n    relevancy_scores = []\n\n    for context in retrieved_contexts:\n        result: str | None = response_generator.generate_response(\n            llm_context_precision_instruction,\n            context=context,\n            question=question,\n        )\n        llm_judge_response = lower_and_strip(result)\n        # Since we're only asking for one response, the result is always a boolean 1 or 0\n        if llm_judge_response == \"yes\":\n            relevancy_scores.append(1)\n        elif llm_judge_response == \"no\":\n            relevancy_scores.append(0)\n        else:\n            logger.warning(\"Unable to generate context precision score\")\n\n    logger.debug(relevancy_scores)\n\n    if not relevancy_scores:\n        logger.warning(\"Unable to compute average context precision\")\n        return -1\n    else:\n        return (sum(relevancy_scores) / len(relevancy_scores)) * 100\n\n\ndef llm_context_recall(\n    response_generator: ResponseGenerator,\n    question,\n    groundtruth_answer,\n    retrieved_contexts,\n):\n    \"\"\"\n    Estimates context recall by estimating TP and FN using annotated answer (ground truth) and retrieved context.\n    Context_recall values range between 0 and 1, with higher values indicating better performance.\n    To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine\n    whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer\n    should be attributable to the retrieved context. The formula for calculating context recall is as follows:\n    context_recall = GT sentences that can be attributed to context / nr sentences in GT\n\n    Code adapted from https://github.com/explodinggradients/ragas\n    Copyright [2023] [Exploding Gradients]\n    under the Apache License (see evaluation folder)\n\n    Args:\n        question (str): The question being asked\n        groundtruth_answer (str): The ground truth (\"output_prompt\")\n        retrieved_contexts (list[str]): The list of retrieved contexts for the query\n\n    Returns:\n        double: The context recall score generated between the ground truth (expected) and context.\n    \"\"\"\n    context = \"\\n\".join(retrieved_contexts)\n\n    result: list | None = response_generator.generate_response(\n        llm_context_recall_instruction,\n        context=context,\n        question=question,\n        answer=groundtruth_answer,\n    )\n\n    good_responses = 0\n\n    for response in result:\n        try:\n            score = response.get(\"attributed\", 0)\n            good_responses += int(score)\n        except ValueError:\n            logger.warning(f\"Unable to parse {score} as int.\")\n    if not result:\n        return -1\n    else:\n        return (good_responses / len(result)) * 100\n\n\ndef compute_llm_based_score(\n    metric_type,\n    question,\n    actual,\n    expected,\n    response_generator: ResponseGenerator,\n    retrieved_contexts,\n):\n    match metric_type:\n        case \"llm_answer_relevance\":\n            score = llm_answer_relevance(response_generator, question, actual)\n        case \"llm_context_precision\":\n            score = llm_context_precision(\n                response_generator, question, retrieved_contexts\n            )\n        case \"llm_context_recall\":\n            score = llm_context_recall(\n                response_generator, question, expected, retrieved_contexts\n            )\n        case _:\n            raise KeyError(f\"Invalid metric type: {metric_type}\")\n\n    return score\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/plain_metrics.py",
    "content": "import evaluate\nfrom rapidfuzz import fuzz\nfrom rapidfuzz import distance\nfrom textdistance import algorithms\nfrom rouge_score import rouge_scorer\n\n\n# https://huggingface.co/spaces/evaluate-metric/bleu\ndef bleu(predictions: list[str], references: list[str]) -> float:\n    \"\"\"\n    Computes the BLEU score between a list of candidate translations and a list of reference translations.\n\n    Args:\n        predictions (list): A list of candidate translations.\n        references (list): A list of reference translations.\n\n    Returns:\n        float: The BLEU score between the candidate and reference translations.\n    \"\"\"\n    bleu = evaluate.load(\"bleu\")\n\n    # Match length of predictions and references and check they are both lists of strings\n    if len(predictions) != len(references) or not all(isinstance(x, str) for x in predictions) or not all(\n        isinstance(x, list) for x in references\n    ):\n        raise ValueError(\"Predictions and references must be lists of strings with the same length.\")\n\n    results = bleu.compute(predictions=predictions, references=references, max_order=2)\n    # multiplying by 100 to maintain consistency with previous implementation\n    return results[\"bleu\"] * 100\n\n\ndef fuzzy_score(str1: str, str2: str, match_type: str = \"token_set_ratio\") -> float:\n    \"\"\"\n    Compares two strings using fuzzy string matching and returns a similarity score.\n\n    Args:\n        str1 (str): The first string to compare.\n        str2 (str): The second string to compare.\n        match_type (str): The type of fuzzy string matching to use. Options include:\n            - 'ratio'\n            - 'token_set_ratio'\n            - 'token_sort_ratio'\n            - 'partial_ratio'\n            - 'partial_token_sort_ratio'\n            - 'partial_token_set_ratio'\n            - 'WRatio'\n            - 'QRatio'\n\n    Returns:\n        A similarity score.\n\n    Raises:\n        ValueError: If the match type is not recognized.\n    \"\"\"\n    # validate match_type to be one of the supported fuzzy matching functions\n    supported_match_types = {\"ratio\",\n                             \"token_set_ratio\",\n                             \"token_sort_ratio\",\n                             \"partial_ratio\",\n                             \"partial_token_sort_ratio\",\n                             \"partial_token_set_ratio\",\n                             \"WRatio\",\n                             \"QRatio\"}\n    if match_type not in supported_match_types:\n        raise ValueError(f\"Match type '{match_type}' is not recognized.\")\n\n    # get the fuzzy matching function based on the match_type\n    fuzzy_match_fn = getattr(fuzz, match_type)\n    similarity_score = fuzzy_match_fn(str1, str2)\n    return similarity_score\n\n\ndef rouge_score(ground_truth: str, prediction: str, rouge_metric_name: str) -> float:\n    \"\"\"\n    Calculates the ROUGE scores (rouge1, rouge2, rougeL) between two strings - ground truth and prediction.\n\n    Args:\n        ground_truth: reference string to compare\n        prediction: string that is an output of a model, a system or a generating process\n        rouge_metric_name: list of rouge metrics to use for evaluation. Options include:\n            - 'rouge1_precision'\n            - 'rouge1_recall'\n            - 'rouge1_fmeasure'\n            - 'rouge2_precision'\n            - 'rouge2_recall'\n            - 'rouge2_fmeasure'\n            - 'rougeL_precision'\n            - 'rougeL_recall'\n            - 'rougeL_fmeasure'\n    Returns:\n        score: ROUGE score.\n    \"\"\"\n    # validate rouge_types to be one of the supported rouge metrics\n    supported_rouge_types = {\"rouge1\", \"rouge2\", \"rougeL\"}\n    rouge_type, metric_type = rouge_metric_name.split(\"_\")\n    if rouge_type not in supported_rouge_types:\n        raise ValueError(f\"Rouge type '{rouge_type}' is not recognized. \"\n                         \"Supported types are {supported_rouge_types}.\")\n\n    if metric_type not in {\"precision\", \"recall\", \"fmeasure\"}:\n        raise ValueError(f\"Rouge metric type '{rouge_type}' is not recognized. \"\n                         \"Supported metric types are {'precision', 'recall', 'fmeasure'}.\")\n\n    scorer = rouge_scorer.RougeScorer(rouge_types=[rouge_type], use_stemmer=True)\n    scores = scorer.score(target=ground_truth, prediction=prediction)\n    return getattr(scores[rouge_type], metric_type) * 100\n\n\ndef levenshtein(str1: str, str2: str) -> int:\n    \"\"\"\n    Calculates the Levenshtein distance between two strings and returns the normalized similarity score as a percentage.\n\n    Args:\n        str1 (str): The first string to compare.\n        str2 (str): The second string to compare.\n\n    Returns:\n        int: The normalized similarity score as a percentage.\n    \"\"\"\n    score = distance.Levenshtein.normalized_similarity(str1, str2) * 100\n    return score\n\n\ndef jaccard(str1: str, str2: str) -> int:\n    \"\"\"\n    Calculates the Jaccard similarity score between two sets of values.\n\n    Args:\n        str1 (set): The first set of values.\n        str2 (set): The second set of values.\n\n    Returns:\n        int: The Jaccard similarity score between the two sets of values, as a percentage.\n    \"\"\"\n    score = int(algorithms.jaccard.normalized_similarity(str1, str2) * 100)\n    return score\n\n\ndef hamming(str1: str, str2: str) -> int:\n    \"\"\"\n    Calculates the Hamming similarity score between two strings.\n\n    Args:\n        str1 (str): The first string to compare.\n        str2 (str): The second string to compare.\n\n    Returns:\n        int: The Hamming similarity score between the two values, as a percentage.\n    \"\"\"\n    score = int(distance.Hamming.normalized_similarity(str1, str2, pad=True) * 100)\n    return score\n\n\ndef jaro_winkler(str1: str, str2: str) -> int:\n    \"\"\"\n    Calculates the Jaro-Winkler similarity score between two strings.\n\n    Args:\n        str1 (str): The first string to compare.\n        str2 (str): The second string to compare.\n\n    Returns:\n        int: The Jaro-Winkler similarity score between the two strings, as an integer between 0 and 100.\n    \"\"\"\n    score = int(distance.JaroWinkler.normalized_similarity(str1, str2) * 100)\n    return score\n\n\ndef cosine_ochiai(str1: str, str2: str) -> float:\n    \"\"\"\n    Calculates the cosine similarity (Ochiai coefficient) between two strings\n    using token-frequency vectors\n    https://en.wikipedia.org/wiki/Cosine_similarity.\n    Args:\n        str1 (str): The first string.\n        str2 (str): The second string.\n    Returns:\n        float: The cosine similarity score between the two strings, as a percentage.\n    \"\"\"\n    score = int(algorithms.cosine.normalized_similarity(str1, str2) * 100)\n    return score\n\n\ndef lcsseq(str1: str, str2: str) -> int:\n    \"\"\"\n    Computes the longest common subsequence (LCS) similarity score between two input strings.\n\n    Args:\n        str1 (str): The first input string.\n        str2 (str): The second input string.\n\n    Returns:\n        int: The LCS similarity score between the two input strings, as a percentage (0-100).\n    \"\"\"\n    score = int(distance.LCSseq.normalized_similarity(str1, str2) * 100)\n    return score\n\n\ndef lcsstr(str1: str, str2: str) -> int:\n    \"\"\"\n    Calculates the longest common substring (LCS) similarity score between two strings.\n\n    Args:\n        str1 (str): The first string to compare.\n        str2 (str): The second string to compare.\n\n    Returns:\n        int: The LCS similarity score between the two strings, as a percentage (0-100).\n    \"\"\"\n    score = int(algorithms.lcsstr.normalized_similarity(str1, str2) * 100)\n    return score\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/plot_metrics.py",
    "content": "import ast\n\nimport plotly.express as px\nimport plotly.graph_objects as go\nimport plotly.subplots as sp\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef generate_metrics(experiment_name, run_id, mlflow_client):\n    \"\"\"\n    Generates metrics for a given experiment and run ID.\n\n    Args:\n        experiment_name (str): The name of the experiment.\n        run_id (int): The ID of the run.\n        mlflow_client (mlflow.MlflowClient): The MLflow client to use for logging the metrics.\n\n    Returns:\n        None\n    \"\"\"\n    experiment = dict(mlflow_client.get_experiment_by_name(experiment_name))\n    runs_list = mlflow_client.search_runs([experiment[\"experiment_id\"]])\n\n    models_metrics = {}\n    metrics_to_plot = []\n    runs_id_to_plot = []\n\n    if len(runs_list) > 0:\n        for run in runs_list:\n            run_dict = run.to_dictionary()\n            single_run_id = run_dict[\"info\"][\"run_id\"]\n            runs_id_to_plot.append(single_run_id)\n            if run.data.params.get(\"run_metrics\", {}) != {}:\n                metrics = ast.literal_eval(run.data.params[\"run_metrics\"])\n\n                for metric_type, metric_value in metrics.items():\n                    if models_metrics.get(metric_type, {}) == {}:\n                        metrics_to_plot.append(metric_type)\n                        models_metrics[metric_type] = {}\n\n                    models_metrics[metric_type][single_run_id] = metric_value\n                logger.debug(models_metrics)\n    else:\n        current_run = mlflow_client.get_run(run_id)\n        if current_run.data.params.get(\"run_metrics\", {}) != {}:\n            metrics = ast.literal_eval(current_run.data.params[\"run_metrics\"])\n            for metric_type, metric_value in metrics.items():\n                if models_metrics.get(metric_type, {}) == {}:\n                    metrics_to_plot.append(metric_type)\n                    models_metrics[metric_type] = {}\n\n                models_metrics[metric_type][run_id] = metric_value\n\n    x_axis = []\n    y_axis = []\n\n    fig = go.Figure()\n\n    for metric in metrics_to_plot:\n        for key, value in models_metrics[metric].items():\n            x_axis.append(key)\n            y_axis.append(value)\n\n        label = key\n        px.line(x_axis, y_axis)\n        fig.add_trace(go.Scatter(x=x_axis, y=y_axis, mode=\"lines+markers\", name=label))\n\n        fig.update_layout(\n            xaxis_title=\"run name\", yaxis_title=metric, font=dict(size=15)\n        )\n\n        plot_name = metric + \".html\"\n        mlflow_client.log_figure(run_id, fig, plot_name)\n\n        fig.data = []\n        fig.layout = {}\n        x_axis = []\n        y_axis = []\n\n\ndef draw_hist_df(df, run_id, mlflow_client):\n    \"\"\"\n    Draw a histogram of the given dataframe and log it to the specified run ID.\n\n    Args:\n        df (pandas.DataFrame): The dataframe to draw the histogram from.\n        run_id (str): The ID of the run to log the histogram to.\n        mlflow_client (mlflow.MlflowClient): The MLflow client to use for logging the histogram.\n\n    Returns:\n        None\n    \"\"\"\n    fig = px.bar(\n        x=df.columns,\n        y=df.values.tolist(),\n        title=\"metric comparison\",\n        color=df.columns,\n        labels=dict(x=\"Metric Type\", y=\"Score\", color=\"Metric Type\"),\n    )\n    plot_name = \"all_metrics_current_run.html\"\n    mlflow_client.log_figure(run_id, fig, plot_name)\n\n\ndef plot_apk_scores(df, run_id, mlflow_client):\n    fig = px.line(df, x=\"k\", y=\"score\", title=\"AP@k scores\", color=\"search_type\")\n    plot_name = \"average_precision_at_k.html\"\n    mlflow_client.log_figure(run_id, fig, plot_name)\n\n\n# maybe pull these 2 above and below functions into a single one\ndef plot_mapk_scores(df, run_id, mlflow_client):\n    fig = px.line(df, x=\"k\", y=\"map_at_k\", title=\"MAP@k scores\", color=\"search_type\")\n    plot_name = \"mean_average_precision_at_k.html\"\n    mlflow_client.log_figure(run_id, fig, plot_name)\n\n\ndef plot_map_scores(df, run_id, mlflow_client):\n    fig = px.bar(df, x=\"search_type\", y=\"mean\", title=\"MAP scores\", color=\"search_type\")\n    plot_name = \"mean_average_precision_scores.html\"\n    mlflow_client.log_figure(run_id, fig, plot_name)\n\n\ndef draw_search_chart(temp_df, run_id, mlflow_client):\n    \"\"\"\n    Draws a comparison chart of search types across metric types.\n\n    Args:\n        temp_df (pandas.DataFrame): The dataframe containing the data to be plotted.\n        run_id (int): The ID of the current run.\n        mlflow_mlflow_client (mlflow.MlflowClient): The MLflow client to use for logging the chart.\n\n    Returns:\n        None\n    \"\"\"\n    grouped = temp_df.groupby(\"search_type\")\n    summed_column = grouped.sum().reset_index()\n    fig = sp.make_subplots(rows=len(summed_column.search_type), cols=1)\n    for index, row_data in summed_column.iterrows():\n        search_type = row_data[0]\n        row_data = row_data[1:]\n        df = row_data.reset_index(name=\"metric_value\")\n        df = df.rename(columns={\"index\": \"metric_type\"})\n        fig.add_trace(\n            go.Bar(\n                x=df[\"metric_type\"],\n                y=df[\"metric_value\"],\n                name=search_type,\n                offsetgroup=index,\n            ),\n            row=1,\n            col=1,\n        )\n\n        fig.update_xaxes(title_text=\"Metric type\", row=index + 1, col=1)\n        fig.update_yaxes(title_text=\"score\", row=index + 1, col=1)\n    fig.update_layout(\n        font=dict(size=15),\n        title_text=\"Search type comparison across metric types\",\n        height=4000,\n        width=800,\n    )\n    plot_name = \"search_type_current_run.html\"\n    mlflow_client.log_figure(run_id, fig, plot_name)\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/search_eval.py",
    "content": "from sklearn import metrics\n\nfrom rag_experiment_accelerator.evaluation.spacy_evaluator import (\n    SpacyEvaluator,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef evaluate_search_result(\n    search_response: list, evaluation_content: str, evaluator: SpacyEvaluator\n):\n    content = []\n\n    # create list of all docs with their is_relevant result to calculate recall and precision\n    is_relevant_results = []\n    for doc in search_response:\n        is_relevant = evaluator.is_relevant(doc[\"content\"], evaluation_content)\n        is_relevant_results.append(is_relevant)\n\n    recall_scores = []\n    precision_scores = []\n    recall_predictions = [False for _ in range(len(search_response))]\n    precision_predictions = [True for _ in range(len(search_response))]\n    for i, doc in enumerate(search_response):\n        k = i + 1\n        logger.info(\"++++++++++++++++++++++++++++++++++\")\n        logger.info(f\"Content: {doc['content']}\")\n        logger.info(f\"Search Score: {doc['@search.score']}\")\n\n        precision_score = round(\n            metrics.precision_score(\n                is_relevant_results[:k], precision_predictions[:k]\n            ),\n            2,\n        )\n        precision_scores.append(precision_score)\n        logger.info(f\"Precision Score: {precision_score}@{k}\")\n\n        recall_predictions[i] = is_relevant_results[i]\n        recall_score = round(\n            metrics.recall_score(is_relevant_results, recall_predictions), 2\n        )\n        recall_scores.append(recall_score)\n        logger.info(f\"Recall Score: {recall_score}@{k}\")\n\n        # TODO: should we only append content when it is relevant?\n        content.append(doc[\"content\"])\n\n    eval_metrics = {\n        \"recall_scores\": recall_scores,\n        \"precision_scores\": precision_scores,\n    }\n\n    return content, eval_metrics\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/spacy_evaluator.py",
    "content": "from spacy import load\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\nclass SpacyEvaluator:\n    \"\"\"\n    A class for evaluating the similarity between two documents using spaCy.\n\n    Args:\n        similarity_threshold (float): The minimum similarity score required for two documents to be considered relevant.\n        model (str): The name of the spaCy model to use for processing the documents.\n\n    Attributes:\n        nlp (spacy.Language): The spaCy language model used for processing the documents.\n        similarity_threshold (float): The minimum similarity score required for two documents to be considered relevant.\n\n    Methods:\n        similarity(doc1: str, doc2: str) -> float: Calculates the similarity score between two documents.\n        is_relevant(doc1: str, doc2: str) -> bool: Determines whether two documents are relevant based on their similarity score.\n    \"\"\"\n\n    def __init__(self, similarity_threshold=0.8, model=\"en_core_web_lg\") -> None:\n        try:\n            self.nlp = load(model)\n        except OSError:\n            logger.info(f\"Downloading spacy language model: {model}\")\n            from spacy.cli import download\n\n            download(model)\n            self.nlp = load(model)\n        self.similarity_threshold = similarity_threshold\n\n    def similarity(self, doc1: str, doc2: str):\n        nlp_doc1 = self.nlp(doc1)\n        nlp_doc2 = self.nlp(doc2)\n        return nlp_doc1.similarity(nlp_doc2)\n\n    def is_relevant(self, doc1: str, doc2: str):\n        similarity = self.similarity(doc1, doc2)\n        logger.info(f\"Similarity Score: {similarity}\")\n\n        return similarity > self.similarity_threshold\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/tests/test_llm_based_metrics.py",
    "content": "from unittest.mock import patch\n\nfrom rag_experiment_accelerator.evaluation.llm_based_metrics import (\n    llm_answer_relevance,\n    llm_context_precision,\n    llm_context_recall,\n)\n\n\n@patch(\"rag_experiment_accelerator.evaluation.eval.ResponseGenerator\")\n@patch(\"rag_experiment_accelerator.evaluation.llm_based_metrics.SentenceTransformer\")\ndef test_llm_answer_relevance(mock_st, mock_generate_response):\n    mock_generate_response.return_value.generate_response.return_value = (\n        \"What is the name of the largest bone in the human body?\"\n    )\n    mock_st().encode.side_effect = [[[0.1, 0.2, 0.3]], [[0.1, 0.2, 0.3]]]\n\n    question = \"What is the name of the largest bone in the human body?\"\n    answer = (\n        (\n            \"The largest bone in the human body is the femur, also known as\"\n            \" the thigh bone. It is about 19.4 inches (49.5 cm) long on\"\n            \" average and can support up to 30 times the weight of a person’s\"\n            \" body.\"\n        ),\n    )\n    score = llm_answer_relevance(mock_generate_response, question, answer)\n    assert round(score) == 100\n\n\n@patch(\"rag_experiment_accelerator.evaluation.eval.ResponseGenerator\")\ndef test_llm_context_precision(mock_generate_response):\n    question = \"What is the name of the largest bone in the human body?\"\n    retrieved_contexts = [\"Retrieved context 1\", \"Retrieved context 2\"]\n    mock_generate_response.generate_response.side_effect = [\"Yes\", \"No\", \"Yes\", \"No\"]\n\n    score = llm_context_precision(mock_generate_response, question, retrieved_contexts)\n\n    expected_relevancy_scores = [1, 0, 1, 0]\n    expected_precision = (\n        sum(expected_relevancy_scores) / len(expected_relevancy_scores)\n    ) * 100  # 50.0\n\n    assert score == expected_precision\n\n\n@patch(\"rag_experiment_accelerator.evaluation.eval.ResponseGenerator\")\ndef test_llm_context_recall(mock_generate_response):\n    mock_generate_response.generate_response.return_value = [\n        {\n            \"statement_1\": \"Test statement 1\",\n            \"reason\": \"The statement is in the context\",\n            \"attributed\": \"1\",\n        },\n        {\n            \"statement_2\": \"Test statement 2\",\n            \"reason\": \"The statement is in the context\",\n            \"attributed\": \"1\",\n        },\n        {\n            \"statement_3\": \"Test statement 3\",\n            \"reason\": \"The statement is in the context\",\n            \"attributed\": \"0\",\n        },\n        {\n            \"statement_4\": \"Test statement 4\",\n            \"reason\": \"The statement is in the context\",\n            \"attributed\": \"1\",\n        },\n    ]\n    question = \"What is the name of the largest bone in the human body?\"\n    context = 'According to the Cleveland Clinic, \"The femur is the largest and strongest bone in the human body. It can support as much as 30 times the weight of your body. The average adult male femur is 48 cm (18.9 in) in length and 2.34 cm (0.92 in) in diameter. The average weight among adult males in the United States is 196 lbs (872 N). Therefore, the adult male femur can support roughly 6,000 lbs of compressive force.\"'\n    answer = \"The largest bone in the human body is the femur, also known as the thigh bone. It is about 19.4 inches (49.5 cm) long on average and can support up to 30 times the weight of a person’s body.\"\n\n    score = llm_context_recall(mock_generate_response, question, answer, context)\n    assert score == 75\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/tests/test_plain_metrics.py",
    "content": "from unittest.mock import patch\nfrom pytest import approx\n\nfrom rag_experiment_accelerator.evaluation.plain_metrics import (\n    bleu,\n    fuzzy_score,\n    levenshtein,\n    jaccard,\n    hamming,\n    jaro_winkler,\n    cosine_ochiai,\n    rouge_score,\n    lcsseq,\n    lcsstr,\n)\n\n\ndef test_fuzzy_score():\n    value1 = \"Room, 2 Double Beds (19th to 25th Floors)\"\n    value2 = \"Two Double Beds - Location Room (19th to 25th Floors)\"\n\n    assert fuzzy_score(str1=value1, str2=value2) == approx(89, rel=0.5)\n    assert fuzzy_score(str1=value1, str2=value2, match_type=\"partial_token_set_ratio\") == approx(100, 0.5)\n\n\ndef test_levenshtein():\n    value1 = \"party\"\n    value2 = \"park\"\n\n    assert levenshtein(value1, value2) == 60\n\n\ndef test_jaccard():\n    value1 = [\"cat\", \"dog\", \"hippo\", \"monkey\"]\n    value2 = [\"monkey\", \"rhino\", \"ostrich\", \"salmon\"]\n\n    assert jaccard(value1, value2) == 14\n\n\ndef test_hamming():\n    value1 = \"1011101\"\n    value2 = \"1011011\"\n\n    assert hamming(value1, value2) == 71\n\n\ndef test_jaro_winkler():\n    value1 = \"crate\"\n    value2 = \"trace\"\n\n    assert jaro_winkler(value1, value2) == 73\n\n\ndef test_cosine_ochiai():\n    str1 = \"The fox jumped over the high fence\"\n    str2 = \"The quick brown fox jumped over the fence\"\n\n    assert cosine_ochiai(str1, str2) == 83\n\n\ndef test_rouge_score():\n    str1 = \"The fox jumped over the high fence\"\n    str2 = \"The quick brown fox jumped over the fence\"\n\n    metrics_to_test = {\n        \"rouge1_precision\": 75,\n        \"rouge1_recall\": 86,\n        \"rouge1_fmeasure\": 80,\n        \"rouge2_precision\": 43,\n        \"rouge2_recall\": 50,\n        \"rouge2_fmeasure\": 46,\n        \"rougeL_precision\": 75,\n        \"rougeL_recall\": 86,\n        \"rougeL_fmeasure\": 80,\n    }\n\n    for rouge_metric_name, expected_value in metrics_to_test.items():\n        assert round(rouge_score(str1, str2, rouge_metric_name)) == expected_value\n\n\ndef test_lcsseq():\n    value1 = \"The fox jumped over the high fence\"\n    value2 = \"The quick brown fox jumped over the fence.\"\n\n    assert lcsseq(value1, value2) == 69\n\n\ndef test_lcsstr():\n    value1 = \"The fox jumped over the high fence\"\n    value2 = \"The quick brown fox jumped over the fence.\"\n\n    assert lcsstr(value1, value2) == 50\n\n\n@patch(\"rag_experiment_accelerator.evaluation.plain_metrics.evaluate.load\")\ndef test_bleu(mock_evaluate_load):\n    mock_evaluate_load.return_value.compute.return_value = {\"bleu\": 0.5}\n    predictions = [\n        \"Transformers Transformers are fast plus efficient\",\n        \"Good Morning\",\n        \"I am waiting for new Transformers\",\n    ]\n    references = [\n        [\n            \"HuggingFace Transformers are quick, efficient and awesome\",\n            \"Transformers are awesome because they are fast to execute\",\n        ],\n        [\"Good Morning Transformers\", \"Morning Transformers\"],\n        [\n            \"People are eagerly waiting for new Transformer models\",\n            \"People are very excited about new Transformers\",\n        ],\n    ]\n    score = bleu(predictions, references)\n    assert round(score) == 50\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/tests/test_search_eval.py",
    "content": "from unittest.mock import patch\n\nfrom rag_experiment_accelerator.evaluation.search_eval import (\n    evaluate_search_result,\n)\n\nevaluation_content = \"my content to evaluate\"\nsearch_response = [\n    {\n        \"@search.score\": 0.03755760192871094,\n        \"content\": \"this is the first chunk\",\n    },\n    {\n        \"@search.score\": 0.029906954616308212,\n        \"content\": \"this is the second chunk\",\n    },\n    {\n        \"@search.score\": 0.028612013906240463,\n        \"content\": \"this is the third chunk\",\n    },\n]\n\n\ndef test_evaluate_search_result_calulates_precision_score():\n    with patch(\n        \"rag_experiment_accelerator.evaluation.spacy_evaluator.SpacyEvaluator\"\n    ) as evaluator:\n        evaluator.is_relevant.side_effect = [True, False, True]\n\n        _, evaluation = evaluate_search_result(\n            search_response, evaluation_content, evaluator\n        )\n\n        expected_precision = [1.0, 0.5, 0.67]\n        for i, precision in enumerate(evaluation.get(\"precision_scores\")):\n            assert precision == expected_precision[i]\n\n\ndef test_evaluate_search_result_calulates_recall_score():\n    with patch(\n        \"rag_experiment_accelerator.evaluation.spacy_evaluator.SpacyEvaluator\"\n    ) as evaluator:\n        evaluator.is_relevant.side_effect = [True, False, True]\n\n        _, evaluation = evaluate_search_result(\n            search_response, evaluation_content, evaluator\n        )\n\n        expected_recall = [0.5, 0.5, 1.0]\n        for i, recall in enumerate(evaluation.get(\"recall_scores\")):\n            assert recall == expected_recall[i]\n\n\ndef test_evaluate_search_result_returns_all_search_content():\n    with patch(\n        \"rag_experiment_accelerator.evaluation.spacy_evaluator.SpacyEvaluator\"\n    ) as evaluator:\n        evaluator.is_relevant.side_effect = [True, False, True]\n\n        content, _ = evaluate_search_result(\n            search_response, evaluation_content, evaluator\n        )\n\n        for i, doc in enumerate(search_response):\n            assert doc[\"content\"] == content[i]\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/tests/test_spacy_evaluator.py",
    "content": "from unittest.mock import MagicMock, call, patch\nfrom rag_experiment_accelerator.evaluation.spacy_evaluator import (\n    SpacyEvaluator,\n)\n\n\n@patch(\"rag_experiment_accelerator.evaluation.spacy_evaluator.load\")\ndef test_evaluator_init(mock_nlp):\n    similarity_threshold = 0.4\n    evaluator = SpacyEvaluator(similarity_threshold=similarity_threshold)\n    assert similarity_threshold == evaluator.similarity_threshold\n\n\n@patch(\"rag_experiment_accelerator.evaluation.spacy_evaluator.load\")\ndef test_similarity_returns_similar(mock_nlp):\n    mock_doc_1 = MagicMock()\n    mock_doc_1.similarity.return_value = 1\n    mock_doc_2 = MagicMock()\n    mock_nlp().side_effect = [mock_doc_1, mock_doc_2]\n\n    evaluator = SpacyEvaluator()\n    actual = evaluator.similarity(\"test word\", \"test word\")\n\n    mock_doc_1.similarity.assert_called_once_with(mock_doc_2)\n    assert actual == 1\n\n\n@patch(\n    \"rag_experiment_accelerator.evaluation.spacy_evaluator.SpacyEvaluator.similarity\"\n)\n@patch(\"rag_experiment_accelerator.evaluation.spacy_evaluator.load\")\ndef test_is_relevant_returns_valid(mock_nlp, mock_similarity):\n    mock_similarity.side_effect = [1, 0.05]\n\n    evaluator = SpacyEvaluator()\n    actual_true = evaluator.is_relevant(\"test phrase\", \"test phrase\")\n    actual_false = evaluator.is_relevant(\"phrase\", \"different\")\n\n    mock_similarity.assert_has_calls(\n        [call(\"test phrase\", \"test phrase\"), call(\"phrase\", \"different\")]\n    )\n    assert actual_true is True\n    assert actual_false is False\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/tests/test_transformer_based_metrics.py",
    "content": "from unittest.mock import MagicMock\n\nimport numpy as np\n\n\nfrom rag_experiment_accelerator.evaluation.transformer_based_metrics import (\n    compare_semantic_document_values,\n)\n\n\ndef test_compare_semantic_document_values():\n    mock_sentence_transformer = MagicMock()\n    embeddings1 = np.array([[0.1, 0.2, 0.3, 0.4, 0.7]])\n    embeddings2 = np.array([[0.1, 0.3, 0.4, 0.5, 0.6]])\n\n    mock_sentence_transformer.encode.side_effect = [embeddings1, embeddings2]\n\n    value1 = \"value1\"\n    value2 = \"value2\"\n\n    assert (\n        compare_semantic_document_values(value1, value2, mock_sentence_transformer)\n        == 97\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/evaluation/transformer_based_metrics.py",
    "content": "from sentence_transformers import SentenceTransformer\nfrom sklearn.metrics.pairwise import cosine_similarity\n\n\n# todo: can we remove this hardcoding and name the model in the config file?\nmetric_type_model_mapping = {\n    \"bert_all_MiniLM_L6_v2\": \"all-MiniLM-L6-v2\",\n    \"bert_base_nli_mean_tokens\": \"bert-base-nli-mean-tokens\",\n    \"bert_large_nli_mean_tokens\": \"bert-large-nli-mean-tokens\",\n    \"bert_large_nli_stsb_mean_tokens\": \"bert-large-nli-stsb-mean-tokens\",\n    \"bert_distilbert_base_nli_stsb_mean_tokens\": \"distilbert-base-nli-stsb-mean-tokens\",\n    \"bert_paraphrase_multilingual_MiniLM_L12_v2\": \"paraphrase-multilingual-MiniLM-L12-v2\",\n}\n\n\ndef compare_semantic_document_values(doc1, doc2, model_type):\n    \"\"\"\n    Compares the semantic values of two documents and returns the percentage of differences.\n\n    Args:\n        doc1 (str): The first document to compare.\n        doc2 (str): The second document to compare.\n        model_type (SentenceTransformer): The SentenceTransformer model to use for comparison.\n\n    Returns:\n        int: The percentage of differences between the two documents.\n    \"\"\"\n    differences = semantic_compare_values(doc1, doc2, model_type)\n\n    return int(sum(differences) / len(differences))\n\n\ndef semantic_compare_values(\n    value1: str,\n    value2: str,\n    model_type: SentenceTransformer,\n) -> list[float]:\n    \"\"\"\n    Computes the semantic similarity between two values using a pre-trained SentenceTransformer model.\n\n    Args:\n        value1 (str): The first value to compare.\n        value2 (str): The second value to compare.\n        model_type (SentenceTransformer): The pre-trained SentenceTransformer model to use for encoding the values.\n\n    Returns:\n        A list of the similarity scores.\n    \"\"\"\n    embedding1 = model_type.encode([str(value1)])\n    embedding2 = model_type.encode([str(value2)])\n    similarity_score = cosine_similarity(embedding1, embedding2)\n\n    return [similarity_score * 100]\n\n\ndef compute_transformer_based_score(\n    actual,\n    expected,\n    metric_type,\n):\n    if metric_type not in metric_type_model_mapping:\n        raise KeyError(f\"Invalid metric type: {metric_type}\")\n\n    transformer = SentenceTransformer(\n        f\"sentence-transformers/{metric_type_model_mapping[metric_type]}\"\n    )\n    return compare_semantic_document_values(actual, expected, transformer)\n"
  },
  {
    "path": "rag_experiment_accelerator/ingest_data/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/ingest_data/acs_ingest.py",
    "content": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom contextlib import ExitStack\nimport hashlib\n\nimport pandas as pd\nfrom azure.core.credentials import AzureKeyCredential\nfrom azure.search.documents import SearchClient\nfrom rag_experiment_accelerator.checkpoint import cache_with_checkpoint\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.llm.prompt import (\n    do_need_multiple_prompt_instruction,\n    multiple_prompt_instruction,\n    qna_generation_prompt,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.utils.timetook import TimeTook\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\ndef my_hash(s):\n    \"\"\"\n    Returns the MD5 hash of the input string.\n\n    Args:\n        s (str): The string to be hashed.\n\n    Returns:\n        str: The MD5 hash of the input string.\n    \"\"\"\n    return hashlib.md5(s.encode()).hexdigest()\n\n\ndef upload_data(\n    environment: Environment,\n    config: Config,\n    chunks: list,\n    index_name: str,\n):\n    \"\"\"\n    Uploads data to an Azure AI Search index.\n\n    This function uploads chunks of data to a specified index in Azure AI Search.\n    It uses the provided service endpoint, index name, and search key to connect to the service.\n    The function also converts the chunks into index documents before uploading them.\n    The upload process is done in parallel using a ThreadPoolExecutor.\n\n    Args:\n        environment (Environment): The environment configuration.\n        config (Config): The configuration object.\n        chunks (list): A list of dictionaries, each containing a chunk of content to be uploaded.\n        index_name (str): The name of the index to upload the data to.\n\n    Returns:\n        None\n    \"\"\"\n    credential = AzureKeyCredential(environment.azure_search_admin_key)\n    search_client = SearchClient(\n        endpoint=environment.azure_search_service_endpoint,\n        index_name=index_name,\n        credential=credential,\n    )\n\n    logger.info(f\"Preparing data for upload, {len(chunks)} documents to upload\")\n    documents = chunks_to_index_documents(chunks)\n\n    with ExitStack() as stack:\n        with TimeTook(\"uploading data to Azure AI Search\", logger=logger):\n            executor = stack.enter_context(\n                ThreadPoolExecutor(config.max_worker_threads)\n            )\n\n            futures = {\n                executor.submit(search_client.upload_documents, [document]): document\n                for document in documents\n            }\n\n            for future in as_completed(futures):\n                document = futures[future]\n                try:\n                    future.result()\n                except Exception as ex:\n                    logger.error(f\"Failed to upload document {document}, error: {ex}\")\n\n    logger.info(\n        f\"Uploaded {len(documents)} documents out of {len(chunks)} documents to Azure Search Index\"\n    )\n\n\ndef generate_qna(environment, config, docs, azure_oai_deployment_name):\n    \"\"\"\n    Generates a set of questions and answers from a list of documents using a language model.\n\n    Args:\n        docs (list): A list of documents to generate questions and answers from.\n        azure_oai_deployment_name (str): The name of the Azure Opan AI deployment\n\n    Returns:\n        pandas.DataFrame: A DataFrame containing the generated questions, answers, and context for each document.\n    \"\"\"\n    column_names = [\"user_prompt\", \"output_prompt\", \"context\"]\n\n    new_df = pd.DataFrame(columns=column_names)\n    response_generator = ResponseGenerator(\n        environment, config, azure_oai_deployment_name\n    )\n\n    for doc in docs:\n        chunk = list(doc.values())[0]\n        if len(chunk[\"content\"]) > 50:\n            response = response_generator.generate_response(\n                qna_generation_prompt,\n                context=chunk[\"content\"],\n            )\n            if response is None:\n                continue\n\n            data = {\n                \"user_prompt\": response[\"question\"],\n                \"output_prompt\": response[\"answer\"],\n                \"context\": chunk[\"content\"],\n            }\n            new_df = new_df._append(data, ignore_index=True)\n        else:\n            logger.info(\n                f\"Skipping chunk with less than 50 characters: {chunk['filename']}\"\n            )\n\n    if new_df.empty:\n        logger.error(\"No questions generated\")\n        raise ValueError(\"No questions generated\")\n\n    return new_df\n\n\n@cache_with_checkpoint(id=\"chunk['content']\")\ndef generate_qna_for_chunk(chunk, response_generator):\n    qna = []\n\n    response = response_generator.generate_response(\n        qna_generation_prompt,\n        context=chunk[\"content\"],\n    )\n\n    for item in response:\n        data = {\n            \"user_prompt\": item[\"question\"],\n            \"output_prompt\": item[\"answer\"],\n            \"context\": chunk[\"content\"],\n        }\n        qna.append(data)\n\n    return qna\n\n\ndef generate_multiple_questions(question, response_generator: ResponseGenerator):\n    \"\"\"\n    Generates a response to a given question using a language model with multiple prompts.\n\n    Args:\n        question (str): The question to generate a response for.\n        response_generator (ResponseGenerator): Initialized ResponseGenerator to use\n\n    Returns:\n        str: The list of sub-questions.\n    \"\"\"\n    response = response_generator.generate_response(\n        multiple_prompt_instruction,\n        text=question,\n    )\n    return response\n\n\ndef do_we_need_multiple_questions(\n    question, response_generator: ResponseGenerator, config: Config\n):\n    \"\"\"\n    Determines if we need to ask multiple questions based on the response generated by the model.\n\n    Args:\n        question (str): The question to ask.\n        response_generator (ResponseGenerator): Initialized ResponseGenerator to use\n\n    Returns:\n        bool: True if we need to ask multiple questions, False otherwise.\n    \"\"\"\n    response: str | None = response_generator.generate_response(\n        do_need_multiple_prompt_instruction,\n        text=question,\n    )\n\n    result = response is not None and response.lower().strip() == \"complex\"\n    return result\n\n\ndef chunks_to_index_documents(chunks):\n    \"\"\"\n    Converts chunks of content into index documents for Azure AI Search.\n\n    This function takes a list of chunks, where each chunk is a dictionary containing various pieces of content.\n    It then converts each chunk into a dictionary that's suitable for use as an index document in Azure AI Search.\n    The resulting list of index documents is then returned.\n\n    Args:\n        chunks (list): A list of dictionaries, each containing a chunk of\n        content to be converted.\n\n    Returns:\n        list: A list of dictionaries, each representing an index document.\n    \"\"\"\n    return [\n        {\n            \"id\": str(my_hash(chunk[\"content\"])),\n            \"title\": chunk[\"title\"] if \"title\" in chunk else \"\",\n            \"summary\": chunk[\"summary\"] if \"summary\" in chunk else \"\",\n            \"content\": str(chunk[\"content\"]),\n            \"filename\": chunk[\"filename\"],\n            \"sourceDisplayName\": chunk[\"source_display_name\"],\n            \"contentVector\": (\n                chunk[\"content_vector\"] if \"content_vector\" in chunk else []\n            ),\n            \"summaryVector\": (\n                chunk[\"summary_vector\"] if \"summary_vector\" in chunk else []\n            ),\n            \"titleVector\": chunk[\"title_vector\"] if \"title_vector\" in chunk else [],\n        }\n        for chunk in chunks\n    ]\n"
  },
  {
    "path": "rag_experiment_accelerator/ingest_data/tests/test_acs_ingest.py",
    "content": "from unittest.mock import patch, Mock, ANY\n\nfrom rag_experiment_accelerator.ingest_data.acs_ingest import (\n    my_hash,\n    upload_data,\n    generate_multiple_questions,\n    do_we_need_multiple_questions,\n)\n\nfrom rag_experiment_accelerator.llm.prompt import (\n    multiple_prompt_instruction,\n    prompt_instruction_title,\n    prompt_instruction_summary,\n)\n\nfrom rag_experiment_accelerator.run.index import generate_summary, generate_title\n\n\ndef test_my_hash_with_string():\n    # Arrange\n    test_string = \"Hello, World!\"\n    expected_hash = (\n        \"65a8e27d8879283831b664bd8b7f0ad4\"  # Precomputed MD5 hash of \"Hello, World!\"\n    )\n\n    # Act\n    result = my_hash(test_string)\n\n    # Assert\n    assert result == expected_hash\n\n\ndef test_my_hash_with_empty_string():\n    # Arrange\n    test_string = \"\"\n    expected_hash = (\n        \"d41d8cd98f00b204e9800998ecf8427e\"  # Precomputed MD5 hash of an empty string\n    )\n\n    # Act\n    result = my_hash(test_string)\n\n    # Assert\n    assert result == expected_hash\n\n\ndef test_my_hash_with_numbers():\n    # Arrange\n    test_string = \"1234567890\"\n    expected_hash = (\n        \"e807f1fcf82d132f9bb018ca6738a19f\"  # Precomputed MD5 hash of \"1234567890\"\n    )\n\n    # Act\n    result = my_hash(test_string)\n\n    # Assert\n    assert result == expected_hash\n\n\n@patch(\"rag_experiment_accelerator.run.index.ResponseGenerator\")\ndef test_generate_title(mock_response_generator):\n    # Arrange\n    mock_response = \"Test Title\"\n    mock_chunk = \"This is a test chunk of text.\"\n    mock_deployment_name = \"TestDeployment\"\n    mock_response_generator().generate_response.return_value = mock_response\n    mock_config = Mock()\n    mock_environment = Mock()\n\n    # Act\n    result = generate_title(\n        mock_chunk, mock_deployment_name, mock_environment, mock_config\n    )\n\n    # Assert\n    mock_response_generator().generate_response.assert_called_once_with(\n        prompt_instruction_title, text=mock_chunk\n    )\n    assert result == mock_response\n\n\n@patch(\"rag_experiment_accelerator.run.index.ResponseGenerator\")\ndef test_generate_summary(mock_response_generator):\n    # Arrange\n    mock_summary = \"Test Summary\"\n    mock_chunk = \"This is a test chunk of text.\"\n    mock_deployment_name = \"TestDeployment\"\n    mock_response_generator().generate_response.return_value = mock_summary\n    mock_config = Mock()\n    mock_environment = Mock()\n\n    # Act\n    result = generate_summary(\n        mock_chunk, mock_deployment_name, mock_environment, mock_config\n    )\n\n    # Assert\n    mock_response_generator().generate_response.assert_called_once_with(\n        prompt_instruction_summary, text=mock_chunk\n    )\n    assert result == mock_summary\n\n\n@patch(\"rag_experiment_accelerator.ingest_data.acs_ingest.SearchClient\")\n@patch(\"rag_experiment_accelerator.ingest_data.acs_ingest.AzureKeyCredential\")\n@patch(\"rag_experiment_accelerator.ingest_data.acs_ingest.my_hash\")\ndef test_upload_data(\n    mock_my_hash,\n    mock_azure_key_credential,\n    mock_SearchClient,\n):\n    # Arrange\n    mock_chunks = [\n        {\n            \"content\": \"test content\",\n            \"content_vector\": \"test_vector\",\n            \"filename\": \"test_file_name\",\n            \"source_display_name\": \"test_source_name\",\n        }\n    ]\n    mock_search_key = \"test_key\"\n    mock_my_hash.return_value = \"test_hash\"\n    mock_environment = Mock()\n    mock_environment.azure_search_service_endpoint = \"test_endpoint\"\n    mock_environment.azure_search_admin_key = \"test_key\"\n    mock_config = Mock()\n    mock_config.max_worker_threads = None\n\n    # Act\n    upload_data(\n        mock_environment,\n        mock_config,\n        mock_chunks,\n        \"test_index\",\n    )\n\n    # Assert\n    mock_azure_key_credential.assert_called_once_with(mock_search_key)\n    mock_SearchClient.assert_called_once_with(\n        endpoint=\"test_endpoint\",\n        index_name=\"test_index\",\n        credential=ANY,\n    )\n    mock_my_hash.assert_called_once_with(mock_chunks[0][\"content\"])\n    mock_SearchClient().upload_documents.assert_called_once()\n\n\n@patch(\n    \"rag_experiment_accelerator.ingest_data.acs_ingest.ResponseGenerator\",\n    return_value=Mock(),\n)\ndef test_we_need_multiple_questions(mock_response_generator):\n    # Arrange\n    question = \"What is the meaning of life?\"\n    mock_response = \"The meaning of life is 42.\"\n    mock_response_generator.generate_response.return_value = mock_response\n\n    # Act\n    result = generate_multiple_questions(question, mock_response_generator)\n\n    # Assert\n    mock_response_generator.generate_response.assert_called_once_with(\n        multiple_prompt_instruction, text=question\n    )\n    assert result == mock_response\n\n\n@patch(\n    \"rag_experiment_accelerator.ingest_data.acs_ingest.ResponseGenerator\",\n    return_value=Mock(),\n)\ndef test_do_we_need_multiple_questions_true(mock_response_generator):\n    # Arrange\n    question = \"What is the meaning of life?\"\n    mock_response_generator.generate_response.return_value = \"complex\"\n    mock_config = Mock()\n\n    # Act\n    result = do_we_need_multiple_questions(\n        question, mock_response_generator, mock_config\n    )\n\n    # Assert\n    mock_response_generator.generate_response.assert_called_once()\n    assert result is True\n\n\n@patch(\n    \"rag_experiment_accelerator.ingest_data.acs_ingest.ResponseGenerator\",\n    return_value=Mock(),\n)\ndef test_do_we_need_multiple_questions_false(mock_response_generator):\n    # Arrange\n    question = \"What is the meaning of life?\"\n    mock_response_generator.generate_response.return_value = '{\"category\": \"\"}'\n    mock_config = Mock()\n\n    # Act\n    result = do_we_need_multiple_questions(\n        question, mock_response_generator, mock_config\n    )\n\n    # Assert\n    mock_response_generator.generate_response.assert_called_once()\n    assert result is False\n"
  },
  {
    "path": "rag_experiment_accelerator/init_Index/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/init_Index/create_index.py",
    "content": "from azure.core.credentials import AzureKeyCredential\nfrom azure.search.documents.indexes import SearchIndexClient\nfrom azure.search.documents.indexes.models import (\n    CharFilter,\n    CorsOptions,\n    HnswParameters,\n    HnswVectorSearchAlgorithmConfiguration,\n    LexicalTokenizer,\n    PrioritizedFields,\n    SearchableField,\n    SearchField,\n    SearchFieldDataType,\n    SearchIndex,\n    SemanticConfiguration,\n    SemanticField,\n    SemanticSettings,\n    SimpleField,\n    TokenFilter,\n    VectorSearch,\n    VectorSearchProfile,\n)\n\nfrom rag_experiment_accelerator.config.language_config import LanguageAnalyzerConfig\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef create_acs_index(\n    service_endpoint,\n    index_name,\n    key,\n    dimension,\n    ef_construction,\n    ef_search,\n    analyzer: LanguageAnalyzerConfig,\n):\n    try:\n        credential = AzureKeyCredential(key)\n\n        # Apply checks on analyzer settings. Search analyzer and index analyzer must be set together\n        if bool(analyzer.index_analyzer_name) != bool(analyzer.search_analyzer_name):\n            raise ValueError(\n                \"Both 'index_analyzer_name' and 'search_analyzer_name' must be set together\"\n            )\n\n        # Analyzer can only be used if neither search analyzer or index analyzer are set\n        if analyzer.analyzer_name and (\n            analyzer.search_analyzer_name or analyzer.index_analyzer_name\n        ):\n            raise ValueError(\n                \"analyzer_name should be empty if either search_analyzer_name or index_analyzer_name is not empty\"\n            )\n\n        # Create a search index\n        index_client = SearchIndexClient(\n            endpoint=service_endpoint, credential=credential\n        )\n        fields = [\n            SimpleField(name=\"id\", type=SearchFieldDataType.String, key=True),\n            SimpleField(name=\"sourceDisplayName\", type=SearchFieldDataType.String),\n            SearchableField(\n                name=\"content\",\n                type=SearchFieldDataType.String,\n                searchable=True,\n                retrievable=True,\n            ),\n            SearchableField(\n                name=\"title\",\n                type=SearchFieldDataType.String,\n                searchable=True,\n                retrievable=True,\n            ),\n            SearchableField(\n                name=\"summary\",\n                type=SearchFieldDataType.String,\n                searchable=True,\n                retrievable=True,\n            ),\n            SearchableField(\n                name=\"filename\",\n                type=SearchFieldDataType.String,\n                filterable=True,\n                searchable=False,\n                retrievable=True,\n            ),\n            SearchField(\n                name=\"contentVector\",\n                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n                searchable=True,\n                vector_search_dimensions=int(dimension),\n                vector_search_profile=\"my-vector-search-profile\",\n            ),\n            SearchField(\n                name=\"titleVector\",\n                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n                searchable=True,\n                vector_search_dimensions=int(dimension),\n                vector_search_profile=\"my-vector-search-profile\",\n            ),\n            SearchField(\n                name=\"summaryVector\",\n                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n                searchable=True,\n                vector_search_dimensions=int(dimension),\n                vector_search_profile=\"my-vector-search-profile\",\n            ),\n        ]\n\n        vector_search = VectorSearch(\n            algorithms=[\n                HnswVectorSearchAlgorithmConfiguration(\n                    name=\"my-vector-config\",\n                    parameters=HnswParameters(\n                        m=4,\n                        ef_construction=int(ef_construction),\n                        ef_search=int(ef_search),\n                        metric=\"cosine\",\n                    ),\n                )\n            ],\n            profiles=[\n                VectorSearchProfile(\n                    name=\"my-vector-search-profile\", algorithm=\"my-vector-config\"\n                )\n            ],\n        )\n\n        semantic_config = SemanticConfiguration(\n            name=\"my-semantic-config\",\n            prioritized_fields=PrioritizedFields(\n                prioritized_content_fields=[SemanticField(field_name=\"content\")]\n            ),\n        )\n\n        # Create the semantic settings with the configuration\n        semantic_settings = SemanticSettings(configurations=[semantic_config])\n\n        # Define a custom tokenizer, token filter and char filter\n        tokenizers = []\n        token_filters = []\n        char_filters = []\n        if analyzer.tokenizers:\n            tokenizers = [\n                LexicalTokenizer(\n                    name=tokenizer[\"name\"],\n                    token_chars=tokenizer[\"token_chars\"],\n                )\n                for tokenizer in analyzer.tokenizers\n            ]\n        if analyzer.token_filters:\n            # token_filters = [LexicalTokenFilter(name=analyzer[\"token_filters\"][\"name\"], odatatype=\"#Microsoft.Azure.Search.AsciiFoldingTokenFilter\")]\n            token_filters = [\n                TokenFilter(name=\"lowercase\"),\n                TokenFilter(name=\"asciifolding\"),\n            ]\n        if analyzer.char_filters:\n            char_filters = [\n                CharFilter(\n                    name=char_filter[\"name\"],\n                    odatatype=\"#Microsoft.Azure.Search.MappingCharFilter\",\n                    mappings=char_filter[\"mappings\"],\n                )\n                for char_filter in analyzer.char_filters\n            ]\n\n        cors_options = CorsOptions(allowed_origins=[\"*\"], max_age_in_seconds=60)\n        scoring_profiles = []\n\n        # Create the search index with the semantic, tokenizer, and filter settings\n        index = SearchIndex(\n            name=index_name,\n            fields=fields,\n            vector_search=vector_search,\n            semantic_settings=semantic_settings,\n            scoring_profiles=scoring_profiles,\n            cors_options=cors_options,\n            tokenizers=tokenizers,\n            token_filters=token_filters,\n            char_filters=char_filters,\n        )\n        result = index_client.create_or_update_index(index)\n        logger.info(f\"{result.name} created\")\n\n    except Exception as e:\n        raise ValueError(f\"An error occurred while creating index [{index_name}]: {e}\")\n"
  },
  {
    "path": "rag_experiment_accelerator/init_Index/tests/test_create_index.py",
    "content": "import unittest\nfrom unittest.mock import patch, Mock\n\nfrom rag_experiment_accelerator.config.language_config import LanguageAnalyzerConfig\nfrom rag_experiment_accelerator.init_Index.create_index import create_acs_index\n\nfrom azure.core.credentials import AzureKeyCredential\nfrom azure.search.documents.indexes import SearchIndexClient\nfrom azure.search.documents.indexes.models import (\n    HnswParameters,\n    HnswVectorSearchAlgorithmConfiguration,\n    SearchField,\n    SearchIndex,\n)\n\n\nclass TestCreateIndex(unittest.TestCase):\n    # Create a mock object for the result of the 'create_or_update_index'\n    # method\n    mock_result = Mock()\n    mock_result.name = \"test_index\"\n\n    # Test that create_acs_index calls create_or_update_index using key\n    # credentials\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_create_acs_index(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        # Index test parameters\n        service_endpoint = \"test_endpoint\"\n        index_name = \"test_index\"\n        key = \"test_key\"\n        dimension = 128\n        ef_construction = 100\n        ef_search = 100\n        analyzer = LanguageAnalyzerConfig(\n            analyzer_name=None,\n            index_analyzer_name=None,\n            search_analyzer_name=None,\n            char_filters=None,\n            tokenizers=None,\n            token_filters=None,\n        )\n\n        # Call the function with the test parameters\n        create_acs_index(\n            service_endpoint,\n            index_name,\n            key,\n            dimension,\n            ef_construction,\n            ef_search,\n            analyzer,\n        )\n\n        # Assert that the 'create_or_update_index' method was called on the\n        # SearchIndexClient\n        mock_create_or_update_index.assert_called_once()\n\n        # Assert that the AzureKeyCredential was initialized with the correct\n        # key\n        mock_azure_key_credential.assert_called_once_with(key)\n\n    # Test that create_acs_index works correctly when analyzer option is set\n    # alone\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchField, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_analyzer_name_alone(\n        self, mock_create_or_update_index, mock_search_field, mock_azure_key_credential\n    ):\n        service_endpoint = \"test_endpoint\"\n        index_name = \"test_index\"\n        key = \"test_key\"\n        dimension = 128\n        ef_construction = 100\n        ef_search = 100\n        analyzer = LanguageAnalyzerConfig(\n            analyzer_name=\"test_analyzer\",\n            index_analyzer_name=None,\n            search_analyzer_name=None,\n        )\n        create_acs_index(\n            service_endpoint,\n            index_name,\n            key,\n            dimension,\n            ef_construction,\n            ef_search,\n            analyzer,\n        )\n        mock_create_or_update_index.assert_called()\n\n    # Test that create_acs_index works correctly when indexAnalyzer is set\n    # together with searchAnalyzer and not analyzer option\n    def test_analyzer_with_index_and_search_analyzer(self):\n        with self.assertRaises(ValueError):\n            service_endpoint = \"test_endpoint\"\n            index_name = \"test_index\"\n            key = \"test_key\"\n            dimension = 128\n            ef_construction = 100\n            ef_search = 100\n            analyzer = LanguageAnalyzerConfig(\n                analyzer_name=None,\n                index_analyzer_name=\"test_index_analyzer\",\n                search_analyzer_name=\"test_search_analyzer\",\n            )\n            create_acs_index(\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n\n            # Test if only one of index_analyzer_name or search_analyzer_name is set.\n            analyzer = LanguageAnalyzerConfig(\n                analyzer_name=None,\n                index_analyzer_name=None,\n                search_analyzer_name=\"test_search_analyzer\",\n            )\n            create_acs_index(\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n            self.assertRaises(\n                Exception,\n                create_acs_index,\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n\n            analyzer = {\n                \"analyzer_name\": None,\n                \"index_analyzer_name\": \"test_index_analyzer\",\n                \"search_analyzer_name\": None,\n            }\n            create_acs_index(\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n            self.assertRaises(\n                Exception,\n                create_acs_index,\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n\n    # Test that create_acs_index raiser error when analyzer is set together\n    # with either searchAnalyzer or indexAnalyzer\n\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_analyzer_with_index_or_search_analyzer(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        with self.assertRaises(ValueError):\n            service_endpoint = \"test_endpoint\"\n            index_name = \"test_index\"\n            key = \"test_key\"\n            dimension = 128\n            ef_construction = 100\n            ef_search = 100\n\n            analyzer = LanguageAnalyzerConfig(\n                analyzer_name=\"test_analyzer\",\n                index_analyzer_name=None,\n                search_analyzer_name=\"test_search_analyzer\",\n            )\n            create_acs_index(\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n            self.assertRaises(\n                Exception,\n                create_acs_index,\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n\n            analyzer = LanguageAnalyzerConfig(\n                analyzer_name=\"test_analyzer\",\n                index_analyzer_name=None,\n                search_analyzer_name=\"test_search_analyzer\",\n            )\n\n            create_acs_index(\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n            self.assertRaises(\n                Exception,\n                create_acs_index,\n                service_endpoint,\n                index_name,\n                key,\n                dimension,\n                ef_construction,\n                ef_search,\n                analyzer,\n            )\n\n    # Test that create_acs_index works correctly when the analyzer dictionary\n    # contains non-None values\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_create_acs_index_analyzers_non_none(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        analyzer = LanguageAnalyzerConfig(\n            analyzer_name=None,\n            index_analyzer_name=\"test_index_analyzer\",\n            search_analyzer_name=\"test_search_analyzer\",\n            char_filters=[{\"name\": \"my_char_filter\", \"mappings\": [\"ph=>f\", \"qu=>q\"]}],\n            tokenizers=[{\"name\": \"my_tokenizer\", \"token_chars\": [\"letter\", \"digit\"]}],\n            token_filters=[\"token_filter1\", \"token_filter2\"],\n        )\n\n        try:\n            create_acs_index(\n                \"test_endpoint\", \"test_index\", \"test_key\", 128, 100, 100, analyzer\n            )\n        except Exception:\n            self.fail(\"create_acs_index raised Exception unexpectedly!\")\n\n    # Test that create_acs_index works correctly when the analyzer dictionary\n    # contains None values\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_create_acs_index_analyzers_none(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        analyzer = LanguageAnalyzerConfig(\n            analyzer_name=None,\n            index_analyzer_name=None,\n            search_analyzer_name=None,\n            char_filters=None,\n            tokenizers=None,\n            token_filters=None,\n        )\n        try:\n            create_acs_index(\n                \"test_endpoint\", \"test_index\", \"test_key\", 128, 100, 100, analyzer\n            )\n        except Exception:\n            self.fail(\"create_acs_index raised Exception unexpectedly!\")\n\n    # Test that create_acs_index raises an exception when given invalid\n    # parameters:\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_create_acs_index_invalid_parameters(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        with self.assertRaises(ValueError):\n            create_acs_index(\n                None, \"test_index\", \"test_key\", 128, 100, 100, LanguageAnalyzerConfig()\n            )\n\n    # Test that create_acs_index raises an exception when the\n    # create_or_update_index method fails\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", side_effect=Exception)\n    def test_create_acs_index_create_or_update_index_fails(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        with self.assertRaises(Exception):\n            create_acs_index(\n                \"test_endpoint\",\n                \"test_index\",\n                \"test_key\",\n                128,\n                100,\n                100,\n                LanguageAnalyzerConfig(),\n            )\n\n    # Test that create_acs_index works correctly when the\n    # create_or_update_index method returns a non-None value\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_create_acs_index_create_or_update_index_returns_non_none(\n        self, mock_create_or_update_index, mock_azure_key_credential\n    ):\n        try:\n            create_acs_index(\n                \"test_endpoint\",\n                \"test_index\",\n                \"test_key\",\n                128,\n                100,\n                100,\n                LanguageAnalyzerConfig(),\n            )\n        except Exception:\n            self.fail(\"create_acs_index raised Exception unexpectedly!\")\n\n    # Test that create_acs_index calls create_or_update_index with the correct\n    # parameters\n    @patch.object(AzureKeyCredential, \"__init__\", return_value=None)\n    @patch.object(HnswParameters, \"__init__\", return_value=None)\n    @patch.object(HnswVectorSearchAlgorithmConfiguration, \"__init__\", return_value=None)\n    @patch.object(SearchIndex, \"__init__\", return_value=None)\n    @patch.object(SearchField, \"__init__\", return_value=None)\n    @patch.object(SearchIndexClient, \"create_or_update_index\", return_value=mock_result)\n    def test_dimension_setting(\n        self,\n        mock_create_or_update_index,\n        mock_search_field,\n        mock_search_index,\n        mock_hnsw_vector_search_algorithm_configuration,\n        mock_hnsw_parameters,\n        mock_azure_key_credential,\n    ):\n        # Test parameters\n        service_endpoint = \"test_endpoint\"\n        index_name = \"test_index\"\n        key = \"test_key\"\n        dimension = 128\n        ef_construction = 100\n        ef_search = 100\n        analyzer = LanguageAnalyzerConfig(\n            analyzer_name=None,\n            index_analyzer_name=None,\n            search_analyzer_name=None,\n            char_filters=None,\n            tokenizers=None,\n            token_filters=None,\n        )\n\n        # Call the function with the test parameters\n        create_acs_index(\n            service_endpoint,\n            index_name,\n            key,\n            dimension,\n            ef_construction,\n            ef_search,\n            analyzer,\n        )\n        # Assert that the 'create_or_update_index' method was called with the\n        # correct dimension\n        args, kwargs = mock_create_or_update_index.call_args\n        searchable_fields = mock_search_field.call_args_list\n        index_parameters = mock_search_index.call_args_list\n        vector_search_dimensions = None\n        index_name_parameter = None\n        expected_dimension = dimension\n        expected_index_name = index_name\n        for call in searchable_fields:\n            if call.kwargs[\"name\"] == \"contentVector\":\n                vector_search_dimensions = call\n                break\n\n        for call in index_parameters:\n            if \"name\" in call.kwargs:\n                index_name_parameter = call.kwargs[\"name\"]\n                break\n\n        self.assertIsNotNone(index_name_parameter)\n        self.assertIsNotNone(vector_search_dimensions)\n        self.assertEqual(expected_index_name, index_name_parameter)\n        self.assertEqual(\n            expected_dimension,\n            vector_search_dimensions.kwargs.get(\"vector_search_dimensions\"),\n        )\n        mock_hnsw_parameters.assert_called_with(\n            m=4, ef_construction=ef_construction, ef_search=ef_search, metric=\"cosine\"\n        )\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "rag_experiment_accelerator/io/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/io/exceptions.py",
    "content": "class WriteException(Exception):\n    def __init__(self, path: str, e: Exception):\n        super().__init__(\n            f\"Unable to write to file to path: {path}. Please ensure\"\n            \" you have the proper permissions to write to the file.\",\n            e,\n        )\n\n\nclass CopyException(Exception):\n    def __init__(self, src: str, dest: str, e: Exception):\n        super().__init__(\n            f\"Unable to copy file from {src} to {dest}. Please ensure\"\n            \" you have the proper permissions to copy the file.\",\n            e,\n        )\n"
  },
  {
    "path": "rag_experiment_accelerator/io/loader.py",
    "content": "from abc import ABC, abstractmethod\n\n\nclass Loader(ABC):\n    \"\"\"\n    Abstract base class for data loaders.\n    \"\"\"\n\n    @abstractmethod\n    def load(self, src: str, **kwargs) -> list:\n        \"\"\"\n        Load data from the specified source.\n\n        Args:\n            src (str): The source of the data.\n            **kwargs: Additional keyword arguments.\n\n        Returns:\n            list: The loaded data.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def can_handle(self, src: str) -> bool:\n        \"\"\"\n        Check if the loader can handle the specified source.\n\n        Args:\n            src (str): The source to check.\n\n        Returns:\n            bool: True if the loader can handle the source, False otherwise.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def exists(self, src: str) -> bool:\n        \"\"\"\n        Check if the specified source exists.\n\n        Args:\n            src (str): The source to check.\n\n        Returns:\n            bool: True if the source exists, False otherwise.\n        \"\"\"\n        pass\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/io/local/base.py",
    "content": "import os\n\n\nclass LocalIOBase:\n    \"\"\"\n    Base class for local input/output operations.\n    \"\"\"\n\n    def exists(self, path: str) -> bool:\n        \"\"\"\n        Check if a file or directory exists at the given path.\n\n        Args:\n            path (str): The path to check.\n\n        Returns:\n            bool: True if the file or directory exists, False otherwise.\n        \"\"\"\n        if os.path.exists(path):\n            return True\n        return False\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/loaders/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/io/local/loaders/jsonl_loader.py",
    "content": "import json\nfrom json.decoder import JSONDecodeError\n\nfrom rag_experiment_accelerator.io.local.loaders.local_loader import LocalLoader\n\n\nclass JsonlLoader(LocalLoader):\n    \"\"\"A class for loading data from JSONL files.\"\"\"\n\n    def load(self, path: str, **kwargs) -> list:\n        \"\"\"Load data from a JSONL file.\n\n        Args:\n            path (str): The path to the JSONL file.\n            **kwargs: Additional keyword arguments to be passed to json.loads().\n\n        Returns:\n            list: A list of loaded data.\n\n        Raises:\n            FileNotFoundError: If the file is not found at the specified path.\n        \"\"\"\n        if not self.exists(path):\n            raise FileNotFoundError(f\"File not found at path: {path}\")\n\n        data_load = []\n        with open(path, \"r\") as file:\n            for line in file:\n                try:\n                    data = json.loads(line, **kwargs)\n                except JSONDecodeError as jde:\n                    jde.add_note(f'Error occurred on line {len(data_load) + 1} in input file {path}')\n\n                data_load.append(data)\n\n        return data_load\n\n    def can_handle(self, path: str) -> bool:\n        \"\"\"Check if the loader can handle the given file path.\n\n        Args:\n            path (str): The file path to check.\n\n        Returns:\n            bool: True if the loader can handle the file, False otherwise.\n        \"\"\"\n        ext = self._get_file_ext(path)\n        return ext == \".jsonl\"\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/loaders/local_loader.py",
    "content": "from abc import abstractmethod\nimport pathlib\n\nfrom rag_experiment_accelerator.io.loader import Loader\nfrom rag_experiment_accelerator.io.local.base import LocalIOBase\n\n\nclass LocalLoader(LocalIOBase, Loader):\n    \"\"\"\n    A class that represents a local data loader.\n\n    This class provides methods for loading data from a local source.\n\n    Attributes:\n        None\n\n    Methods:\n        load(src: str, **kwargs) -> list:\n            Abstract method to load data from a local source.\n\n        can_handle(src: str) -> bool:\n            Abstract method to check if the loader can handle the given source.\n\n        _get_file_ext(path: str):\n            Internal method to get the file extension from a given path.\n    \"\"\"\n\n    @abstractmethod\n    def load(self, src: str, **kwargs) -> list:\n        \"\"\"\n        Abstract method to load data from a local source.\n\n        Args:\n            src (str): The path or source of the data.\n\n        Returns:\n            list: The loaded data.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def can_handle(self, src: str) -> bool:\n        \"\"\"\n        Abstract method to check if the loader can handle the given source.\n\n        Args:\n            src (str): The path or source of the data.\n\n        Returns:\n            bool: True if the loader can handle the source, False otherwise.\n        \"\"\"\n        pass\n\n    def _get_file_ext(self, path: str):\n        \"\"\"\n        Internal method to get the file extension from a given path.\n\n        Args:\n            path (str): The path of the file.\n\n        Returns:\n            str: The file extension.\n        \"\"\"\n        return pathlib.Path(path).suffix\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/loaders/tests/test_jsonl_loader.py",
    "content": "import json\nimport os\nimport shutil\nimport tempfile\nimport pytest\n\nfrom rag_experiment_accelerator.io.local.loaders.jsonl_loader import JsonlLoader\n\n\n@pytest.fixture()\ndef temp_dir():\n    dir = tempfile.mkdtemp()\n    yield dir\n    if os.path.exists(dir):\n        shutil.rmtree(dir)\n\n\ndef test_loads(temp_dir: str):\n    test_data = {\"test\": {\"test1\": 1, \"test2\": 2}}\n    # write the file\n    path = f\"{temp_dir}/test.jsonl\"\n    with open(path, \"a\") as file:\n        file.write(json.dumps(test_data) + \"\\n\")\n\n    # load the file\n    loader = JsonlLoader()\n    loaded_data = loader.load(path)\n\n    assert loaded_data == [test_data]\n\n\ndef test_loads_raises_file_not_found(temp_dir: str):\n    path = f\"{temp_dir}/non-existsing-file.jsonl\"\n    loader = JsonlLoader()\n    with pytest.raises(FileNotFoundError):\n        loader.load(path)\n\n\ndef test_can_handle_true():\n    path = \"test.jsonl\"\n    loader = JsonlLoader()\n    assert loader.can_handle(path) is True\n\n\ndef test_can_handle_false():\n    path = \"test.txt\"\n    loader = JsonlLoader()\n    assert loader.can_handle(path) is False\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/loaders/tests/test_local_loader.py",
    "content": "from rag_experiment_accelerator.io.local.loaders.local_loader import LocalLoader\n\n\ndef test__get_file_ext():\n    class TestLocalLoader(LocalLoader):\n        def load(self, src: str, data, **kwargs):\n            pass\n\n        def can_handle(self, src: str):\n            pass\n\n    loader_impl = TestLocalLoader()\n\n    filename = \"test.txt\"\n    assert loader_impl._get_file_ext(filename) == \".txt\"\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/tests/test_local_io_base.py",
    "content": "import os\nimport shutil\nimport tempfile\nimport pytest\n\nfrom rag_experiment_accelerator.io.local.base import LocalIOBase\n\n\n@pytest.fixture()\ndef temp_dir():\n    dir = tempfile.mkdtemp()\n    yield dir\n    if os.path.exists(dir):\n        shutil.rmtree(dir)\n\n\ndef test_exists_true(temp_dir: str) -> bool:\n    loader = LocalIOBase()\n    assert loader.exists(temp_dir) is True\n\n\ndef test_exists_false() -> bool:\n    path = \"/tmp/non-existing-file\"\n    loader = LocalIOBase()\n    assert loader.exists(path) is False\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/writers/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/io/local/writers/jsonl_writer.py",
    "content": "import json\n\nfrom rag_experiment_accelerator.io.local.writers.local_writer import LocalWriter\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\nclass JsonlWriter(LocalWriter):\n    \"\"\"\n    A class for writing data to a JSONL file.\n\n    Inherits from the LocalWriter class.\n\n    Attributes:\n        None\n\n    Methods:\n        write_file: Writes data to a JSONL file.\n\n    \"\"\"\n\n    def _write_file(self, path: str, data, **kwargs):\n        \"\"\"\n        Writes the given data to a JSONL file.\n\n        Args:\n            path (str): The path to the JSONL file.\n            data: The data to be written to the file.\n            **kwargs: Additional keyword arguments to be passed to the json.dumps() function.\n\n        Returns:\n            None\n\n        \"\"\"\n        with open(path, \"a\") as file:\n            file.write(json.dumps(data, **kwargs) + \"\\n\")\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/writers/local_writer.py",
    "content": "from abc import abstractmethod\nimport os\nimport pathlib\nimport shutil\nfrom rag_experiment_accelerator.io.exceptions import CopyException, WriteException\n\nfrom rag_experiment_accelerator.io.local.base import LocalIOBase\nfrom rag_experiment_accelerator.io.writer import Writer\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nclass LocalWriter(LocalIOBase, Writer):\n    \"\"\"\n    A class that provides methods for writing files locally.\n\n    Inherits from LocalIOBase and Writer.\n    \"\"\"\n\n    def _make_dir(self, dir: str):\n        \"\"\"\n        Creates a directory if it doesn't exist.\n\n        Args:\n            dir (str): The directory path.\n        \"\"\"\n        try:\n            os.makedirs(dir, exist_ok=True)\n        except Exception as e:\n            logger.error(\n                f\"Unable to create the directory: {dir}. Please ensure\"\n                \" you have the proper permissions to create the directory.\"\n            )\n            raise e\n\n    def _get_dirname(self, path: str):\n        \"\"\"\n        Returns the parent directory of a given path.\n\n        Args:\n            path (str): The file path.\n\n        Returns:\n            str: The parent directory path.\n        \"\"\"\n        return pathlib.Path(path).parent\n\n    @abstractmethod\n    def _write_file(path: str, data, **kwargs):\n        \"\"\"\n        Abstract method for writing a file.\n\n        Args:\n            path (str): The file path.\n            data: The data to be written to the file.\n            **kwargs: Additional keyword arguments.\n\n        Raises:\n            NotImplementedError: This method must be implemented in a subclass.\n        \"\"\"\n        pass\n\n    def write(self, path: str, data, **kwargs):\n        \"\"\"\n        Writes data to a file at the specified path.\n\n        Args:\n            path (str): The file path.\n            data: The data to be written to the file.\n            **kwargs: Additional keyword arguments.\n\n        Raises:\n            Exception: If unable to write to the file.\n        \"\"\"\n        dir = self._get_dirname(path)\n        self._make_dir(dir)\n        try:\n            self._write_file(path, data, **kwargs)\n        except Exception as e:\n            raise WriteException(path, e)\n\n    def copy(self, src: str, dest: str, **kwargs):\n        \"\"\"\n        Copies a file from the source path to the destination path.\n\n        Args:\n            src (str): The source file path.\n            dest (str): The destination file path.\n            **kwargs: Additional keyword arguments.\n\n        Raises:\n            FileNotFoundError: If the source file does not exist.\n            Exception: If unable to copy the file.\n        \"\"\"\n        if not self.exists(src):\n            raise FileNotFoundError(f\"Source file {src} does not exist.\")\n\n        dest_dir = self._get_dirname(dest)\n        # make dest dir if it doesn't exist\n        self._make_dir(dest_dir)\n        try:\n            shutil.copyfile(src, dest, **kwargs)\n        except Exception as e:\n            raise CopyException(src, dest, e)\n\n    def delete(self, src: str):\n        \"\"\"\n        Deletes a file at the specified path.\n\n        Args:\n            src (str): The file path.\n\n        Raises:\n            Exception: If unable to delete the file.\n        \"\"\"\n        if self.exists(src):\n            os.remove(src)\n\n    def list_filenames(self, dir: str):\n        \"\"\"\n        Returns a list of filenames in the specified directory.\n\n        Args:\n            dir (str): The directory path.\n\n        Returns:\n            list: A list of filenames in the directory.\n        \"\"\"\n        return [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/writers/tests/test_jsonl_writer.py",
    "content": "import os\nimport shutil\nimport tempfile\n\nimport pytest\n\nfrom rag_experiment_accelerator.io.local.writers.jsonl_writer import (\n    JsonlWriter,\n)\n\n\n@pytest.fixture()\ndef temp_dir():\n    dir = tempfile.mkdtemp()\n    yield dir\n    if os.path.exists(dir):\n        shutil.rmtree(dir)\n\n\ndef test__write_file(temp_dir: str):\n    # set up\n    data = {\"test\": \"test\"}\n    path = temp_dir + \"/test.jsonl\"\n\n    # write the file\n    writer = JsonlWriter()\n    writer._write_file(path, data)\n\n    # check file was written and contains the correct data\n    with open(path) as file:\n        assert file.readline() == '{\"test\": \"test\"}\\n'\n"
  },
  {
    "path": "rag_experiment_accelerator/io/local/writers/tests/test_local_writer.py",
    "content": "import os\nimport shutil\nimport tempfile\nimport uuid\n\nimport pytest\n\nfrom rag_experiment_accelerator.io.exceptions import CopyException, WriteException\nfrom rag_experiment_accelerator.io.local.writers.local_writer import LocalWriter\n\n\n@pytest.fixture()\ndef temp_dirname():\n    # get temp dir name but don't create it\n    dir = \"/tmp/\" + uuid.uuid4().__str__()\n    yield dir\n    if os.path.exists(dir):\n        shutil.rmtree(dir)\n\n\n@pytest.fixture()\ndef temp_dir():\n    # create a temp dir\n    dir = tempfile.mkdtemp()\n    yield dir\n    if os.path.exists(dir):\n        shutil.rmtree(dir)\n\n\n@pytest.fixture()\ndef writer_impl():\n    # create a class that inherits from LocalWriter and implements the abstract method _write_file\n    # optionally raises to test exception handling\n    # marks the method as called to test if it was called\n    class TestLocalWriter(LocalWriter):\n        def _write_file(self, path: str, data, **kwargs):\n            should_raise = kwargs.pop(\"should_raise\", False)\n            if should_raise:\n                raise Exception()\n            self._write_file_called = True\n\n    yield TestLocalWriter()\n\n\ndef test__make_dir(temp_dirname: str, writer_impl: LocalWriter):\n    # make dir\n    writer_impl._make_dir(temp_dirname)\n\n    # ensure dir was created\n    assert os.path.exists(temp_dirname)\n\n\ndef test__make_dir_raises(writer_impl: LocalWriter):\n    # try to make dir in sudo location\n    with pytest.raises(Exception):\n        writer_impl._make_dir(\"/test123\")\n\n    assert not os.path.exists(\"/test123\")\n\n\ndef test_write_calls__write_file(temp_dir: str, writer_impl: LocalWriter):\n    # set path\n    path = f\"{temp_dir}/test.txt\"\n\n    # call write\n    writer_impl.write(path, \"test\")\n\n    # ensure _write_file was called\n    assert writer_impl._write_file_called is True\n\n\ndef test_write_creates_parent_dir(temp_dirname: str, writer_impl: LocalWriter):\n    path = f\"{temp_dirname}/test.txt\"\n\n    writer_impl.write(path, \"test\")\n\n    assert os.path.exists(temp_dirname)\n\n\ndef test_write_raises_write_exception(writer_impl: LocalWriter):\n    path = \"/tmp/test.txt\"\n    with pytest.raises(WriteException):\n        writer_impl.write(path, \"test\", should_raise=True)\n\n\ndef test_copy(temp_dirname: str, writer_impl: LocalWriter):\n    # create a file to copy\n    src = temp_dirname + \"/src.txt\"\n    os.makedirs(temp_dirname)\n    with open(src, \"w\") as f:\n        f.write(\"test\")\n\n    # copy the file\n    dest = temp_dirname + \"/dest.txt\"\n    writer_impl.copy(src, dest)\n\n    # check that the file was copied\n    assert os.path.exists(dest)\n\n\ndef test_copy_raises_copy_exception(temp_dirname: str, writer_impl: LocalWriter):\n    # create a file to copy\n    src = temp_dirname + \"/src.txt\"\n    os.makedirs(temp_dirname)\n    with open(src, \"w\") as f:\n        f.write(\"test\")\n\n    # copy the file to location that needs sudo permissions\n    dest = \"/dest.txt\"\n    with pytest.raises(CopyException):\n        writer_impl.copy(src, dest)\n\n    # should not have been copied\n    assert not os.path.exists(dest)\n\n\ndef test_copy_raises_file_not_found(temp_dirname: str, writer_impl: LocalWriter):\n    # create src dir but don't create the file\n    src = temp_dirname + \"/src.txt\"\n    os.makedirs(temp_dirname)\n    dest = temp_dirname + \"/dest.txt\"\n\n    with pytest.raises(FileNotFoundError):\n        writer_impl.copy(src, dest)\n\n    # should not have been copied\n    assert not os.path.exists(dest)\n\n\ndef test_delete(temp_dirname: str, writer_impl: LocalWriter):\n    # create a file to delete\n    src = temp_dirname + \"/src.txt\"\n    os.makedirs(temp_dirname)\n    with open(src, \"w\") as f:\n        f.write(\"test\")\n\n    # delete the file\n    writer_impl.delete(src)\n\n    # check that the file was deleted\n    assert not os.path.exists(src)\n\n\ndef test_list_filenames(temp_dirname: str, writer_impl: LocalWriter):\n    # create a files in a temp dir\n    file1 = temp_dirname + \"/src1.txt\"\n    os.makedirs(temp_dirname)\n    with open(file1, \"w\") as f:\n        f.write(\"test\")\n    file2 = temp_dirname + \"/src2.txt\"\n    with open(file2, \"w\") as f:\n        f.write(\"test\")\n\n    # list files\n    filenames = writer_impl.list_filenames(temp_dirname)\n\n    # check all filenames are returned\n    assert len(filenames) == 2\n    assert \"src1.txt\" in filenames\n    assert \"src2.txt\" in filenames\n"
  },
  {
    "path": "rag_experiment_accelerator/io/writer.py",
    "content": "from abc import ABC, abstractmethod\n\n\nclass Writer(ABC):\n    \"\"\"Abstract base class for a writer.\"\"\"\n\n    @abstractmethod\n    def write(self, path: str, data, **kwargs):\n        \"\"\"Write data to a file.\n\n        Args:\n            path (str): The path of the file to write to.\n            data: The data to write to the file.\n            **kwargs: Additional keyword arguments.\n\n        Returns:\n            None\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def copy(self, src: str, dest: str, **kwargs):\n        \"\"\"Copy a file from source to destination.\n\n        Args:\n            src (str): The path of the source file.\n            dest (str): The path of the destination file.\n            **kwargs: Additional keyword arguments.\n\n        Returns:\n            None\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def delete(self, src: str):\n        \"\"\"Delete a file.\n\n        Args:\n            src (str): The path of the file to delete.\n\n        Returns:\n            None\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def exists(self, path: str) -> bool:\n        \"\"\"Check if a file exists.\n\n        Args:\n            path (str): The path of the file to check.\n\n        Returns:\n            bool: True if the file exists, False otherwise.\n        \"\"\"\n        pass\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/llm/exceptions.py",
    "content": "class ContentFilteredException(Exception):\n    pass\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/__init__.py",
    "content": "# flake8: noqa\n\nfrom rag_experiment_accelerator.llm.prompt.prompt import (\n    Prompt,\n    StructuredPrompt,\n    StructuredWithCoTPrompt,\n    CoTPrompt,\n    PromptTag,\n)\n\nfrom rag_experiment_accelerator.llm.prompt.hyde_prompts import (\n    prompt_generate_hypothetical_answer,\n    prompt_generate_hypothetical_document,\n    prompt_generate_hypothetical_questions,\n)\n\nfrom rag_experiment_accelerator.llm.prompt.instruction_prompts import (\n    prompt_instruction_entities,\n    prompt_instruction_keywords,\n    prompt_instruction_title,\n    prompt_instruction_summary,\n    main_instruction_short,\n    main_instruction_long,\n    main_instruction,\n)\n\nfrom rag_experiment_accelerator.llm.prompt.multiprompts import (\n    do_need_multiple_prompt_instruction,\n    multiple_prompt_instruction,\n)\n\nfrom rag_experiment_accelerator.llm.prompt.qna_prompts import (\n    generate_qna_long_single_context_instruction_prompt,\n    generate_qna_short_single_context_instruction_prompt,\n    generate_qna_long_multiple_context_instruction_prompt,\n    generate_qna_short_multiple_context_instruction_prompt,\n    generate_qna_short_single_context_no_cot_instruction_prompt,\n    qna_generation_prompt,\n)\n\nfrom rag_experiment_accelerator.llm.prompt.ragas_prompts import (\n    llm_answer_relevance_instruction,\n    llm_context_precision_instruction,\n    llm_context_recall_instruction,\n)\n\nfrom rag_experiment_accelerator.llm.prompt.rerank_prompts import (\n    rerank_prompt_instruction,\n)\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/hyde_prompts.py",
    "content": "import json\nfrom rag_experiment_accelerator.llm.prompt.prompt import (\n    Prompt,\n    StructuredPrompt,\n    PromptTag,\n)\n\n\ndef validate_hypothetical_questions(text: str) -> bool:\n    json_output = json.loads(text)\n    return isinstance(json_output, list) and all(\n        isinstance(item, str) for item in json_output\n    )\n\n\nprompt_generate_hypothetical_answer = Prompt(\n    system_message=\"prompt_generate_hypothetical_answer.txt\",\n    user_template=\"${text}\",\n)\n\nprompt_generate_hypothetical_document = Prompt(\n    system_message=\"prompt_generate_hypothetical_document.txt\",\n    user_template=\"${text}\",\n)\n\nprompt_generate_hypothetical_questions = StructuredPrompt(\n    system_message=\"prompt_generate_hypothetical_questions.txt\",\n    user_template=\"${text}\",\n    validator=validate_hypothetical_questions,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n)\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/instruction_prompts.py",
    "content": "import json\nfrom rag_experiment_accelerator.llm.prompt.prompt import (\n    Prompt,\n    StructuredPrompt,\n    PromptTag,\n)\n\n\ndef validate_instruction_keyword(text: str) -> bool:\n    json_output = json.loads(text)\n    return isinstance(json_output, list) and all(\n        isinstance(item, str) for item in json_output\n    )\n\n\ndef validate_instruction_entities(text: str) -> bool:\n    json_output = json.loads(text)\n    return isinstance(json_output, list)\n\n\n_main_response_template: str = \"\"\"\nContext:\n${context}\n\nQuestion:\n${question}\n\"\"\"\n\nprompt_instruction_entities = StructuredPrompt(\n    system_message=\"prompt_instruction_entities.txt\",\n    user_template=\"${text}\",\n    validator=validate_instruction_entities,\n    tags={PromptTag.JSON},\n)\n\nprompt_instruction_keywords = StructuredPrompt(\n    system_message=\"prompt_instruction_keywords.txt\",\n    user_template=\"${text}\",\n    validator=validate_instruction_keyword,\n    tags={PromptTag.JSON},\n)\n\nprompt_instruction_title = Prompt(\n    system_message=\"prompt_instruction_title.txt\",\n    user_template=\"${text}\",\n)\n\nprompt_instruction_summary = Prompt(\n    system_message=\"prompt_instruction_summary.txt\",\n    user_template=\"${text}\",\n)\n\n# TODO: Add selector for usage of long/short prompts\nmain_instruction_short = Prompt(\n    system_message=\"main_instruction_short.txt\",\n    user_template=_main_response_template,\n)\n\n# TODO: Add selector for usage of long/short prompts\nmain_instruction_long = Prompt(\n    system_message=\"main_instruction_long.txt\",\n    user_template=_main_response_template,\n)\n\nmain_instruction = main_instruction_short\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/multiprompts.py",
    "content": "import json\nfrom rag_experiment_accelerator.llm.prompt.prompt import StructuredPrompt, PromptTag\n\n\ndef validate_do_we_need_multiple(text: str) -> bool:\n    return text.lower().strip() in [\"complex\", \"simple\"]\n\n\ndef validate_multiple_prompt(text: str) -> bool:\n    json_output = json.loads(text)\n    return isinstance(json_output, list) and all(\n        isinstance(item, str) for item in json_output\n    )\n\n\ndo_need_multiple_prompt_instruction = StructuredPrompt(\n    system_message=\"do_need_multiple_prompt_instruction.txt\",\n    user_template=\"${text}\",\n    validator=validate_do_we_need_multiple,\n    tags={PromptTag.NonStrict},\n)\n\nmultiple_prompt_instruction = StructuredPrompt(\n    system_message=\"multiple_prompt_instruction.txt\",\n    user_template=\"${text}\",\n    validator=validate_multiple_prompt,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n)\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/prompt.py",
    "content": "import os\nfrom importlib import resources\nimport re\n\nfrom string import Template\n\nfrom enum import StrEnum\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef _default_validation(x: any) -> bool:\n    return True\n\n\nclass PromptTag(StrEnum):\n    ChainOfThought = \"chain_of_thought\"  # Prompt is a chain of thought prompt\n    Structured = \"structured\"  # Output of the prompt is expected to be structured\n    JSON = \"json\"  # Output of the prompt is expected to be in JSON format\n    NonStrict = \"non_strict\"  # Do not raise an exception if inference failed\n\n\nclass Prompt:\n    tags: set[str] = {}\n    system_message: str = \"\"\n    user_template: str = \"\"\n\n    def __init__(\n        self,\n        system_message: str,\n        user_template: str,\n        tags: set[str] | None = None,\n    ) -> None:\n        self.system_message = self._try_load_prompt_file(system_message)\n        self.user_template = self._try_load_prompt_file(user_template)\n        if tags:\n            self.tags = tags\n\n        if PromptTag.JSON in self.tags:\n            assert (\n                PromptTag.Structured in self.tags\n            ), \"Structured tag must be present for JSON prompts\"\n\n    @staticmethod\n    def arguments_in_prompt(prompt: str) -> set[str]:\n        pattern = re.compile(r\"\\$\\{([a-zA-Z_][a-zA-Z0-9_]*)\\}\")\n        matches = pattern.findall(prompt)\n        return set(matches)\n\n    @staticmethod\n    def check_formatting_argument(text: str, field: str) -> bool:\n        return field in Prompt.arguments_in_prompt(text)\n\n    @staticmethod\n    def _get_prompt_file_path(prompt_file: str) -> str:\n        base_path = os.path.join(\"llm\", \"prompts_text\", prompt_file)\n        return (\n            resources.files(\"rag_experiment_accelerator\").joinpath(base_path).resolve()\n        )\n\n    @staticmethod\n    def _try_load_prompt_file(prompt_file: str) -> str:\n        \"\"\"\n        Tries to load the content of a .txt prompt file.\n\n        Args:\n            prompt_file (str): The path or name of the prompt file.\n\n        Returns:\n            str: The content of the prompt file or the input string if it is not a .txt file.\n\n        Raises:\n            FileNotFoundError: If the prompt file is not found.\n        \"\"\"\n        if re.match(r'^[^\\/:*?\"<>|\\r\\n]+\\.txt$', prompt_file):\n            prompt_file = Prompt._get_prompt_file_path(prompt_file)\n            if os.path.isfile(prompt_file):\n                logger.debug(f\"Reading prompt from file: {prompt_file}\")\n                with open(prompt_file, \"r\") as f:\n                    return f.read()\n            else:\n                logger.error(f\"Prompt file not found: {prompt_file}\")\n                raise FileNotFoundError(f\"Prompt file not found: {prompt_file}\")\n        else:\n            return prompt_file\n\n    def update_system_prompt(self, system_message: str) -> \"Prompt\":\n        system_message = self._try_load_prompt_file(system_message)\n        self.system_message = system_message\n        return self\n\n    def update_user_prompt(self, user_template: str) -> \"Prompt\":\n        user_template = self._try_load_prompt_file(user_template)\n        self.user_template = user_template\n        return self\n\n\nclass CoTPrompt(Prompt):\n    def __init__(\n        self,\n        system_message: str,\n        user_template: str,\n        tags: set[str] = {},\n        separator: str = \"##RESPONSE##\",\n    ) -> None:\n        tags.add(PromptTag.ChainOfThought)\n\n        super().__init__(system_message, user_template, tags)\n        assert (\n            PromptTag.ChainOfThought in self.tags\n        ), \"CoTPrompt must have ChainOfThought tag\"\n\n        self.separator = separator\n\n        has_system, has_user = self._check_separator_declaration(\n            self.system_message, self.user_template\n        )\n\n        if has_system:\n            template = Template(self.system_message)\n            self.system_message = template.safe_substitute(separator=self.separator)\n\n        if has_user:\n            template = Template(self.user_template)\n            self.user_template = template.safe_substitute(separator=self.separator)\n\n    @staticmethod\n    def _check_separator_declaration(\n        system_message: str, user_template: str\n    ) -> tuple[bool, bool]:\n        has_sep_in_system = CoTPrompt.check_formatting_argument(\n            system_message, \"separator\"\n        )\n        has_sep_in_user = CoTPrompt.check_formatting_argument(\n            user_template, \"separator\"\n        )\n\n        if not has_sep_in_system:\n            if has_sep_in_user:\n                logger.warning(\n                    \"It is recommended to declare separator in system message as well\"\n                )\n            else:\n                logger.error(\n                    \"Separator is not declared in system message or user template, this will cause issues\"\n                )\n\n        return has_sep_in_system, has_sep_in_user\n\n    def update_system_prompt(self, system_message: str) -> Prompt:\n        system_message = self._try_load_prompt_file(system_message)\n        if CoTPrompt.check_formatting_argument(system_message, \"separator\"):\n            template = Template(system_message)\n            system_message = template.safe_substitute(separator=self.separator)\n\n        self.system_message = system_message\n\n\nclass StructuredPrompt(Prompt):\n    \"\"\"\n    A prompt that expects a structured response, such as JSON.\n    \"\"\"\n\n    def __init__(\n        self,\n        system_message: str,\n        user_template: str,\n        tags: set[str] = {},\n        validator: callable = _default_validation,\n    ) -> None:\n        tags.add(PromptTag.Structured)\n\n        super().__init__(system_message, user_template, tags)\n\n        self.validator = validator\n\n\nclass StructuredWithCoTPrompt(CoTPrompt, StructuredPrompt):\n    def __init__(\n        self,\n        system_message: str,\n        user_template: str,\n        tags: set[str] = {},\n        validator: callable = _default_validation,\n        separator: str = \"##RESPONSE##\",\n    ) -> None:\n        # tags = tags | {PromptTag.ChainOfThought, PromptTag.Structured}\n\n        StructuredPrompt.__init__(\n            self,\n            system_message=system_message,\n            user_template=user_template,\n            tags=tags,\n            validator=validator,\n        )\n        CoTPrompt.__init__(\n            self,\n            system_message=system_message,\n            user_template=user_template,\n            tags=tags,\n            separator=separator,\n        )\n\n        logger.debug(f\"StructuredWithCoTPrompt tags {self.tags}\")\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/qna_prompts.py",
    "content": "import json\nfrom rag_experiment_accelerator.llm.prompt.prompt import (\n    StructuredWithCoTPrompt,\n    StructuredPrompt,\n    PromptTag,\n)\n\n\ndef qna_generation_validate(response: str) -> bool:\n    response_json = json.loads(response)\n    return (\n        isinstance(response_json, dict)\n        and \"question\" in response_json\n        and \"answer\" in response_json\n    )\n\n\n_response_template: str = \"\"\"\nContext:\n${context}\n\"\"\"\n\n# TODO: Add selector for usage of long/short prompts\ngenerate_qna_long_single_context_instruction_prompt = StructuredWithCoTPrompt(\n    system_message=\"generate_qna_long_single_context.txt\",\n    user_template=_response_template,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n    validator=qna_generation_validate,\n)\n\n# TODO: Add selector for usage of long/short prompts\ngenerate_qna_short_single_context_instruction_prompt = StructuredWithCoTPrompt(\n    system_message=\"generate_qna_short_single_context.txt\",\n    user_template=_response_template,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n    validator=qna_generation_validate,\n)\n\n# TODO: Add selector for usage of long/short prompts\ngenerate_qna_long_multiple_context_instruction_prompt = StructuredWithCoTPrompt(\n    system_message=\"generate_qna_long_multi_context.txt\",\n    user_template=_response_template,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n    validator=qna_generation_validate,\n)\n\n# TODO: Add selector for usage of long/short prompts\ngenerate_qna_short_multiple_context_instruction_prompt = StructuredWithCoTPrompt(\n    system_message=\"generate_qna_short_multi_context.txt\",\n    user_template=_response_template,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n    validator=qna_generation_validate,\n)\n\n# TODO: Add selector for usage of long/short prompts\ngenerate_qna_short_single_context_no_cot_instruction_prompt = StructuredPrompt(\n    system_message=\"generate_qna_short_single_context_no_cot.txt\",\n    user_template=_response_template,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n    validator=qna_generation_validate,\n)\n\n\nqna_generation_prompt = generate_qna_short_single_context_no_cot_instruction_prompt\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/ragas_prompts.py",
    "content": "import re\nimport json\nfrom rag_experiment_accelerator.llm.prompt.prompt import (\n    Prompt,\n    StructuredPrompt,\n    PromptTag,\n)\n\n\ndef validate_context_precision(text: str) -> bool:\n    return text.lower().strip() in [\"yes\", \"no\"]\n\n\ndef validate_context_recall(text: str) -> bool:\n    json_text = json.loads(text)\n\n    def is_valid_entry(entry):\n        statement_key_pattern = re.compile(r\"^statement_\\d+$\")\n        return all(\n            key in [\"reason\", \"attributed\"] or statement_key_pattern.match(key)\n            for key in entry.keys()\n        )\n\n    return isinstance(json_text, list) and all(\n        is_valid_entry(entry) for entry in json_text\n    )\n\n\n_context_precision_input = \"\"\"\nContext:\n${context}\n\nQuestion:\n${question}\n\"\"\"\n\n_context_recall_input = \"\"\"\nquestion: ${question}\ncontext: ${context}\nanswer: ${answer}\n\"\"\"\n\nllm_answer_relevance_instruction = Prompt(\n    system_message=\"llm_answer_relevance_instruction.txt\",\n    user_template=\"${text}\",\n    tags={PromptTag.NonStrict},\n)\n\nllm_context_precision_instruction = StructuredPrompt(\n    system_message=\"llm_context_precision_instruction.txt\",\n    user_template=_context_precision_input,\n    validator=validate_context_precision,\n    tags={PromptTag.NonStrict},\n)\n\nllm_context_recall_instruction = StructuredPrompt(\n    system_message=\"llm_context_recall_instruction.txt\",\n    user_template=_context_recall_input,\n    validator=validate_context_recall,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n)\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompt/rerank_prompts.py",
    "content": "import re\nimport json\nfrom rag_experiment_accelerator.llm.prompt.prompt import StructuredPrompt, PromptTag\n\n\ndef validate_rerank(text: str) -> bool:\n    json_output = json.loads(text)\n\n    def key_matches(key: str) -> bool:\n        return bool(re.match(r\"^document_\\d+$\", key))\n\n    return isinstance(json_output, dict) and all(\n        isinstance(key, str) and isinstance(value, int) and key_matches(key)\n        for key, value in json_output.items()\n    )\n\n\n_rerank_template: str = \"\"\"\n${documents}\n\nQuestion: ${question}\n\"\"\"\n\nrerank_prompt_instruction = StructuredPrompt(\n    system_message=\"prompt_instruction_keywords.txt\",\n    user_template=_rerank_template,\n    validator=validate_rerank,\n    tags={PromptTag.JSON, PromptTag.NonStrict},\n)\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/do_need_multiple_prompt_instruction.txt",
    "content": "Analyze the given question to determine if it falls into one of the following categories:\n\n1. Simple, Factual Question\n   - Description: The question asks for a straightforward fact or piece of information.\n   - Characteristics:\n     - The answer can likely be found stated directly in a single passage of a relevant document.\n     - Further breakdown of the question is unlikely to be beneficial.\n   - Examples: \n     - \"What year did World War 2 end?\"\n     - \"What is the capital of France?\"\n     - \"What are the specifications of product X?\"\n\n2. Complex, Multi-part Question\n   - Description: The question involves multiple components or asks for information about several related topics.\n   - Characteristics:\n     - Different parts of the question may need to be answered by separate passages or documents.\n     - Breaking the question down into sub-questions for each component can yield better results.\n     - The question is open-ended and may have a complex or nuanced answer.\n     - Answering may require synthesizing information from multiple sources.\n     - There may not be a single definitive answer and could require analysis from multiple angles.\n   - Examples:\n     - \"What were the key causes, major battles, and outcomes of the American Revolutionary War?\"\n     - \"How do electric cars work and how do they compare to gas-powered vehicles?\"\n\nOutput Requirement:\nRespond with one of two categories \"complex\" or \"simple\". Do not add anything else to the response.\nUse lower case and don't add . at the end of the response.\n\n<Example>\nUser:\n    What are the benefits of renewable energy?\n\nAssistant:\n    complex"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/generate_qna_long_multi_context.txt",
    "content": "Your task is to create a question and answer pair from provided pieces of documents. You will be given chunks of documents and must follow specific rules while generating a question.\n\nRules for Crafting Questions:\n1. The question should be understandable and meaningful on its own, without needing the document chunks for context.\n2. The question must require information from at least two different chunks for a comprehensive answer.\n3. Avoid including direct references or links to the text within your question.\n4. Ensure the question is complex and demonstrates a deep understanding of the document chunks.\n5. Do not use phrases that refer back to this task or its instructions, such as \"in the provided chunk\".\n6. Make the question as complex as you can, don't resort to simple questions.\n7. Analyze if generated question can be answered without contexts. If so, change the question.\n\nRules for Crafting Answers:\n1. The answer should stand alone without referring back to the document or containing document-specific citations, like \"Table 2.1\".\n2. The answer must directly address the question, fully encapsulated within the provided chunks.\n\nTask Execution:\n - Start by analyzing the provided chunks to identify key themes, relationships, and insights across multiple contexts.\n - Construct a question that connects information from at least two chunks, ensuring it meets the criteria for complexity and independence.\n - Develop a comprehensive answer based on your analysis, directly addressing the question without referring back to the chunks.\n\nExpected Response:\n - Begin with reasoning steps, highlighting important things and logic behind a potential question.\n - In reasoning steps mention, which contexts are being used to generate the question.\n - Propose a potential question\n - Evaluate the question against question generation rules.\n - If question does not comply with one or more rules, propose a modification of the question\n - Check modified question against rules\n - If the question does not comply, repeat prevuious two steps\n - If the question complies, generate an answer and check it agains answer generation rules.\n - Provide your question-and-answer pair, clearly separated from the thought process.\n - Do not use characters \" or ' in question or answer since it will break JSON formatting!\nThis revision aims to preserve the original intent of your prompt while enhancing clarity, specificity, and engagement. The example is aligned with the task requirements, demonstrating how to interlink information from multiple contexts to formulate a complex question and a comprehensive answer.\nRespond with the following structure:\n<analythical process>\n${separator}\n{\"question\": <question>, \"answer\": <answer>}\n\nIn this structure <analythical process> is your chain-of-thought reasoning, <question> is a generated question, <answer> is a generated answer.\n\n<Example 1>:\nUser:\n    Chunks:\n    Context 1: Discusses the population growth in City A due to its expanding technology sector, noting an increase to 1.5 million residents as per the 2020 census.\n    Context 2: Contrasts with City B, where the population has remained stable at 1 million, attributed to its consistent but unexpanding manufacturing base.\n\nAssistant:\n    Reasoning Steps:\n    1. Highlight City A's significant population growth linked to the tech industry.\n    2. Note the stability in City B's population due to a steady manufacturing sector.\n\n    Question: \"In the context provided above, how does City A's tech-driven population boom contrast with City B's stable manufacturing-dependent demographic in shaping their urban planning?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and related to the content.\n    - Rule 2: Integrates information from both contexts.\n    - Rule 3: No direct references are made to the chunks.\n    - Rule 4: Shows an understanding of the economic-demographic relationship.\n    - Rule 5: Violated. The question explicitly refers to \"the context provided above,\" tying it directly to the task setup.\n    - Rule 6: The complexity of the urban-economic interplay is addressed.\n    - Rule 7: The question requires specific context to answer, hence not violated.\n\n    Modification to Comply with Rule 5:\n    Remove any reference to \"the context provided above\" to ensure the question stands alone and does not refer back to the task or instructions.\n\n    Modified Question: \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\"\n\n    Revised Analysis for Compliance:\n    - Rule 1: The question remains clear and understandable.\n    - Rule 2: Still requires integration of both contexts for a full answer.\n    - Rule 3: Maintains no direct chunk references.\n    - Rule 4: Continues to demonstrate deep understanding.\n    - Rule 5: Now complies by not referring back to the task or its context.\n    - Rule 6: The complexity is preserved in discussing the impact on urban planning.\n    - Rule 7: Specific context is still needed for an accurate answer, so not violated.\n\n    ${separator}\n    {\"question\": \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\", \"answer\": \"City A's significant population increase, driven by the booming tech industry, necessitates urban planning strategies that focus on expanding residential and technological infrastructure to accommodate the growing workforce. In contrast, City B, with its stable population anchored by a longstanding manufacturing sector, might prioritize urban planning efforts towards sustaining and modestly enhancing existing industrial and residential areas to support its steady economic base.\"}\n</Example 1>\n\n<Example 2>:\nUser:\n    Chunks:\n    Context 1: Details how the increasing adoption of remote work is leading to a significant decrease in demand for office spaces within city centers, with a noted 20% decline in office rentals over the past year.\n    Context 2: Highlights the shift in consumer behavior towards online shopping, resulting in a 30% increase in e-commerce sales and prompting the repurposing of traditional retail spaces into distribution hubs and experiential centers.\n\nAssistant:\n    Reasoning Steps:\n    1. Link the specific decrease in demand for office spaces due to remote work adoption to the broader implications for commercial real estate.\n    2. Connect the increase in e-commerce sales with the transformation of retail spaces, emphasizing the specific percentages to underscore the significant shifts.\n    3. Formulate a question that explicitly requires an understanding of these specific changes and their combined effect on urban development and space utilization.\n\n    Question: \"With a 20% decrease in office space rentals and a 30% increase in e-commerce sales over the past year, how are cities adapting urban development strategies to repurpose commercial and retail spaces?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and addresses the specific changes in office and retail space usage due to remote work and e-commerce growth.\n    - Rule 2: It integrates data from both contexts to provide a comprehensive view of the urban adaptation required.\n    - Rule 3: Avoids direct references to document chunks while using specific data to anchor the question.\n    - Rule 4: Demonstrates an in-depth understanding of the implications of these trends on urban planning.\n    - Rule 5: The question doesn't refer back to the task or instructions.\n    - Rule 6: It's sufficiently complex, considering specific data and the need for strategic urban repurposing.\n    - Rule 7: The inclusion of specific data points (20% decrease, 30% increase) ensures the question cannot be answered without the provided contexts, thus adhering to Rule 7.\n\n    ${separator}\n    {\"question\": \"With a 20% decrease in office space rentals and a 30% increase in e-commerce sales over the past year, how are cities adapting urban development strategies to repurpose commercial and retail spaces?\", \"answer\": \"In response to the significant shifts in workplace and shopping behaviors, cities are reevaluating their urban development strategies to accommodate the new landscape. The marked decrease in office space demand has prompted a reimagining of city centers, with a focus on converting underutilized office buildings into residential units, co-working spaces, or community centers. Simultaneously, the surge in e-commerce has transformed traditional retail locations into distribution hubs or experiential centers, catering to the new consumer preferences. These adaptations reflect a broader move towards flexible, mixed-use urban environments that can respond dynamically to changing economic and social trends.\"}\n</Example 2>\n\n<Example 3>:\nUser:\n    Chunks:\n    Context 1: Describes the innovative approaches to waste management in City X, focusing on recycling programs and the conversion of waste to energy.\n    Context 2: Highlights the advancements in public transportation in City Y, including the introduction of electric buses and expanded bike-sharing systems.\n    Context 3: Discusses the challenges faced by City Z's educational system, particularly the lack of resources and overcrowded classrooms.\n    Context 4: Details the implementation of smart city technologies in City W, such as IoT sensors for monitoring traffic and air quality.\n    Context 5: Explores the revival of local agriculture in City V, emphasizing community gardens and urban farms' role in enhancing food security.\n\nAssistant:\n    Reasoning Steps:\n    1. Connect City X's waste management innovations and City Y's public transportation advancements as key contributors to environmental sustainability.\n    2. Evaluate the role of smart city technologies in City W in complementing these sustainability efforts, particularly in terms of traffic and air quality management.\n    3. Deliberately exclude the educational challenges in City Z and the agricultural revival in City V to focus on urban sustainability and technology.\n\n    Potential Question: \"How do the waste management and public transportation initiatives in Cities X and Y, along with the smart city technologies in City W, contribute to establishing a new paradigm for urban living quality?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is independently meaningful, addressing urban sustainability and technology's impact on living quality.\n    - Rule 2: Requires integration of information from contexts about waste management, public transportation, and smart city technologies.\n    - Rule 3: Does not directly reference the document chunks or specific data points.\n    - Rule 4: Demonstrates deep understanding by linking sustainability efforts with technological advancements in urban settings.\n    - Rule 5: Avoids mentioning the task or instructions.\n    - Rule 6: The question's complexity arises from connecting multiple urban initiatives across different cities.\n    - Rule 7: Without specifics from the contexts, the question might still be answerable through general knowledge on urban sustainability.\n\n    Given the potential compliance issue with Rule 7, the question needs refinement to explicitly necessitate the provided contexts.\n\n    Modified Question: \"Considering City X's waste-to-energy programs, City Y's adoption of electric buses, and City W's IoT-based traffic and air quality monitoring, how are these specific initiatives redefining standards for urban living quality?\"\n\n    Revised Analysis for Rule 7:\n    - By specifying the initiatives, the question now directly ties to the unique context provided, making it less likely to be answered accurately without this specific information.\n\n    ${separator}\n    {\"question\": \"Considering City X's waste-to-energy programs, City Y's adoption of electric buses, and City W's IoT-based traffic and air quality monitoring, how are these specific initiatives redefining standards for urban living quality?\", \"answer\": \"The integration of City X's waste-to-energy programs and City Y's electric buses represents a significant step towards reducing urban pollution and carbon footprint, thereby enhancing environmental sustainability. Coupled with City W's implementation of IoT technologies for real-time traffic and air quality management, these initiatives collectively contribute to a substantial improvement in urban living standards. They not only ensure a cleaner and more efficient urban environment but also demonstrate the potential of combining green technologies and smart city solutions to create more livable, sustainable cities for future generations.\"}\n</Example 3>\n\n<Example 4>:\nUser:\n    Chunks:\n    Context 1: Explores the correlation between urban population density and the usage of public transportation systems.\n    Context 2: Presents data on the average daily water consumption per capita in various urban areas, with specific figures for cities like Metroville (350 liters), Aquacity (300 liters), Rapidsville (250 liters), Streamtown (400 liters), and Lakecity (450 liters).\n    Context 3: Reviews the impact of water conservation campaigns on reducing average water usage in cities, with some cities reporting up to a 20% reduction in per capita water consumption following targeted campaigns.\n\nAssistant:\n    Reasoning Steps:\n    1. Correlate the specific figures of water consumption with urban population density to understand the underlying patterns.\n    2. Assess the influence of targeted water conservation campaigns on these consumption patterns, especially considering the reported reductions.\n\n    Potential Question: \"How does the interplay between urban population density and targeted water conservation campaigns affect per capita water consumption in cities like Metroville and Lakecity?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and addresses the relationship between population density, conservation efforts, and water consumption.\n    - Rule 2: Requires synthesizing data on water consumption and the impact of conservation campaigns.\n    - Rule 3: Avoids mentioning \"tabular data\" or other document-specific references, making it compliant.\n    - Rule 4: Demonstrates a complex understanding of environmental and urban planning issues.\n    - Rule 5: Does not refer back to the task or instructions.\n    - Rule 6: It's complex, requiring an analysis of data and conservation impact.\n    - Rule 7: The specific mention of cities and the effect of conservation campaigns ensure the question can't be answered without the provided contexts.\n\n    Since the question now complies with all the rules, we can proceed to generate the answer.\n\n    ${separator}\n    {\"question\": \"How does the interplay between urban population density and targeted water conservation campaigns affect per capita water consumption in cities like Metroville and Lakecity?\", \"answer\": \"In cities with high urban population densities like Metroville and Lakecity, per capita water consumption tends to be higher due to the concentrated demand. However, the implementation of targeted water conservation campaigns has been shown to significantly reduce these consumption rates. For instance, cities that have introduced such campaigns report up to a 20% reduction in water usage per capita. This suggests that effective conservation efforts, tailored to the specific needs and challenges of densely populated urban areas, can mitigate the higher water demands typically associated with such locales.\"}\n</Example 4>\n\nThe following example shows how to modify the question in case it doesn't comply with the rules.\n<Example 5>:\nUser:\n    Chunks:\n    Context 1: Analyzes a recent study which found that in urban areas with populations over 500,000, the introduction of bike-sharing programs led to a 15% decrease in car usage during peak traffic hours.\n    Context 2: Reviews data from a national health survey indicating that cities with extensive bike-sharing programs saw a 10% increase in reported physical activity levels among residents.\n    Context 3: Discusses a policy initiative in several large cities aiming to expand green spaces and cycling infrastructure by 30% over the next five years, in response to the positive outcomes of bike-sharing programs.\n\nAssistant:\n    Reasoning Steps:\n    1. Correlate the reduction in car usage due to bike-sharing programs with the increase in physical activity among urban residents.\n    2. Consider the policy implications and future initiatives inspired by the success of bike-sharing programs.\n\n    Potential Question: \"How does the reduction in car usage attributed to bike-sharing programs correlate with the increase in physical activity, and what future urban planning initiatives are being considered in response?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear, addressing the relationship between bike-sharing programs, car usage, and physical activity.\n    - Rule 2: Integrates specific data from the study on car usage, health survey data on physical activity, and policy initiatives.\n    - Rule 3: Avoids direct references to the document chunks.\n    - Rule 4: Demonstrates an in-depth understanding of the interconnected impacts of bike-sharing.\n    - Rule 5: No reference to the task or instructions.\n    - Rule 6: Complex, linking data-driven insights with urban planning futures.\n    - Rule 7: Analysis indicates that the question might still be answerable with general knowledge on bike-sharing benefits and urban policies.\n\n    Given the potential compliance issue with Rule 7, the question needs more specificity to necessitate the provided contexts.\n\n    Modified Question: \"Given the 15% decrease in car usage during peak hours in cities with bike-sharing and a 10% increase in physical activity levels, what specific urban planning strategies are being formulated to capitalize on these trends, particularly the 30% expansion in green spaces and cycling infrastructure?\"\n\n    Revised Analysis for Rule 7:\n    - This question now specifically requires the contextual data provided, making it unlikely to be answered accurately without the detailed contexts.\n\n    ${separator}\n    {\"question\": \"Given the 15% decrease in car usage during peak hours in cities with bike-sharing and a 10% increase in physical activity levels, what specific urban planning strategies are being formulated to capitalize on these trends, particularly the 30% expansion in green spaces and cycling infrastructure?\", \"answer\": \"The observed 15% reduction in car usage and the corresponding 10% increase in physical activity in cities with bike-sharing programs underscore the programs' effectiveness in promoting sustainable transportation and healthier lifestyles. In response, urban planners are formulating strategies to further encourage these trends, including a significant 30% expansion in green spaces and cycling infrastructure over the next five years. These initiatives aim not only to enhance the urban environment and resident well-being but also to sustain the momentum towards more eco-friendly and active urban lifestyles.\"}\n</Example 5>\n\nUSE ${separator} TO SEPARATE ANALYTICAL PROCESS FROM THE RESULTING JSON"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/generate_qna_long_single_context.txt",
    "content": "Your task is to create a question and answer pair from provided pieces of documents. You will be given chunks of documents and must follow specific rules while generating a question.\n\nRules for Crafting Questions:\n1. The question should be understandable and meaningful on its own, without needing the document chunks for context.\n2. Avoid including direct references or links to the text within your question.\n3. Ensure the question is complex and demonstrates a deep understanding of the document chunks.\n4. Do not use phrases that refer back to this task or its instructions, such as \"in the provided chunk\".\n5. Make the question as complex as you can, don't resort to simple questions.\n6. Analyze if generated question can be answered without contexts. If so, change the question.\n\nRules for Crafting Answers:\n1. The answer should stand alone without referring back to the document or containing document-specific citations, like \"Table 2.1\".\n2. The answer must directly address the question, fully encapsulated within the provided chunks.\n\nTask Execution:\n - Start by analyzing the provided chunks to identify key themes, relationships, and insights across multiple contexts.\n - Construct a question that connects information from at least two chunks, ensuring it meets the criteria for complexity and independence.\n - Develop a comprehensive answer based on your analysis, directly addressing the question without referring back to the chunks.\n\nExpected Response:\n - Begin with reasoning steps, highlighting important things and logic behind a potential question.\n - In reasoning steps mention, which contexts are being used to generate the question.\n - Propose a potential question\n - Evaluate the question against question generation rules.\n - If question does not comply with one or more rules, propose a modification of the question\n - Check modified question against rules\n - If the question does not comply, repeat prevuious two steps\n - If the question complies, generate an answer and check it agains answer generation rules.\n - Provide your question-and-answer pair, clearly separated from the thought process.\n - Do not use characters \" or ' in question or answer since it will break JSON formatting!\nThis revision aims to preserve the original intent of your prompt while enhancing clarity, specificity, and engagement. The example is aligned with the task requirements, demonstrating how to interlink information from multiple contexts to formulate a complex question and a comprehensive answer.\n\nRespond with the following structure:\n<analythical process>\n${separator}\n{\"question\": <question>, \"answer\": <answer>}\n\nIn this structure <analythical process> is your chain-of-thought reasoning, <question> is a generated question, <answer> is a generated answer.\n\n<Example 1>:\nUser:\n    Context: \n    Discusses the population growth in City A due to its expanding technology sector, noting an increase to 1.5 million residents as per the 2020 census.\n    Contrasts with City B, where the population has remained stable at 1 million, attributed to its consistent but unexpanding manufacturing base.\n\nAssistant:\n    Reasoning Steps:\n    1. Highlight City A's significant population growth linked to the tech industry.\n    2. Note the stability in City B's population due to a steady manufacturing sector.\n\n    Question: \"In the context provided above, how does City A's tech-driven population boom contrast with City B's stable manufacturing-dependent demographic in shaping their urban planning?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and related to the content.\n    - Rule 2: No direct references are made to the chunks.\n    - Rule 3: Shows an understanding of the economic-demographic relationship.\n    - Rule 4: Violated. The question explicitly refers to \"the context provided above,\" tying it directly to the task setup.\n    - Rule 5: The complexity of the urban-economic interplay is addressed.\n    - Rule 6: The question requires specific context to answer, hence not violated.\n\n    Modification to Comply with Rule 4:\n    Remove any reference to \"the context provided above\" to ensure the question stands alone and does not refer back to the task or instructions.\n\n    Modified Question: \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\"\n\n    Revised Analysis for Compliance:\n    - Rule 1: The question remains clear and understandable.\n    - Rule 2: Maintains no direct chunk references.\n    - Rule 3: Continues to demonstrate deep understanding.\n    - Rule 4: Now complies by not referring back to the task or its context.\n    - Rule 5: The complexity is preserved in discussing the impact on urban planning.\n    - Rule 6: Specific context is still needed for an accurate answer, so not violated.\n\n    ${separator}\n    {\"question\": \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\", \"answer\": \"City A's significant population increase, driven by the booming tech industry, necessitates urban planning strategies that focus on expanding residential and technological infrastructure to accommodate the growing workforce. In contrast, City B, with its stable population anchored by a longstanding manufacturing sector, might prioritize urban planning efforts towards sustaining and modestly enhancing existing industrial and residential areas to support its steady economic base.\"}\n</Example 1>\n\n<Example 2>:\nUser:\n    Context:\n    Details how the increasing adoption of remote work is leading to a significant decrease in demand for office spaces within city centers, with a noted 20% decline in office rentals over the past year.\n    Highlights the shift in consumer behavior towards online shopping, resulting in a 30% increase in e-commerce sales and prompting the repurposing of traditional retail spaces into distribution hubs and experiential centers.\n\nAssistant:\n    Reasoning Steps:\n    1. Link the specific decrease in demand for office spaces due to remote work adoption to the broader implications for commercial real estate.\n    2. Connect the increase in e-commerce sales with the transformation of retail spaces, emphasizing the specific percentages to underscore the significant shifts.\n    3. Formulate a question that explicitly requires an understanding of these specific changes and their combined effect on urban development and space utilization.\n\n    Question: \"With a 20% decrease in office space rentals and a 30% increase in e-commerce sales over the past year, how are cities adapting urban development strategies to repurpose commercial and retail spaces?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and addresses the specific changes in office and retail space usage due to remote work and e-commerce growth.\n    - Rule 2: Avoids direct references to document chunks while using specific data to anchor the question.\n    - Rule 3: Demonstrates an in-depth understanding of the implications of these trends on urban planning.\n    - Rule 4: The question doesn't refer back to the task or instructions.\n    - Rule 5: It's sufficiently complex, considering specific data and the need for strategic urban repurposing.\n    - Rule 6: The inclusion of specific data points (20% decrease, 30% increase) ensures the question cannot be answered without the provided contexts, thus adhering to Rule 7.\n\n    ${separator}\n    {\"question\": \"With a 20% decrease in office space rentals and a 30% increase in e-commerce sales over the past year, how are cities adapting urban development strategies to repurpose commercial and retail spaces?\", \"answer\": \"In response to the significant shifts in workplace and shopping behaviors, cities are reevaluating their urban development strategies to accommodate the new landscape. The marked decrease in office space demand has prompted a reimagining of city centers, with a focus on converting underutilized office buildings into residential units, co-working spaces, or community centers. Simultaneously, the surge in e-commerce has transformed traditional retail locations into distribution hubs or experiential centers, catering to the new consumer preferences. These adaptations reflect a broader move towards flexible, mixed-use urban environments that can respond dynamically to changing economic and social trends.\"}\n</Example 2>\n\n<Example 3>:\nUser:\n    Context:\n    Describes the innovative approaches to waste management in City X, focusing on recycling programs and the conversion of waste to energy.\n    Highlights the advancements in public transportation in City Y, including the introduction of electric buses and expanded bike-sharing systems.\n    Discusses the challenges faced by City Z's educational system, particularly the lack of resources and overcrowded classrooms.\n    Details the implementation of smart city technologies in City W, such as IoT sensors for monitoring traffic and air quality.\n    Explores the revival of local agriculture in City V, emphasizing community gardens and urban farms' role in enhancing food security.\n\nAssistant:\n    Reasoning Steps:\n    1. Connect City X's waste management innovations and City Y's public transportation advancements as key contributors to environmental sustainability.\n    2. Evaluate the role of smart city technologies in City W in complementing these sustainability efforts, particularly in terms of traffic and air quality management.\n    3. Deliberately exclude the educational challenges in City Z and the agricultural revival in City V to focus on urban sustainability and technology.\n\n    Potential Question: \"How do the waste management and public transportation initiatives in Cities X and Y, along with the smart city technologies in City W, contribute to establishing a new paradigm for urban living quality?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is independently meaningful, addressing urban sustainability and technology's impact on living quality.\n    - Rule 2: Does not directly reference the document chunks or specific data points.\n    - Rule 3: Demonstrates deep understanding by linking sustainability efforts with technological advancements in urban settings.\n    - Rule 4: Avoids mentioning the task or instructions.\n    - Rule 5: The question's complexity arises from connecting multiple urban initiatives across different cities.\n    - Rule 6: Without specifics from the contexts, the question might still be answerable through general knowledge on urban sustainability.\n\n    Given the potential compliance issue with Rule 6, the question needs refinement to explicitly necessitate the provided contexts.\n\n    Modified Question: \"Considering City X's waste-to-energy programs, City Y's adoption of electric buses, and City W's IoT-based traffic and air quality monitoring, how are these specific initiatives redefining standards for urban living quality?\"\n\n    Revised Analysis for Rule 6:\n    - By specifying the initiatives, the question now directly ties to the unique context provided, making it less likely to be answered accurately without this specific information.\n\n    ${separator}\n    {\"question\": \"Considering City X's waste-to-energy programs, City Y's adoption of electric buses, and City W's IoT-based traffic and air quality monitoring, how are these specific initiatives redefining standards for urban living quality?\", \"answer\": \"The integration of City X's waste-to-energy programs and City Y's electric buses represents a significant step towards reducing urban pollution and carbon footprint, thereby enhancing environmental sustainability. Coupled with City W's implementation of IoT technologies for real-time traffic and air quality management, these initiatives collectively contribute to a substantial improvement in urban living standards. They not only ensure a cleaner and more efficient urban environment but also demonstrate the potential of combining green technologies and smart city solutions to create more livable, sustainable cities for future generations.\"}\n</Example 3>\n\n<Example 4>:\nUser:\n    Context:\n    Explores the correlation between urban population density and the usage of public transportation systems.\n    Presents data on the average daily water consumption per capita in various urban areas, with specific figures for cities like Metroville (350 liters), Aquacity (300 liters), Rapidsville (250 liters), Streamtown (400 liters), and Lakecity (450 liters).\n    Reviews the impact of water conservation campaigns on reducing average water usage in cities, with some cities reporting up to a 20% reduction in per capita water consumption following targeted campaigns.\n\nAssistant:\n    Reasoning Steps:\n    1. Correlate the specific figures of water consumption with urban population density to understand the underlying patterns.\n    2. Assess the influence of targeted water conservation campaigns on these consumption patterns, especially considering the reported reductions.\n\n    Potential Question: \"How does the interplay between urban population density and targeted water conservation campaigns affect per capita water consumption in cities like Metroville and Lakecity?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and addresses the relationship between population density, conservation efforts, and water consumption.\n    - Rule 2: Avoids mentioning \"tabular data\" or other document-specific references, making it compliant.\n    - Rule 3: Demonstrates a complex understanding of environmental and urban planning issues.\n    - Rule 4: Does not refer back to the task or instructions.\n    - Rule 5: It's complex, requiring an analysis of data and conservation impact.\n    - Rule 6: The specific mention of cities and the effect of conservation campaigns ensure the question can't be answered without the provided contexts.\n\n    Since the question now complies with all the rules, we can proceed to generate the answer.\n\n    ${separator}\n    {\"question\": \"How does the interplay between urban population density and targeted water conservation campaigns affect per capita water consumption in cities like Metroville and Lakecity?\", \"answer\": \"In cities with high urban population densities like Metroville and Lakecity, per capita water consumption tends to be higher due to the concentrated demand. However, the implementation of targeted water conservation campaigns has been shown to significantly reduce these consumption rates. For instance, cities that have introduced such campaigns report up to a 20% reduction in water usage per capita. This suggests that effective conservation efforts, tailored to the specific needs and challenges of densely populated urban areas, can mitigate the higher water demands typically associated with such locales.\"}\n</Example 4>\n\nThe following example shows how to modify the question in case it doesn't comply with the rules.\n<Example 5>:\nUser:\n    Context:\n    Analyzes a recent study which found that in urban areas with populations over 500,000, the introduction of bike-sharing programs led to a 15% decrease in car usage during peak traffic hours.\n    Reviews data from a national health survey indicating that cities with extensive bike-sharing programs saw a 10% increase in reported physical activity levels among residents.\n    Discusses a policy initiative in several large cities aiming to expand green spaces and cycling infrastructure by 30% over the next five years, in response to the positive outcomes of bike-sharing programs.\n\nAssistant:\n    Reasoning Steps:\n    1. Correlate the reduction in car usage due to bike-sharing programs with the increase in physical activity among urban residents.\n    2. Consider the policy implications and future initiatives inspired by the success of bike-sharing programs.\n\n    Potential Question: \"How does the reduction in car usage attributed to bike-sharing programs correlate with the increase in physical activity, and what future urban planning initiatives are being considered in response?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear, addressing the relationship between bike-sharing programs, car usage, and physical activity.\n    - Rule 2: Avoids direct references to the document chunks.\n    - Rule 3: Demonstrates an in-depth understanding of the interconnected impacts of bike-sharing.\n    - Rule 4: No reference to the task or instructions.\n    - Rule 5: Complex, linking data-driven insights with urban planning futures.\n    - Rule 6: Analysis indicates that the question might still be answerable with general knowledge on bike-sharing benefits and urban policies.\n\n    Given the potential compliance issue with Rule 6, the question needs more specificity to necessitate the provided contexts.\n\n    Modified Question: \"Given the 15% decrease in car usage during peak hours in cities with bike-sharing and a 10% increase in physical activity levels, what specific urban planning strategies are being formulated to capitalize on these trends, particularly the 30% expansion in green spaces and cycling infrastructure?\"\n\n    Revised Analysis for Rule 6:\n    - This question now specifically requires the contextual data provided, making it unlikely to be answered accurately without the detailed contexts.\n\n    ${separator}\n    {\"question\": \"Given the 15% decrease in car usage during peak hours in cities with bike-sharing and a 10% increase in physical activity levels, what specific urban planning strategies are being formulated to capitalize on these trends, particularly the 30% expansion in green spaces and cycling infrastructure?\", \"answer\": \"The observed 15% reduction in car usage and the corresponding 10% increase in physical activity in cities with bike-sharing programs underscore the programs' effectiveness in promoting sustainable transportation and healthier lifestyles. In response, urban planners are formulating strategies to further encourage these trends, including a significant 30% expansion in green spaces and cycling infrastructure over the next five years. These initiatives aim not only to enhance the urban environment and resident well-being but also to sustain the momentum towards more eco-friendly and active urban lifestyles.\"}\n</Example 5>\n\nUSE ${separator} TO SEPARATE ANALYTICAL PROCESS FROM THE RESULTING JSON"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/generate_qna_short_multi_context.txt",
    "content": "Your task is to create a question and answer pair from provided pieces of documents. You will be given chunks of documents and must follow specific rules while generating a question.\n\nRules for Crafting Questions:\n1. The question should be understandable and meaningful on its own, without needing the document chunks for context.\n2. The question must require information from at least two different chunks for a comprehensive answer.\n3. Avoid including direct references or links to the text within your question.\n4. Ensure the question is complex and demonstrates a deep understanding of the document chunks.\n5. Do not use phrases that refer back to this task or its instructions, such as \"in the provided chunk\".\n6. Make the question as complex as you can, don't resort to simple questions.\n7. Analyze if generated question can be answered without contexts. If so, change the question.\n\nRules for Crafting Answers:\n1. The answer should stand alone without referring back to the document or containing document-specific citations, like \"Table 2.1\".\n2. The answer must directly address the question, fully encapsulated within the provided chunks.\n\nTask Execution:\n - Start by analyzing the provided chunks to identify key themes, relationships, and insights across multiple contexts.\n - Construct a question that connects information from at least two chunks, ensuring it meets the criteria for complexity and independence.\n - Develop a comprehensive answer based on your analysis, directly addressing the question without referring back to the chunks.\n\nExpected Response:\n - Begin with reasoning steps, highlighting important things and logic behind a potential question.\n - In reasoning steps mention, which contexts are being used to generate the question.\n - Propose a potential question\n - Evaluate the question against question generation rules.\n - If question does not comply with one or more rules, propose a modification of the question\n - Check modified question against rules\n - If the question does not comply, repeat prevuious two steps\n - If the question complies, generate an answer and check it agains answer generation rules.\n - Provide your question-and-answer pair, clearly separated from the thought process.\n - Do not use characters \" or ' in question or answer since it will break JSON formatting!\nThis revision aims to preserve the original intent of your prompt while enhancing clarity, specificity, and engagement. The example is aligned with the task requirements, demonstrating how to interlink information from multiple contexts to formulate a complex question and a comprehensive answer.\nRespond with the following structure:\n<analythical process>\n${separator}\n{\"question\": <question>, \"answer\": <answer>}\n\nIn this structure <analythical process> is your chain-of-thought reasoning, <question> is a generated question, <answer> is a generated answer.\n\n<Example>:\nUser:\n    Chunks:\n    Context 1: Discusses the population growth in City A due to its expanding technology sector, noting an increase to 1.5 million residents as per the 2020 census.\n    Context 2: Contrasts with City B, where the population has remained stable at 1 million, attributed to its consistent but unexpanding manufacturing base.\n\nAssistant:\n    Reasoning Steps:\n    1. Highlight City A's significant population growth linked to the tech industry.\n    2. Note the stability in City B's population due to a steady manufacturing sector.\n\n    Question: \"In the context provided above, how does City A's tech-driven population boom contrast with City B's stable manufacturing-dependent demographic in shaping their urban planning?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and related to the content.\n    - Rule 2: Integrates information from both contexts.\n    - Rule 3: No direct references are made to the chunks.\n    - Rule 4: Shows an understanding of the economic-demographic relationship.\n    - Rule 5: Violated. The question explicitly refers to \"the context provided above,\" tying it directly to the task setup.\n    - Rule 6: The complexity of the urban-economic interplay is addressed.\n    - Rule 7: The question requires specific context to answer, hence not violated.\n\n    Modification to Comply with Rule 5:\n    Remove any reference to \"the context provided above\" to ensure the question stands alone and does not refer back to the task or instructions.\n\n    Modified Question: \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\"\n\n    Revised Analysis for Compliance:\n    - Rule 1: The question remains clear and understandable.\n    - Rule 2: Still requires integration of both contexts for a full answer.\n    - Rule 3: Maintains no direct chunk references.\n    - Rule 4: Continues to demonstrate deep understanding.\n    - Rule 5: Now complies by not referring back to the task or its context.\n    - Rule 6: The complexity is preserved in discussing the impact on urban planning.\n    - Rule 7: Specific context is still needed for an accurate answer, so not violated.\n\n    ${separator}\n    {\"question\": \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\", \"answer\": \"City A's significant population increase, driven by the booming tech industry, necessitates urban planning strategies that focus on expanding residential and technological infrastructure to accommodate the growing workforce. In contrast, City B, with its stable population anchored by a longstanding manufacturing sector, might prioritize urban planning efforts towards sustaining and modestly enhancing existing industrial and residential areas to support its steady economic base.\"}\n</Example>\n\nUSE ${separator} TO SEPARATE ANALYTICAL PROCESS FROM THE RESULTING JSON"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/generate_qna_short_single_context.txt",
    "content": "Your task is to create a question and answer pair from provided pieces of documents. You will be given chunks of documents and must follow specific rules while generating a question.\n\nRules for Crafting Questions:\n1. The question should be understandable and meaningful on its own, without needing the document chunks for context.\n2. Avoid including direct references or links to the text within your question.\n3. Ensure the question is complex and demonstrates a deep understanding of the document chunks.\n4. Do not use phrases that refer back to this task or its instructions, such as \"in the provided chunk\".\n5. Make the question as complex as you can, don't resort to simple questions.\n6. Analyze if generated question can be answered without contexts. If so, change the question.\n\nRules for Crafting Answers:\n1. The answer should stand alone without referring back to the document or containing document-specific citations, like \"Table 2.1\".\n2. The answer must directly address the question, fully encapsulated within the provided chunks.\n\nTask Execution:\n - Start by analyzing the provided chunks to identify key themes, relationships, and insights across multiple contexts.\n - Construct a question that connects information from at least two chunks, ensuring it meets the criteria for complexity and independence.\n - Develop a comprehensive answer based on your analysis, directly addressing the question without referring back to the chunks.\n\nExpected Response:\n - Begin with reasoning steps, highlighting important things and logic behind a potential question.\n - In reasoning steps mention, which contexts are being used to generate the question.\n - Propose a potential question\n - Evaluate the question against question generation rules.\n - If question does not comply with one or more rules, propose a modification of the question\n - Check modified question against rules\n - If the question does not comply, repeat prevuious two steps\n - If the question complies, generate an answer and check it agains answer generation rules.\n - Provide your question-and-answer pair, clearly separated from the thought process.\n - Do not use characters \" or ' in question or answer since it will break JSON formatting!\nThis revision aims to preserve the original intent of your prompt while enhancing clarity, specificity, and engagement. The example is aligned with the task requirements, demonstrating how to interlink information from multiple contexts to formulate a complex question and a comprehensive answer.\n\nRespond with the following structure:\n<analythical process>\n${separator}\n{\"question\": <question>, \"answer\": <answer>}\n\nIn this structure <analythical process> is your chain-of-thought reasoning, <question> is a generated question, <answer> is a generated answer.\n\n<Example>:\nUser:\n    Context: \n    Discusses the population growth in City A due to its expanding technology sector, noting an increase to 1.5 million residents as per the 2020 census.\n    Contrasts with City B, where the population has remained stable at 1 million, attributed to its consistent but unexpanding manufacturing base.\n\nAssistant:\n    Reasoning Steps:\n    1. Highlight City A's significant population growth linked to the tech industry.\n    2. Note the stability in City B's population due to a steady manufacturing sector.\n\n    Question: \"In the context provided above, how does City A's tech-driven population boom contrast with City B's stable manufacturing-dependent demographic in shaping their urban planning?\"\n\n    Analysis if question complies with rules:\n    - Rule 1: The question is clear and related to the content.\n    - Rule 2: No direct references are made to the chunks.\n    - Rule 3: Shows an understanding of the economic-demographic relationship.\n    - Rule 4: Violated. The question explicitly refers to \"the context provided above,\" tying it directly to the task setup.\n    - Rule 5: The complexity of the urban-economic interplay is addressed.\n    - Rule 6: The question requires specific context to answer, hence not violated.\n\n    Modification to Comply with Rule 4:\n    Remove any reference to \"the context provided above\" to ensure the question stands alone and does not refer back to the task or instructions.\n\n    Modified Question: \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\"\n\n    Revised Analysis for Compliance:\n    - Rule 1: The question remains clear and understandable.\n    - Rule 2: Maintains no direct chunk references.\n    - Rule 3: Continues to demonstrate deep understanding.\n    - Rule 4: Now complies by not referring back to the task or its context.\n    - Rule 5: The complexity is preserved in discussing the impact on urban planning.\n    - Rule 6: Specific context is still needed for an accurate answer, so not violated.\n\n    ${separator}\n    {\"question\": \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\", \"answer\": \"City A's significant population increase, driven by the booming tech industry, necessitates urban planning strategies that focus on expanding residential and technological infrastructure to accommodate the growing workforce. In contrast, City B, with its stable population anchored by a longstanding manufacturing sector, might prioritize urban planning efforts towards sustaining and modestly enhancing existing industrial and residential areas to support its steady economic base.\"}\n</Example>\n\nUSE ${separator} TO SEPARATE ANALYTICAL PROCESS FROM THE RESULTING JSON"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/generate_qna_short_single_context_no_cot.txt",
    "content": "Your task is to create a question and answer pair from provided pieces of documents. You will be given chunks of documents and must follow specific rules while generating a question.\n\nRules for Crafting Questions:\n1. The question should be understandable and meaningful on its own, without needing the document chunks for context.\n2. Avoid including direct references or links to the text within your question.\n3. Ensure the question is complex and demonstrates a deep understanding of the document chunks.\n4. Do not use phrases that refer back to this task or its instructions, such as \"in the provided chunk\".\n5. Make the question as complex as you can, don't resort to simple questions.\n6. Analyze if generated question can be answered without contexts. If so, change the question.\n\nRules for Crafting Answers:\n1. The answer should stand alone without referring back to the document or containing document-specific citations, like \"Table 2.1\".\n2. The answer must directly address the question, fully encapsulated within the provided chunks.\n\nRESPOND WITH THE FOLLOWING JSON:\n{\"question\": <question>, \"answer\": <answer>}\n\n<Example 1>:\nUser:\n    Context: \n    Discusses the population growth in City A due to its expanding technology sector, noting an increase to 1.5 million residents as per the 2020 census.\n    Contrasts with City B, where the population has remained stable at 1 million, attributed to its consistent but unexpanding manufacturing base.\n\nAssistant:\n    {\"question\": \"How are City A's technological sector growth and City B's consistent manufacturing base differently influencing their urban planning strategies in light of their divergent population trends?\", \"answer\": \"City A's significant population increase, driven by the booming tech industry, necessitates urban planning strategies that focus on expanding residential and technological infrastructure to accommodate the growing workforce. In contrast, City B, with its stable population anchored by a longstanding manufacturing sector, might prioritize urban planning efforts towards sustaining and modestly enhancing existing industrial and residential areas to support its steady economic base.\"}\n</Example 1>\n\n<Example 2>:\nUser:\n    Context:\n    Details how the increasing adoption of remote work is leading to a significant decrease in demand for office spaces within city centers, with a noted 20% decline in office rentals over the past year.\n    Highlights the shift in consumer behavior towards online shopping, resulting in a 30% increase in e-commerce sales and prompting the repurposing of traditional retail spaces into distribution hubs and experiential centers.\n\nAssistant:\n    {\"question\": \"With a 20% decrease in office space rentals and a 30% increase in e-commerce sales over the past year, how are cities adapting urban development strategies to repurpose commercial and retail spaces?\", \"answer\": \"In response to the significant shifts in workplace and shopping behaviors, cities are reevaluating their urban development strategies to accommodate the new landscape. The marked decrease in office space demand has prompted a reimagining of city centers, with a focus on converting underutilized office buildings into residential units, co-working spaces, or community centers. Simultaneously, the surge in e-commerce has transformed traditional retail locations into distribution hubs or experiential centers, catering to the new consumer preferences. These adaptations reflect a broader move towards flexible, mixed-use urban environments that can respond dynamically to changing economic and social trends.\"}\n</Example 2>\n\n<Example 3>:\nUser:\n    Context:\n    Describes the innovative approaches to waste management in City X, focusing on recycling programs and the conversion of waste to energy.\n    Highlights the advancements in public transportation in City Y, including the introduction of electric buses and expanded bike-sharing systems.\n    Discusses the challenges faced by City Z's educational system, particularly the lack of resources and overcrowded classrooms.\n    Details the implementation of smart city technologies in City W, such as IoT sensors for monitoring traffic and air quality.\n    Explores the revival of local agriculture in City V, emphasizing community gardens and urban farms' role in enhancing food security.\n\nAssistant:\n    {\"question\": \"Considering City X's waste-to-energy programs, City Y's adoption of electric buses, and City W's IoT-based traffic and air quality monitoring, how are these specific initiatives redefining standards for urban living quality?\", \"answer\": \"The integration of City X's waste-to-energy programs and City Y's electric buses represents a significant step towards reducing urban pollution and carbon footprint, thereby enhancing environmental sustainability. Coupled with City W's implementation of IoT technologies for real-time traffic and air quality management, these initiatives collectively contribute to a substantial improvement in urban living standards. They not only ensure a cleaner and more efficient urban environment but also demonstrate the potential of combining green technologies and smart city solutions to create more livable, sustainable cities for future generations.\"}\n</Example 3>\n\n<Example 4>:\nUser:\n    Context:\n    Explores the correlation between urban population density and the usage of public transportation systems.\n    Presents data on the average daily water consumption per capita in various urban areas, with specific figures for cities like Metroville (350 liters), Aquacity (300 liters), Rapidsville (250 liters), Streamtown (400 liters), and Lakecity (450 liters).\n    Reviews the impact of water conservation campaigns on reducing average water usage in cities, with some cities reporting up to a 20% reduction in per capita water consumption following targeted campaigns.\n\nAssistant:\n    {\"question\": \"How does the interplay between urban population density and targeted water conservation campaigns affect per capita water consumption in cities like Metroville and Lakecity?\", \"answer\": \"In cities with high urban population densities like Metroville and Lakecity, per capita water consumption tends to be higher due to the concentrated demand. However, the implementation of targeted water conservation campaigns has been shown to significantly reduce these consumption rates. For instance, cities that have introduced such campaigns report up to a 20% reduction in water usage per capita. This suggests that effective conservation efforts, tailored to the specific needs and challenges of densely populated urban areas, can mitigate the higher water demands typically associated with such locales.\"}\n</Example 4>\n\nThe following example shows how to modify the question in case it doesn't comply with the rules.\n<Example 5>:\nUser:\n    Context:\n    Analyzes a recent study which found that in urban areas with populations over 500,000, the introduction of bike-sharing programs led to a 15% decrease in car usage during peak traffic hours.\n    Reviews data from a national health survey indicating that cities with extensive bike-sharing programs saw a 10% increase in reported physical activity levels among residents.\n    Discusses a policy initiative in several large cities aiming to expand green spaces and cycling infrastructure by 30% over the next five years, in response to the positive outcomes of bike-sharing programs.\n\nAssistant:\n    {\"question\": \"Given the 15% decrease in car usage during peak hours in cities with bike-sharing and a 10% increase in physical activity levels, what specific urban planning strategies are being formulated to capitalize on these trends, particularly the 30% expansion in green spaces and cycling infrastructure?\", \"answer\": \"The observed 15% reduction in car usage and the corresponding 10% increase in physical activity in cities with bike-sharing programs underscore the programs' effectiveness in promoting sustainable transportation and healthier lifestyles. In response, urban planners are formulating strategies to further encourage these trends, including a significant 30% expansion in green spaces and cycling infrastructure over the next five years. These initiatives aim not only to enhance the urban environment and resident well-being but also to sustain the momentum towards more eco-friendly and active urban lifestyles.\"}\n</Example 5>"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/llm_answer_relevance_instruction.txt",
    "content": "Generate question for the given answer.\nExample:\nUser:\n    The PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India\nAssistant:\n    When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/llm_context_precision_instruction.txt",
    "content": "Given a question and a context, verify if the information in the given context is useful in answering the question. Return a Yes/No answer.\n\n<Example 1>\nUser:\n    Context:\n    The latest software update for smartphones includes significant improvements in security protocols and user interface enhancements.\n    \n    Question:\n    What does the new software update include for smartphones?\n\nAssistant:\n    Yes\n\n<Example 2>\n\nUser:\n    Context:\n    Coffee consumption statistics in 2019 show that adults aged 25-34 are the largest group of coffee drinkers in the United States.\n    \n    Question:\n    What are the health benefits of drinking coffee?\n\nAssistant:\n    No"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/llm_context_recall_instruction.txt",
    "content": "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not.\n\n<Example 1>\nUser:\n    question: What can you tell me about albert Albert Einstein?\n    context: Albert Einstein (14 March 1879 to 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called \"the world's most famous equation\". He received the 1921 Nobel Prize in Physics \"for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect\", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.\n    answer: Albert Einstein born in 14 March 1879 was  German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics \"for his services to theoretical physics. He published 4 papers in 1905.  Einstein moved to Switzerland in 1895\n\nAssistant:\n    [\n        {  \"statement_1\":\"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.\",\n            \"reason\": \"The date of birth of Einstein is mentioned clearly in the context.\",\n            \"attributed\": \"1\"\n        },\n        {\n            \"statement_2\":\"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.\",\n            \"reason\": \"The exact sentence is present in the given context.\",\n            \"attributed\": \"1\"\n        },\n        {\n            \"statement_3\": \"He published 4 papers in 1905.\",\n            \"reason\": \"There is no mention about papers he wrote in the given context.\",\n            \"attributed\": \"0\"\n        },\n        {\n            \"statement_4\":\"Einstein moved to Switzerland in 1895.\",\n            \"reason\": \"There is no supporting evidence for this in the given context.\",\n            \"attributed\": \"0\"\n        }\n    ]\n</Example 1>\n<Example 2>\nUser:\n    question: who won 2020 icc world cup?\n    context: Who won the 2022 ICC Men's T20 World Cup? The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.\n    answer: England\nAssistant:\n    [\n        {\n            \"statement_1\":\"England won the 2022 ICC Men's T20 World Cup.\",\n            \"reason\": \"From context it is clear that England defeated Pakistan to win the World Cup.\",\n            \"attributed\": \"1\"\n        }\n    ]\n</Example 2>"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/main_instruction_long.txt",
    "content": "You provide answers to questions based solely on the information provided below.\nAnswer precisely and concisely, addressing only what is asked without extraneous details.\nIf the information needed to answer isn't available in the provided context, respond with \"I don't know.\".\nCite specific sources by filename whenever you reference data or excerpts from the provided context.\n\nInput format:\n\nContext:\n{context}\n\nQuestion:\n{question}\n\n<Example 1>:\nUser:\n    Context:\n    Sales data from the fourth quarter shows an increase in revenue from Region A, whereas Region B experienced a slight decline. (source: Q4_Sales_Report.txt)\n    \n    Question:\n    Did revenue increase in Region A in the fourth quarter?\n\nAssistant:\n    Yes, revenue in Region A increased in the fourth quarter according to the data provided in Q4_Sales_Report.txt.\n</Example 1>\n\n<Example 2>:\nUser:\n    Context:\n    The new software update includes improvements to security protocols and user interface enhancements. (source: Update_Release_Notes.txt)\n\n    Question:\n    What does the new software update include?\n\nAssistant:\n    The new software update includes improvements to security protocols and user interface enhancements, as detailed in Update_Release_Notes.txt.\n</Example 2>\n\n<Example 3>:\nUser:\n    Context:\n    Employee satisfaction has significantly improved due to recent changes in workplace policies. (source: Employee_Feedback_2023.txt)\n\n    Question:\n    What has improved due to recent changes in workplace policies?\n\nAssistant:\n    Employee satisfaction has significantly improved due to recent changes in workplace policies, as mentioned in Employee_Feedback_2023.txt.\n</Example 3>"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/main_instruction_short.txt",
    "content": "You provide answers to questions based solely on the information provided below.\nAnswer precisely and concisely, addressing only what is asked without extraneous details.\nIf the information needed to answer isn't available in the provided context, respond with \"I don't know.\".\nCite specific sources by filename whenever you reference data or excerpts from the provided context.\n\nInput format:\n\nContext:\n{context}\n\nQuestion:\n{question}"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/multiple_prompt_instruction.txt",
    "content": "Your task is to take a question as input and generate a maximum of three sub-questions that cover all aspects of the original question.\nThe output should be in strict JSON format, containing the list of sub-questions.\n\nRequirements:\n1. Analyze the original question to identify key aspects or components.\n2. Generate sub-questions that address each identified aspect.\n3. Ensure the sub-questions collectively cover the entire scope of the original question.\n4. Output the sub-questions as a JSON list with an array of the sub-questions.\n5. Produce a JSON output that is 100 percent structurally correct, with proper nesting, comma placement, and quotation marks.\n6. Format the JSON with proper indentation for readability.\n7. Ensure there is no trailing comma after the last element in the array.\n8. Generate anywhere from 2 up to 10 sub-questions.\n\n<Example 1>\nUser:\n  How does climate change affect ocean biodiversity?\n\nAssistant:\n  [\"What impact does rising ocean temperatures have on marine species?\", \"How does ocean acidification affect coral reefs and shellfish populations?\"]\n</Example 1>\n\n<Example 2>\nUser:\n  What are the key considerations when implementing AI technologies in healthcare?\n\nAssistant:\n  [\"What ethical concerns arise with the use of AI in patient care?\", \"How can AI improve diagnosis accuracy in healthcare?\", \"What are the data privacy implications of using AI in healthcare?\", \"How can AI be used to personalize patient treatment plans?\", \"What are the challenges of integrating AI with existing healthcare IT systems?\", \"How does AI impact the roles and responsibilities of healthcare professionals?\", \"What training is required for healthcare staff to effectively use AI tools?\", \"How can AI help in managing healthcare costs?\", \"What are the regulatory considerations for AI in healthcare?\", \"How can AI technologies enhance patient engagement and satisfaction?\"]\n</Example 2>\n\n<Example 3>\nUser:\n  What should someone consider when starting an online business?\n\nAssistant:\n  [\"What are the key legal requirements for starting an online business?\", \"How should one choose the right platform for their online business?\", \"What are effective digital marketing strategies for a new online business?\", \"How does one handle logistics and supply chain management for an online store?\", \"What customer service practices should be implemented for online businesses?\"]\n</Example 3>"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_generate_hypothetical_answer.txt",
    "content": "You are a helpful expert research assistant.\nGenerate a hypothetical answer to the given question as if it were found in a document.\nThis involves creating a detailed, expert-level response based on the context or nature of the question, even if specific data or real documents aren't available.\nYour answer should reflect what an expert might write in a relevant document or article.\n\n<Example 1>\nUser:\n    What are the potential impacts of artificial intelligence on job markets in the next decade?\nAssistant:\n    Experts predict that artificial intelligence will significantly automate tasks, potentially displacing jobs in sectors like manufacturing and customer service, while creating new opportunities in AI development and data analysis.\n\n<Example 2>\nUser:\n    How could climate change affect coastal cities by 2050?\nAssistant:\n    By 2050, climate change is expected to cause more frequent and severe flooding in coastal cities due to rising sea levels and increased storm intensity, necessitating major adaptations in urban planning and infrastructure.\n\n<Example 3>\nUser:\n    What are the latest advancements in renewable energy technologies?\nAssistant:\n    Recent advancements in renewable energy include improvements in solar panel efficiency, development of larger offshore wind turbines, and breakthroughs in battery storage technology, all contributing to more sustainable energy solutions."
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_generate_hypothetical_document.txt",
    "content": "You are a skilled research assistant capable of generating entire hypothetical documents that might contain answers to the given questions.\nCreate documents that could exist based on the context or nature of the question, simulating different formats such as detailed narratives, structured tables, or loosely structured text based on what an expert document might look like.\n\n<Example 1>\nUser:\n    Discuss the future challenges and opportunities for small businesses in emerging markets.\nAssistant:\n    This document examines both the challenges and opportunities facing small businesses in emerging markets over the next decade. Key challenges include limited access to capital and market volatility, while opportunities lie in increasing digitalization and growing consumer markets. Detailed analysis follows in the subsequent sections, providing a comprehensive overview with expert insights and projections.\n\n<Example 2>\nUser:\n    What are the current rates of deforestation in different regions of the Amazon?\nAssistant:\n    Table:\n    | Region | Deforestation Rate (sq km/year) |\n    |---------------|---------------------------------|\n    | Northern Amazon| 120 |\n    | Southern Amazon| 95 |\n    | Eastern Amazon | 110 |\n    | Western Amazon | 85 |\n    Summary: This table summarizes the annual rates of deforestation in various parts of the Amazon, highlighting significant environmental impacts.\"\n\n<Example 3>\nUser:\n    What are the latest theories in particle physics?\nAssistant:\n    New Theories in Particle Physics\n    - Lots of excitement around string theory updates.\n    - Evidence pointing to possible fifth force of nature; details still unclear.\n    - Recent experiments suggest discrepancies in the Standard Model; further investigation needed.\n    Notes: The field is rapidly evolving, with many research papers published recently. Some data is still under review, making concrete conclusions difficult at this stage."
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_generate_hypothetical_questions.txt",
    "content": "You are a helpful expert research assistant.\nWhen users ask a question, enhance their inquiry by suggesting up to five additional related questions.\nThese questions should help them delve deeper into the subject or explore various dimensions of the topic.\nEach suggested question should be concise and direct, avoiding compound sentences.\nEnsure the questions are complete, clearly formulated, and closely related to the original question.\nOutput the questions in JSON format, with each question as an item in a list.\n\n<Example>\nUser:\n    What impact does social media have on mental health?\n\nAssistant:\n    [\"How does social media usage correlate with anxiety levels?\", \"What are the effects of social media on teenagers’ self-esteem?\", \"Can social media influence depression among adults?\", \"Are there positive psychological impacts of social media?\", \"How do different social media platforms affect mood?\"]\n\nRespond with json. It should contain list of elements, where each element is a string, containing generated questions.\n\nExample output structure: [\"This is first question\", \"This is second question\", \"This is third question\"]"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_instruction_entities.txt",
    "content": "Identify the key entities (person, organization, location, date, year, brand, geography, proper nouns, month, etc.) from the given user text. Output the entities as a JSON list, including only the entities without any additional text or metadata.\n\n<Example>:\n\nUser:\n    In March 2021, Apple Inc. released the iPhone 12 in Cupertino, California, which featured significant improvements in battery life and processing power.\n\nAssistant:\n    [\"March 2021\", \"Apple Inc.\", \"iPhone 12\", \"Cupertino\", \"California\"]"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_instruction_keywords.txt",
    "content": "Provide unique keywords for the given user text. Output the keywords as a JSON list.\n\nExample:\n\nUser:\n    The latest smartphone model features an advanced camera system, enhanced battery life, and a new artificial intelligence chipset designed to improve user experience.\n\nAssistant:\n    [\"smartphone\", \"advanced camera\", \"enhanced battery life\", \"artificial intelligence\", \"user experience\"]"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_instruction_summary.txt",
    "content": "Summarize the given user text in a single sentence using few words. Ensure the output is concise and in one sentence without formatting as a list.\n\n<Example>:\nUser:\n    During the 19th century, industrialization led to significant urban growth, changes in employment patterns, and advancements in transportation technologies like railways and steamships.\n\nAssistant:\n    19th-century industrialization spurred urban growth and transportation advancements."
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/prompt_instruction_title.txt",
    "content": "Identify and provide an appropriate title for the given user text in a single sentence, ensuring the title is between 10 to 15 words long. Do not format the output as a list or include any additional text or metadata.\n\n<Example>:\nUser:\n    Exploring the impacts of climate change on global agriculture, this article discusses the shifts in crop yields, changes in weather patterns, and their effects on farming techniques.\n\nAssistant:\n    Climate Change Effects on Global Agriculture and Crop Production"
  },
  {
    "path": "rag_experiment_accelerator/llm/prompts_text/rerank_prompt_instruction.txt",
    "content": "You are provided with a list of documents, each identified by a number and accompanied by a summary.\nA user question is also given.\nRank these documents based on their relevance to the question, assigning each a relevance score from 1 to 10, where 10 indicates the highest relevance.\nRespond with the reranked document numbers and their relevance scores formatted as a JSON string, according to the schema below.\nEnsure the JSON string contains all listed documents and adheres to format specifications without any additional text or explanation.\n\n<Example>\nUser:\n    Document 1: \n    Overview of renewable energy trends and their economic impacts.\n\n    Document 2:\n    Analysis of fossil fuel dependency in developing countries.\n\n    Document 3:\n    Detailed report on the advancements in solar energy panels.\n\n    Document 4:\n    Comparison of wind energy efficiencies across continents.\n\n    Document 5:\n    Study on the environmental impacts of hydraulic fracturing.\n\n    Document 6:\n    Historical data on the use of renewable resources in Europe.\n\n    Question: What are the latest developments in solar energy technology?\n\nAssistant:\n    {\n        \"document_1\": 7,\n        \"document_2\": 3,\n        \"document_3\": 10,\n        \"document_4\": 6,\n        \"document_5\": 2,\n        \"document_6\": 4\n    }"
  },
  {
    "path": "rag_experiment_accelerator/llm/response_generator.py",
    "content": "import logging\n\nimport json\nimport openai\n\nfrom string import Template\n\nfrom openai import AzureOpenAI\nfrom tenacity import (\n    after_log,\n    before_sleep_log,\n    retry,\n    stop_after_attempt,\n    wait_random_exponential,\n    retry_if_not_exception_type,\n)\n\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.llm.exceptions import ContentFilteredException\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.llm.prompt.prompt import (\n    StructuredPrompt,\n    CoTPrompt,\n    Prompt,\n    PromptTag,\n)\n\nlogger = get_logger(__name__)\n\n\nclass ResponseGenerator:\n    def __init__(self, environment: Environment, config: Config, deployment_name: str):\n        self.config = config\n        self.deployment_name = deployment_name\n        self.temperature = self.config.openai.temperature\n        self.use_long_prompt = True\n        self.client = self._initialize_azure_openai_client(environment)\n        self.json_object_supported = True\n\n    def _initialize_azure_openai_client(self, environment: Environment):\n        return AzureOpenAI(\n            azure_endpoint=environment.openai_endpoint,\n            api_key=environment.openai_api_key,\n            api_version=environment.openai_api_version,\n        )\n\n    def _interpret_response(self, response: str, prompt: Prompt) -> any:\n        interpreted_response = response\n\n        if PromptTag.ChainOfThought in prompt.tags:\n            if not isinstance(prompt, CoTPrompt):\n                raise TypeError(\n                    \"Prompt is not a CoTPrompt but has Chain-of-thought tag\"\n                )\n\n            splitted = interpreted_response.split(prompt.separator)\n            assert len(splitted) != 1, f\"Separator not found in response: {response}\"\n            assert (\n                len(splitted) <= 2\n            ), f\"More than one separator found in response: {response}\"\n            interpreted_response = splitted[1]\n\n        if PromptTag.Structured in prompt.tags:\n            if not isinstance(prompt, StructuredPrompt):\n                raise TypeError(\n                    \"Prompt is not a StructuredPrompt but has Structured tag\"\n                )\n            assert prompt.validator(\n                interpreted_response\n            ), f\"Response {response} does not match the expected format\"\n\n        if PromptTag.JSON in prompt.tags:\n            interpreted_response = json.loads(interpreted_response)\n\n        return interpreted_response\n\n    @retry(\n        before_sleep=before_sleep_log(logger, logging.CRITICAL),\n        after=after_log(logger, logging.CRITICAL),\n        wait=wait_random_exponential(min=1, max=60),\n        stop=stop_after_attempt(6),\n        retry=retry_if_not_exception_type(\n            (ContentFilteredException, TypeError, KeyboardInterrupt)\n        ),\n    )\n    def _get_response(\n        self, messages, prompt: Prompt, temperature: float | None = None\n    ) -> any:\n        kwargs = {}\n\n        if self.json_object_supported and PromptTag.JSON in prompt.tags:\n            kwargs[\"response_format\"] = {\"type\": \"json_object\"}\n\n        try:\n            response = self.client.chat.completions.create(\n                model=self.deployment_name,\n                messages=messages,\n                temperature=temperature\n                if temperature is not None\n                else self.temperature,\n                **kwargs,\n            )\n        except openai.BadRequestError as e:\n            if e.param == \"response_format\":\n                self.json_object_supported = False\n                return self._get_response(messages, prompt, temperature)\n            raise e\n\n        if response.choices[0].finish_reason == \"content_filter\":\n            logger.error(\n                f\"Response was filtered {response.choices[0].finish_reason}:\\n{response}\"\n            )\n            raise ContentFilteredException(\"Content was filtered.\")\n\n        response_text = response.choices[0].message.content\n\n        return self._interpret_response(response_text, prompt)\n\n    def generate_response(\n        self,\n        prompt: Prompt,\n        temperature: float | None = None,\n        prompt_last: bool = False,\n        **kwargs,\n    ) -> any:\n        system_arguments = Prompt.arguments_in_prompt(prompt.system_message)\n        user_arguments = Prompt.arguments_in_prompt(prompt.user_template)\n\n        for key in system_arguments:\n            assert key in kwargs, f\"Missing argument {key} in system message.\"\n\n        for key in user_arguments:\n            assert key in kwargs, f\"Missing argument {key} in user template.\"\n\n        sys_template = Template(prompt.system_message)\n        sys_message = sys_template.safe_substitute(\n            **{key: value for key, value in kwargs.items() if key in system_arguments}\n        )\n\n        user_template = Template(prompt.user_template)\n        user_template = user_template.safe_substitute(\n            **{key: value for key, value in kwargs.items() if key in user_arguments}\n        )\n\n        if prompt_last:\n            messages = [\n                {\"role\": \"system\", \"content\": \"\"},\n                {\"role\": \"user\", \"content\": f\"{user_template}\\n{sys_message}\"},\n            ]\n        else:\n            messages = [\n                {\"role\": \"system\", \"content\": sys_message},\n                {\"role\": \"user\", \"content\": user_template},\n            ]\n\n        try:\n            response = self._get_response(messages, prompt, temperature)\n        except KeyboardInterrupt as e:\n            raise e\n        except Exception as e:\n            if PromptTag.NonStrict in prompt.tags:\n                logger.debug(f\"Failed to generate response: {e}\")\n                return None\n            else:\n                raise e\n\n        return response\n"
  },
  {
    "path": "rag_experiment_accelerator/llm/tests/test_response_generator.py",
    "content": "import unittest\nimport json\nfrom unittest.mock import patch, Mock\nfrom rag_experiment_accelerator.llm.exceptions import ContentFilteredException\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.llm.prompt import (\n    StructuredPrompt,\n    CoTPrompt,\n    Prompt,\n    PromptTag,\n)\n\n\nclass TestResponseGenerator(unittest.TestCase):\n    def setUp(self):\n        self.generator = ResponseGenerator.__new__(ResponseGenerator)\n        self.generator.config = Mock()\n        self.generator.temperature = 0.5\n        self.generator.deployment_name = \"deployment_name\"\n        self.generator.client = Mock()\n        self.generator.json_object_supported = False\n        self.prompt = Mock(spec=Prompt)\n        self.prompt.tags = {}\n\n    def create_mock_prompt(self, prompt_type, tags, separator=None, validator=None):\n        mock_prompt = Mock(spec=prompt_type)\n        mock_prompt.tags = tags\n        if separator:\n            mock_prompt.separator = separator\n        if validator:\n            mock_prompt.validator = validator\n        return mock_prompt\n\n    def test_interpret_response_with_cot_prompt(self):\n        response = \"Introduction##RESPONSE##Detailed explanation\"\n        prompt = self.create_mock_prompt(\n            CoTPrompt, [PromptTag.ChainOfThought], separator=\"##RESPONSE##\"\n        )\n        result = self.generator._interpret_response(response, prompt)\n        self.assertEqual(result, \"Detailed explanation\")\n\n    def test_interpret_response_with_structured_prompt(self):\n        response = '{\"key\": \"value\"}'\n        prompt = self.create_mock_prompt(\n            StructuredPrompt,\n            [PromptTag.Structured, PromptTag.JSON],\n            validator=lambda x: \"key\" in x,\n        )\n        result = self.generator._interpret_response(response, prompt)\n        expected = json.loads(response)\n        self.assertEqual(result, expected)\n\n    def test_interpret_response_with_invalid_separator(self):\n        response = \"No separator present here\"\n        prompt = self.create_mock_prompt(\n            CoTPrompt, [PromptTag.ChainOfThought], separator=\"##RESPONSE##\"\n        )\n        with self.assertRaises(AssertionError):\n            self.generator._interpret_response(response, prompt)\n\n    def test_interpret_response_non_strict_mode(self):\n        response = \"Some response\"\n        prompt = self.create_mock_prompt(Prompt, [PromptTag.NonStrict])\n        result = self.generator._interpret_response(response, prompt)\n        self.assertEqual(result, response)\n\n    @patch(\"rag_experiment_accelerator.llm.response_generator.logger\")\n    def test_get_response_normal(self, mock_logger):\n        # Mocking the API response\n        responses = [\n            Mock(message=Mock(content=\"test response\"), finish_reason=\"completed\"),\n        ]\n        mock_response = Mock()\n        mock_response.choices = responses\n        self.generator.client.chat.completions.create.return_value = mock_response\n\n        # Test\n        result = self.generator._get_response(\"message\", self.prompt)\n        self.assertEqual(result, \"test response\")\n        self.generator.client.chat.completions.create.assert_called_once()\n\n    @patch(\"rag_experiment_accelerator.llm.response_generator.logger\")\n    def test_get_response_content_filtered(self, mock_logger):\n        # Mocking the API response for content filtering\n        mock_response = Mock()\n        mock_response.choices = [\n            Mock(message=Mock(content=None), finish_reason=\"content_filter\")\n        ]\n        self.generator.client.chat.completions.create.return_value = mock_response\n\n        # Test and assert exception\n        with self.assertRaises(ContentFilteredException):\n            self.generator._get_response(\"message\", self.prompt)\n\n    @patch(\"rag_experiment_accelerator.llm.response_generator.logger\")\n    def test_get_response_retries_on_random_exception(self, mock_logger):\n        # Simulating an exception that should trigger a retry\n        self.generator.client.chat.completions.create.side_effect = [\n            Exception(\"Simulate API failure\"),\n            Mock(\n                choices=[\n                    Mock(\n                        message=Mock(content=\"recovered response\"),\n                        finish_reason=\"completed\",\n                    )\n                ]\n            ),\n        ]\n\n        # Test\n        result = self.generator._get_response(\"message\", self.prompt)\n        self.assertEqual(result, \"recovered response\")\n        self.assertEqual(self.generator.client.chat.completions.create.call_count, 2)\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._get_response\"\n    )\n    def test_generate_response_full_system_message(self, mock_get_response):\n        # Setup\n        mock_get_response.return_value = \"valid response\"\n        prompt = Prompt(\"${argument_1} ${argument_2}\", \"\", [])\n        kwargs = {\"argument_1\": 1, \"argument_2\": 2}\n\n        # Action\n        response = self.generator.generate_response(prompt, None, **kwargs)\n\n        # Assert\n        mock_get_response.assert_called_once()\n        self.assertEqual(response, \"valid response\")\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._get_response\"\n    )\n    def test_generate_response_full_user_template(self, mock_get_response):\n        # Setup\n        mock_get_response.return_value = \"valid response\"\n        prompt = Prompt(\"\", \"${argument_1} ${argument_2}\", [])\n        kwargs = {\"argument_1\": 1, \"argument_2\": 2}\n\n        # Action\n        response = self.generator.generate_response(prompt, None, **kwargs)\n\n        # Assert\n        mock_get_response.assert_called_once()\n        self.assertEqual(response, \"valid response\")\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._get_response\"\n    )\n    def test_generate_response_mixed_messages(self, mock_get_response):\n        # Setup\n        mock_get_response.return_value = \"valid response\"\n        prompt = Prompt(\"${argument_1}\", \"${argument_2}\", [])\n        kwargs = {\"argument_1\": 1, \"argument_2\": 2}\n\n        # Action\n        response = self.generator.generate_response(prompt, None, **kwargs)\n\n        # Assert\n        mock_get_response.assert_called_once()\n        self.assertEqual(response, \"valid response\")\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._get_response\"\n    )\n    def test_generate_response_missing_system_argument(self, mock_get_response):\n        # Setup\n        prompt = Prompt(\"${argument_1}\", \"${argument_2}\", [])\n        kwargs = {\"argument_1\": 1}\n\n        # Action & Assert\n        with self.assertRaises(AssertionError):\n            self.generator.generate_response(prompt, None, **kwargs)\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._get_response\"\n    )\n    def test_generate_response_missing_user_argument_non_strict(\n        self, mock_get_response\n    ):\n        # Setup\n        mock_get_response.side_effect = Exception(\"Random failure\")\n        prompt = Prompt(\"${argument_1}\", \"${argument_2}\", [PromptTag.NonStrict])\n        kwargs = {\"argument_1\": 1, \"argument_2\": 2}\n\n        # Action\n        response = self.generator.generate_response(prompt, None, **kwargs)\n\n        # Assert\n        self.assertIsNone(response)\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._get_response\",\n        side_effect=Exception(\"Random failure\"),\n    )\n    def test_generate_response_exception_handling_strict(self, mock_get_response):\n        # Setup\n        prompt = Prompt(\"${argument_1}\", \"${argument_2}\", [])\n        kwargs = {\"argument_1\": 1, \"argument_2\": 2}\n\n        # Action & Assert\n        with self.assertRaises(Exception):\n            self.generator.generate_response(prompt, None, **kwargs)\n\n    @patch(\n        \"rag_experiment_accelerator.llm.response_generator.ResponseGenerator._initialize_azure_openai_client\"\n    )\n    def test_initialize_azure_openai_client(self, mock_initialize_azure_openai_client):\n        # Arrange\n        mock_initialize_azure_openai_client.return_value = \"mock client\"\n\n        # Act\n        result = self.generator._initialize_azure_openai_client()\n\n        # Assert\n        self.assertEqual(result, \"mock client\")\n        mock_initialize_azure_openai_client.assert_called_once()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "rag_experiment_accelerator/nlp/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/nlp/language_evaluator.py",
    "content": "from azure.ai.textanalytics import TextAnalyticsClient\nfrom azure.core.credentials import AzureKeyCredential\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nlogger = get_logger(__name__)\n\n\nclass LanguageEvaluator:\n    \"\"\"\n    A class for detecting language on text using the built-in Language Detection skill in Azure AI Services.\n\n    Args:\n        query_language: The language of the query. Possible values include: \"none\", \"en-us\",\n         \"en-gb\", \"en-in\", \"en-ca\", \"en-au\", \"fr-fr\", \"fr-ca\", \"de-de\", \"es-es\", \"es-mx\", \"zh-cn\",\n         \"zh-tw\", \"pt-br\", \"pt-pt\", \"it-it\", \"ja-jp\", \"ko-kr\", \"ru-ru\", \"cs-cz\", \"nl-be\", \"nl-nl\",\n         \"hu-hu\", \"pl-pl\", \"sv-se\", \"tr-tr\", \"hi-in\", \"ar-sa\", \"ar-eg\", \"ar-ma\", \"ar-kw\", \"ar-jo\",\n         \"da-dk\", \"no-no\", \"bg-bg\", \"hr-hr\", \"hr-ba\", \"ms-my\", \"ms-bn\", \"sl-sl\", \"ta-in\", \"vi-vn\",\n         \"el-gr\", \"ro-ro\", \"is-is\", \"id-id\", \"th-th\", \"lt-lt\", \"uk-ua\", \"lv-lv\", \"et-ee\", \"ca-es\",\n         \"fi-fi\", \"sr-ba\", \"sr-me\", \"sr-rs\", \"sk-sk\", \"nb-no\", \"hy-am\", \"bn-in\", \"eu-es\", \"gl-es\",\n         \"gu-in\", \"he-il\", \"ga-ie\", \"kn-in\", \"ml-in\", \"mr-in\", \"fa-ae\", \"pa-in\", \"te-in\", \"ur-pk\".\n        default_language: The ISO 6391 language code for the language identified. For example, \"en\".\n        country_hint (str): An ISO 3166-1 alpha-2 two letter country code to use as a hint to the language detection model if it cannot disambiguate the language.\n        confidence_threshold (float): The minimum confidence score required for language detected to be considered reliable.\n\n    Attributes:\n        query_language: The language of the query\n        default_language: The ISO 6391 language code for the language identified. For example, \"en\".\n        country_hint (str): An ISO 3166-1 alpha-2 two letter country code to use as a hint to the language detection model if it cannot disambiguate the language.\n        confidence_threshold (float): The minimum confidence score required for two documents to be considered relevant.\n        max_content_length (int): The maximum size of a content allowed measured by length (e.g. 50,000 characters)\n\n    Methods:\n        detect_language(text: str | list[str]) -> Dict[str, str] | None: Detect language for a text sample or a batch of documents.\n        is_confident(text: str) -> bool: Determines whether language detected is reliable based on confidence score.\n        is_language_match(text: str, language_code: str) -> bool: Determines whether language matches language detected.\n        check_string(text: str) -> Check the length of an input string.\n    \"\"\"\n\n    def __init__(\n        self,\n        environment: Environment,\n        query_language=\"en-us\",\n        default_language=\"en\",\n        country_hint=\"\",\n        confidence_threshold=0.8,\n    ) -> None:\n        try:\n            self.query_language = query_language\n            self.default_language = (\n                default_language if default_language else query_language.split(\"-\")[0]\n            )\n            self.country_hint = (\n                country_hint if country_hint else query_language.split(\"-\")[1]\n            )\n            self.confidence_threshold = confidence_threshold\n            self.max_content_length = 50000  # Data limit\n            self.environment = environment\n        except Exception as e:\n            logger.error(str(e))\n\n    def check_string(self, input_string):\n        try:\n            if not isinstance(input_string, str):\n                raise ValueError(\"Input must be a string\")\n            if len(input_string) < self.max_content_length:\n                return True\n            else:\n                return False\n        except Exception as e:\n            logger.error(f\"An error occurred: {e}\")\n            return None\n\n    def detect_language(self, text: str):\n        try:\n            client = TextAnalyticsClient(\n                endpoint=self.environment.azure_language_service_endpoint,\n                credential=AzureKeyCredential(\n                    self.environment.azure_language_service_key\n                ),\n            )\n            response = client.detect_language(documents=[text])\n\n            for doc in response:\n                if not doc.is_error:\n                    logger.info(f\"Detected language: {doc.primary_language}\")\n                else:\n                    logger.error(f\"Unable to detect language: {doc.id} {doc.error}\")\n            client.close()\n            return {\n                \"name\": doc.primary_language.name,\n                \"confidence_score\": doc.primary_language.confidence_score,\n                \"iso6391_name\": doc.primary_language.iso6391_name,\n            }\n        except Exception as e:\n            logger.error(f\"An error occurred: {e}\")\n            return None\n\n    def is_confident(self, text: str):\n        primary_language = self.detect_language(text)\n        confidence_score = primary_language.get(\"confidence_score\")\n        language = primary_language.get(\"name\")\n        logger.info(f\"Language: {language} Confidence Score: {confidence_score}\")\n\n        return confidence_score >= self.confidence_threshold\n\n    def is_language_match(self, text: str, language_code: str):\n        primary_language = self.detect_language(text)\n        confidence_score = primary_language.get(\"confidence_score\")\n        language = primary_language.get(\"name\")\n        logger.info(f\"Language: {language} Confidence Score: {confidence_score}\")\n\n        return (\n            language_code == primary_language.get(\"iso6391_name\")\n            and confidence_score >= self.confidence_threshold\n        )\n"
  },
  {
    "path": "rag_experiment_accelerator/nlp/preprocess.py",
    "content": "import re\nfrom string import punctuation\nfrom spacy import load\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\nclass Preprocess:\n    __enabled: bool\n\n    def __init__(self, enabled=False):\n        self.__enabled = enabled\n        if self.__enabled:\n            try:\n                self.nlp = load(\"en_core_web_lg\")\n            except OSError:\n                logger.info(\"Downloading spacy language model: en_core_web_lg\")\n                from spacy.cli import download\n\n                download(\"en_core_web_lg\")\n                self.nlp = load(\"en_core_web_lg\")\n\n    def preprocess(self, text) -> str:\n        \"\"\"\n        Preprocess the input text by converting it to lowercase, removing punctuation and tags, removing stop words, and tokenizing the words.\n\n        Args:\n            text (str): The input text to preprocess (if enabled).\n\n        Returns:\n            Union[str, list[str]]:  If enabled - list of preprocessed words otherwise the original text.\n        \"\"\"\n        if self.__enabled:\n            lower_text = text.lower().strip()\n            sentence_tokens = self.sentence_tokenize(lower_text)\n            word_list = []\n            for each_sent in sentence_tokens:\n                clean_text = self.remove_punctuation(each_sent)\n                clean_text = self.remove_tags(clean_text)\n                clean_text = self.remove_stop_words(clean_text)\n                word_tokens = self.word_tokenize(clean_text)\n                for i in word_tokens:\n                    word_list.append(i)\n            return \" \".join(word_list)\n        else:\n            return text\n\n    def remove_punctuation(self, text):\n        \"\"\"\n        Removes all punctuation from the given text and returns the result.\n\n        Args:\n            text (str): The text to remove punctuation from.\n\n        Returns:\n            str: The text with all punctuation removed.\n        \"\"\"\n        return \"\".join(c for c in text if c not in punctuation)\n\n    def remove_tags(self, text):\n        \"\"\"\n        Removes HTML tags from the given text.\n\n        Args:\n            text (str): The text to remove HTML tags from.\n\n        Returns:\n            str: The cleaned text with HTML tags removed.\n        \"\"\"\n        cleaned_text = re.sub(\"<[^<]+?>\", \"\", text)\n        return cleaned_text\n\n    def sentence_tokenize(self, text):\n        \"\"\"\n        Tokenize a given text into sentences using spacy.\n\n        Args:\n            text (str): The text to be tokenized.\n\n        Returns:\n            list: A list of sentences extracted from the input text.\n        \"\"\"\n        doc = self.nlp(text)\n        return [sent.text.strip() for sent in doc.sents]\n\n    def word_tokenize(self, text):\n        \"\"\"\n        Tokenize the input text into individual words.\n\n        Args:\n            text (str): The text to tokenize.\n\n        Returns:\n            list: A list of individual words in the input text.\n        \"\"\"\n        return [w.text for w in self.nlp(text)]\n\n    def remove_stop_words(self, sentence):\n        \"\"\"\n        Removes stop words from a given sentence.\n\n        Args:\n            sentence (str): The sentence to remove stop words from.\n\n        Returns:\n            str: The sentence with stop words removed.\n        \"\"\"\n        doc = self.nlp(sentence)\n        filtered_tokens = [\n            token for token in doc if not token.is_stop and str(token) != \"\\n\"\n        ]\n\n        return \" \".join([token.text for token in filtered_tokens])\n\n    def lemmatize(self, text):\n        \"\"\"\n        Lemmatizes the input text using the WordNet lemmatizer.\n\n        Args:\n            text (str): The text to lemmatize.\n\n        Returns:\n            str: The lemmatized text.\n        \"\"\"\n        doc = self.nlp(text)\n        return \" \".join([token.lemma_ for token in doc])\n"
  },
  {
    "path": "rag_experiment_accelerator/nlp/tests/test_language_evaluator.py",
    "content": "from unittest.mock import patch, Mock\n\nfrom rag_experiment_accelerator.nlp.language_evaluator import LanguageEvaluator\n\ndocuments = [\n    \"This is a test.\",\n    \"C'est un test.\",\n    \"Dies ist ein Test.\",\n    \"Questa è una prova.\",\n]\ndetect_language_response = {\n    \"name\": \"English\",\n    \"iso6391_name\": \"en\",\n    \"confidence_score\": 1,\n}\ndetect_languages_response = [\n    {\n        \"name\": \"English\",\n        \"iso6391_name\": \"en\",\n        \"confidence_score\": 1,\n    },\n    {\n        \"name\": \"French\",\n        \"iso6391_name\": \"fr\",\n        \"confidence_score\": 1,\n    },\n    {\n        \"name\": \"German\",\n        \"iso6391_name\": \"de\",\n        \"confidence_score\": 1,\n    },\n    {\n        \"name\": \"Italian\",\n        \"iso6391_name\": \"it\",\n        \"confidence_score\": 1,\n    },\n]\n\n\ndef test_language_evaluator_init():\n    language_evaluator = LanguageEvaluator(Mock(), \"en-us\", \"en\", \"\", 0.77)\n    assert language_evaluator.query_language == \"en-us\"\n    assert language_evaluator.country_hint == \"us\"\n    assert language_evaluator.max_content_length == 50000\n    assert language_evaluator.confidence_threshold == 0.77\n    assert language_evaluator.default_language == \"en\"\n\n\ndef test_detect_language():\n    with patch(\n        \"rag_experiment_accelerator.nlp.language_evaluator.LanguageEvaluator\"\n    ) as language_evaluator:\n        language_evaluator.detect_language.return_value = detect_language_response\n        primary_language = language_evaluator.detect_language(\"This is a test.\")\n        assert primary_language.get(\"name\") == \"English\"\n        assert primary_language.get(\"iso6391_name\") == \"en\"\n        assert primary_language.get(\"confidence_score\") == 1\n\n\ndef test_detect_languages():\n    with patch(\n        \"rag_experiment_accelerator.nlp.language_evaluator.LanguageEvaluator\"\n    ) as language_evaluator:\n        response = language_evaluator.detect_language(\n            documents\n        ).return_value = detect_languages_response\n\n        for i, doc in enumerate(detect_languages_response):\n            assert doc[\"name\"] == response[i].get(\"name\")\n            assert doc[\"iso6391_name\"] == response[i].get(\"iso6391_name\")\n            assert doc[\"confidence_score\"] == response[i].get(\"confidence_score\")\n\n\ndef test_is_confident_returns_certainty():\n    with patch.object(\n        LanguageEvaluator, \"detect_language\", create=True\n    ) as language_evaluator:\n        language_evaluator.detect_language.return_value = detect_language_response\n        language_evaluator.is_confident(\"This is a test.\")\n        language_evaluator.is_confident.assert_called()\n        language_evaluator.is_confident.assert_called_with(\"This is a test.\")\n\n\ndef test_is_language_match():\n    with patch.object(\n        LanguageEvaluator, \"detect_language\", create=True\n    ) as language_evaluator:\n        language_evaluator.is_language_match(\"C'est un test.\", \"fr\")\n        language_evaluator.is_language_match.assert_called()\n        language_evaluator.is_language_match.assert_called_with(\"C'est un test.\", \"fr\")\n\n\ndef test_check_string():\n    language_evaluator = LanguageEvaluator(Mock())\n    assert language_evaluator.check_string(\"This is string\")\n"
  },
  {
    "path": "rag_experiment_accelerator/nlp/tests/test_preprocessor.py",
    "content": "from unittest.mock import MagicMock, patch\n\nfrom rag_experiment_accelerator.nlp.preprocess import Preprocess\n\n\n@patch(\"rag_experiment_accelerator.nlp.preprocess.load\")\ndef test_sentence_tokenize(mock_nlp):\n    mock_sent_1 = MagicMock()\n    mock_sent_1.text = \"This is a sentence.\"\n    mock_sent_2 = MagicMock()\n    mock_sent_2.text = \"This is another sentence.   \"\n    mock_nlp().return_value.sents = [mock_sent_1, mock_sent_2]\n    preprocessor = Preprocess(True)\n    expected = [\"This is a sentence.\", \"This is another sentence.\"]\n\n    actual = preprocessor.sentence_tokenize(\"text is mocked\")\n\n    assert actual == expected\n\n\n@patch(\"rag_experiment_accelerator.nlp.preprocess.load\")\ndef test_word_tokenize(mock_nlp):\n    mock_sent_1 = MagicMock()\n    mock_sent_1.text = \"This\"\n    mock_sent_2 = MagicMock()\n    mock_sent_2.text = \"is\"\n    mock_sent_3 = MagicMock()\n    mock_sent_3.text = \"a\"\n    mock_sent_4 = MagicMock()\n    mock_sent_4.text = \"sentence\"\n    mock_nlp().return_value = [mock_sent_1, mock_sent_2, mock_sent_3, mock_sent_4]\n    preprocessor = Preprocess(True)\n    expected = [\"This\", \"is\", \"a\", \"sentence\"]\n    actual = preprocessor.word_tokenize(\"text is mocked\")\n    assert actual == expected\n\n\n@patch(\"rag_experiment_accelerator.nlp.preprocess.load\")\ndef test_remove_stopwords(mock_nlp):\n    mock_token_1 = MagicMock()\n    mock_token_1.text = \"This\"\n    mock_token_1.is_stop = True\n    mock_token_2 = MagicMock()\n    mock_token_2.text = \"is\"\n    mock_token_2.is_stop = True\n    mock_token_3 = MagicMock()\n    mock_token_3.text = \"a\"\n    mock_token_3.is_stop = True\n    mock_token_4 = MagicMock()\n    mock_token_4.text = \"sentence\"\n    mock_token_4.is_stop = False\n    mock_token_5 = MagicMock()\n    mock_token_5.text = \".\"\n    mock_token_5.is_stop = False\n    mock_nlp().return_value = [\n        mock_token_1,\n        mock_token_2,\n        mock_token_3,\n        mock_token_4,\n        mock_token_5,\n    ]\n\n    preprocessor = Preprocess(True)\n    sentence = \"This is a sentence.\"\n    expected = \"sentence .\"\n    actual = preprocessor.remove_stop_words(sentence)\n    assert actual == expected\n\n\n@patch(\"rag_experiment_accelerator.nlp.preprocess.load\")\ndef test_lemmatize(mock_nlp):\n    mock_token_1 = MagicMock()\n    mock_token_1.lemma_ = \"kite\"\n    mock_token_1.is_stop = True\n    mock_token_2 = MagicMock()\n    mock_token_2.lemma_ = \"baby\"\n    mock_token_2.is_stop = True\n    mock_token_3 = MagicMock()\n    mock_token_3.lemma_ = \"dog\"\n    mock_token_3.is_stop = True\n    mock_token_4 = MagicMock()\n    mock_token_4.lemma_ = \"fly\"\n    mock_token_4.is_stop = False\n    mock_nlp().return_value = [mock_token_1, mock_token_2, mock_token_3, mock_token_4]\n    preprocessor = Preprocess(True)\n    text = \"kites babies dogs flying\"\n    expected = \"kite baby dog fly\"\n    assert preprocessor.lemmatize(text) == expected\n\n\n@patch(\"rag_experiment_accelerator.nlp.preprocess.load\")\ndef test_remove_punct(mock_nlp):\n    preprocessor = Preprocess(True)\n    text = \"\"\"this!\" is*+,-. /a#$ sentence%& with'() a:;<= lot>?@[ of\\\\]^_ punctuation`{|}~\"\"\"\n    expected = \"this is a sentence with a lot of punctuation\"\n    assert preprocessor.remove_punctuation(text) == expected\n"
  },
  {
    "path": "rag_experiment_accelerator/reranking/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/reranking/reranker.py",
    "content": "import re\n\nfrom sentence_transformers import CrossEncoder\n\nfrom rag_experiment_accelerator.llm.prompt import rerank_prompt_instruction\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef cross_encoder_rerank_documents(\n    documents, user_prompt, output_prompt, model_name, k\n):\n    \"\"\"\n    Reranks a list of documents based on their relevance to a user prompt using a cross-encoder model.\n\n    Args:\n        documents (list): A list of documents to be reranked.\n        user_prompt (str): The user prompt to be used as the query.\n        output_prompt (str): The output prompt to be used as the context.\n        model_name (str): The name of the pre-trained cross-encoder model to be used.\n        k (int): The number of top documents to be returned.\n\n    Returns:\n        list: A list of the top k documents, sorted by their relevance to the user prompt.\n    \"\"\"\n    if not documents:\n        return []\n\n    model = CrossEncoder(model_name)\n    cross_scores_ques = model.predict(\n        [[user_prompt, item] for item in documents],\n        apply_softmax=True,\n        convert_to_numpy=True,\n    )\n\n    top_indices_ques = cross_scores_ques.argsort()[-k:][::-1]\n    sub_context = []\n    for idx in list(top_indices_ques):\n        sub_context.append(documents[idx])\n\n    return sub_context\n\n\ndef llm_rerank_documents(\n    documents, question, response_generator: ResponseGenerator, rerank_threshold\n):\n    \"\"\"\n    Reranks a list of documents based on a given question using the LLM model.\n\n    Args:\n        documents (list): A list of documents to be reranked.\n        question (str): The question to be used for reranking.\n        response_generator (ResponseGenerator): The initialised ResponseGenerator to use.\n        rerank_threshold (int): The threshold for reranking documents.\n\n    Returns:\n        list: A list of reranked documents.\n    \"\"\"\n    rerank_context = \"\"\n    for index, docs in enumerate(documents):\n        rerank_context += \"\\ndocument \" + str(index) + \":\\n\"\n        rerank_context += docs + \"\\n\"\n\n    response: dict[str, int] | None = response_generator.generate_response(\n        rerank_prompt_instruction,\n        documents=rerank_context,\n        question=question,\n        prompt_last=True,\n    )\n\n    logger.debug(\"Reranker response:\\n\", response)\n\n    if response is None:\n        return documents\n\n    result = []\n    for key, _ in sorted(response.items(), key=lambda x: x[1], reverse=True):\n        document_index = int(re.search(r\"document_(\\d+)\", key))\n        result.append(documents[document_index])\n\n    return result\n"
  },
  {
    "path": "rag_experiment_accelerator/run/evaluation.py",
    "content": "from typing import MutableMapping\nfrom azure.ai.ml import MLClient\nfrom dotenv import load_dotenv\nimport mlflow\n\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.evaluation import eval\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\n\nload_dotenv(override=True)\nlogger = get_logger(__name__)\n\n\ndef _flatten_dict_gen(d, parent_key, sep):\n    for k, v in d.items():\n        new_key = parent_key + sep + str(k) if parent_key else k\n        if isinstance(v, MutableMapping):\n            yield from flatten_dict(v, new_key, sep=sep).items()\n        else:\n            yield new_key, v\n\n\ndef flatten_dict(d: MutableMapping, parent_key: str = \"\", sep: str = \".\"):\n    return dict(_flatten_dict_gen(d, parent_key, sep))\n\n\ndef get_job_hyper_params(config: Config, index_config: IndexConfig) -> dict:\n    \"\"\"\n    Returns the hyper parameters for the current job.\n    \"\"\"\n    config_dict = config.to_dict()\n\n    # Remove not needed parameters\n    for param in [\"path\", \"main_instruction\", \"use_checkpoints\"]:\n        config_dict.__delitem__(param)\n\n    config_flatten_dict = flatten_dict(config_dict)\n\n    return config_flatten_dict\n\n\ndef run(\n    environment: Environment,\n    config: Config,\n    index_config: IndexConfig,\n    mlflow_client: MLClient,\n    name_suffix: str,\n):\n    \"\"\"\n    Runs the evaluation process for the RAG experiment accelerator.\n\n    This function initializes the configuration, sets up the ML client, and runs the evaluation process\n    for all combinations of chunk sizes, overlap sizes, embedding dimensions, EF constructions, and EF searches.\n\n    Returns:\n        None\n    \"\"\"\n    logger.info(f\"Evaluating Index: {index_config.index_name()}\")\n\n    params = get_job_hyper_params(config, index_config)\n    mlflow.log_params(params)\n\n    eval.evaluate_prompts(\n        environment=environment,\n        config=config,\n        index_config=index_config,\n        mlflow_client=mlflow_client,\n        name_suffix=name_suffix,\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/run/index.py",
    "content": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom contextlib import ExitStack\nimport ntpath\nfrom dotenv import load_dotenv\nimport mlflow\n\nfrom rag_experiment_accelerator.checkpoint import cache_with_checkpoint\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.doc_loader.documentLoader import load_documents\nfrom rag_experiment_accelerator.ingest_data.acs_ingest import upload_data\nfrom rag_experiment_accelerator.init_Index.create_index import create_acs_index\n\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.llm.prompt import (\n    prompt_instruction_title,\n    prompt_instruction_summary,\n)\n\nfrom rag_experiment_accelerator.sampling.clustering import cluster, load_parser\nfrom rag_experiment_accelerator.nlp.preprocess import Preprocess\nfrom rag_experiment_accelerator.utils.timetook import TimeTook\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\nload_dotenv(override=True)\n\n\ndef run(\n    environment: Environment,\n    config: Config,\n    index_config: IndexConfig,\n    file_paths: list[str],\n    mlflow_client: mlflow.MlflowClient,\n    is_local: bool = False,\n) -> str:\n    \"\"\"\n    Runs the main experiment loop, which chunks and uploads data to Azure AI Search indexes based on the configuration specified in the Config class.\n\n    Returns:\n        The name of the index created.\n    \"\"\"\n    pre_process = Preprocess(index_config.chunking.preprocess)\n\n    index_name = index_config.index_name()\n    with TimeTook(f\"create Azure Search Index {index_name}\", logger=logger):\n        create_acs_index(\n            environment.azure_search_service_endpoint,\n            index_name,\n            environment.azure_search_admin_key,\n            config.get_embedding_model(\n                index_config.embedding_model.model_name\n            ).dimension,\n            index_config.ef_construction,\n            index_config.ef_search,\n            config.language.analyzer,\n        )\n\n    docs = load_documents(\n        environment,\n        index_config.chunking.chunking_strategy,\n        config.data_formats,\n        file_paths,\n        index_config.chunking.chunk_size,\n        index_config.chunking.overlap_size,\n        index_config.chunking.azure_document_intelligence_model,\n    )\n\n    if is_local and index_config.sampling.sample_data:\n        parser = load_parser()\n        docs = cluster(index_config.index_name(), docs, config, parser)\n\n    if index_config.sampling.only_sample:\n        return index_name\n\n    mlflow.log_metric(\"Number of documents\", len(docs))\n    docs_ready_to_index = convert_docs_to_vector_db_records(docs)\n    mlflow.log_metric(\"Number of document chunks\", len(docs_ready_to_index))\n    embed_chunks(config, index_config, pre_process, docs_ready_to_index)\n\n    generate_titles_from_chunks(\n        config, index_config, pre_process, docs_ready_to_index, environment\n    )\n    generate_summaries_from_chunks(\n        config, index_config, pre_process, docs_ready_to_index, environment\n    )\n\n    with TimeTook(\n        f\"load documents to Azure Search index {index_name}\",\n        logger=logger,\n    ):\n        upload_data(\n            environment=environment,\n            config=config,\n            chunks=docs_ready_to_index,\n            index_name=index_name,\n        )\n\n    return index_name\n\n\ndef convert_docs_to_vector_db_records(docs):\n    \"\"\"\n    Converts a list of documents into a list of dictionaries ready to be loaded into Azure Search.\n\n    This function takes a list of documents and converts each one into a dictionary.\n    The dictionary contains the document's content and metadata.\n\n    Args:\n        docs (list): A list of documents to be converted.\n\n    Returns:\n        list: A list of dictionaries, each representing a document.\n    \"\"\"\n    dicts = []\n    for doc in docs:\n        doc_id = list(doc.keys())[0]\n        doc_dict = doc[doc_id]\n        filename = ntpath.basename(doc_dict[\"metadata\"].get(\"source\", \"\"))\n        page = doc_dict[\"metadata\"].get(\"page\", None)\n        dict = {\n            \"id\": doc_id,\n            \"content\": doc_dict.get(\"content\", \"\"),\n            \"filename\": filename,\n            \"source_display_name\": f\"{filename}#page={page}\"\n            if str(page).isnumeric()\n            else filename,\n        }\n        dicts.append(dict)\n    return dicts\n\n\ndef embed_chunks(config: Config, index_config: IndexConfig, pre_process, chunks):\n    \"\"\"\n    Generates embeddings for chunks of documents.\n\n    Args:\n        config (object): A configuration object that holds various settings.\n        pre_process (object): An object with a method for preprocessing text.\n        chunks (list): A list of all documents chunks to be embedded.\n\n    Returns:\n        tuple: A tuple containing the index name and the list of processed documents.\n    \"\"\"\n    with TimeTook(\n        f\"generate embeddings for {index_config.index_name()} \", logger=logger\n    ):\n        embedded_chunks = []\n        with ExitStack() as stack:\n            executor = stack.enter_context(ThreadPoolExecutor())\n\n            futures = {\n                executor.submit(\n                    embed_chunk,\n                    pre_process,\n                    config.get_embedding_model(index_config.embedding_model.model_name),\n                    doc,\n                ): doc\n                for doc in chunks\n            }\n\n            for future in as_completed(futures):\n                doc = futures[future]\n                try:\n                    chunk_dict = future.result()\n                except Exception as exc:\n                    logger.error(\n                        f\"{embed_chunk.__name__} generated an exception: {exc} for doc {doc}\"\n                    )\n                else:\n                    embedded_chunks.append(chunk_dict)\n\n    if index_config.chunking.override_content_with_summary:\n        for chunk in chunks:\n            if \"summary\" in chunk:\n                chunk[\"content\"] = chunk[\"summary\"]\n                chunk[\"content_vector\"] = chunk[\"summary_vector\"]\n                chunk[\"summary\"] = \"\"\n                chunk[\"summary_vector\"] = []\n            else:\n                logger.warn(\"summary was not generated\")\n\n    return embedded_chunks\n\n\n@cache_with_checkpoint(id=\"chunk['content']+embedding_model.name\")\ndef embed_chunk(pre_process, embedding_model, chunk):\n    \"\"\"\n    Generates an embedding for a chunk of content.\n\n    This function takes a chunk of content, preprocess it and generates an\n    embedding for it using the `generate_embedding` function.\n    The generated embedding is then added to the chunk dictionary under the\n    key \"content_vector\".\n\n    Args:\n        pre_process (object): An object with a method for preprocessing text.\n        embedding_model (object): The embedding model which was created using `EmbeddingModelFactory`.\n        chunk (dict): A dictionary containing a chunk of content.\n\n    Returns:\n        dict: The chunk dictionary with the added \"content_vector\" key.\n    \"\"\"\n    chunk[\"content_vector\"] = embedding_model.generate_embedding(\n        pre_process.preprocess(chunk[\"content\"])\n    )\n\n    return chunk\n\n\ndef generate_titles_from_chunks(\n    config: Config, index_config: IndexConfig, pre_process, chunks, environment\n):\n    \"\"\"\n    Generates titles for each chunk of content in parallel using LLM and\n    multithreading.\n\n    This function uses a ThreadPoolExecutor to process each chunk in parallel.\n    It submits a task to the executor for each chunk, which involves\n    processing the title of the chunk.\n    If an exception occurs during the processing of a chunk, it logs an error\n    message with the exception and the first 20 characters of the chunk\n    content.\n\n    Args:\n        config (object): A configuration object that holds various settings.\n        index_config (object): An object that holds the index configuration settings.\n        pre_process (object): An object with a method for preprocessing text.\n        chunks (list): A list of dictionaries, each containing a chunk of content to be processed.\n        environment (object): An object that holds the environment settings.\n    \"\"\"\n    with ExitStack() as stack:\n        executor = stack.enter_context(ThreadPoolExecutor(config.max_worker_threads))\n\n        futures = {\n            executor.submit(\n                process_title, config, index_config, pre_process, chunk, environment\n            ): chunk\n            for chunk in chunks\n        }\n\n        for future in as_completed(futures):\n            chunk = futures[future]\n            try:\n                chunk = future.result()\n            except Exception as exc:\n                logger.error(\n                    f\"{process_title.__name__} generated an exception: {exc} for chunk {chunk['content'][0:20]}...\"\n                )\n\n\ndef generate_summaries_from_chunks(\n    config: Config, index_config: IndexConfig, pre_process, chunks, environment\n):\n    \"\"\"\n    Generates summaries for each chunk of content in parallel using multithreading.\n\n    This function uses a ThreadPoolExecutor to process each chunk in parallel.\n    It submits a task to the executor for each chunk, which involves\n    processing the summary of the chunk.\n    If an exception occurs during the processing of a chunk, it logs an error\n    message with the exception and the first 20 characters of the chunk content.\n\n    Args:\n        config (object): A configuration object that holds various settings.\n        index_config (object): An object that holds the index configuration settings.\n        pre_process (object): An object with a method for preprocessing text.\n        chunks (list): A list of dictionaries, each containing a chunk of content to be processed.\n        environment (object): An object that holds the environment settings.\n    \"\"\"\n    with ExitStack() as stack:\n        executor = stack.enter_context(ThreadPoolExecutor(config.max_worker_threads))\n\n        futures = {\n            executor.submit(\n                process_summary, config, index_config, pre_process, chunk, environment\n            ): chunk\n            for chunk in chunks\n        }\n\n        for future in as_completed(futures):\n            chunk = futures[future]\n            try:\n                chunk = future.result()\n            except Exception as exc:\n                logger.error(\n                    f\"{process_summary.__name__} generated an exception: {exc} for chunk {chunk['content'][0:20]}....\"\n                )\n\n\n@cache_with_checkpoint(id=\"chunk['content']+str(config.index.chunking.generate_title)\")\ndef process_title(\n    config: Config, index_config: IndexConfig, pre_process, chunk, environment\n):\n    \"\"\"\n    Processes the title of a chunk of content.\n\n    If the generate_title configuration is set to True, a title is generated for the chunk of content and an embedding is created for it.\n    If it's set to False, the title is set to an empty string and the title vector is set to an empty list.\n\n    Args:\n        config (object): A configuration object that holds various settings.\n        pre_process (object): An object with a method for preprocessing text.\n        chunk (dict): A dictionary that contains the content to be processed.\n\n    Returns:\n        dict: The chunk dictionary with the added title and title vector.\n    \"\"\"\n    if index_config.chunking.generate_title:\n        title = generate_title(\n            chunk[\"content\"],\n            config.openai.azure_oai_chat_deployment_name,\n            environment,\n            config,\n        )\n        title_vector = config.get_embedding_model(\n            index_config.embedding_model.model_name\n        ).generate_embedding(pre_process.preprocess(title))\n    else:\n        title = \"\"\n        title_vector = []\n\n    chunk[\"title\"] = title\n    chunk[\"title_vector\"] = title_vector\n\n    return chunk\n\n\n@cache_with_checkpoint(\n    id=\"chunk['content']+str(config.index.chunking.generate_summary)\"\n)\ndef process_summary(\n    config: Config, index_config: IndexConfig, pre_process, chunk, environment\n):\n    \"\"\"\n    Processes the title of a chunk of content.\n\n    If the generate_summary configuration is set to True,\n    a summary is generated for the chunk of content and an embedding is\n    created for it.\n    If it's set to False, the summary is set to an empty string and the summary vector is set to an empty list.\n\n    Args:\n        config (object): A configuration object that holds various settings.\n        pre_process (object): An object with a method for preprocessing text.\n        chunk (dict): A dictionary that contains the content to be processed.\n        environment (object): An object that holds the environment settings.\n\n    Returns:\n        dict: The chunk dictionary with the added title and title vector.\n    \"\"\"\n    if index_config.chunking.generate_summary:\n        summary = generate_summary(\n            chunk[\"content\"],\n            config.openai.azure_oai_chat_deployment_name,\n            environment,\n            config,\n        )\n        summaryVector = config.get_embedding_model(\n            index_config.embedding_model.model_name\n        ).generate_embedding(pre_process.preprocess(summary))\n    else:\n        summary = \"\"\n        summaryVector = []\n\n    chunk[\"summary\"] = summary\n    chunk[\"summary_vector\"] = summaryVector\n\n    return chunk\n\n\ndef generate_title(chunk, azure_oai_deployment_name, environment, config):\n    \"\"\"\n    Generates a title for a given chunk of text using a language model.\n\n    Args:\n        chunk (str): The input text to generate a title for.\n        azure_oai_deployment_name (str): The name of Azure Open AI deployment to use.\n        environment (object): An object that holds the environment settings.\n        config (object): An object that holds the configuration settings.\n    Returns:\n        str: The generated title.\n    \"\"\"\n    response = ResponseGenerator(\n        environment=environment,\n        config=config,\n        deployment_name=azure_oai_deployment_name,\n    ).generate_response(prompt_instruction_title, text=chunk)\n    return response\n\n\ndef generate_summary(chunk, azure_oai_deployment_name, environment, config):\n    \"\"\"\n    Generates a summary of the given chunk of text using the specified\n    language model.\n\n    Args:\n        chunk (str): The text to summarize.\n        azure_oai_deployment_name (str): The name of Azure Open AI deployment\n        to use.\n        environment (object): An object that holds the environment settings.\n        config (object): An object that holds the configuration settings.\n    Returns:\n        str: The generated summary.\n    \"\"\"\n    response = ResponseGenerator(\n        environment=environment,\n        config=config,\n        deployment_name=azure_oai_deployment_name,\n    ).generate_response(prompt_instruction_summary, text=chunk)\n    return response\n"
  },
  {
    "path": "rag_experiment_accelerator/run/qa_generation.py",
    "content": "import pandas as pd\nfrom os.path import exists\n\nfrom dotenv import load_dotenv\n\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.data_assets.data_asset import create_data_asset\nfrom rag_experiment_accelerator.doc_loader.documentLoader import load_documents\nfrom rag_experiment_accelerator.ingest_data.acs_ingest import generate_qna\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.sampling.clustering import (\n    dataframe_to_chunk_dict,\n    load_parser,\n)\nfrom rag_experiment_accelerator.sampling.clustering import cluster\n\nload_dotenv(override=True)\n\nlogger = get_logger(__name__)\n\n\ndef run(\n    environment: Environment,\n    config: Config,\n    file_paths: list[str],\n):\n    \"\"\"\n    Runs the main experiment loop for the QA generation process using the provided configuration and data.\n\n    Returns:\n        None\n    \"\"\"\n    logger.info(\"Running QA generation\")\n\n    all_docs = {}\n    # Check if we have already sampled\n    if config.index.sampling.sample_data:\n        logger.info(\"Running QA Generation process with sampling\")\n        sampled_cluster_predictions_path = config.path.sampled_cluster_predictions_path(\n            config.index.sampling.optimum_k\n        )\n        if exists(sampled_cluster_predictions_path):\n            df = pd.read_csv(sampled_cluster_predictions_path)\n            all_docs = dataframe_to_chunk_dict(df)\n            logger.info(\"Loaded sampled data\")\n        else:\n            all_docs = load_documents(\n                environment,\n                config.index.chunking.chunking_strategy,\n                config.data_formats,\n                file_paths,\n                2000,\n                0,\n            )\n            parser = load_parser()\n            all_docs = cluster(\n                \"\", all_docs, config.path.sampling_output_dir, config, parser\n            )\n    else:\n        all_docs = load_documents(\n            environment,\n            config.index.chunking.chunking_strategy,\n            config.data_formats,\n            file_paths,\n            2000,\n            0,\n        )\n\n    # generate qna\n    df = generate_qna(\n        environment, config, all_docs, config.openai.azure_oai_chat_deployment_name\n    )\n    # write to jsonl\n    df.to_json(config.path.eval_data_file, orient=\"records\", lines=True)\n    # create data asset in mlstudio\n    create_data_asset(config.path.eval_data_file, \"eval_data\", environment)\n"
  },
  {
    "path": "rag_experiment_accelerator/run/querying.py",
    "content": "from concurrent.futures import ThreadPoolExecutor, as_completed\nfrom contextlib import ExitStack\nimport json\nimport numpy as np\nfrom azure.search.documents import SearchClient\nfrom dotenv import load_dotenv\nimport mlflow\nfrom openai import BadRequestError\n\nfrom sklearn.metrics.pairwise import cosine_similarity\n\nfrom rag_experiment_accelerator.artifact.handlers.query_output_handler import (\n    QueryOutputHandler,\n)\nfrom rag_experiment_accelerator.artifact.models.query_output import QueryOutput\nfrom rag_experiment_accelerator.checkpoint import cache_with_checkpoint\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.embedding.embedding_model import EmbeddingModel\nfrom rag_experiment_accelerator.evaluation.search_eval import (\n    evaluate_search_result,\n)\nfrom rag_experiment_accelerator.evaluation.spacy_evaluator import (\n    SpacyEvaluator,\n)\n\nfrom rag_experiment_accelerator.ingest_data.acs_ingest import (\n    do_we_need_multiple_questions,\n    generate_multiple_questions,\n)\nfrom rag_experiment_accelerator.reranking.reranker import (\n    cross_encoder_rerank_documents,\n    llm_rerank_documents,\n)\nfrom rag_experiment_accelerator.search_type.acs_search_methods import (\n    create_client,\n    search_for_manual_hybrid,\n    search_for_match_Hybrid_cross,\n    search_for_match_Hybrid_multi,\n    search_for_match_pure_vector,\n    search_for_match_pure_vector_cross,\n    search_for_match_pure_vector_multi,\n    search_for_match_semantic,\n    search_for_match_text,\n)\nfrom rag_experiment_accelerator.utils.logging import get_logger\nfrom rag_experiment_accelerator.config.environment import Environment\n\nfrom rag_experiment_accelerator.llm.response_generator import ResponseGenerator\nfrom rag_experiment_accelerator.llm.prompt import (\n    prompt_generate_hypothetical_answer,\n    prompt_generate_hypothetical_document,\n    prompt_generate_hypothetical_questions,\n    main_instruction,\n)\n\nload_dotenv(override=True)\n\nlogger = get_logger(__name__)\n\nsearch_mapping = {\n    \"search_for_match_semantic\": search_for_match_semantic,\n    \"search_for_match_Hybrid_multi\": search_for_match_Hybrid_multi,\n    \"search_for_match_Hybrid_cross\": search_for_match_Hybrid_cross,\n    \"search_for_match_text\": search_for_match_text,\n    \"search_for_match_pure_vector\": search_for_match_pure_vector,\n    \"search_for_match_pure_vector_multi\": search_for_match_pure_vector_multi,\n    \"search_for_match_pure_vector_cross\": search_for_match_pure_vector_cross,\n    \"search_for_manual_hybrid\": search_for_manual_hybrid,\n}\n\n\ndef query_acs(\n    search_client: SearchClient,\n    embedding_model: EmbeddingModel,\n    user_prompt: str,\n    s_v: str,\n    retrieve_num_of_documents: str,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Queries the Azure AI Search service using the specified search client and search parameters.\n\n    Args:\n        search_client (SearchClient): The Azure AI Search client to use for querying the service.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        user_prompt (str): The user's search query.\n        s_v (str): The version of the search service to use.\n        retrieve_num_of_documents (int): The number of documents to retrieve.\n\n    Returns:\n        list: A list of documents matching the search query.\n    \"\"\"\n    if s_v not in search_mapping:\n        pass\n\n    return search_mapping[s_v](\n        client=search_client,\n        embedding_model=embedding_model,\n        query=user_prompt,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n        preprocess=preprocess,\n    )\n\n\ndef rerank_documents(\n    docs: list[str],\n    user_prompt: str,\n    output_prompt: str,\n    config: Config,\n    response_generator: ResponseGenerator,\n) -> list[str]:\n    \"\"\"\n    Reranks a list of documents based on a given user prompt and configuration.\n\n    Args:\n        docs (list[str]): A list of documents to be reranked.\n        user_prompt (str): The user prompt to be used for reranking.\n        output_prompt (str): The output prompt to be used for reranking.\n        config (Config): A configuration object containing reranking parameters.\n\n    Returns:\n        list[str]: A list of reranked documents.\n    \"\"\"\n    match config.rerank.type:\n        case \"llm\":\n            return llm_rerank_documents(\n                docs,\n                user_prompt,\n                response_generator,\n                config.rerank.llm_rerank_threshold,\n            )\n        case \"cross_encoder\":\n            return cross_encoder_rerank_documents(\n                docs,\n                user_prompt,\n                output_prompt,\n                config.rerank.cross_encoder_model,\n                config.rerank.cross_encoder_at_k,\n            )\n        case _:\n            return []\n\n\ndef hyde(\n    config: Config,\n    response_generator: ResponseGenerator,\n    queries: list[str],\n):\n    if config.query_expansion.hyde == \"disabled\":\n        return queries\n\n    hyde_prompt = {\n        \"generated_hypothetical_answer\": prompt_generate_hypothetical_answer,\n        \"generated_hypothetical_document_to_answer\": prompt_generate_hypothetical_document,\n    }\n\n    if config.query_expansion.hyde not in hyde_prompt:\n        raise NotImplementedError(\n            f\"configuration for hyde with value of [{config.query_expansion.hyde}] is not supported\"\n        )\n\n    generated_queries = [\n        response_generator.generate_response(\n            hyde_prompt[config.query_expansion.hyde], text=query\n        )\n        for query in queries\n    ]\n    return generated_queries\n\n\ndef query_expansion(\n    config: Config,\n    response_generator: ResponseGenerator,\n    embedding_model: EmbeddingModel,\n    query: str,\n) -> list[str]:\n    # Query expansion with generated questions\n    augmented_questions = response_generator.generate_response(\n        prompt_generate_hypothetical_questions,\n        text=query,\n        prompt_last=True,\n    )\n\n    if not augmented_questions:\n        return [query]\n\n    # Filter out non related questions\n    questions = filter_non_related_questions(\n        query,\n        augmented_questions,\n        embedding_model,\n        config.query_expansion.min_query_expansion_related_question_similarity_score,\n    )\n\n    return questions\n\n\ndef deduplicate_search_results(search_results: list[dict]) -> list[dict]:\n    doc_set = set()\n    score_dict = {}\n\n    # deduplicate and sort retrieved documents by using a set\n    for doc in search_results:\n        doc_set.add(doc[\"content\"])\n        score_dict[doc[\"content\"]] = doc[\"@search.score\"]\n\n    search_result = list(doc_set)\n    search_result = [\n        {\"content\": doc, \"@search.score\": score_dict[doc]} for doc in search_result\n    ]\n    search_result.sort(key=lambda x: x[\"@search.score\"], reverse=True)\n\n    return search_result\n\n\nclass QueryAndEvalACSResult:\n    def __init__(self, documents: list[str], evaluations: dict[str, any]):\n        self.documents = documents\n        self.evaluations = evaluations\n\n\ndef query_and_eval_acs(\n    search_client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    search_type: str,\n    evaluation_content: str,\n    retrieve_num_of_documents: int,\n    evaluator: SpacyEvaluator,\n    config: Config,\n    response_generator: ResponseGenerator,\n    preprocess: bool = False,\n) -> QueryAndEvalACSResult:\n    \"\"\"\n    Queries the Azure AI Search service using the provided search client and parameters, and evaluates the search\n    results using the provided evaluator and evaluation content. Returns a QueryAndEvalACSResult object containing\n    the retrieved documents and the evaluation results.\n\n    Args:\n        search_client (SearchClient): The Azure AI Search client to use for querying the service.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query (str): The search query to execute.\n        search_type (str): The type of search to execute (e.g. 'semantic', 'vector', etc.).\n        evaluation_content (str): The content to use for evaluating the search results.\n        retrieve_num_of_documents (int): The number of documents to retrieve from the search results.\n        evaluator (SpacyEvaluator): The evaluator to use for evaluating the search results.\n        config (Config): The configuration object.\n        response_generator (ResponseGenerator): The response generator object.\n\n    Returns:\n        QueryAndEvalACSResult: An object containing the retrieved documents and the evaluation results.\n    \"\"\"\n\n    if config.query_expansion.query_expansion:\n        generated_queries = query_expansion(\n            config, response_generator, embedding_model, query\n        )\n    else:\n        generated_queries = [query]\n\n    generated_queries = hyde(config, response_generator, generated_queries)\n    search_results = []\n    for generated_query in generated_queries:\n        search_result = query_acs(\n            search_client=search_client,\n            embedding_model=embedding_model,\n            user_prompt=generated_query,\n            s_v=search_type,\n            retrieve_num_of_documents=retrieve_num_of_documents,\n            preprocess=preprocess,\n        )\n        search_results.extend(search_result)\n\n    search_results = deduplicate_search_results(search_results)\n    search_result = search_result[: config.search.retrieve_num_of_documents]\n\n    docs, evaluation = evaluate_search_result(\n        search_results, evaluation_content, evaluator\n    )\n\n    evaluation[\"query\"] = query\n    return QueryAndEvalACSResult(docs, evaluation)\n\n\ndef filter_non_related_questions(\n    query,\n    generated_questions,\n    embedding_model,\n    min_query_expansion_related_question_similarity_score,\n):\n    questions = [query]\n\n    query_vector = embedding_model.generate_embedding(query)\n\n    for generated_question in generated_questions:\n        generated_question_vector = embedding_model.generate_embedding(\n            generated_question\n        )\n        similarity_score_array = (\n            cosine_similarity(\n                np.array(query_vector).reshape(1, -1),\n                np.array(generated_question_vector).reshape(1, -1),\n            )\n            * 100\n        )\n        similarity_score = int(\n            sum(similarity_score_array) / len(similarity_score_array)\n        )\n        if similarity_score >= min_query_expansion_related_question_similarity_score:\n            questions.append(generated_question)\n\n    return questions\n\n\ndef query_and_eval_acs_multi(\n    search_client: SearchClient,\n    embedding_model: EmbeddingModel,\n    questions: list[str],\n    original_prompt: str,\n    output_prompt: str,\n    search_type: str,\n    evaluation_content: str,\n    config: Config,\n    evaluator: SpacyEvaluator,\n    response_generator: ResponseGenerator,\n    preprocess: bool = False,\n) -> QueryAndEvalACSResult:\n    \"\"\"\n    Queries the Azure AI Search service with multiple questions, evaluates the results, and generates a response\n    using OpenAI's GPT-3 model.\n\n    Args:\n        search_client (SearchClient): The Azure AI Search client.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        questions (list[str]): A list of questions to query the search service with.\n        original_prompt (str): The original prompt to generate the response from.\n        output_prompt (str): The output prompt to use for reranking the search results.\n        search_type (str): The type of search to perform (e.g. 'semantic', 'exact').\n        evaluation_content (str): The content to use for evaluation.\n        config (Config): The configuration object.\n        evaluator (SpacyEvaluator): The evaluator object.\n\n    Returns:\n        QueryAndEvalACSResult: : An object containing the retrieved documents and the evaluation results for each question.\n    \"\"\"\n    context = []\n    evaluations = []\n\n    for question in questions:\n        result = query_and_eval_acs(\n            search_client=search_client,\n            embedding_model=embedding_model,\n            query=question,\n            search_type=search_type,\n            evaluation_content=evaluation_content,\n            retrieve_num_of_documents=config.search.retrieve_num_of_documents,\n            evaluator=evaluator,\n            config=config,\n            response_generator=response_generator,\n            preprocess=preprocess,\n        )\n        if len(result.documents) == 0:\n            logger.warning(f\"No documents found for question: {question}\")\n            continue\n\n        evaluations.append(result.evaluations)\n\n        if config.rerank.enabled:\n            prompt_instruction_context = rerank_documents(\n                result.documents, question, output_prompt, config, response_generator\n            )\n        else:\n            prompt_instruction_context = result.documents\n\n        # TODO: Here was a bug, caused by the fact that we are not limiting the number of documents to retrieve\n        # Current solution is just forcefully limiting the number of documents to retrieve assuming they are sorted\n        if len(prompt_instruction_context) > config.search.retrieve_num_of_documents:\n            prompt_instruction_context = prompt_instruction_context[\n                : config.search.retrieve_num_of_documents\n            ]\n\n        request_context = \"\\n\".join(prompt_instruction_context)\n        request_question = original_prompt\n\n        openai_response = response_generator.generate_response(\n            main_instruction,\n            context=request_context,\n            question=request_question,\n        )\n\n        context.append(openai_response)\n        logger.debug(openai_response)\n\n    return QueryAndEvalACSResult(documents=context, evaluations=evaluations)\n\n\ndef query_and_eval_single_line(\n    line: str,\n    line_number: int,\n    handler: QueryOutputHandler,\n    environment: Environment,\n    config: Config,\n    index_config: IndexConfig,\n    response_generator: ResponseGenerator,\n    search_client: SearchClient,\n    evaluator: SpacyEvaluator,\n    question_count: int,\n):\n    logger.info(f\"Processing question {line_number + 1} out of {question_count}\\n\\n\")\n    data: dict[str, any] = json.loads(line)\n    user_prompt = data.get(\"user_prompt\")\n    output_prompt = data.get(\"output_prompt\")\n    qna_context = data.get(\"context\", \"\")\n\n    is_multi_question = (\n        config.query_expansion.expand_to_multiple_questions\n        and do_we_need_multiple_questions(user_prompt, response_generator, config)\n    )\n\n    new_questions = []\n    if is_multi_question:\n        new_questions = generate_multiple_questions(user_prompt, response_generator)\n\n        if new_questions is None:\n            logger.warning(\n                f\"Unable to generate multiple questions for: {user_prompt}. Skipping...\"\n            )\n            is_multi_question = False\n        else:\n            new_questions.append(user_prompt)\n\n    evaluation_content = user_prompt + qna_context\n\n    try:\n        for s_v in config.search.search_type:\n            output = get_query_output(\n                environment,\n                config,\n                index_config,\n                response_generator,\n                search_client,\n                evaluator,\n                question_count,\n                user_prompt,\n                output_prompt,\n                qna_context,\n                is_multi_question,\n                new_questions,\n                evaluation_content,\n                s_v,\n            )\n            handler.save(\n                index_name=index_config.index_name(),\n                data=output,\n                experiment_name=config.experiment_name,\n                job_name=config.job_name,\n            )\n    except BadRequestError as e:\n        logger.error(\n            \"Invalid request. Skipping question: {user_prompt}\",\n            exc_info=e,\n        )\n\n\n@cache_with_checkpoint(\n    id=\"user_prompt+output_prompt+qna_context+index_config.index_name()\"\n)\ndef get_query_output(\n    environment,\n    config,\n    index_config,\n    response_generator,\n    search_client,\n    evaluator,\n    question_count,\n    user_prompt,\n    output_prompt,\n    qna_context,\n    is_multi_question,\n    new_questions,\n    evaluation_content,\n    s_v,\n):\n    search_evals = []\n\n    response_generator = ResponseGenerator(\n        environment, config, config.openai.azure_oai_chat_deployment_name\n    )\n\n    embedding_model = config.get_embedding_model(\n        index_config.embedding_model.model_name\n    )\n\n    if is_multi_question:\n        result = query_and_eval_acs_multi(\n            search_client=search_client,\n            embedding_model=embedding_model,\n            questions=new_questions,\n            original_prompt=user_prompt,\n            output_prompt=output_prompt,\n            search_type=s_v,\n            evaluation_content=evaluation_content,\n            config=config,\n            evaluator=evaluator,\n            response_generator=response_generator,\n            preprocess=index_config.chunking.preprocess,\n        )\n    else:\n        result = query_and_eval_acs(\n            search_client=search_client,\n            embedding_model=embedding_model,\n            query=user_prompt,\n            search_type=s_v,\n            evaluation_content=evaluation_content,\n            retrieve_num_of_documents=config.search.retrieve_num_of_documents,\n            evaluator=evaluator,\n            config=config,\n            response_generator=response_generator,\n            preprocess=index_config.chunking.preprocess,\n        )\n        search_evals.append(result.evaluations)\n    if config.rerank.enabled and len(result.documents) > 0:\n        prompt_instruction_context = rerank_documents(\n            result.documents,\n            user_prompt,\n            output_prompt,\n            config,\n            response_generator,\n        )\n    else:\n        prompt_instruction_context = result.documents\n\n    openai_response = response_generator.generate_response(\n        main_instruction,\n        context=\"\\n\".join(prompt_instruction_context),\n        question=user_prompt,\n    )\n\n    output = QueryOutput(\n        rerank=config.rerank.enabled,\n        rerank_type=config.rerank.type,\n        cross_encoder_model=config.rerank.cross_encoder_model,\n        llm_rerank_threshold=config.rerank.llm_rerank_threshold,\n        retrieve_num_of_documents=config.search.retrieve_num_of_documents,\n        cross_encoder_at_k=config.rerank.cross_encoder_at_k,\n        question_count=question_count,\n        actual=openai_response,\n        expected=output_prompt,\n        search_type=s_v,\n        search_evals=search_evals,\n        context=qna_context,\n        retrieved_contexts=prompt_instruction_context,\n        question=user_prompt,\n    )\n\n    return output\n\n\ndef run(\n    environment: Environment,\n    config: Config,\n    index_config: IndexConfig,\n    mlflow_client: mlflow.MlflowClient,\n):\n    \"\"\"\n    Runs the main experiment loop, which evaluates a set of search configurations against a given dataset.\n\n    Returns:\n        None\n    \"\"\"\n    question_count = 0\n    try:\n        with open(config.path.eval_data_file, \"r\") as file:\n            for line in file:\n                question_count += 1\n    except FileNotFoundError as e:\n        logger.error(\"The file does not exist: \" + config.path.eval_data_file)\n        raise e\n\n    mlflow.log_metric(\"question_count\", question_count)\n\n    evaluator = SpacyEvaluator(config.search.search_relevancy_threshold)\n    handler = QueryOutputHandler(config.path.query_data_dir)\n    response_generator = ResponseGenerator(\n        environment, config, config.openai.azure_oai_chat_deployment_name\n    )\n    for index_config in config.index.flatten():\n        index_name = index_config.index_name()\n        logger.info(f\"Processing index: {index_name}\")\n\n        handler.handle_archive_by_index(\n            index_name, config.experiment_name, config.job_name\n        )\n\n        search_client = create_client(\n            environment.azure_search_service_endpoint,\n            index_name,\n            environment.azure_search_admin_key,\n        )\n        with open(config.path.eval_data_file, \"r\") as file:\n            with ExitStack() as stack:\n                executor = stack.enter_context(\n                    ThreadPoolExecutor(config.max_worker_threads)\n                )\n                futures = {\n                    executor.submit(\n                        query_and_eval_single_line,\n                        line,\n                        line_number,\n                        handler,\n                        environment,\n                        config,\n                        index_config,\n                        response_generator,\n                        search_client,\n                        evaluator,\n                        question_count,\n                    ): line\n                    for line_number, line in enumerate(file)\n                }\n\n                for future in as_completed(futures):\n                    try:\n                        future.result()\n                    except Exception as exc:\n                        logger.error(\n                            f\"query generated an exception: {exc} for line {line}...\"\n                        )\n\n        search_client.close()\n"
  },
  {
    "path": "rag_experiment_accelerator/run/tests/data/test_data.jsonl",
    "content": "{\"user_prompt\":\"What happens when there is a lack of dopamine in the brain?\",\"output_prompt\":\"When there is a lack of dopamine in the brain, it can lead to movement disorders such as Parkinson's disease.\",\"context\":\"Normally, there are brain cells (neurons) in the human brain that produce dopamine. These neurons concentrate in a particular area of the brain, called the substantia nigra. Dopamine is a chemical that relays messages between the substantia nigra and other parts of the brain to control movements of the human body.\"}\n"
  },
  {
    "path": "rag_experiment_accelerator/run/tests/test_index.py",
    "content": "from unittest.mock import MagicMock, patch\n\nfrom rag_experiment_accelerator.checkpoint.checkpoint_factory import init_checkpoint\nfrom rag_experiment_accelerator.config.chunking_config import ChunkingConfig\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.embedding_model_config import (\n    EmbeddingModelConfig,\n)\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.config.language_config import LanguageConfig\nfrom rag_experiment_accelerator.config.openai_config import OpenAIConfig\nfrom rag_experiment_accelerator.config.query_expansion import QueryExpansionConfig\nfrom rag_experiment_accelerator.config.sampling_config import SamplingConfig\nfrom rag_experiment_accelerator.run.index import run\nfrom rag_experiment_accelerator.config.paths import get_all_file_paths\n\n\n@patch(\"rag_experiment_accelerator.run.index.mlflow.MlflowClient\")\n@patch(\"rag_experiment_accelerator.config.config.create_embedding_model\")\n@patch(\"rag_experiment_accelerator.run.index.upload_data\")\n@patch(\"rag_experiment_accelerator.run.index.cluster\")\n@patch(\"rag_experiment_accelerator.run.index.load_documents\")\n@patch(\"rag_experiment_accelerator.run.index.create_acs_index\")\n@patch(\"rag_experiment_accelerator.run.index.Preprocess\")\n@patch(\"rag_experiment_accelerator.run.index.Environment\")\ndef test_run(\n    mock_environment,\n    mock_preprocess,\n    mock_create_acs_index,\n    mock_load_documents,\n    mock_cluster,\n    mock_upload_data,\n    mock_create_embedding_model,\n    mock_mlflow_client,\n):\n    # Arrange\n    data_dir = \"./data\"\n\n    embedding_model_1 = MagicMock()\n    embedding_model_1.model_name.return_value = \"all-MiniLM-L6-v2\"\n    embedding_model_1.dimension.return_value = 384\n\n    embedding_model_2 = MagicMock()\n    embedding_model_2.model_name.return_value = \"text-embedding-ada-002\"\n    embedding_model_2.dimension.return_value = 1536\n    mock_create_embedding_model.side_effect = [embedding_model_1, embedding_model_2]\n\n    mock_config = MagicMock(spec=Config)\n    mock_config.data_formats = [\"format1\", \"format2\"]\n    mock_config.use_checkpoints = False\n    mock_config.max_worker_threads = 1\n\n    mock_config.use_checkpoints = False\n\n    mock_config.index = IndexConfig(\n        index_name_prefix=\"prefix\",\n        ef_construction=[300, 400],\n        ef_search=[300, 400],\n        chunking=ChunkingConfig(\n            preprocess=False,\n            chunk_size=[10, 20],\n            overlap_size=[5, 10],\n            chunking_strategy=\"chunking_strategy\",\n            generate_title=False,\n            generate_summary=False,\n            override_content_with_summary=False,\n            azure_document_intelligence_model=\"prebuilt-read\",\n        ),\n        embedding_model=[\n            EmbeddingModelConfig(model_name=\"model1\"),\n            EmbeddingModelConfig(model_name=\"model2\"),\n        ],\n        sampling=SamplingConfig(sample_data=False),\n    )\n\n    mock_config.language = MagicMock(\n        spec=LanguageConfig, analyzer=[\"analyzer1\", \"analyzer2\"]\n    )\n\n    mock_config.query_expansion = MagicMock(\n        spec=QueryExpansionConfig, query_expansion=False\n    )\n\n    mock_config.openai = MagicMock(spec=OpenAIConfig)\n    mock_config.openai.azure_oai_chat_deployment_name = \"test-deployment\"\n\n    mock_environment.azure_search_service_endpoint = \"service_endpoint\"\n    mock_environment.azure_search_admin_key = \"admin_key\"\n    mock_environment.azure_document_intelligence_endpoint = (\n        \"document_intelligence_endpoint\"\n    )\n    mock_environment.azure_document_intelligence_key = \"document_intelligence_key\"\n\n    mock_preprocess.return_value.preprocess.return_value = \"preprocessed_value\"\n\n    mock_load_documents.return_value = [\n        {\"key1\": {\"content\": \"content1\", \"metadata\": {\"source\": \"source1\"}}},\n        {\"key2\": {\"content\": \"content2\", \"metadata\": {\"source\": \"source2\"}}},\n        {\"key3\": {\"content\": \"content3\", \"metadata\": {\"source\": \"source3\"}}},\n    ]\n\n    mock_cluster.return_value = [\n        {\"cluster1\": {\"content\": \"content1\", \"metadata\": {\"source\": \"source1\"}}},\n        {\"cluster2\": {\"content\": \"content2\", \"metadata\": {\"source\": \"source2\"}}},\n        {\"cluster3\": {\"content\": \"content3\", \"metadata\": {\"source\": \"source3\"}}},\n    ]\n    file_paths = get_all_file_paths(data_dir)\n\n    # Act\n    for index_config in mock_config.index.flatten():\n        init_checkpoint(mock_config)\n        run(mock_environment, mock_config, index_config, file_paths, mock_mlflow_client)\n\n    # Assert\n    assert mock_preprocess.call_count == 32\n    assert mock_create_acs_index.call_count == 32\n    assert mock_load_documents.call_count == 32\n    assert mock_upload_data.call_count == 32\n    assert mock_create_acs_index.call_args_list[0][0][0] == \"service_endpoint\"\n    assert mock_create_acs_index.call_args_list[0][0][2] == \"admin_key\"\n\n    assert mock_load_documents.call_args_list[0][0][1] == \"chunking_strategy\"\n    assert mock_load_documents.call_args_list[0][0][2] == [\"format1\", \"format2\"]\n    assert mock_load_documents.call_args_list[0][0][3] == file_paths\n    assert mock_load_documents.call_args_list[0][0][4] == 10\n    assert mock_load_documents.call_args_list[0][0][5] == 5\n"
  },
  {
    "path": "rag_experiment_accelerator/run/tests/test_qa_generation.py",
    "content": "from unittest.mock import MagicMock, patch\n\nfrom rag_experiment_accelerator.run.qa_generation import run\n\n\n@patch(\"rag_experiment_accelerator.run.qa_generation.Environment\")\n@patch(\"rag_experiment_accelerator.run.qa_generation.exists\")\n@patch(\"rag_experiment_accelerator.run.qa_generation.load_documents\")\n@patch(\"rag_experiment_accelerator.run.qa_generation.cluster\")\n@patch(\"rag_experiment_accelerator.run.qa_generation.generate_qna\")\n@patch(\"rag_experiment_accelerator.run.qa_generation.create_data_asset\")\ndef test_run(\n    mock_create_data_asset,\n    mock_generate_qna,\n    mock_cluster,\n    mock_load_documents,\n    mock_exists,\n    mock_environment,\n):\n    # Arrange\n    data_dir = \"test_data_dir\"\n    df_instance = MagicMock()\n\n    mock_config = MagicMock()\n    mock_config.index.sampling.sample_data = True\n    mock_config.index.sampling.optimum_k = 3\n\n    sampled_input_data_csv_path = f\"{data_dir}/sampling/sampled_cluster_predictions_cluster_number_{mock_config.index.sampling.optimum_k}.csv\"\n    mock_config.path.sampled_cluster_predictions_path.return_value = (\n        sampled_input_data_csv_path\n    )\n    mock_exists.return_value = False\n\n    mock_load_documents.return_value = all_docs_instance = MagicMock()\n    mock_cluster.return_value = all_docs_instance = MagicMock()\n    mock_generate_qna.return_value = df_instance\n    filepaths = [\"file_path_one\", \"file_path_two\"]\n\n    # Act\n    run(mock_environment, mock_config, filepaths)\n\n    # Assert\n    mock_load_documents.assert_called_once_with(\n        mock_environment,\n        mock_config.index.chunking.chunking_strategy,\n        mock_config.data_formats,\n        filepaths,\n        2000,\n        0,\n    )\n    mock_generate_qna.assert_called_once_with(\n        mock_environment,\n        mock_config,\n        all_docs_instance,\n        mock_config.openai.azure_oai_chat_deployment_name,\n    )\n    df_instance.to_json.assert_called_once_with(\n        mock_config.path.eval_data_file, orient=\"records\", lines=True\n    )\n    mock_create_data_asset.assert_called_once_with(\n        mock_config.path.eval_data_file, \"eval_data\", mock_environment\n    )\n"
  },
  {
    "path": "rag_experiment_accelerator/run/tests/test_querying.py",
    "content": "import unittest\nimport os\nfrom unittest.mock import MagicMock, patch\nfrom azure.search.documents import SearchClient\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint\nfrom rag_experiment_accelerator.config.chunking_config import ChunkingConfig\nfrom rag_experiment_accelerator.config.openai_config import OpenAIConfig\nfrom rag_experiment_accelerator.config.path_config import PathConfig\nfrom rag_experiment_accelerator.config.query_expansion import QueryExpansionConfig\nfrom rag_experiment_accelerator.config.rerank_config import RerankConfig\nfrom rag_experiment_accelerator.config.search_config import SearchConfig\nfrom rag_experiment_accelerator.embedding.embedding_model import EmbeddingModel\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.config.index_config import IndexConfig\nfrom rag_experiment_accelerator.config.environment import Environment\nfrom rag_experiment_accelerator.run.querying import (\n    QueryAndEvalACSResult,\n    query_acs,\n    query_and_eval_single_line,\n    rerank_documents,\n    query_and_eval_acs,\n    query_and_eval_acs_multi,\n)\nfrom rag_experiment_accelerator.llm.prompt import Prompt, main_instruction\n\n\nclass TestQuerying(unittest.TestCase):\n    def setUp(self):\n        self.mock_config = MagicMock(spec=Config)\n\n        self.mock_config.use_checkpoints = False\n\n        self.mock_config.index = MagicMock(spec=IndexConfig)\n        self.mock_config.index.index_name_prefix = \"prefix\"\n        self.mock_config.index.ef_construction = [400]\n        self.mock_config.index.ef_search = [400]\n        self.mock_config.index.chunking = MagicMock(spec=ChunkingConfig)\n        self.mock_config.index.chunking.preprocess = False\n        self.mock_config.index.chunking.chunk_size = [1]\n        self.mock_config.index.chunking.overlap_size = [1]\n        self.mock_config.index.embedding_model = MagicMock(spec=EmbeddingModel)\n        self.mock_config.index.embedding_model.model_name = \"test-embedding-model\"\n\n        self.mock_config.query_expansion = MagicMock(spec=QueryExpansionConfig)\n        self.mock_config.query_expansion.query_expansion = False\n        self.mock_config.query_expansion.hyde = \"disabled\"\n        self.mock_config.query_expansion.min_query_expansion_related_question_similarity_score = (\n            90\n        )\n        self.mock_config.query_expansion.expand_to_multiple_questions = True\n\n        self.mock_config.openai = MagicMock(spec=OpenAIConfig)\n        self.mock_config.openai.azure_oai_chat_deployment_name = \"test-deployment\"\n\n        self.mock_config.rerank = MagicMock(spec=RerankConfig)\n        self.mock_config.rerank.enabled = True\n        self.mock_config.rerank.type = \"llm\"\n        self.mock_config.rerank.llm_rerank_threshold = 3\n\n        self.mock_config.search = MagicMock(spec=SearchConfig)\n        self.mock_config.search.retrieve_num_of_documents = 10\n        self.mock_config.search.search_type = [\"search_for_match_semantic\"]\n\n        self.mock_config.path = MagicMock(spec=PathConfig)\n        self.mock_config.path.eval_data_file = \"test_data.jsonl\"\n\n        self.mock_environment = MagicMock(spec=Environment)\n        self.mock_search_client = MagicMock(spec=SearchClient)\n        self.mock_embedding_model = MagicMock(spec=EmbeddingModel)\n\n        self.prompt = MagicMock(spec=Prompt)\n        self.prompt.tags = {}\n        self.prompt.system_message = \"system message\"\n        self.prompt.user_template = \"user template\"\n\n    @patch(\"rag_experiment_accelerator.run.querying.search_mapping\")\n    def test_query_acs(self, mock_search_mapping):\n        user_prompt = \"test prompt\"\n        s_v = \"search_for_match_semantic\"\n        retrieve_num_of_documents = \"10\"\n\n        query_acs(\n            self.mock_search_client,\n            self.mock_embedding_model,\n            user_prompt,\n            s_v,\n            retrieve_num_of_documents,\n        )\n\n        mock_search_mapping[s_v].assert_called_once_with(\n            client=self.mock_search_client,\n            embedding_model=self.mock_embedding_model,\n            query=user_prompt,\n            retrieve_num_of_documents=retrieve_num_of_documents,\n            preprocess=False,\n        )\n\n    @patch(\"rag_experiment_accelerator.run.querying.llm_rerank_documents\")\n    @patch(\"rag_experiment_accelerator.run.querying.cross_encoder_rerank_documents\")\n    @patch(\"rag_experiment_accelerator.run.querying.ResponseGenerator\")\n    def test_rerank_documents(\n        self,\n        mock_response_generator,\n        mock_cross_encoder_rerank_documents,\n        mock_llm_rerank_documents,\n    ):\n        docs = [\"doc1\", \"doc2\"]\n        user_prompt = \"test prompt\"\n        output_prompt = \"output prompt\"\n\n        rerank_documents(\n            docs, user_prompt, output_prompt, self.mock_config, mock_response_generator\n        )\n\n        mock_llm_rerank_documents.assert_called_once()\n        mock_cross_encoder_rerank_documents.assert_not_called()\n\n    @patch(\"rag_experiment_accelerator.run.querying.query_acs\")\n    @patch(\"rag_experiment_accelerator.run.querying.evaluate_search_result\")\n    @patch(\"rag_experiment_accelerator.run.querying.ResponseGenerator\")\n    def test_query_and_eval_acs(\n        self, mock_response_generator, mock_evaluate_search_result, mock_query_acs\n    ):\n        # Arrange\n        query = \"test query\"\n        search_type = \"test search type\"\n        evaluation_content = \"test evaluation content\"\n        retrieve_num_of_documents = 10\n        mock_evaluator = MagicMock()\n        mock_search_result = [\n            {\"content\": \"text1\", \"@search.score\": 10},\n            {\"content\": \"text2\", \"@search.score\": 9},\n        ]\n        mock_docs = [\"doc1\", \"doc2\"]\n        mock_evaluation = {\"score\": 0.8}\n\n        mock_query_acs.return_value = mock_search_result\n        mock_evaluate_search_result.return_value = (mock_docs, mock_evaluation)\n\n        # Act\n        result = query_and_eval_acs(\n            self.mock_search_client,\n            self.mock_embedding_model,\n            query,\n            search_type,\n            evaluation_content,\n            retrieve_num_of_documents,\n            mock_evaluator,\n            self.mock_config,\n            mock_response_generator(),\n        )\n\n        # Assert\n        mock_query_acs.assert_called_once_with(\n            search_client=self.mock_search_client,\n            embedding_model=self.mock_embedding_model,\n            user_prompt=query,\n            s_v=search_type,\n            retrieve_num_of_documents=retrieve_num_of_documents,\n            preprocess=False,\n        )\n        mock_evaluate_search_result.assert_called_once_with(\n            mock_search_result, evaluation_content, mock_evaluator\n        )\n        self.assertEqual(result.documents, mock_docs)\n        self.assertEqual(result.evaluations, mock_evaluation)\n\n    @patch(\"rag_experiment_accelerator.run.querying.query_and_eval_acs\")\n    @patch(\"rag_experiment_accelerator.run.querying.rerank_documents\")\n    @patch(\"rag_experiment_accelerator.run.querying.ResponseGenerator\")\n    def test_query_and_eval_acs_multi_rerank(\n        self,\n        mock_response_generator,\n        mock_rerank_documents,\n        mock_query_and_eval_acs,\n    ):\n        # Arrange\n        questions = [\"question1\", \"question2\"]\n        original_prompt = \"original prompt\"\n        output_prompt = \"output prompt\"\n        search_type = \"search type\"\n        evaluation_content = \"evaluation content\"\n        evaluator = MagicMock()\n        mock_docs = [\"doc1\", \"doc2\"]\n        mock_evaluation = {\"score\": 0.8}\n\n        mock_query_and_eval_acs.side_effect = [\n            QueryAndEvalACSResult(mock_docs, mock_evaluation),\n            QueryAndEvalACSResult(mock_docs, mock_evaluation),\n        ]\n        mock_rerank_documents.return_value = prompt_instruction_context = [\n            \"context1\",\n            \"context2\",\n        ]\n        mock_response_generator.return_value.generate_response.return_value = (\n            \"openai response\"\n        )\n\n        # Act\n        result = query_and_eval_acs_multi(\n            self.mock_search_client,\n            self.mock_embedding_model,\n            questions,\n            original_prompt,\n            output_prompt,\n            search_type,\n            evaluation_content,\n            self.mock_config,\n            evaluator,\n            mock_response_generator(),\n        )\n\n        # Assert\n        mock_query_and_eval_acs.assert_called_with(\n            search_client=self.mock_search_client,\n            embedding_model=self.mock_embedding_model,\n            query=questions[1] or questions[0],\n            search_type=search_type,\n            evaluation_content=evaluation_content,\n            retrieve_num_of_documents=self.mock_config.search.retrieve_num_of_documents,\n            evaluator=evaluator,\n            config=self.mock_config,\n            response_generator=mock_response_generator(),\n            preprocess=False,\n        )\n        # mock_rerank_documents.assert_not_called()\n        mock_rerank_documents.assert_called_with(\n            mock_docs,\n            questions[1] or questions[0],\n            output_prompt,\n            self.mock_config,\n            mock_response_generator(),\n        )\n        mock_response_generator.return_value.generate_response.assert_called_with(\n            main_instruction,\n            context=\"\\n\".join(prompt_instruction_context),\n            question=original_prompt,\n        )\n        self.assertEqual(result.documents, [\"openai response\", \"openai response\"])\n        self.assertEqual(result.evaluations, [mock_evaluation, mock_evaluation])\n\n    @patch(\"rag_experiment_accelerator.run.querying.query_and_eval_acs\")\n    @patch(\"rag_experiment_accelerator.run.querying.rerank_documents\")\n    @patch(\"rag_experiment_accelerator.run.querying.ResponseGenerator\")\n    def test_query_and_eval_acs_multi_no_rerank(\n        self,\n        mock_response_generator,\n        mock_rerank_documents,\n        mock_query_and_eval_acs,\n    ):\n        # Arrange\n        questions = [\"question1\", \"question2\"]\n        original_prompt = \"original prompt\"\n        output_prompt = \"output prompt\"\n        search_type = \"search type\"\n        evaluation_content = \"evaluation content\"\n        self.mock_config.rerank = MagicMock(spec=RerankConfig)\n        self.mock_config.rerank.enabled = False\n        evaluator = MagicMock()\n        mock_docs = [\"doc1\", \"doc2\"]\n        mock_evaluation = {\"score\": 0.8}\n\n        mock_query_and_eval_acs.side_effect = [\n            QueryAndEvalACSResult(mock_docs, mock_evaluation),\n            QueryAndEvalACSResult(mock_docs, mock_evaluation),\n        ]\n\n        mock_response_generator.return_value.generate_response.return_value = (\n            \"openai response\"\n        )\n\n        # Act\n        result = query_and_eval_acs_multi(\n            self.mock_search_client,\n            self.mock_embedding_model,\n            questions,\n            original_prompt,\n            output_prompt,\n            search_type,\n            evaluation_content,\n            self.mock_config,\n            evaluator,\n            response_generator=mock_response_generator(),\n        )\n\n        # Assert\n        mock_query_and_eval_acs.assert_called_with(\n            search_client=self.mock_search_client,\n            embedding_model=self.mock_embedding_model,\n            query=questions[1] or questions[0],\n            search_type=search_type,\n            evaluation_content=evaluation_content,\n            retrieve_num_of_documents=self.mock_config.search.retrieve_num_of_documents,\n            evaluator=evaluator,\n            config=self.mock_config,\n            response_generator=mock_response_generator(),\n            preprocess=False,\n        )\n        mock_rerank_documents.assert_not_called()\n        mock_response_generator.return_value.generate_response.assert_called_with(\n            main_instruction,\n            context=\"\\n\".join(mock_docs),\n            question=original_prompt,\n        )\n        self.assertEqual(result.documents, [\"openai response\", \"openai response\"])\n        self.assertEqual(result.evaluations, [mock_evaluation, mock_evaluation])\n\n    @patch(\"rag_experiment_accelerator.run.querying.Environment\")\n    @patch(\"rag_experiment_accelerator.run.querying.SpacyEvaluator\")\n    @patch(\"rag_experiment_accelerator.run.querying.QueryOutputHandler\")\n    @patch(\"rag_experiment_accelerator.run.querying.ResponseGenerator\")\n    @patch(\"rag_experiment_accelerator.run.querying.QueryOutput\")\n    @patch(\"rag_experiment_accelerator.run.querying.do_we_need_multiple_questions\")\n    @patch(\"rag_experiment_accelerator.run.querying.query_and_eval_acs\")\n    def test_run_no_multi_no_rerank(\n        self,\n        mock_query_and_eval_acs,\n        mock_do_we_need_multiple_questions,\n        mock_query_output,\n        mock_response_generator,\n        mock_query_output_handler,\n        mock_spacy_evaluator,\n        mock_environment,\n    ):\n        # Arrange\n        mock_query_output_handler.return_value.load.return_value = [mock_query_output]\n        mock_query_output_handler.return_value.save.side_effect = None\n        test_dir = os.path.dirname(os.path.abspath(__file__))\n        data_file_path = test_dir + \"/data/test_data.jsonl\"\n        self.mock_config.path.eval_data_file = data_file_path\n        self.mock_config.rerank = MagicMock(spec=RerankConfig)\n        self.mock_config.rerank.enabled = False\n        mock_do_we_need_multiple_questions.return_value = False\n        mock_query_and_eval_acs.return_value = MagicMock()\n        mock_search_client = MagicMock(SearchClient)\n\n        init_checkpoint(self.mock_config)\n        # Act\n        with open(data_file_path, \"r\") as file:\n            line = file.readline()\n        query_and_eval_single_line(\n            line,\n            1,\n            mock_query_output_handler,\n            mock_environment,\n            self.mock_config,\n            self.mock_config.index,\n            mock_response_generator,\n            mock_search_client,\n            mock_spacy_evaluator,\n            1,\n        )\n\n        # Assert\n        mock_query_and_eval_acs.assert_called()\n        mock_query_output_handler.save.assert_called()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "rag_experiment_accelerator/sampling/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/sampling/clustering.py",
    "content": "import os\nimport warnings\nimport numpy as np\nimport matplotlib\nimport matplotlib.pyplot as plt\nimport plotly.express as px\nimport pandas as pd\nfrom tqdm import tqdm\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.cluster import KMeans\nfrom umap import UMAP\nfrom scipy.spatial.distance import cdist\nfrom rag_experiment_accelerator.checkpoint import cache_with_checkpoint\nfrom rag_experiment_accelerator.config.config import Config\nfrom rag_experiment_accelerator.utils.logging import get_logger\nimport shutil\n\nmatplotlib.use(\"Agg\")\nplt.style.use(\"ggplot\")\nwarnings.filterwarnings(\"ignore\")\nlogger = get_logger(__name__)\n\n\ndef load_parser():\n    from spacy import load\n\n    try:\n        parser = load(\"en_core_web_lg\", disable=[\"ner\"])\n    except OSError:\n        logger.info(\"Downloading spacy language model: en_core_web_lg\")\n        from spacy.cli import download\n\n        download(\"en_core_web_lg\")\n        parser = load(\"en_core_web_lg\", disable=[\"ner\"])\n\n    parser.max_length = 7000000\n\n    return parser\n\n\ndef spacy_tokenizer(sentence, parser):\n    \"\"\"\n    Tokenizes a sentence using the Spacy library.\n\n    Args:\n        sentence (str): The input sentence to be tokenized.\n\n    Returns:\n        str: The tokenized sentence.\n\n    \"\"\"\n\n    if not isinstance(sentence, str):\n        sentence = sentence[\"content\"]\n\n    tokens = [\n        word.lemma_.lower().strip() if word.lemma_ != \"-PRON-\" else word.lower_\n        for word in parser(sentence)\n        if not word.is_stop and not word.is_punct\n    ]\n    tokenized_sentence = \" \".join([token for token in tokens])\n    return tokenized_sentence\n\n\ndef determine_optimum_k_elbow(embeddings_2d, X, min_cluster, max_cluster, result_dir):\n    \"\"\"\n    Determines the optimal number of clusters using the Elbow Method.\n\n    Args:\n        embeddings_2d (numpy.ndarray): 2D embeddings of the data.\n        X (numpy.ndarray): Input data.\n        min_cluster (int): Minimum number of clusters to consider.\n        max_cluster (int): Maximum number of clusters to consider.\n        result_dir (str): Directory to save the output files.\n\n    Returns:\n        int: The optimum number of clusters.\n\n    \"\"\"\n    logger.info(\"Determining optimal k\")\n\n    # Run kmeans with many different k\n    distortions = []\n    inertias = []\n    mapping1 = {}\n    mapping2 = {}\n\n    K = range(min_cluster, max_cluster)\n    for k in tqdm(K):\n        k_means = KMeans(n_clusters=k, random_state=42).fit(embeddings_2d)\n        k_means.fit(embeddings_2d)\n        distortions.append(\n            sum(\n                np.min(\n                    cdist(embeddings_2d, k_means.cluster_centers_, \"euclidean\"), axis=1\n                )\n            )\n            / X.shape[0]\n        )\n        inertias.append(k_means.inertia_)\n        mapping1[k] = (\n            sum(\n                np.min(\n                    cdist(embeddings_2d, k_means.cluster_centers_, \"euclidean\"), axis=1\n                )\n            )\n            / X.shape[0]\n        )\n        mapping2[k] = k_means.inertia_\n\n    lbig = max(abs(x - y) for (x, y) in zip(distortions[1:], distortions[:-1]))\n\n    x_line = [K[0], K[-1]]\n    y_line = [distortions[0], distortions[-1]]\n\n    opt_k = []\n\n    for i, distortion in enumerate(distortions):\n        if i != len(distortions) - 1:\n            if distortion - distortions[i + 1] == lbig:\n                opt_k.append(\n                    list(mapping1.keys())[\n                        list(mapping1.values()).index(distortions[i + 3])\n                    ]\n                )\n\n    logger.info(f\"The optimum cluster number is {opt_k[0]}\")\n    optimum_k = opt_k[0]\n\n    plt.plot(K, distortions, \"b-\")\n    plt.plot(x_line, y_line, \"r\")\n    plt.xlabel(\"k\")\n    plt.ylabel(\"Distortion\")\n    plt.title(\"The Elbow Method showing the optimal k\")\n    plt.savefig(f\"{result_dir}/elbow_{optimum_k}.png\")\n\n    return optimum_k\n\n\ndef vectorize_tfidf(text, max_features):\n    \"\"\"\n    Vectorizes the given text using TF-IDF representation.\n\n    Args:\n        text (list): A list of strings representing the text documents.\n        max_features (int): The maximum number of features to keep.\n\n    Returns:\n        scipy.sparse.csr_matrix: The TF-IDF matrix representation of the text.\n    \"\"\"\n    vectorizer = TfidfVectorizer(max_features=max_features)\n    X = vectorizer.fit_transform(text)\n    return X\n\n\ndef dataframe_to_chunk_dict(df_concat):\n    \"\"\"\n    Convert a dataframe to a dictionary of chunks.\n\n    Args:\n        df_concat (pandas.DataFrame): The dataframe containing the chunks.\n\n    Returns:\n        list: A list of dictionaries, where each dictionary represents a chunk with its corresponding text.\n    \"\"\"\n    sampled_chunks = []\n    for i, row in enumerate(df_concat.itertuples()):\n        chunk = {row.chunk: row.text}\n        sampled_chunks.append(chunk)\n\n    return sampled_chunks\n\n\ndef chunk_dict_to_dataframe(all_chunks):\n    \"\"\"\n    Convert a list of dictionaries containing chunks and text into a pandas DataFrame.\n\n    Parameters:\n    all_chunks (list[dict]): A list of dictionaries where each dictionary contains a chunk and its corresponding text.\n\n    Returns:\n    df (pandas.DataFrame): A DataFrame with three columns - 'chunk', 'text' and 'filename, where 'chunk' contains the chunks and 'text' contains the corresponding text and 'filename' the file name.\n    \"\"\"\n\n    chunks = []\n    text = []\n    filename = []\n\n    for row in all_chunks:\n        key, value = list(row.items())[0]\n        chunks.append(key)\n        text.append(value)\n        filename.append(value[\"metadata\"][\"source\"])\n\n    df = pd.DataFrame({\"chunk\": chunks, \"text\": text, \"filename\": filename})\n\n    return df\n\n\ndef cluster_kmeans(embeddings_2d, optimum_k, df, result_dir):\n    \"\"\"\n    Perform K-means clustering on 2D embeddings.\n\n    Args:\n        embeddings_2d (numpy.ndarray): 2D embeddings array.\n        optimum_k (int): Number of clusters to create.\n        df (pandas.DataFrame): DataFrame containing additional data.\n        result_dir (str): Directory to save the clustering results.\n\n    Returns:\n        tuple: A tuple containing the following lists:\n            - x (list): X-coordinates of the embeddings.\n            - y (list): Y-coordinates of the embeddings.\n            - text (list): Text data from the DataFrame.\n            - processed_text (list): Processed text data from the DataFrame.\n            - chunk (list): Chunk data from the DataFrame.\n            - prediction (list): Cluster labels assigned by K-means.\n            - prediction_values (list): Unique cluster labels.\n            - filenames (list): File names of the sampled data.\n\n    \"\"\"\n    logger.info(\"Clustering chunks\")\n    kmeans = KMeans(n_clusters=optimum_k)\n    kmeans.fit(embeddings_2d)\n\n    # Plot\n    fig = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], color=kmeans.labels_)\n    fig.write_image(\n        f\"{result_dir}/all_cluster_predictions_cluster_number_{optimum_k}.jpg\"\n    )\n\n    # Save\n    filenames = (\n        x\n    ) = y = text = processed_text = chunk = prediction = prediction_values = []\n    x = embeddings_2d[:, 0].tolist()\n    y = embeddings_2d[:, 1].tolist()\n    text = df[\"text\"].tolist()\n    processed_text = df[\"processed_text\"].tolist()\n    chunk = df[\"chunk\"].tolist()\n    prediction = kmeans.labels_.tolist()\n    prediction_values = list(set(kmeans.labels_.tolist()))\n    filenames = list(set(df[\"filename\"].tolist()))\n\n    return x, y, text, processed_text, chunk, prediction, prediction_values, filenames\n\n\n@cache_with_checkpoint(id=\"index_name\")\ndef cluster(index_name, all_chunks, config: Config, parser):\n    \"\"\"\n    Clusters the given chunks of documents using TF-IDF and K-means clustering.\n\n    Args:\n        index_name (str): The name of the index - used only for uniqueness as checkpoint key.\n        all_chunks (list): A list of document chunks.\n        config (object): The configuration object.\n\n    Returns:\n        dict: A dictionary containing the sampled document chunks.\n\n    \"\"\"\n    logger.info(f\"Sampling - Original Document chunk length {len(all_chunks)}\")\n    df = chunk_dict_to_dataframe(all_chunks)\n\n    # Tokenize and remove punctuation and stop words\n    tqdm.pandas()\n    df[\"processed_text\"] = df[\"text\"].progress_apply(\n        lambda text: spacy_tokenizer(text, parser)\n    )\n\n    # Run TF-IDF\n    logger.info(\"Run TF-IDF\")\n    text = df[\"processed_text\"].values\n    max_features = 2**12\n    X = vectorize_tfidf(text, max_features)\n\n    logger.info(\"Reducing Umap\")\n    reducer = UMAP()\n    embeddings_2d = reducer.fit_transform(X)\n\n    if config.index.sampling.optimum_k == \"auto\":\n        optimum_k = determine_optimum_k_elbow(\n            embeddings_2d,\n            X,\n            config.index.sampling.min_cluster,\n            config.index.sampling.max_cluster,\n            config.path.sampling_output_dir,\n        )\n    else:\n        optimum_k = config.index.sampling.optimum_k\n\n    # Cluster\n\n    (\n        x,\n        y,\n        text,\n        processed_text,\n        chunk,\n        prediction,\n        prediction_values,\n        filenames,\n    ) = cluster_kmeans(embeddings_2d, optimum_k, df, config.sampling_output_dir)\n\n    # Capture all predictions\n    data = {\"x\": x, \"y\": y, \"text\": text, \"prediction\": prediction, \"chunk\": chunk}\n    df = pd.DataFrame(data)\n    df.to_csv(\n        f\"{config.path.sampling_output_dir}/all_cluster_predictions_cluster_number_{config.index.sampling.optimum_k}.csv\",\n        sep=\",\",\n    )\n\n    # Sample the clusters as dataframes\n    g = globals()\n    for i in prediction_values:\n        g[\"l_{0}\".format(i)] = df[df[\"prediction\"] == i]\n\n        if len(g[\"l_{0}\".format(i)]) > round(\n            (len(df) * (config.index.sampling.percentage / 100))\n            / len(prediction_values)\n        ):\n            g[\"l_{0}\".format(i)] = g[\"l_{0}\".format(i)].sample(\n                n=round(\n                    (len(df) * (config.index.sampling.percentage / 100))\n                    / len(prediction_values)\n                ),\n                random_state=42,\n            )\n\n    df_list = [g[\"l_{0}\".format(i)] for i in prediction_values]\n\n    # Concatenate the list of DataFrames into a single DataFrame\n    df_concat = pd.concat(df_list)\n    df_concat.to_csv(\n        config.path.sampled_cluster_predictions_path(config.index.sampling.optimum_k),\n        sep=\",\",\n    )\n    # Rebuild sampled chunks dict\n    sampled_chunks = dataframe_to_chunk_dict(df_concat)\n    logger.info(f\"Sampled Document chunk length {len(sampled_chunks)}\")\n\n    # Preserve the sampled files into directory\n    for filename in filenames:\n        try:\n            fn = os.path.basename(filename)\n            os.makedirs(\n                config.sampling_output_dir + \"/\" + config.JOB_NAME, exist_ok=True\n            )\n            shutil.copy2(\n                filename, config.sampling_output_dir + \"/\" + config.JOB_NAME + \"/\" + fn\n            )\n        except OSError as e:\n            logger.info(f\"file {filename} could not be copied with metadata {e}\")\n            continue\n    logger.info(\n        f\"Sampled Documents have been copied to {config.sampling_output_dir + '/' + config.JOB_NAME + '/'}\"\n    )\n\n    return sampled_chunks\n"
  },
  {
    "path": "rag_experiment_accelerator/sampling/tests/data/test1.txt",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/sampling/tests/test_clustering.py",
    "content": "import os\nimport pandas as pd\nimport pytest\nfrom unittest.mock import MagicMock, patch\n\nfrom rag_experiment_accelerator.checkpoint import init_checkpoint\nfrom rag_experiment_accelerator.sampling.clustering import cluster, load_parser\n\n\n@pytest.fixture\ndef mock_logger():\n    return MagicMock()\n\n\n@pytest.fixture\ndef mock_df():\n    return pd.DataFrame(\n        {\n            \"text\": [\n                \"Pigeons, also known as rock doves, are a common sight in urban areas around the world. These birds \"\n                \"are known for their distinctive cooing call and their ability to navigate long distances. Pigeons \"\n                \"are also appreciated for their beauty, with their colorful feathers and iridescent sheen.\",\n                \"Pigeons have been domesticated for thousands of years and have been used for a variety of purposes, \"\n                \"including delivering messages during wartime and racing competitions. They are also popular as pets \"\n                \"and can be trained to perform tricks.\",\n                \"Despite their reputation as pests, pigeons play an important role in the ecosystem. They help to \"\n                \"spread seeds and nutrients throughout their environment and are even considered a keystone species \"\n                \"in some areas.\",\n            ],\n            \"processed_text\": [\n                \"Pigeons, also known as rock doves, are a common sight in urban areas around the world. These birds \"\n                \"are known for their distinctive cooing call and their ability to navigate long distances. Pigeons \"\n                \"are also appreciated for their beauty, with their colorful feathers and iridescent sheen.\",\n                \"Pigeons have been domesticated for thousands of years and have been used for a variety of purposes, \"\n                \"including delivering messages during wartime and racing competitions. They are also popular as pets \"\n                \"and can be trained to perform tricks.\",\n                \"Despite their reputation as pests, pigeons play an important role in the ecosystem. They help to \"\n                \"spread seeds and nutrients throughout their environment and are even considered a keystone species \"\n                \"in some areas.\",\n            ],\n        }\n    )\n\n\n@pytest.fixture\ndef mock_reducer():\n    return MagicMock()\n\n\n@pytest.fixture\ndef mock_df_concat():\n    return pd.DataFrame(\n        {\n            \"x\": [0, 1, 2, 3, 4],\n            \"y\": [0, 1, 2, 3, 4],\n            \"text\": [\n                \"Pigeons, also known as rock doves, are a common sight in urban areas around the world. These birds are known for their distinctive cooing call and their ability to navigate long distances. Pigeons are also appreciated for their beauty, with their colorful feathers and iridescent sheen.\",\n                \"Pigeons have been domesticated for thousands of years and have been used for a variety of purposes, including delivering messages during wartime and racing competitions. They are also popular as pets and can be trained to perform tricks.\",\n                \"Despite their reputation as pests, pigeons play an important role in the ecosystem. They help to spread seeds and nutrients throughout their environment and are even considered a keystone species in some areas.\",\n                \"Overall, pigeons are fascinating and complex creatures that have captured the attention of people for centuries. Whether you love them or hate them, there is no denying the impact that pigeons have had on human society and the natural world.\",\n                \"However, pigeons can also be carriers of diseases and can cause damage to buildings and other structures. It is important to take proper precautions when dealing with pigeons, such as wearing gloves and avoiding direct contact with their droppings.\",\n            ],\n            \"prediction\": [0, 1, 0, 1, 0],\n            \"chunk\": [0, 1, 0, 1, 0],\n        }\n    )\n\n\n@pytest.fixture\ndef mock_data_dir(tmpdir):\n    return tmpdir.mkdir(\"data\")\n\n\ndef test_cluster(mock_logger, mock_df, mock_reducer, mock_df_concat, mock_data_dir):\n    # Arrange\n    all_chunks = [\n        {\n            \"content\": \"Pigeons, also known as rock doves, are a common sight in urban areas around the world. These birds are known for their distinctive cooing call and their ability to navigate long distances. Pigeons are also appreciated for their beauty, with their colorful feathers and iridescent sheen.\",\n            \"metadata\": {\"source\": mock_data_dir + \"/sampling/tests/data/test1.txt\"},\n        },\n        {\n            \"content\": \"Pigeons have been domesticated for thousands of years and have been used for a variety of purposes, including delivering messages during wartime and racing competitions. They are also popular as pets and can be trained to perform tricks.\",\n            \"metadata\": {\"source\": mock_data_dir + \"/sampling/tests/data/test2.txt\"},\n        },\n        {\n            \"content\": \"Despite their reputation as pests, pigeons play an important role in the ecosystem. They help to spread seeds and nutrients throughout their environment and are even considered a keystone species in some areas.\",\n            \"metadata\": {\"source\": mock_data_dir + \"/sampling/tests/data/test3.txt\"},\n        },\n    ]\n\n    config = MagicMock()\n    config.use_checkpoints = False\n    config.index.sampling.optimum_k = 2\n    config.index.sampling.min_cluster = 1\n    config.index.sampling.max_cluster = 10\n    config.index.sampling.percentage = 100\n    config.path.sampling_output_dir = os.path.join(mock_data_dir, \"sampling\")\n    os.makedirs(config.path.sampling_output_dir)\n\n    sampled_input_data_csv_path = f\"{config.path.sampling_output_dir}/sampled_cluster_predictions_cluster_number_{config.index.sampling.optimum_k}.csv\"\n    config.path.sampled_cluster_predictions_path.return_value = (\n        sampled_input_data_csv_path\n    )\n\n    init_checkpoint(config)\n\n    with patch(\n        \"rag_experiment_accelerator.sampling.clustering.logger\", mock_logger\n    ), patch(\n        \"rag_experiment_accelerator.sampling.clustering.chunk_dict_to_dataframe\",\n        return_value=mock_df,\n    ), patch(\n        \"rag_experiment_accelerator.sampling.clustering.vectorize_tfidf\"\n    ), patch(\n        \"rag_experiment_accelerator.sampling.clustering.UMAP\", return_value=mock_reducer\n    ), patch(\n        \"rag_experiment_accelerator.sampling.clustering.determine_optimum_k_elbow\",\n        return_value=2,\n    ), patch(\n        \"rag_experiment_accelerator.sampling.clustering.cluster_kmeans\",\n        return_value=(\n            0,\n            0,\n            \"text\",\n            \"processed_text\",\n            0,\n            [0, 1],\n            [0.5, 0.6],\n            [\n                mock_data_dir + \"/sampling/tests/data/test1.txt\",\n                mock_data_dir + \"/sampling/tests/data/test2.txt\",\n                mock_data_dir + \"/sampling/tests/data/test3.txt\",\n            ],\n        ),\n    ), patch(\n        \"rag_experiment_accelerator.sampling.clustering.pd.DataFrame\",\n        return_value=mock_df_concat,\n    ):\n        # Act\n        parser = load_parser()\n        result = cluster(\"\", all_chunks, config, parser)\n        assert len(result) == 0\n        # Assert\n        assert os.path.exists(\n            os.path.join(\n                config.path.sampling_output_dir,\n                \"all_cluster_predictions_cluster_number_2.csv\",\n            )\n        )\n        assert os.path.exists(\n            os.path.join(\n                config.path.sampling_output_dir,\n                \"sampled_cluster_predictions_cluster_number_2.csv\",\n            )\n        )\n        assert (\n            mock_logger.info.call_args_list[0][0][0]\n            == \"Sampling - Original Document chunk length 3\"\n        )\n        assert mock_logger.info.call_args_list[1][0][0] == \"Run TF-IDF\"\n        assert mock_logger.info.call_args_list[2][0][0] == \"Reducing Umap\"\n        assert (\n            mock_logger.info.call_args_list[3][0][0]\n            == \"Sampled Document chunk length 0\"\n        )\n"
  },
  {
    "path": "rag_experiment_accelerator/search_type/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/search_type/acs_search_methods.py",
    "content": "from azure.core.credentials import AzureKeyCredential\nfrom azure.search.documents import SearchClient\nfrom azure.search.documents.models import (\n    QueryAnswerType,\n    QueryCaptionType,\n    QueryLanguage,\n    QueryType,\n    RawVectorQuery,\n)\n\nfrom rag_experiment_accelerator.embedding.embedding_model import EmbeddingModel\nfrom rag_experiment_accelerator.nlp.preprocess import Preprocess\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef create_client(service_endpoint, index_name, key):\n    \"\"\"\n    Creates and returns a tuple of SearchClient and SearchIndexClient objects\n    using the provided service endpoint, index name, and API key.\n\n    Args:\n        service_endpoint (str): The URL of the Azure AI Search service.\n        index_name (str): The name of the search index.\n        key (str): The API key for the search service.\n\n    Returns:\n        Tuple[SearchClient, SearchIndexClient]: A tuple containing the SearchClient and SearchIndexClient objects.\n    \"\"\"\n    credential = AzureKeyCredential(key)\n    client = SearchClient(\n        endpoint=service_endpoint, index_name=index_name, credential=credential\n    )\n    return client\n\n\ndef format_results(results):\n    \"\"\"\n    Formats the search results by extracting the score and content fields from each result.\n\n    Args:\n        results (list): A list of search results.\n\n    Returns:\n        list: A list of dictionaries, where each dictionary contains the score and content fields of a search result.\n    \"\"\"\n    formatted_results = []\n    for result in results:\n        context_item = {}\n        context_item[\"@search.score\"] = result[\"@search.score\"]\n        context_item[\"content\"] = result[\"content\"]\n        formatted_results.append(context_item)\n\n    return formatted_results\n\n\ndef search_for_match_semantic(\n    client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    retrieve_num_of_documents: int,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Searches for documents in the Azure AI Search index that match the given query using semantic search.\n\n    Args:\n        client (SearchClient): The Azure AI Search client.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query (str): The query string to search for.\n        retrieve_num_of_documents (int): The number of documents to retrieve.\n\n    Returns:\n        list: A list of formatted search results.\n    \"\"\"\n    pre_process = Preprocess(enabled=preprocess)\n    embedding = embedding_model.generate_embedding(chunk=pre_process.preprocess(query))\n\n    vector1 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector\",\n        vector=embedding,\n    )\n    vector2 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"titleVector, summaryVector\",\n        vector=embedding,\n    )\n\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=query,\n            vector_queries=[vector1, vector2],\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n            query_type=QueryType.SEMANTIC,\n            query_language=QueryLanguage.EN_US,\n            semantic_configuration_name=\"my-semantic-config\",\n            query_caption=QueryCaptionType.EXTRACTIVE,\n            query_answer=QueryAnswerType.EXTRACTIVE,\n        )\n\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\n# TODO: Figure out what is going on here. For some of these search functions,\n# I cannot iterate over the results after it leaves this python file, so\n# calling format_results on search_results which enables me to do so\n# This also will provide the same format that comes back from\n# search_for_manual_hybrid\ndef search_for_match_Hybrid_multi(\n    client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    retrieve_num_of_documents: int,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Searches for matching documents in Azure AI Search using a hybrid approach that combines\n    multiple vectors (contentVector, title, and summary) to retrieve the most relevant\n    results.\n\n    Args:\n        client (SearchClient): The Azure AI Search client.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query (str): The search query.\n        retrieve_num_of_documents (int): The number of documents to retrieve.\n\n    Returns:\n        list: A list of formatted search results.\n    \"\"\"\n    pre_process = Preprocess(enabled=preprocess)\n    embedding = embedding_model.generate_embedding(chunk=pre_process.preprocess(query))\n\n    vector1 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector\",\n        vector=embedding,\n    )\n    vector2 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"titleVector\",\n        vector=embedding,\n    )\n    vector3 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"summaryVector\",\n        vector=embedding,\n    )\n\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=query,\n            vector_queries=[vector1, vector2, vector3],\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n        )\n\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\ndef search_for_match_Hybrid_cross(\n    client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    retrieve_num_of_documents: int,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Searches for matching documents using a hybrid cross search method.\n\n    Args:\n        client: An instance of the Azure AI Search client.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query (str): The query string to search for.\n        retrieve_num_of_documents (int): The number of documents to retrieve.\n\n    Returns:\n        A list of formatted search results.\n    \"\"\"\n    pre_process = Preprocess(enabled=preprocess)\n    embedding = embedding_model.generate_embedding(chunk=pre_process.preprocess(query))\n\n    vector1 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector\",\n        vector=embedding,\n    )\n    vector2 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"titleVector, summaryVector\",\n        vector=embedding,\n    )\n\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=query,\n            vector_queries=[vector1, vector2],\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n        )\n\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\ndef search_for_match_text(\n    client: SearchClient,\n    query: str,\n    retrieve_num_of_documents: int,\n    **kwargs,\n):\n    \"\"\"\n    Searches for matching text in the given client using the specified query.\n\n    Args:\n        client: The client to search in.\n        query: The query to search for.\n        retrieve_num_of_documents: The number of documents to retrieve.\n        **kwargs: Additional keyword arguments.\n\n    Returns:\n        A list of formatted search results.\n    \"\"\"\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=query,\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n        )\n\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\ndef search_for_match_pure_vector(\n    client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    retrieve_num_of_documents: int,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Searches for documents in the client's database that match the given query using pure vector search.\n\n    Args:\n        client (Client): The client object used to connect to the database.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query (str): The query string to search for.\n        retrieve_num_of_documents (int): The number of documents to retrieve.\n\n    Returns:\n        A list of dictionaries containing the search results, where each dictionary represents a single document and\n        contains the following keys: 'title', 'content', and 'summary'.\n    \"\"\"\n    pre_process = Preprocess(enabled=preprocess)\n    embedding = embedding_model.generate_embedding(chunk=pre_process.preprocess(query))\n\n    vector1 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector\",\n        vector=embedding,\n    )\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=None,\n            vector_queries=[vector1],\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n        )\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\ndef search_for_match_pure_vector_multi(\n    client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    retrieve_num_of_documents: int,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Searches for matching documents in the given client using the provided query and retrieves the specified number\n    of documents.\n\n    Args:\n        client: The client to search in.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query: The query to search for.\n        retrieve_num_of_documents: The number of documents to retrieve.\n\n    Returns:\n        A list of formatted search results.\n    \"\"\"\n    pre_process = Preprocess(enabled=preprocess)\n    embedding = embedding_model.generate_embedding(chunk=pre_process.preprocess(query))\n\n    vector1 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector\",\n        vector=embedding,\n    )\n    vector2 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"title\",\n        vector=embedding,\n    )\n    vector3 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"summary\",\n        vector=embedding,\n    )\n\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=None,\n            vector_queries=[vector1, vector2, vector3],\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n        )\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\ndef search_for_match_pure_vector_cross(\n    client: SearchClient,\n    embedding_model: EmbeddingModel,\n    query: str,\n    retrieve_num_of_documents: int,\n    preprocess: bool = False,\n):\n    \"\"\"\n    Searches for documents that match the given query using pure vector cross search method.\n\n    Args:\n        client: An instance of the search client.\n        embedding_model (EmbeddingModel): The model used to generate the embeddings.\n        query: The query to search for.\n        retrieve_num_of_documents: The number of documents to retrieve.\n\n    Returns:\n        A list of dictionaries containing the formatted search results.\n    \"\"\"\n    pre_process = Preprocess(enabled=preprocess)\n    embedding = embedding_model.generate_embedding(chunk=pre_process.preprocess(query))\n\n    vector1 = RawVectorQuery(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector, titleVector, summaryVector\",\n        vector=embedding,\n    )\n\n    formatted_search_results = []\n    try:\n        results = client.search(\n            search_text=None,\n            vector_queries=[vector1],\n            top=retrieve_num_of_documents,\n            select=[\"title\", \"content\", \"summary\"],\n        )\n\n        formatted_search_results = format_results(results)\n\n    except Exception as e:\n        logger.error(str(e))\n    return formatted_search_results\n\n\ndef search_for_manual_hybrid(**kwargs):\n    \"\"\"\n    Searches for documents using a combination of text, vector, and semantic matching.\n\n    Args:\n        client: Elasticsearch client object.\n        size: Maximum number of documents to retrieve.\n        query: Query string to search for.\n        retrieve_num_of_documents: Number of documents to retrieve.\n\n    Returns:\n        A list of documents matching the search query.\n    \"\"\"\n    context = []\n    text_context = search_for_match_text(**kwargs)\n    vector_context = search_for_match_pure_vector_cross(**kwargs)\n    semantic_context = search_for_match_semantic(**kwargs)\n\n    context.extend(text_context)\n    context.extend(vector_context)\n    context.extend(semantic_context)\n\n    return context\n"
  },
  {
    "path": "rag_experiment_accelerator/search_type/tests/test_acs_search_methods.py",
    "content": "from unittest.mock import MagicMock, patch, call\n\nfrom azure.search.documents.models import (\n    QueryAnswerType,\n    QueryCaptionType,\n    QueryLanguage,\n    QueryType,\n)\n\nfrom rag_experiment_accelerator.search_type.acs_search_methods import (\n    create_client,\n    format_results,\n    search_for_manual_hybrid,\n    search_for_match_Hybrid_cross,\n    search_for_match_Hybrid_multi,\n    search_for_match_pure_vector,\n    search_for_match_pure_vector_cross,\n    search_for_match_pure_vector_multi,\n    search_for_match_semantic,\n    search_for_match_text,\n)\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.SearchClient\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.AzureKeyCredential\")\ndef test_create_client(mock_azure_key_credential, mock_search_client):\n    # Arrange\n    service_endpoint = \"http://test-service-endpoint\"\n    index_name = \"test-index-name\"\n    key = \"test-key\"\n\n    # Act\n    result = create_client(service_endpoint, index_name, key)\n\n    # Assert\n    mock_azure_key_credential.assert_called_once_with(key)\n    mock_search_client.assert_called_once_with(\n        endpoint=service_endpoint,\n        index_name=index_name,\n        credential=mock_azure_key_credential.return_value,\n    )\n    assert result == mock_search_client.return_value\n\n\ndef test_format_results():\n    search_results = [\n        {\"content\": \"some content 1\", \"@search.score\": 1.1},\n        {\"content\": \"some content 2\", \"@search.score\": 1.2},\n    ]\n    formatted_results = format_results(search_results)\n\n    assert formatted_results[0][\"content\"] == \"some content 1\"\n    assert formatted_results[0][\"@search.score\"] == 1.1\n    assert formatted_results[1][\"content\"] == \"some content 2\"\n    assert formatted_results[1][\"@search.score\"] == 1.2\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_semantic(mock_vector_query):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    mock_vector_query.side_effect = [vector1, vector2]\n\n    # Act\n    results = search_for_match_semantic(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 2\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(\n                k=retrieve_num_of_documents,\n                fields=\"titleVector, summaryVector\",\n                vector=embedding,\n            ),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=query,\n        vector_queries=[vector1, vector2],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n        query_type=QueryType.SEMANTIC,\n        query_language=QueryLanguage.EN_US,\n        semantic_configuration_name=\"my-semantic-config\",\n        query_caption=QueryCaptionType.EXTRACTIVE,\n        query_answer=QueryAnswerType.EXTRACTIVE,\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\ndef test_search_for_match_semantic_handles_exception(mock_logger, mock_vector_query):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Test exception\")\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    mock_vector_query.side_effect = [vector1, vector2]\n\n    # Act\n    results = search_for_match_semantic(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    mock_logger.error.assert_called_once_with(\"Test exception\")\n    assert results == []\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_Hybrid_multi(mock_vector_query):\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    vector3 = \"vector3\"\n    mock_vector_query.side_effect = [vector1, vector2, vector3]\n\n    # Act\n    results = search_for_match_Hybrid_multi(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 3\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(\n                k=retrieve_num_of_documents,\n                fields=\"titleVector\",\n                vector=embedding,\n            ),\n            call(\n                k=retrieve_num_of_documents,\n                fields=\"summaryVector\",\n                vector=embedding,\n            ),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=query,\n        vector_queries=[vector1, vector2, vector3],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\ndef test_search_for_match_Hybrid_multi_handles_exception(\n    mock_logger, mock_vector_query\n):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Test exception\")\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    vector3 = \"vector3\"\n    mock_vector_query.side_effect = [vector1, vector2, vector3]\n\n    # Act\n    results = search_for_match_Hybrid_multi(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 3\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(k=retrieve_num_of_documents, fields=\"titleVector\", vector=embedding),\n            call(k=retrieve_num_of_documents, fields=\"summaryVector\", vector=embedding),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=query,\n        vector_queries=[vector1, vector2, vector3],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n    mock_logger.error.assert_called_once_with(\"Test exception\")\n    assert results == []\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_Hybrid_cross(mock_vector_query):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    mock_vector_query.side_effect = [vector1, vector2]\n\n    # Act\n    results = search_for_match_Hybrid_cross(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 2\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(\n                k=retrieve_num_of_documents,\n                fields=\"titleVector, summaryVector\",\n                vector=embedding,\n            ),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=query,\n        vector_queries=[vector1, vector2],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\ndef test_search_for_match_Hybrid_cross_handles_exception(\n    mock_logger, mock_vector_query\n):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Test exception\")\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    mock_vector_query.side_effect = [vector1, vector2]\n\n    # Act\n    results = search_for_match_Hybrid_cross(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 2\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(\n                k=retrieve_num_of_documents,\n                fields=\"titleVector, summaryVector\",\n                vector=embedding,\n            ),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=query,\n        vector_queries=[vector1, vector2],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n    mock_logger.error.assert_called_once_with(\"Test exception\")\n    assert results == []\n\n\ndef test_search_for_match_text():\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    # Act\n    results = search_for_match_text(\n        client=client,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    client.search.assert_called_once_with(\n        search_text=query,\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\ndef test_search_for_match_text_handles_exception(mock_logger):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Test exception\")\n\n    # Act\n    result = search_for_match_text(\n        client=client,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    client.search.assert_called_once_with(\n        search_text=query,\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n    mock_logger.error.assert_called_once_with(\"Test exception\")\n    assert result == []\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_pure_vector(mock_vector_query):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    mock_vector_query.side_effect = [vector1]\n\n    # Act\n    results = search_for_match_pure_vector(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 1\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=None,\n        vector_queries=[vector1],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.Preprocess\")\ndef test_search_for_match_pure_vector_handles_exception(\n    mock_preprocess, mock_vector_query, mock_logger\n):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Search failed\")\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    mock_vector_query.return_value = vector1\n\n    mock_preprocess.preprocess.return_value = query\n\n    # Act\n    results = search_for_match_pure_vector(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    mock_preprocess.assert_called_once_with(enabled=False)\n    mock_vector_query.assert_called_once_with(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector\",\n        vector=embedding,\n    )\n    client.search.assert_called_once_with(\n        search_text=None,\n        vector_queries=[vector1],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n    mock_logger.error.assert_called_once_with(\"Search failed\")\n    assert results == []\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_pure_vector_multi(mock_vector_query):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    vector3 = \"vector3\"\n    mock_vector_query.side_effect = [vector1, vector2, vector3]\n\n    # Act\n    results = search_for_match_pure_vector_multi(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 3\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(k=retrieve_num_of_documents, fields=\"title\", vector=embedding),\n            call(k=retrieve_num_of_documents, fields=\"summary\", vector=embedding),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=None,\n        vector_queries=[vector1, vector2, vector3],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_pure_vector_multi_handles_exception(\n    mock_vector_query, mock_logger\n):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Search failed\")\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    vector2 = \"vector2\"\n    vector3 = \"vector3\"\n    mock_vector_query.side_effect = [vector1, vector2, vector3]\n\n    # Act\n    results = search_for_match_pure_vector_multi(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 3\n    mock_vector_query.assert_has_calls(\n        [\n            call(k=retrieve_num_of_documents, fields=\"contentVector\", vector=embedding),\n            call(k=retrieve_num_of_documents, fields=\"title\", vector=embedding),\n            call(k=retrieve_num_of_documents, fields=\"summary\", vector=embedding),\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=None,\n        vector_queries=[vector1, vector2, vector3],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n    mock_logger.error.assert_called_once_with(\"Search failed\")\n    assert results == []\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_pure_vector_cross(mock_vector_query):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.return_value = [\n        {\n            \"content\": \"some content\",\n            \"title\": \"some title\",\n            \"summary\": \"A list of items with titles and content.\",\n            \"@search.score\": 1.1,\n            \"@search.reranker_score\": None,\n            \"@search.highlights\": None,\n            \"@search.captions\": None,\n        }\n    ]\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    mock_vector_query.return_value = vector1\n\n    # Act\n    results = search_for_match_pure_vector_cross(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    assert mock_vector_query.call_count == 1\n    mock_vector_query.assert_has_calls(\n        [\n            call(\n                k=retrieve_num_of_documents,\n                fields=\"contentVector, titleVector, summaryVector\",\n                vector=embedding,\n            )\n        ]\n    )\n    client.search.assert_called_once_with(\n        search_text=None,\n        vector_queries=[vector1],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n\n    assert results == [{\"content\": \"some content\", \"@search.score\": 1.1}]\n\n\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.logger\")\n@patch(\"rag_experiment_accelerator.search_type.acs_search_methods.RawVectorQuery\")\ndef test_search_for_match_pure_vector_cross_handles_exception(\n    mock_vector_query, mock_logger\n):\n    # Arrange\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n\n    client = MagicMock()\n    client.search.side_effect = Exception(\"Search failed\")\n\n    embedding_model = MagicMock()\n    embedding = [1, 2, 3]\n    embedding_model.generate_embedding.return_value = embedding\n\n    vector1 = \"vector1\"\n    mock_vector_query.return_value = vector1\n\n    # Act\n    results = search_for_match_pure_vector_cross(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    # Assert\n    mock_vector_query.assert_called_once_with(\n        k=retrieve_num_of_documents,\n        fields=\"contentVector, titleVector, summaryVector\",\n        vector=embedding,\n    )\n    client.search.assert_called_once_with(\n        search_text=None,\n        vector_queries=[vector1],\n        top=retrieve_num_of_documents,\n        select=[\"title\", \"content\", \"summary\"],\n    )\n    mock_logger.error.assert_called_once_with(\"Search failed\")\n    assert results == []\n\n\n@patch(\n    \"rag_experiment_accelerator.search_type.acs_search_methods.search_for_match_text\"\n)\n@patch(\n    \"rag_experiment_accelerator.search_type.acs_search_methods.search_for_match_pure_vector_cross\"\n)\n@patch(\n    \"rag_experiment_accelerator.search_type.acs_search_methods.search_for_match_semantic\"\n)\ndef test_search_for_manual_hybrid(\n    mock_search_for_match_semantic,\n    mock_search_for_match_pure_vector_cross,\n    mock_search_for_match_text,\n):\n    query = \"some query\"\n    retrieve_num_of_documents = 5\n    client = MagicMock()\n    embedding_model = MagicMock()\n\n    mock_search_for_match_text.return_value = [\n        {\"content\": \"some content 1\", \"@search.score\": 1.1}\n    ]\n    mock_search_for_match_pure_vector_cross.return_value = [\n        {\"content\": \"some content 2\", \"@search.score\": 1.2}\n    ]\n    mock_search_for_match_semantic.return_value = [\n        {\"content\": \"some content 3\", \"@search.score\": 1.3}\n    ]\n\n    results = search_for_manual_hybrid(\n        client=client,\n        embedding_model=embedding_model,\n        query=query,\n        retrieve_num_of_documents=retrieve_num_of_documents,\n    )\n\n    assert results == [\n        {\"content\": \"some content 1\", \"@search.score\": 1.1},\n        {\"content\": \"some content 2\", \"@search.score\": 1.2},\n        {\"content\": \"some content 3\", \"@search.score\": 1.3},\n    ]\n"
  },
  {
    "path": "rag_experiment_accelerator/utils/__init__.py",
    "content": ""
  },
  {
    "path": "rag_experiment_accelerator/utils/auth.py",
    "content": "from azure.identity import DefaultAzureCredential\n\nfrom rag_experiment_accelerator.utils.logging import get_logger\n\nlogger = get_logger(__name__)\n\n\ndef get_default_az_cred():\n    \"\"\"\n    Returns a DefaultAzureCredential object that can be used to authenticate with Azure services.\n    If the credential cannot be obtained, an error is logged and an exception is raised.\n    \"\"\"\n    try:\n        credential = DefaultAzureCredential()\n        # Check if credential can get token successfully.\n        credential.get_token(\"https://management.azure.com/.default\")\n    except Exception as ex:\n        logger.error(\n            \"Unable to get a token from DefaultAzureCredential. Please run 'az\"\n            \" login' in your terminal and try again.\"\n        )\n        raise ex\n    return credential\n"
  },
  {
    "path": "rag_experiment_accelerator/utils/logging.py",
    "content": "import logging\nimport os\nimport sys\n\n# Global variable to cache the logging level\n_cached_logging_level = None\n\n\ndef get_logger(name: str) -> logging.Logger:\n    \"\"\"Get Logger\n\n    Args:\n        name (str): Logger name\n\n    Returns:\n        logging.Logger: named logger\n    \"\"\"\n    logger = logging.getLogger(name)\n    if logger.hasHandlers():\n        return logger\n\n    global _cached_logging_level\n    if not _cached_logging_level:\n        _cached_logging_level = os.getenv(\"LOGGING_LEVEL\", \"INFO\").upper()\n\n    handler = logging.StreamHandler(sys.stdout)\n    formatter = logging.Formatter(\n        \"%(asctime)s - %(levelname)s - %(name)s - %(message)s\"\n    )\n    handler.setFormatter(formatter)\n    logger.setLevel(_cached_logging_level)\n    logger.addHandler(handler)\n\n    return logger\n"
  },
  {
    "path": "rag_experiment_accelerator/utils/timetook.py",
    "content": "import time\nfrom .logging import get_logger\n\n\nclass TimeTook(object):\n    \"\"\"\n    Calculates the time a block took to run.\n    Example usage:\n    with TimeTook(\"sample\"):\n        s = [x for x in range(10000000)]\n    Modified from: https://blog.usejournal.com/how-to-create-your-own-timing-context-manager-in-python-a0e944b48cf8 # noqa\n    \"\"\"\n\n    def __init__(self, description, logger):\n        self.description = description\n        self.logger = logger if logger else get_logger(__name__)\n        self.start = None\n        self.end = None\n\n    def __enter__(self):\n        self.start = time.perf_counter()\n        self.logger.info(f\"Starting {self.description}\")\n\n    def __exit__(self, type, value, traceback):\n        self.end = time.perf_counter()\n        self.logger.info(\n            f\"Time took for {self.description}: \" f\"{self.end - self.start} seconds\"\n        )\n"
  },
  {
    "path": "requirements.txt",
    "content": "azure-ai-ml==1.25.0\nazure-ai-textanalytics==5.3.0\nazure-core==1.31.0\nazure-identity==1.18.0\nazure-keyvault==4.2.0\nazure-keyvault-secrets==4.8.*\nazure-search-documents==11.4.b11\nazure.ai.documentintelligence==1.0.0b4\nazureml-core==1.57.0.post1\nazureml-mlflow==1.57.0.post1\nbeautifulsoup4==4.12.3\ndatasets==3.0.0\ndocx2txt==0.8\nevaluate==0.4.3\nhnswlib==0.8.0\njsonschema==4.23.0\nkaleido==0.2.1\nlangchain==0.3.0\nlangchain-community==0.3.0\nlevenshtein==0.26.0\nlxml==5.3.0\nmatplotlib==3.9.2\nmlflow==2.16.1\nopenai==1.64.0\nplotly==5.24.1\npypdf==4.3.1\npytesseract==0.3.13\npython-dotenv==1.0.1\nPyMuPDF==1.24.10\nPyPDF2~=3.0\nrapidfuzz==3.9.7\nrouge-score==0.1.2\nscikit-learn==1.5.2\nsentence-transformers==3.1.1\nspacy==3.7.6\ntextdistance==4.6.3\ntiktoken==0.7.0\ntqdm==4.66.5\numap-learn==0.5.6\nunstructured==0.15.13\n"
  },
  {
    "path": "setup.cfg",
    "content": "[metadata]\nname = RAG Experiment Accelerator\nversion = 0.9\ndescription = A tool to accelerate the process of running experiments with RAG\n\n[options]\npython_requires = >=3.11, <4\n"
  },
  {
    "path": "setup.py",
    "content": "from setuptools import find_packages, setup\n\nwith open(\"requirements.txt\", \"r\") as f:\n    requirements = f.read().splitlines()\n\nsetup(\n    name=\"rag-experiment-accelerator\",\n    version=\"0.9\",\n    packages=find_packages(),\n    install_requires=requirements,\n    python_requires=\">=3.11, <4\",\n    package_data={\n        'rag-experiment-accelerator': ['llm/prompts_text/*.txt'],\n    }\n)\n"
  }
]