[
  {
    "path": ".dockerignore",
    "content": "# Ignore files that are already ignored by git\n.gitignore\n\nscripts/\ntests/\nexamples/\nlocal_server/\nassets/\n*.md\n*.pyc\n.dockerignore\nDockerfile\n"
  },
  {
    "path": ".env.example",
    "content": "# Core environment variables\nDATASTORE=\"<your_datastore>\"\nBEARER_TOKEN=\"<your_bearer_token>\"\nOPENAI_API_KEY=\"<your_openai_api_key>\"\nEMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use\nEMBEDDING_MODEL=\"text-embedding-3-large\" # edit this value based on the model you want to use e.g. text-embedding-3-small, text-embedding-ada-002\n \n# Optional environment variables for Azure OpenAI\nOPENAI_API_BASE=\"https://<AzureOpenAIName>.openai.azure.com/\"\nOPENAI_API_TYPE=\"azure\"\nOPENAI_EMBEDDINGMODEL_DEPLOYMENTID=\"<Name of embedding model deployment>\"\nOPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID=\"<Name of deployment of model for metatdata>\"\nOPENAI_COMPLETIONMODEL_DEPLOYMENTID=\"<Name of general model deployment used for completion>\"\nOPENAI_EMBEDDING_BATCH_SIZE=\"<Batch size of embedding, for AzureOAI, this value need to be set as 1>\"\n\n# Pinecone configuration\nPINECONE_API_KEY=\"<your_pinecone_api_key>\"\nPINECONE_ENVIRONMENT=\"<your_pinecone_environment>\"\nPINECONE_INDEX=\"<your_pinecone_index>\"\n\n# Weaviate configuration\nWEAVIATE_URL=\"<your_weaviate_instance_url>\"\nWEAVIATE_API_KEY=\"<your_api_key_for_WCS>\"\nWEAVIATE_CLASS=\"<your_optional_weaviate_class>\"\n\n# Zilliz configuration\nZILLIZ_COLLECTION=\"<your_zilliz_collection>\"\nZILLIZ_URI=\"<your_zilliz_uri>\"\nZILLIZ_USER=\"<your_zilliz_username>\"\nZILLIZ_PASSWORD=\"<your_zilliz_password>\"\n\n# Milvus configuration\nMILVUS_COLLECTION=\"<your_milvus_collection>\"\nMILVUS_HOST=\"<your_milvus_host>\"\nMILVUS_PORT=\"<your_milvus_port>\"\nMILVUS_USER=\"<your_milvus_username>\"\nMILVUS_PASSWORD=\"<your_milvus_password>\"\n\n# Qdrant configuration\nQDRANT_URL=\"<your_qdrant_url>\"\nQDRANT_PORT=\"<your_qdrant_port>\"\nQDRANT_GRPC_PORT=\"<your_qdrant_grpc_port>\"\nQDRANT_API_KEY=\"<your_qdrant_api_key>\"\nQDRANT_COLLECTION=\"<your_qdrant_collection>\"\n\n# AnalyticDB configuration\nPG_HOST=\"<your_analyticdb_host>\"\nPG_PORT=\"<your_analyticdb_port>\"\nPG_USER=\"<your_analyticdb_username>\"\nPG_PASSWORD=\"<your_analyticdb_password>\"\nPG_DATABASE=\"<your_analyticdb_database>\"\nPG_COLLECTION=\"<your_analyticdb_collection>\"\n\n# Redis configuration\nREDIS_HOST=\"<your_redis_host>\"\nREDIS_PORT=\"<your_redis_port>\"\nREDIS_PASSWORD=\"<your_redis_password>\"\nREDIS_INDEX_NAME=\"<your_redis_index_name>\"\nREDIS_DOC_PREFIX=\"<your_redis_doc_prefix>\"\nREDIS_DISTANCE_METRIC=\"<your_redis_distance_metric>\"\nREDIS_INDEX_TYPE=\"<your_redis_index_type>\"\n\n# Llama configuration\nLLAMA_INDEX_TYPE=\"<gpt_vector_index_type>\"\nLLAMA_INDEX_JSON_PATH=\"<path_to_saved_index_json_file>\"\nLLAMA_QUERY_KWARGS_JSON_PATH=\"<path_to_saved_query_kwargs_json_file>\"\nLLAMA_RESPONSE_MODE=\"<response_mode_for_query>\"\n\n# Chroma configuration\nCHROMA_COLLECTION=\"<your_chroma_collection>\"\nCHROMA_IN_MEMORY=\"<true_or_false>\"\nCHROMA_PERSISTENCE_DIR=\"<your_chroma_persistence_directory>\"\nCHROMA_HOST=\"<your_chroma_host>\"\nCHROMA_PORT=\"<your_chroma_port>\"\n\n# Azure Cognitive Search configuration\nAZURESEARCH_SERVICE=\"<your_search_service_name>\"\nAZURESEARCH_INDEX=\"<your_search_index_name>\"\nAZURESEARCH_API_KEY=\"<your_api_key>\" # (optional, uses key-free managed identity if not set)\n\n# Azure CosmosDB Mongo vCore configuration\nAZCOSMOS_API=\"<your azure cosmos db api, for now it only supports mongo>\"\nAZCOSMOS_CONNSTR=\"<your azure cosmos db mongo vcore connection string>\"\nAZCOSMOS_DATABASE_NAME=\"<your mongo database name>\"\nAZCOSMOS_CONTAINER_NAME=\"<your mongo container name>\"\n\n# Supabase configuration\nSUPABASE_URL=\"<supabase_project_url>\"\nSUPABASE_ANON_KEY=\"<supabase_project_api_anon_key>\"\n\n# Postgres configuration\nPG_HOST=\"<postgres_host>\"\nPG_PORT=\"<postgres_port>\"\nPG_USER=\"<postgres_user>\"\nPG_PASSWORD=\"<postgres_password>\"\nPG_DB=\"<postgres_database>\"\n\n# Elasticsearch configuration\nELASTICSEARCH_URL=\"<elasticsearch_host_and_port>\" # (either specify host or cloud_id)\nELASTICSEARCH_CLOUD_ID=\"<elasticsearch_cloud_id>\"\nELASTICSEARCH_USERNAME=\"<elasticsearch_username>\"\nELASTICSEARCH_PASSWORD=\"<elasticsearch_password>\"\nELASTICSEARCH_API_KEY=\"<elasticsearch_api_key>\"\nELASTICSEARCH_INDEX=\"<elasticsearch_index_name>\"\nELASTICSEARCH_REPLICAS=\"<elasticsearch_replicas>\"\nELASTICSEARCH_SHARDS=\"<elasticsearch_shards>\""
  },
  {
    "path": ".github/pull_request_template.md",
    "content": "## Pull Request (PR) Checklist\nIf you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing!\n\n1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`.\n\n2. **Short Description**: Provide a brief, informative description of the PR that explains the changes made.\n\n3. **Issue(s) Linked**: Mention any related issue(s) by using the keyword `Fixes` or `Closes` followed by the respective issue number(s) (e.g., Fixes #123, Closes #456).\n\n4. **Branch**: Ensure that you have created a new branch for the changes, and it is based on the latest version of the `main` branch.\n\n5. **Code Changes**: Make sure the code changes are minimal, focused, and relevant to the issue or feature being addressed.\n\n6. **Commit Messages**: Write clear and concise commit messages that explain the purpose of each commit.\n\n7. **Tests**: Include unit tests and/or integration tests for any new code or changes to existing code. Make sure all tests pass before submitting the PR.\n\n8. **Documentation**: Update relevant documentation (e.g., README, inline comments, or external documentation) to reflect any changes made.\n\n9. **Review Requested**: Request a review from at least one other contributor or maintainer of the repository.\n\n10. **Video Submission** (For Complex/Large PRs): If your PR introduces significant changes, complexities, or a large number of lines of code, submit a brief video walkthrough along with the PR. The video should explain the purpose of the changes, the logic behind them, and how they address the issue or add the proposed feature. This will help reviewers to better understand your contribution and expedite the review process.\n\n## Pull Request Naming Convention\n\nUse the following naming convention for your PR branches:\n\n```\n<type>/<short-description>-<issue-number>\n```\n\n- `<type>`: The type of PR, such as `bugfix`, `feature`, `enhancement`, `refactor`, or `docs`. Multiple types are ok and should appear as <type>, <type2>\n- `<short-description>`: A brief description of the changes made, using hyphens to separate words.\n- `<issue-number>`: The issue number associated with the changes made (if applicable).\n\nExample:\n\n```\nfeature/advanced-chunking-strategy-123\n```"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# .vscode files\n.vscode/*\n\n# Pycharm\n.idea/\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\nmyvenv/\n\n# Exception for .env.example\n!.env.example\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# macOS .DS_Store files\n.DS_Store"
  },
  {
    "path": ".well-known/ai-plugin.json",
    "content": "{\n  \"schema_version\": \"v1\",\n  \"name_for_model\": \"retrieval\",\n  \"name_for_human\": \"Retrieval Plugin\",\n  \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.\",\n  \"description_for_human\": \"Search through your documents.\",\n  \"auth\": {\n    \"type\": \"user_http\",\n    \"authorization_type\": \"bearer\"\n  },\n  \"api\": {\n    \"type\": \"openapi\",\n    \"url\": \"https://your-app-url.com/.well-known/openapi.yaml\",\n    \"has_user_authentication\": false\n  },\n  \"logo_url\": \"https://your-app-url.com/.well-known/logo.png\",\n  \"contact_email\": \"hello@contact.com\", \n  \"legal_info_url\": \"http://example.com/legal-info\"\n}\n"
  },
  {
    "path": ".well-known/openapi.yaml",
    "content": "openapi: 3.0.2\ninfo:\n  title: Retrieval Plugin API\n  description: A retrieval API for querying and filtering documents based on natural language queries and metadata\n  version: 1.0.0\nservers:\n  - url: https://your-app-url.com\npaths:\n  /query:\n    post:\n      summary: Query\n      description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\n      operationId: query_query_post\n      requestBody:\n        content:\n          application/json:\n            schema:\n              $ref: \"#/components/schemas/QueryRequest\"\n        required: true\n      responses:\n        \"200\":\n          description: Successful Response\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/QueryResponse\"\n        \"422\":\n          description: Validation Error\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/HTTPValidationError\"\n      security:\n        - HTTPBearer: []\ncomponents:\n  schemas:\n    DocumentChunkMetadata:\n      title: DocumentChunkMetadata\n      type: object\n      properties:\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        url:\n          title: Url\n          type: string\n        created_at:\n          title: Created At\n          type: string\n        author:\n          title: Author\n          type: string\n        document_id:\n          title: Document Id\n          type: string\n    DocumentChunkWithScore:\n      title: DocumentChunkWithScore\n      required:\n        - text\n        - metadata\n        - score\n      type: object\n      properties:\n        id:\n          title: Id\n          type: string\n        text:\n          title: Text\n          type: string\n        metadata:\n          $ref: \"#/components/schemas/DocumentChunkMetadata\"\n        embedding:\n          title: Embedding\n          type: array\n          items:\n            type: number\n        score:\n          title: Score\n          type: number\n    DocumentMetadataFilter:\n      title: DocumentMetadataFilter\n      type: object\n      properties:\n        document_id:\n          title: Document Id\n          type: string\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        author:\n          title: Author\n          type: string\n        start_date:\n          title: Start Date\n          type: string\n        end_date:\n          title: End Date\n          type: string\n    HTTPValidationError:\n      title: HTTPValidationError\n      type: object\n      properties:\n        detail:\n          title: Detail\n          type: array\n          items:\n            $ref: \"#/components/schemas/ValidationError\"\n    Query:\n      title: Query\n      required:\n        - query\n      type: object\n      properties:\n        query:\n          title: Query\n          type: string\n        filter:\n          $ref: \"#/components/schemas/DocumentMetadataFilter\"\n        top_k:\n          title: Top K\n          type: integer\n          default: 3\n    QueryRequest:\n      title: QueryRequest\n      required:\n        - queries\n      type: object\n      properties:\n        queries:\n          title: Queries\n          type: array\n          items:\n            $ref: \"#/components/schemas/Query\"\n    QueryResponse:\n      title: QueryResponse\n      required:\n        - results\n      type: object\n      properties:\n        results:\n          title: Results\n          type: array\n          items:\n            $ref: \"#/components/schemas/QueryResult\"\n    QueryResult:\n      title: QueryResult\n      required:\n        - query\n        - results\n      type: object\n      properties:\n        query:\n          title: Query\n          type: string\n        results:\n          title: Results\n          type: array\n          items:\n            $ref: \"#/components/schemas/DocumentChunkWithScore\"\n    Source:\n      title: Source\n      enum:\n        - email\n        - file\n        - chat\n      type: string\n      description: An enumeration.\n    ValidationError:\n      title: ValidationError\n      required:\n        - loc\n        - msg\n        - type\n      type: object\n      properties:\n        loc:\n          title: Location\n          type: array\n          items:\n            anyOf:\n              - type: string\n              - type: integer\n        msg:\n          title: Message\n          type: string\n        type:\n          title: Error Type\n          type: string\n  securitySchemes:\n    HTTPBearer:\n      type: http\n      scheme: bearer\n"
  },
  {
    "path": "Dockerfile",
    "content": "\nFROM python:3.10 as requirements-stage\n\nWORKDIR /tmp\n\nRUN pip install poetry\n\nCOPY ./pyproject.toml ./poetry.lock* /tmp/\n\n\nRUN poetry export -f requirements.txt --output requirements.txt --without-hashes\n\nFROM python:3.10\n\nWORKDIR /code\n\nCOPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt\n\nRUN pip install --no-cache-dir --upgrade -r /code/requirements.txt\n\nCOPY . /code/\n\n# Heroku uses PORT, Azure App Services uses WEBSITES_PORT, Fly.io uses 8080 by default\nCMD [\"sh\", \"-c\", \"uvicorn server.main:app --host 0.0.0.0 --port ${PORT:-${WEBSITES_PORT:-8080}}\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2023 OpenAI\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Makefile",
    "content": "# Heroku\n# make heroku-login\n# make heroku-push\n\nHEROKU_APP = <your app name> \n\nheroku-push:\n\tdocker buildx build --platform linux/amd64 -t ${HEROKU_APP} .\n\tdocker tag ${HEROKU_APP} registry.heroku.com/${HEROKU_APP}/web\n\tdocker push registry.heroku.com/${HEROKU_APP}/web\n\theroku container:release web -a ${HEROKU_APP}\n\nheroku-login:\n\theroku container:login\n"
  },
  {
    "path": "README.md",
    "content": "# ChatGPT Retrieval Plugin\n\nBuild Custom GPTs with a Retrieval Plugin backend to give ChatGPT access to personal documents.\n![Example Custom GPT Screenshot](/assets/example.png)\n\n## Introduction\n\nThe ChatGPT Retrieval Plugin repository provides a flexible solution for semantic search and retrieval of personal or organizational documents using natural language queries. It is a standalone retrieval backend, and can be used with [ChatGPT custom GPTs](https://chat.openai.com/gpts/discovery), [function calling](https://platform.openai.com/docs/guides/function-calling) with the [chat completions](https://platform.openai.com/docs/guides/text-generation) or [assistants APIs](https://platform.openai.com/docs/assistants/overview), or with the [ChatGPT plugins model (deprecated)](https://chat.openai.com/?model=gpt-4-plugins). ChatGPT and the Assistants API both natively support retrieval from uploaded files, so you should use the Retrieval Plugin as a backend only if you want more granular control of your retrieval system (e.g. document text chunk length, embedding model / size, etc.).\n\nThe repository is organized into several directories:\n\n| Directory                       | Description                                                                                                                |\n| ------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |\n| [`datastore`](/datastore)       | Contains the core logic for storing and querying document embeddings using various vector database providers.              |\n| [`docs`](/docs)                 | Includes documentation for setting up and using each vector database provider, webhooks, and removing unused dependencies. |\n| [`examples`](/examples)         | Provides example configurations, authentication methods, and provider-specific examples.                                   |\n| [`local_server`](/local_server) | Contains an implementation of the Retrieval Plugin configured for localhost testing.                                       |\n| [`models`](/models)             | Contains the data models used by the plugin, such as document and metadata models.                                         |\n| [`scripts`](/scripts)           | Offers scripts for processing and uploading documents from different data sources.                                         |\n| [`server`](/server)             | Houses the main FastAPI server implementation.                                                                             |\n| [`services`](/services)         | Contains utility services for tasks like chunking, metadata extraction, and PII detection.                                 |\n| [`tests`](/tests)               | Includes integration tests for various vector database providers.                                                          |\n| [`.well-known`](/.well-known)   | Stores the plugin manifest file and OpenAPI schema, which define the plugin configuration and API specification.           |\n\nThis README provides detailed information on how to set up, develop, and deploy the ChatGPT Retrieval Plugin (stand-alone retrieval backend).\n\n## Table of Contents\n\n- [Quickstart](#quickstart)\n- [About](#about)\n  - [Retrieval Plugin](#retrieval-plugin)\n  - [Retrieval Plugin with custom GPTs](#retrieval-plugin-with-custom-gpts)\n  - [Retrieval Plugin with function calling](#retrieval-plugin-with-function-calling)\n  - [Retrieval Plugin with the plugins model (deprecated)](#chatgpt-plugins-model)\n  - [API Endpoints](#api-endpoints)\n  - [Memory Feature](#memory-feature)\n  - [Security](#security)\n  - [Choosing an Embeddings Model](#choosing-an-embeddings-model)\n- [Development](#development)\n  - [Setup](#setup)\n    - [General Environment Variables](#general-environment-variables)\n  - [Choosing a Vector Database](#choosing-a-vector-database)\n    - [Pinecone](#pinecone)\n    - [Elasticsearch](#elasticsearch)\n    - [MongoDB Atlas](#mongodb-atlas)\n    - [Weaviate](#weaviate)\n    - [Zilliz](#zilliz)\n    - [Milvus](#milvus)\n    - [Qdrant](#qdrant)\n    - [Redis](#redis)\n    - [Llama Index](#llamaindex)\n    - [Chroma](#chroma)\n    - [Azure Cognitive Search](#azure-cognitive-search)\n    - [Azure CosmosDB Mongo vCore](#azure-cosmosdb-mongo-vcore)\n    - [Supabase](#supabase)\n    - [Postgres](#postgres)\n    - [AnalyticDB](#analyticdb)\n  - [Running the API Locally](#running-the-api-locally)\n  - [Personalization](#personalization)\n  - [Authentication Methods](#authentication-methods)\n- [Deployment](#deployment)\n- [Webhooks](#webhooks)\n- [Scripts](#scripts)\n- [Limitations](#limitations)\n- [Contributors](#contributors)\n- [Future Directions](#future-directions)\n\n## Quickstart\n\nFollow these steps to quickly set up and run the ChatGPT Retrieval Plugin:\n\n1. Install Python 3.10, if not already installed.\n2. Clone the repository: `git clone https://github.com/openai/chatgpt-retrieval-plugin.git`\n3. Navigate to the cloned repository directory: `cd /path/to/chatgpt-retrieval-plugin`\n4. Install poetry: `pip install poetry`\n5. Create a new virtual environment with Python 3.10: `poetry env use python3.10`\n6. Activate the virtual environment: `poetry shell`\n7. Install app dependencies: `poetry install`\n8. Create a [bearer token](#general-environment-variables)\n9. Set the required environment variables:\n\n   ```\n   export DATASTORE=<your_datastore>\n   export BEARER_TOKEN=<your_bearer_token>\n   export OPENAI_API_KEY=<your_openai_api_key>\n   export EMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use\n   export EMBEDDING_MODEL=text-embedding-3-large # edit this based on your model preference, e.g. text-embedding-3-small, text-embedding-ada-002\n\n   # Optional environment variables used when running Azure OpenAI\n   export OPENAI_API_BASE=https://<AzureOpenAIName>.openai.azure.com/\n   export OPENAI_API_TYPE=azure\n   export OPENAI_EMBEDDINGMODEL_DEPLOYMENTID=<Name of embedding model deployment>\n   export OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID=<Name of deployment of model for metatdata>\n   export OPENAI_COMPLETIONMODEL_DEPLOYMENTID=<Name of general model deployment used for completion>\n   export OPENAI_EMBEDDING_BATCH_SIZE=<Batch size of embedding, for AzureOAI, this value need to be set as 1>\n\n   # Add the environment variables for your chosen vector DB.\n   # Some of these are optional; read the provider's setup docs in /docs/providers for more information.\n\n   # Pinecone\n   export PINECONE_API_KEY=<your_pinecone_api_key>\n   export PINECONE_ENVIRONMENT=<your_pinecone_environment>\n   export PINECONE_INDEX=<your_pinecone_index>\n\n   # Weaviate\n   export WEAVIATE_URL=<your_weaviate_instance_url>\n   export WEAVIATE_API_KEY=<your_api_key_for_WCS>\n   export WEAVIATE_CLASS=<your_optional_weaviate_class>\n\n   # Zilliz\n   export ZILLIZ_COLLECTION=<your_zilliz_collection>\n   export ZILLIZ_URI=<your_zilliz_uri>\n   export ZILLIZ_USER=<your_zilliz_username>\n   export ZILLIZ_PASSWORD=<your_zilliz_password>\n\n   # Milvus\n   export MILVUS_COLLECTION=<your_milvus_collection>\n   export MILVUS_HOST=<your_milvus_host>\n   export MILVUS_PORT=<your_milvus_port>\n   export MILVUS_USER=<your_milvus_username>\n   export MILVUS_PASSWORD=<your_milvus_password>\n\n   # Qdrant\n   export QDRANT_URL=<your_qdrant_url>\n   export QDRANT_PORT=<your_qdrant_port>\n   export QDRANT_GRPC_PORT=<your_qdrant_grpc_port>\n   export QDRANT_API_KEY=<your_qdrant_api_key>\n   export QDRANT_COLLECTION=<your_qdrant_collection>\n\n   # AnalyticDB\n   export PG_HOST=<your_analyticdb_host>\n   export PG_PORT=<your_analyticdb_port>\n   export PG_USER=<your_analyticdb_username>\n   export PG_PASSWORD=<your_analyticdb_password>\n   export PG_DATABASE=<your_analyticdb_database>\n   export PG_COLLECTION=<your_analyticdb_collection>\n\n\n   # Redis\n   export REDIS_HOST=<your_redis_host>\n   export REDIS_PORT=<your_redis_port>\n   export REDIS_PASSWORD=<your_redis_password>\n   export REDIS_INDEX_NAME=<your_redis_index_name>\n   export REDIS_DOC_PREFIX=<your_redis_doc_prefix>\n   export REDIS_DISTANCE_METRIC=<your_redis_distance_metric>\n   export REDIS_INDEX_TYPE=<your_redis_index_type>\n\n   # Llama\n   export LLAMA_INDEX_TYPE=<gpt_vector_index_type>\n   export LLAMA_INDEX_JSON_PATH=<path_to_saved_index_json_file>\n   export LLAMA_QUERY_KWARGS_JSON_PATH=<path_to_saved_query_kwargs_json_file>\n   export LLAMA_RESPONSE_MODE=<response_mode_for_query>\n\n   # Chroma\n   export CHROMA_COLLECTION=<your_chroma_collection>\n   export CHROMA_IN_MEMORY=<true_or_false>\n   export CHROMA_PERSISTENCE_DIR=<your_chroma_persistence_directory>\n   export CHROMA_HOST=<your_chroma_host>\n   export CHROMA_PORT=<your_chroma_port>\n\n   # Azure Cognitive Search\n   export AZURESEARCH_SERVICE=<your_search_service_name>\n   export AZURESEARCH_INDEX=<your_search_index_name>\n   export AZURESEARCH_API_KEY=<your_api_key> (optional, uses key-free managed identity if not set)\n\n   # Azure CosmosDB Mongo vCore\n   export AZCOSMOS_API = <your azure cosmos db api, for now it only supports mongo>\n   export AZCOSMOS_CONNSTR = <your azure cosmos db mongo vcore connection string>\n   export AZCOSMOS_DATABASE_NAME = <your mongo database name>\n   export AZCOSMOS_CONTAINER_NAME = <your mongo container name>\n\n   # Supabase\n   export SUPABASE_URL=<supabase_project_url>\n   export SUPABASE_ANON_KEY=<supabase_project_api_anon_key>\n\n   # Postgres\n   export PG_HOST=<postgres_host>\n   export PG_PORT=<postgres_port>\n   export PG_USER=<postgres_user>\n   export PG_PASSWORD=<postgres_password>\n   export PG_DB=<postgres_database>\n\n   # Elasticsearch\n   export ELASTICSEARCH_URL=<elasticsearch_host_and_port> (either specify host or cloud_id)\n   export ELASTICSEARCH_CLOUD_ID=<elasticsearch_cloud_id>\n\n   export ELASTICSEARCH_USERNAME=<elasticsearch_username>\n   export ELASTICSEARCH_PASSWORD=<elasticsearch_password>\n   export ELASTICSEARCH_API_KEY=<elasticsearch_api_key>\n\n   export ELASTICSEARCH_INDEX=<elasticsearch_index_name>\n   export ELASTICSEARCH_REPLICAS=<elasticsearch_replicas>\n   export ELASTICSEARCH_SHARDS=<elasticsearch_shards>\n\n   # MongoDB Atlas\n   export MONGODB_URI=<mongodb_uri>\n   export MONGODB_DATABASE=<mongodb_database>\n   export MONGODB_COLLECTION=<mongodb_collection>\n   export MONGODB_INDEX=<mongodb_index>\n   ```\n\n10. Run the API locally: `poetry run start`\n11. Access the API documentation at `http://0.0.0.0:8000/docs` and test the API endpoints (make sure to add your bearer token).\n\n## About\n\n### Retrieval Plugin\n\nThis is a standalone retrieval backend that can be used with [ChatGPT custom GPTs](https://chat.openai.com/gpts/discovery), [function calling](https://platform.openai.com/docs/guides/function-calling) with the [chat completions](https://platform.openai.com/docs/guides/text-generation) or [assistants APIs](https://platform.openai.com/docs/assistants/overview), or with the [ChatGPT plugins model (deprecated)](https://chat.openai.com/?model=gpt-4-plugins).\n\nIt enables a model to carry out semantic search and retrieval of personal or organizational documents, and write answers informed by relevent retrieved context (sometimes referred to as \"Retrieval-Augmented Generation\" or \"RAG\"). It allows users to obtain the most relevant document snippets from their data sources, such as files, notes, or emails, by asking questions or expressing needs in natural language. Enterprises can make their internal documents available to their employees through ChatGPT using this plugin.\n\nThe plugin uses OpenAI's embeddings model (`text-embedding-3-large` 256 dimension embeddings by default) to generate embeddings of document chunks, and then stores and queries them using a vector database on the backend. As an open-source and self-hosted solution, developers can deploy their own Retrieval Plugin and register it with ChatGPT. The Retrieval Plugin supports several vector database providers, allowing developers to choose their preferred one from a list.\n\nA FastAPI server exposes the plugin's endpoints for upserting, querying, and deleting documents. Users can refine their search results by using metadata filters by source, date, author, or other criteria. The plugin can be hosted on any cloud platform that supports Docker containers, such as Fly.io, Heroku, Render, or Azure Container Apps. To keep the vector database updated with the latest documents, the plugin can process and store documents from various data sources continuously, using incoming webhooks to the upsert and delete endpoints. Tools like [Zapier](https://zapier.com) or [Make](https://www.make.com) can help configure the webhooks based on events or schedules.\n\n### Retrieval Plugin with Custom GPTs\n\nTo create a custom GPT that can use your Retrieval Plugin for semantic search and retrieval of your documents, and even store new information back to the database, you first need to have deployed a Retrieval Plugin. For detailed instructions on how to do this, please refer to the [Deployment section](#deployment). Once you have your app URL (e.g., `https://your-app-url.com`), take the following steps:\n\n1. Navigate to the create GPT page at `https://chat.openai.com/gpts/editor`.\n2. Follow the standard creation flow to set up your GPT.\n3. Navigate to the \"Configure\" tab. Here, you can manually fill in fields such as name, description, and instructions, or use the smart creator for assistance.\n4. Under the \"Actions\" section, click on \"Create new action\".\n5. Choose an authentication method. The Retrieval Plugin supports None, API key (Basic or Bearer) and OAuth. For more information on these methods, refer to the [Authentication Methods Section](#authentication-methods).\n6. Import the OpenAPI schema. You can either:\n   - Import directly from the OpenAPI schema hosted in your app at `https://your-app-url.com/.well-known/openapi.yaml`.\n   - Copy and paste the contents of [this file](/.well-known/openapi.yaml) into the Schema input area if you only want to expose the query endpoint to the GPT. Remember to change the URL under the `-servers` section of the OpenAPI schema you paste in.\n7. Optionally, you might want to add a fetch endpoint. This would involve editing the [`/server/main.py`](/server/main.py) file to add an endpoint and implement this for your chosen vector database. If you make this change, please consider contributing it back to the project by opening a pull request! Adding the fetch endpoint to the OpenAPI schema would allow the model to fetch more content from a document by ID if some text is cut off in the retrieved result. It might also be useful to pass in a string with the text from the retrieved result and an option to return a fixed length of context before and after the retrieved result.\n8. If you want the GPT to be able to save information back to the vector database, you can give it access to the Retrieval Plugin's `/upsert` endpoint. To do this, copy the contents of [this file](/examples/memory/openapi.yaml) into the schema area. This allows the GPT to store new information it generates or learns during the conversation. More details on this feature can be found at [Memory Feature](#memory-feature) and [in the docs here](/examples/memory).\n\nRemember: ChatGPT and custom GPTs natively support retrieval from uploaded files, so you should use the Retrieval Plugin as a backend only if you want more granular control of your retrieval system (e.g. self-hosting, embedding chunk length, embedding model / size, etc.).\n\n### Retrieval Plugin with Function Calling\n\nThe Retrieval Plugin can be integrated with function calling in both the [Chat Completions API](https://platform.openai.com/docs/guides/function-calling) and the [Assistants API](https://platform.openai.com/docs/assistants/overview). This allows the model to decide when to use your functions (query, fetch, upsert) based on the conversation context.\n\n#### Function Calling with Chat Completions\n\nIn a call to the chat completions API, you can describe functions and have the model generate a JSON object containing arguments to call one or many functions. The latest models (gpt-3.5-turbo-0125 and gpt-4-turbo-preview) have been trained to detect when a function should be called and to respond with JSON that adheres to the function signature.\n\nYou can define the functions for the Retrieval Plugin endpoints and pass them in as tools when you use the Chat Completions API with one of the latest models. The model will then intelligently call the functions. You can use function calling to write queries to your APIs, call the endpoint on the backend, and return the response as a tool message to the model to continue the conversation. The function definitions/schemas and an example can be found [here](/examples/function-calling/).\n\n#### Function Calling with Assistants API\n\nYou can use the same function definitions with the OpenAI [Assistants API](https://platform.openai.com/docs/assistants/overview), specifically the [function calling in tool use](https://platform.openai.com/docs/assistants/tools/function-calling). The Assistants API allows you to build AI assistants within your own applications, leveraging models, tools, and knowledge to respond to user queries. The function definitions/schemas and an example can be found [here](/examples/function-calling/). The Assistants API natively supports retrieval from uploaded files, so you should use the Retrieval Plugin with function calling only if you want more granular control of your retrieval system (e.g. embedding chunk length, embedding model / size, etc.).\n\nParallel function calling is supported for both the Chat Completions API and the Assistants API. This means you can perform multiple tasks, such as querying something and saving something back to the vector database, in the same message.\n\nRead more about function calling with the Retrieval Plugin [here](/examples/function-calling/).\n\n### ChatGPT Plugins Model\n\n(deprecated) We recommend using custom actions with GPTs to make use of the Retrieval Plugin through ChatGPT. Instrucitons for using retrieval with the deprecated plugins model can be found [here](/docs/deprecated/plugins.md).\n\n### API Endpoints\n\nThe Retrieval Plugin is built using FastAPI, a web framework for building APIs with Python. FastAPI allows for easy development, validation, and documentation of API endpoints. Find the FastAPI documentation [here](https://fastapi.tiangolo.com/).\n\nOne of the benefits of using FastAPI is the automatic generation of interactive API documentation with Swagger UI. When the API is running locally, Swagger UI at `<local_host_url i.e. http://0.0.0.0:8000>/docs` can be used to interact with the API endpoints, test their functionality, and view the expected request and response models.\n\nThe plugin exposes the following endpoints for upserting, querying, and deleting documents from the vector database. All requests and responses are in JSON format, and require a valid bearer token as an authorization header.\n\n- `/upsert`: This endpoint allows uploading one or more documents and storing their text and metadata in the vector database. The documents are split into chunks of around 200 tokens, each with a unique ID. The endpoint expects a list of documents in the request body, each with a `text` field, and optional `id` and `metadata` fields. The `metadata` field can contain the following optional subfields: `source`, `source_id`, `url`, `created_at`, and `author`. The endpoint returns a list of the IDs of the inserted documents (an ID is generated if not initially provided).\n\n- `/upsert-file`: This endpoint allows uploading a single file (PDF, TXT, DOCX, PPTX, or MD) and storing its text and metadata in the vector database. The file is converted to plain text and split into chunks of around 200 tokens, each with a unique ID. The endpoint returns a list containing the generated id of the inserted file.\n\n- `/query`: This endpoint allows querying the vector database using one or more natural language queries and optional metadata filters. The endpoint expects a list of queries in the request body, each with a `query` and optional `filter` and `top_k` fields. The `filter` field should contain a subset of the following subfields: `source`, `source_id`, `document_id`, `url`, `created_at`, and `author`. The `top_k` field specifies how many results to return for a given query, and the default value is 3. The endpoint returns a list of objects that each contain a list of the most relevant document chunks for the given query, along with their text, metadata and similarity scores.\n\n- `/delete`: This endpoint allows deleting one or more documents from the vector database using their IDs, a metadata filter, or a delete_all flag. The endpoint expects at least one of the following parameters in the request body: `ids`, `filter`, or `delete_all`. The `ids` parameter should be a list of document IDs to delete; all document chunks for the document with these IDS will be deleted. The `filter` parameter should contain a subset of the following subfields: `source`, `source_id`, `document_id`, `url`, `created_at`, and `author`. The `delete_all` parameter should be a boolean indicating whether to delete all documents from the vector database. The endpoint returns a boolean indicating whether the deletion was successful.\n\nThe detailed specifications and examples of the request and response models can be found by running the app locally and navigating to http://0.0.0.0:8000/openapi.json, or in the OpenAPI schema [here](/.well-known/openapi.yaml). Note that the OpenAPI schema only contains the `/query` endpoint, because that is the only function that ChatGPT needs to access. This way, ChatGPT can use the plugin only to retrieve relevant documents based on natural language queries or needs. However, if developers want to also give ChatGPT the ability to remember things for later, they can use the `/upsert` endpoint to save snippets from the conversation to the vector database. An example of a manifest and OpenAPI schema that gives ChatGPT access to the `/upsert` endpoint can be found [here](/examples/memory).\n\nTo include custom metadata fields, edit the `DocumentMetadata` and `DocumentMetadataFilter` data models [here](/models/models.py), and update the OpenAPI schema [here](/.well-known/openapi.yaml). You can update this easily by running the app locally, copying the JSON found at http://0.0.0.0:8000/sub/openapi.json, and converting it to YAML format with [Swagger Editor](https://editor.swagger.io/). Alternatively, you can replace the `openapi.yaml` file with an `openapi.json` file.\n\n### Memory Feature\n\nA notable feature of the Retrieval Plugin is its capacity to provide ChatGPT with memory. By using the plugin's upsert endpoint, ChatGPT can save snippets from the conversation to the vector database for later reference (only when prompted to do so by the user). This functionality contributes to a more context-aware chat experience by allowing ChatGPT to remember and retrieve information from previous conversations. Learn how to configure the Retrieval Plugin with memory [here](/examples/memory).\n\n### Security\n\nThe Retrieval Plugin allows ChatGPT to search a vector database of content, and then add the best results into the ChatGPT session. This means it doesn’t have any external effects, and the main risk consideration is data authorization and privacy. Developers should only add content into their Retrieval Plugin that they have authorization for and that they are fine with appearing in users’ ChatGPT sessions. You can choose from a number of different authentication methods to secure the plugin (more information [here](#authentication-methods)).\n\n### Choosing an Embeddings Model\n\nThe ChatGPT Retrieval Plugin uses OpenAI's embeddings models to generate embeddings of document chunks. The default model for the Retrieval Plugin is `text-embedding-3-large` with 256 dimensions. OpenAI offers two latest embeddings models, `text-embedding-3-small` and `text-embedding-3-large`, as well as an older model, `text-embedding-ada-002`.\n\nThe new models support shortening embeddings without significant loss of retrieval accuracy, allowing you to balance retrieval accuracy, cost, and speed.\n\nHere's a comparison of the models:\n\n| Model                  | Embedding Size | Average MTEB Score | Cost per 1k Tokens |\n| ---------------------- | -------------- | ------------------ | ------------------ |\n| text-embedding-3-large | 3072           | 64.6%              | $0.00013           |\n| text-embedding-3-large | 1024           | 64.1%              | $0.00013           |\n| text-embedding-3-large | 256            | 62.0%              | $0.00013           |\n| text-embedding-3-small | 1536           | 62.3%              | $0.00002           |\n| text-embedding-3-small | 512            | 61.6%              | $0.00002           |\n| text-embedding-ada-002 | 1536           | 61.0%              | $0.0001            |\n\nWhen choosing a model, consider:\n\n1. **Retrieval Accuracy vs Cost**: `text-embedding-3-large` offers the highest accuracy but at a higher cost. `text-embedding-3-small` is more cost-effective with competitive accuracy. The older `text-embedding-ada-002` model has the lowest accuracy.\n\n2. **Embedding Size**: Larger embeddings provide better accuracy but consume more storage and could be slower to query. You can adjust the size of the embeddings to balance these factors.\n\nFor example, if your vector database supports up to 1024 dimensions, you can use `text-embedding-3-large` and set the dimensions API parameter to 1024. This shortens the embedding from 3072 dimensions, trading off some accuracy for lower storage and query costs.\n\nTo change your chosen embeddings model and size, edit the following environment variables:\n\n```\nEMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use\nEMBEDDING_MODEL=\"text-embedding-3-large\" # edit this value based on the model you want to use e.g. text-embedding-3-small, text-embedding-ada-002\n```\n\n## Development\n\n### Setup\n\nThis app uses Python 3.10, and [poetry](https://python-poetry.org/) for dependency management.\n\nInstall Python 3.10 on your machine if it isn't already installed. It can be downloaded from the official [Python website](https://www.python.org/downloads/) or with a package manager like `brew` or `apt`, depending on your system.\n\nClone the repository from GitHub:\n\n```\ngit clone https://github.com/openai/chatgpt-retrieval-plugin.git\n```\n\nNavigate to the cloned repository directory:\n\n```\ncd /path/to/chatgpt-retrieval-plugin\n```\n\nInstall poetry:\n\n```\npip install poetry\n```\n\nCreate a new virtual environment that uses Python 3.10:\n\n```\npoetry env use python3.10\npoetry shell\n```\n\nInstall app dependencies using poetry:\n\n```\npoetry install\n```\n\n**Note:** If adding dependencies in the `pyproject.toml`, make sure to run `poetry lock` and `poetry install`.\n\n#### General Environment Variables\n\nThe API requires the following environment variables to work:\n\n| Name             | Required | Description                                                                                                                                                                                                                                                   |\n| ---------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| `DATASTORE`      | Yes      | This specifies the vector database provider you want to use to store and query embeddings. You can choose from `elasticsearch`, `chroma`, `pinecone`, `weaviate`, `zilliz`, `milvus`, `qdrant`, `redis`, `azuresearch`, `supabase`, `postgres`, `analyticdb`, `mongodb-atlas`. |\n| `BEARER_TOKEN`   | Yes      | This is a secret token that you need to authenticate your requests to the API. You can generate one using any tool or method you prefer, such as [jwt.io](https://jwt.io/).                                                                                   |\n| `OPENAI_API_KEY` | Yes      | This is your OpenAI API key that you need to generate embeddings using the one of the OpenAI embeddings model. You can get an API key by creating an account on [OpenAI](https://openai.com/).                                                                |\n\n### Using the plugin with Azure OpenAI\n\nThe Azure Open AI uses URLs that are specific to your resource and references models not by model name but by the deployment id. As a result, you need to set additional environment variables for this case.\n\nIn addition to the `OPENAI_API_BASE` (your specific URL) and `OPENAI_API_TYPE` (azure), you should also set `OPENAI_EMBEDDINGMODEL_DEPLOYMENTID` which specifies the model to use for getting embeddings on upsert and query. For this, we recommend deploying `text-embedding-ada-002` model and using the deployment name here.\n\nIf you wish to use the data preparation scripts, you will also need to set `OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID`, used for metadata extraction and\n`OPENAI_COMPLETIONMODEL_DEPLOYMENTID`, used for PII handling.\n\n### Choosing a Vector Database\n\nThe plugin supports several vector database providers, each with different features, performance, and pricing. Depending on which one you choose, you will need to use a different Dockerfile and set different environment variables. The following sections provide brief introductions to each vector database provider.\n\nFor more detailed instructions on setting up and using each vector database provider, please refer to the respective documentation in the `/docs/providers/<datastore_name>/setup.md` file ([folders here](/docs/providers)).\n\n#### Pinecone\n\n[Pinecone](https://www.pinecone.io) is a managed vector database designed for speed, scale, and rapid deployment to production. It supports hybrid search and is currently the only datastore to natively support SPLADE sparse vectors. For detailed setup instructions, refer to [`/docs/providers/pinecone/setup.md`](/docs/providers/pinecone/setup.md).\n\n#### Weaviate\n\n[Weaviate](https://weaviate.io/) is an open-source vector search engine built to scale seamlessly into billions of data objects. It supports hybrid search out-of-the-box, making it suitable for users who require efficient keyword searches. Weaviate can be self-hosted or managed, offering flexibility in deployment. For detailed setup instructions, refer to [`/docs/providers/weaviate/setup.md`](/docs/providers/weaviate/setup.md).\n\n#### Zilliz\n\n[Zilliz](https://zilliz.com) is a managed cloud-native vector database designed for billion-scale data. It offers a wide range of features, including multiple indexing algorithms, distance metrics, scalar filtering, time travel searches, rollback with snapshots, full RBAC, 99.9% uptime, separated storage and compute, and multi-language SDKs. For detailed setup instructions, refer to [`/docs/providers/zilliz/setup.md`](/docs/providers/zilliz/setup.md).\n\n#### Milvus\n\n[Milvus](https://milvus.io/) is an open-source, cloud-native vector database that scales to billions of vectors. It is the open-source version of Zilliz and shares many of its features, such as various indexing algorithms, distance metrics, scalar filtering, time travel searches, rollback with snapshots, multi-language SDKs, storage and compute separation, and cloud scalability. For detailed setup instructions, refer to [`/docs/providers/milvus/setup.md`](/docs/providers/milvus/setup.md).\n\n#### Qdrant\n\n[Qdrant](https://qdrant.tech/) is a vector database capable of storing documents and vector embeddings. It offers both self-hosted and managed [Qdrant Cloud](https://cloud.qdrant.io/) deployment options, providing flexibility for users with different requirements. For detailed setup instructions, refer to [`/docs/providers/qdrant/setup.md`](/docs/providers/qdrant/setup.md).\n\n#### Redis\n\n[Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform suitable for a variety of use cases, including everyday applications and AI/ML workloads. It can be used as a low-latency vector engine by creating a Redis database with the [Redis Stack docker container](/examples/docker/redis/docker-compose.yml). For a hosted/managed solution, [Redis Cloud](https://app.redislabs.com/#/) is available. For detailed setup instructions, refer to [`/docs/providers/redis/setup.md`](/docs/providers/redis/setup.md).\n\n#### LlamaIndex\n\n[LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data.\nIt provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT.\nUnlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases.\nIt is light-weight, easy-to-use, and requires no additional deployment.\nAll you need to do is specifying a few environment variables (optionally point to an existing saved Index json file).\nNote that metadata filters in queries are not yet supported.\nFor detailed setup instructions, refer to [`/docs/providers/llama/setup.md`](/docs/providers/llama/setup.md).\n\n#### Chroma\n\n[Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make getting started as easy as possible. Chroma runs in-memory, or in a client-server setup. It supports metadata and keyword filtering out of the box. For detailed instructions, refer to [`/docs/providers/chroma/setup.md`](/docs/providers/chroma/setup.md).\n\n#### Azure Cognitive Search\n\n[Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval cloud service that supports vector search, text search, and hybrid (vectors + text combined to yield the best of the two approaches). It also offers an [optional L2 re-ranking step](https://learn.microsoft.com/azure/search/semantic-search-overview) to further improve results quality. For detailed setup instructions, refer to [`/docs/providers/azuresearch/setup.md`](/docs/providers/azuresearch/setup.md)\n\n#### Azure CosmosDB Mongo vCore\n\n[Azure CosmosDB Mongo vCore](https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/) supports vector search on embeddings, and it could be used to seamlessly integrate your AI-based applications with your data stored in the Azure CosmosDB. For detailed instructions, refer to [`/docs/providers/azurecosmosdb/setup.md`](/docs/providers/azurecosmosdb/setup.md)\n\n#### Supabase\n\n[Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension for Postgres Database. [You can use Supabase CLI](https://github.com/supabase/cli) to set up a whole Supabase stack locally or in the cloud or you can also use docker-compose, k8s and other options available. For a hosted/managed solution, try [Supabase.com](https://supabase.com/) and unlock the full power of Postgres with built-in authentication, storage, auto APIs, and Realtime features. For detailed setup instructions, refer to [`/docs/providers/supabase/setup.md`](/docs/providers/supabase/setup.md).\n\n#### Postgres\n\n[Postgres](https://www.postgresql.org) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension. To use pgvector, you will need to set up a PostgreSQL database with the pgvector extension enabled. For example, you can [use docker](https://www.docker.com/blog/how-to-use-the-postgres-docker-official-image/) to run locally. For a hosted/managed solution, you can use any of the cloud vendors which support [pgvector](https://github.com/pgvector/pgvector#hosted-postgres). For detailed setup instructions, refer to [`/docs/providers/postgres/setup.md`](/docs/providers/postgres/setup.md).\n\n#### AnalyticDB\n\n[AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is fully compatible with PostgreSQL syntax and managed by Alibaba Cloud. AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing features such as indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. For detailed setup instructions, refer to [`/docs/providers/analyticdb/setup.md`](/docs/providers/analyticdb/setup.md).\n\n#### Elasticsearch\n\n[Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html) currently supports storing vectors through the `dense_vector` field type and uses them to calculate document scores. Elasticsearch 8.0 builds on this functionality to support fast, approximate nearest neighbor search (ANN). This represents a much more scalable approach, allowing vector search to run efficiently on large datasets. For detailed setup instructions, refer to [`/docs/providers/elasticsearch/setup.md`](/docs/providers/elasticsearch/setup.md).\n\n#### Mongodb-Atlas\n\n[MongoDB Atlas](https://www.mongodb.com/docs/atlas/getting-started/) Currently, the procedure involves generating an Atlas Vector Search index for all collections featuring vector embeddings of 2048 dimensions or fewer in width. This applies to diverse data types coexisting with additional data on your Atlas cluster, and the process is executed through the Atlas UI and Atlas Administration AP, refer to [`/docs/providers/mongodb_atlas/setup.md`](/docs/providers/mongodb_atlas/setup.md).\n\n### Running the API locally\n\nTo run the API locally, you first need to set the requisite environment variables with the `export` command:\n\n```\nexport DATASTORE=<your_datastore>\nexport BEARER_TOKEN=<your_bearer_token>\nexport OPENAI_API_KEY=<your_openai_api_key>\n<Add the environment variables for your chosen vector DB here>\n```\n\nStart the API with:\n\n```\npoetry run start\n```\n\nAppend `docs` to the URL shown in the terminal and open it in a browser to access the API documentation and try out the endpoints (i.e. http://0.0.0.0:8000/docs). Make sure to enter your bearer token and test the API endpoints.\n\n**Note:** If you add new dependencies to the pyproject.toml file, you need to run `poetry lock` and `poetry install` to update the lock file and install the new dependencies.\n\n### Personalization\n\nYou can personalize the Retrieval Plugin for your own use case by doing the following:\n\n- **Replace the logo**: Replace the image in [logo.png](/.well-known/logo.png) with your own logo.\n\n- **Edit the data models**: Edit the `DocumentMetadata` and `DocumentMetadataFilter` data models in [models.py](/models/models.py) to add custom metadata fields. Update the OpenAPI schema in [openapi.yaml](/.well-known/openapi.yaml) accordingly. To update the OpenAPI schema more easily, you can run the app locally, then navigate to `http://0.0.0.0:8000/sub/openapi.json` and copy the contents of the webpage. Then go to [Swagger Editor](https://editor.swagger.io/) and paste in the JSON to convert it to a YAML format. You could also replace the [openapi.yaml](/.well-known/openapi.yaml) file with an openapi.json file in the [.well-known](/.well-known) folder.\n\n- **Change the plugin name, description, and usage instructions**: Update the plugin name, user-facing description, and usage instructions for the model. You can either edit the descriptions in the [main.py](/server/main.py) file or update the [openapi.yaml](/.well-known/openapi.yaml) file. Follow the same instructions as in the previous step to update the OpenAPI schema.\n\n- **Enable ChatGPT to save information from conversations**: See the instructions in the [memory example folder](/examples/memory).\n\n### Authentication Methods\n\nYou can choose from four options for authenticating requests to your plugin:\n\n1. **No Authentication**: Anyone can add your plugin and use its API without any credentials. This option is suitable if you are only exposing documents that are not sensitive or already public. It provides no security for your data. If using this method, copy the contents of this [main.py](/examples/authentication-methods/no-auth/main.py) into the [actual main.py file](/server/main.py). Example manifest [here](/examples/authentication-methods/no-auth/ai-plugin.json).\n\n2. **HTTP Bearer**: You can use a secret token as a header to authorize requests to your plugin. There are two variants of this option:\n\n   - **User Level** (default for this implementation): Each user who adds your plugin to ChatGPT must provide the bearer token when adding the plugin. You can generate and distribute these tokens using any tool or method you prefer, such as [jwt.io](https://jwt.io/). This method provides better security as each user has to enter the shared access token. If you require a unique access token for each user, you will need to implement this yourself in the [main.py](/server/main.py) file. Example manifest [here](/examples/authentication-methods/user-http/ai-plugin.json).\n\n   - **Service Level**: Anyone can add your plugin and use its API without credentials, but you must add a bearer token when registering the plugin. When you install your plugin, you need to add your bearer token, and will then receive a token from ChatGPT that you must include in your hosted manifest file. Your token will be used by ChatGPT to authorize requests to your plugin on behalf of all users who add it. This method is more convenient for users, but it may be less secure as all users share the same token and do not need to add a token to install the plugin. Example manifest [here](/examples/authentication-methods/service-http/ai-plugin.json).\n\n3. **OAuth**: Users must go through an OAuth flow to add your plugin. You can use an OAuth provider to authenticate users who add your plugin and grant them access to your API. This method offers the highest level of security and control, as users authenticate through a trusted third-party provider. However, you will need to implement the OAuth flow yourself in the [main.py](/server/main.py) file and provide the necessary parameters in your manifest file. Example manifest [here](/examples/authentication-methods/oauth/ai-plugin.json).\n\nConsider the benefits and drawbacks of each authentication method before choosing the one that best suits your use case and security requirements. If you choose to use a method different to the default (User Level HTTP), make sure to update the manifest file [here](/.well-known/ai-plugin.json).\n\n## Deployment\n\nYou can deploy your app to different cloud providers, depending on your preferences and requirements. However, regardless of the provider you choose, you will need to update two files in your app: [openapi.yaml](/.well-known/openapi.yaml) and [ai-plugin.json](/.well-known/ai-plugin.json). As outlined above, these files define the API specification and the AI plugin configuration for your app, respectively. You need to change the url field in both files to match the address of your deployed app.\n\nRender has a 1-click deploy option that automatically updates the url field in both files:\n\n[<img src=\"https://render.com/images/deploy-to-render-button.svg\" alt=\"Deploy to Render\" />](https://render.com/deploy?repo=https://github.com/render-examples/chatgpt-retrieval-plugin/tree/main)\n\nBefore deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. Refer to the respective documentation in the [`/docs/deployment/removing-unused-dependencies.md`](/docs/deployment/removing-unused-dependencies.md) file for information on removing unused dependencies for each provider.\n\nInstructions:\n\n- [Deploying to Fly.io](/docs/deployment/flyio.md)\n- [Deploying to Heroku](/docs/deployment/heroku.md)\n- [Deploying to Render](/docs/deployment/render.md)\n- [Other Deployment Options](/docs/deployment/other-options.md) (Azure Container Apps, Google Cloud Run, AWS Elastic Container Service, etc.)\n\nOnce you have deployed your app, consider uploading an initial batch of documents using one of [these scripts](/scripts) or by calling the `/upsert` endpoint.\n\n## Webhooks\n\nTo keep the documents stored in the vector database up-to-date, consider using tools like [Zapier](https://zapier.com) or [Make](https://www.make.com) to configure incoming webhooks to your plugin's API based on events or schedules. For example, this could allow you to sync new information as you update your notes or receive emails. You can also use a [Zapier Transfer](https://zapier.com/blog/zapier-transfer-guide/) to batch process a collection of existing documents and upload them to the vector database.\n\nIf you need to pass custom fields from these tools to your plugin, you might want to create an additional Retrieval Plugin API endpoint that calls the datastore's upsert function, such as `upsert-email`. This custom endpoint can be designed to accept specific fields from the webhook and process them accordingly.\n\nTo set up an incoming webhook, follow these general steps:\n\n- Choose a webhook tool like Zapier or Make and create an account.\n- Set up a new webhook or transfer in the tool, and configure it to trigger based on events or schedules.\n- Specify the target URL for the webhook, which should be the API endpoint of your Retrieval Plugin (e.g. `https://your-plugin-url.com/upsert`).\n- Configure the webhook payload to include the necessary data fields and format them according to your Retrieval Plugin's API requirements.\n- Test the webhook to ensure it's working correctly and sending data to your Retrieval Plugin as expected.\n\nAfter setting up the webhook, you may want to run a backfill to ensure that any previously missed data is included in the vector database.\n\nRemember that if you want to use incoming webhooks to continuously sync data, you should consider running a backfill after setting these up to avoid missing any data.\n\nIn addition to using tools like Zapier and Make, you can also build your own custom integrations to sync data with your Retrieval Plugin. This allows you to have more control over the data flow and tailor the integration to your specific needs and requirements.\n\n## Scripts\n\nThe `scripts` folder contains scripts to batch upsert or process text documents from different data sources, such as a zip file, JSON file, or JSONL file. These scripts use the plugin's upsert utility functions to upload the documents and their metadata to the vector database, after converting them to plain text and splitting them into chunks. Each script folder has a README file that explains how to use it and what parameters it requires. You can also optionally screen the documents for personally identifiable information (PII) using a language model and skip them if detected, with the [`services.pii_detection`](/services/pii_detection.py) module. This can be helpful if you want to avoid uploading sensitive or private documents to the vector database unintentionally. Additionally, you can optionally extract metadata from the document text using a language model, with the [`services.extract_metadata`](/services/extract_metadata.py) module. This can be useful if you want to enrich the document metadata. **Note:** if using incoming webhooks to continuously sync data, consider running a backfill after setting these up to avoid missing any data.\n\nThe scripts are:\n\n- [`process_json`](scripts/process_json/): This script processes a file dump of documents in a JSON format and stores them in the vector database with some metadata. The format of the JSON file should be a list of JSON objects, where each object represents a document. The JSON object should have a `text` field and optionally other fields to populate the metadata. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata.\n- [`process_jsonl`](scripts/process_jsonl/): This script processes a file dump of documents in a JSONL format and stores them in the vector database with some metadata. The format of the JSONL file should be a newline-delimited JSON file, where each line is a valid JSON object representing a document. The JSON object should have a `text` field and optionally other fields to populate the metadata. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata.\n- [`process_zip`](scripts/process_zip/): This script processes a file dump of documents in a zip file and stores them in the vector database with some metadata. The format of the zip file should be a flat zip file folder of docx, pdf, txt, md, pptx or csv files. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata.\n\n## Pull Request (PR) Checklist\n\nIf you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing!\n\n1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`.\n\n2. **Short Description**: Provide a brief, informative description of the PR that explains the changes made.\n\n3. **Issue(s) Linked**: Mention any related issue(s) by using the keyword `Fixes` or `Closes` followed by the respective issue number(s) (e.g., Fixes #123, Closes #456).\n\n4. **Branch**: Ensure that you have created a new branch for the changes, and it is based on the latest version of the `main` branch.\n\n5. **Code Changes**: Make sure the code changes are minimal, focused, and relevant to the issue or feature being addressed.\n\n6. **Commit Messages**: Write clear and concise commit messages that explain the purpose of each commit.\n\n7. **Tests**: Include unit tests and/or integration tests for any new code or changes to existing code. Make sure all tests pass before submitting the PR.\n\n8. **Documentation**: Update relevant documentation (e.g., README, inline comments, or external documentation) to reflect any changes made.\n\n9. **Review Requested**: Request a review from at least one other contributor or maintainer of the repository.\n\n10. **Video Submission** (For Complex/Large PRs): If your PR introduces significant changes, complexities, or a large number of lines of code, submit a brief video walkthrough along with the PR. The video should explain the purpose of the changes, the logic behind them, and how they address the issue or add the proposed feature. This will help reviewers to better understand your contribution and expedite the review process.\n\n## Pull Request Naming Convention\n\nUse the following naming convention for your PR branches:\n\n```\n<type>/<short-description>-<issue-number>\n```\n\n- `<type>`: The type of PR, such as `bugfix`, `feature`, `enhancement`, `refactor`, or `docs`. Multiple types are ok and should appear as <type>, <type2>\n- `<short-description>`: A brief description of the changes made, using hyphens to separate words.\n- `<issue-number>`: The issue number associated with the changes made (if applicable).\n\nExample:\n\n```\nfeature/advanced-chunking-strategy-123\n```\n\n## Limitations\n\nWhile the ChatGPT Retrieval Plugin is designed to provide a flexible solution for semantic search and retrieval, it does have some limitations:\n\n- **Keyword search limitations**: The embeddings generated by the chosen OpenAI embeddings model may not always be effective at capturing exact keyword matches. As a result, the plugin might not return the most relevant results for queries that rely heavily on specific keywords. Some vector databases, like Elasticsearch, Pinecone, Weaviate and Azure Cognitive Search, use hybrid search and might perform better for keyword searches.\n- **Sensitive data handling**: The plugin does not automatically detect or filter sensitive data. It is the responsibility of the developers to ensure that they have the necessary authorization to include content in the Retrieval Plugin and that the content complies with data privacy requirements.\n- **Scalability**: The performance of the plugin may vary depending on the chosen vector database provider and the size of the dataset. Some providers may offer better scalability and performance than others.\n- **Metadata extraction**: The optional metadata extraction feature relies on a language model to extract information from the document text. This process may not always be accurate, and the quality of the extracted metadata may vary depending on the document content and structure.\n- **PII detection**: The optional PII detection feature is not foolproof and may not catch all instances of personally identifiable information. Use this feature with caution and verify its effectiveness for your specific use case.\n\n## Future Directions\n\nThe ChatGPT Retrieval Plugin provides a flexible solution for semantic search and retrieval, but there is always potential for further development. We encourage users to contribute to the project by submitting pull requests for new features or enhancements. Notable contributions may be acknowledged with OpenAI credits.\n\nSome ideas for future directions include:\n\n- **More vector database providers**: If you are interested in integrating another vector database provider with the ChatGPT Retrieval Plugin, feel free to submit an implementation.\n- **Additional scripts**: Expanding the range of scripts available for processing and uploading documents from various data sources would make the plugin even more versatile.\n- **User Interface**: Developing a user interface for managing documents and interacting with the plugin could improve the user experience.\n- **Hybrid search / TF-IDF option**: Enhancing the [datastore's upsert function](/datastore/datastore.py#L18) with an option to use hybrid search or TF-IDF indexing could improve the plugin's performance for keyword-based queries.\n- **Advanced chunking strategies and embeddings calculations**: Implementing more sophisticated chunking strategies and embeddings calculations, such as embedding document titles and summaries, performing weighted averaging of document chunks and summaries, or calculating the average embedding for a document, could lead to better search results.\n- **Custom metadata**: Allowing users to add custom metadata to document chunks, such as titles or other relevant information, might improve the retrieved results in some use cases.\n- **Additional optional services**: Integrating more optional services, such as summarizing documents or pre-processing documents before embedding them, could enhance the plugin's functionality and quality of retrieved results. These services could be implemented using language models and integrated directly into the plugin, rather than just being available in the scripts.\n\nWe welcome contributions from the community to help improve the ChatGPT Retrieval Plugin and expand its capabilities. If you have an idea or feature you'd like to contribute, please submit a pull request to the repository.\n\n## Contributors\n\nWe would like to extend our gratitude to the following contributors for their code / documentation contributions, and support in integrating various vector database providers with the ChatGPT Retrieval Plugin:\n\n- [Pinecone](https://www.pinecone.io/)\n  - [acatav](https://github.com/acatav)\n  - [gkogan](https://github.com/gkogan)\n  - [jamescalam](https://github.com/jamescalam)\n- [Weaviate](https://www.semi.technology/)\n  - [byronvoorbach](https://github.com/byronvoorbach)\n  - [hsm207](https://github.com/hsm207)\n  - [sebawita](https://github.com/sebawita)\n- [Zilliz](https://zilliz.com/)\n  - [filip-halt](https://github.com/filip-halt)\n- [Milvus](https://milvus.io/)\n  - [filip-halt](https://github.com/filip-halt)\n- [Qdrant](https://qdrant.tech/)\n  - [kacperlukawski](https://github.com/kacperlukawski)\n- [Redis](https://redis.io/)\n  - [spartee](https://github.com/spartee)\n  - [tylerhutcherson](https://github.com/tylerhutcherson)\n- [LlamaIndex](https://github.com/jerryjliu/llama_index)\n  - [jerryjliu](https://github.com/jerryjliu)\n  - [Disiok](https://github.com/Disiok)\n- [Supabase](https://supabase.com/)\n  - [egor-romanov](https://github.com/egor-romanov)\n- [Postgres](https://www.postgresql.org/)\n  - [egor-romanov](https://github.com/egor-romanov)\n  - [mmmaia](https://github.com/mmmaia)\n- [Elasticsearch](https://www.elastic.co/)\n  - [joemcelroy](https://github.com/joemcelroy)\n"
  },
  {
    "path": "datastore/__init__.py",
    "content": ""
  },
  {
    "path": "datastore/datastore.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import Dict, List, Optional\nimport asyncio\n\nfrom models.models import (\n    Document,\n    DocumentChunk,\n    DocumentMetadataFilter,\n    Query,\n    QueryResult,\n    QueryWithEmbedding,\n)\nfrom services.chunks import get_document_chunks\nfrom services.openai import get_embeddings\n\n\nclass DataStore(ABC):\n    async def upsert(\n        self, documents: List[Document], chunk_token_size: Optional[int] = None\n    ) -> List[str]:\n        \"\"\"\n        Takes in a list of documents and inserts them into the database.\n        First deletes all the existing vectors with the document id (if necessary, depends on the vector db), then inserts the new ones.\n        Return a list of document ids.\n        \"\"\"\n        # Delete any existing vectors for documents with the input document ids\n        await asyncio.gather(\n            *[\n                self.delete(\n                    filter=DocumentMetadataFilter(\n                        document_id=document.id,\n                    ),\n                    delete_all=False,\n                )\n                for document in documents\n                if document.id\n            ]\n        )\n\n        chunks = get_document_chunks(documents, chunk_token_size)\n\n        return await self._upsert(chunks)\n\n    @abstractmethod\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n\n        raise NotImplementedError\n\n    async def query(self, queries: List[Query]) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        # get a list of just the queries from the Query list\n        query_texts = [query.query for query in queries]\n        query_embeddings = get_embeddings(query_texts)\n        # hydrate the queries with embeddings\n        queries_with_embeddings = [\n            QueryWithEmbedding(**query.dict(), embedding=embedding)\n            for query, embedding in zip(queries, query_embeddings)\n        ]\n        return await self._query(queries_with_embeddings)\n\n    @abstractmethod\n    async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Multiple parameters can be used at once.\n        Returns whether the operation was successful.\n        \"\"\"\n        raise NotImplementedError\n"
  },
  {
    "path": "datastore/factory.py",
    "content": "from datastore.datastore import DataStore\nimport os\n\n\nasync def get_datastore() -> DataStore:\n    datastore = os.environ.get(\"DATASTORE\")\n    assert datastore is not None\n\n    match datastore:\n        case \"chroma\":\n            from datastore.providers.chroma_datastore import ChromaDataStore\n\n            return ChromaDataStore()\n        case \"llama\":\n            from datastore.providers.llama_datastore import LlamaDataStore\n\n            return LlamaDataStore()\n\n        case \"pinecone\":\n            from datastore.providers.pinecone_datastore import PineconeDataStore\n\n            return PineconeDataStore()\n        case \"weaviate\":\n            from datastore.providers.weaviate_datastore import WeaviateDataStore\n\n            return WeaviateDataStore()\n        case \"milvus\":\n            from datastore.providers.milvus_datastore import MilvusDataStore\n\n            return MilvusDataStore()\n        case \"zilliz\":\n            from datastore.providers.zilliz_datastore import ZillizDataStore\n\n            return ZillizDataStore()\n        case \"redis\":\n            from datastore.providers.redis_datastore import RedisDataStore\n\n            return await RedisDataStore.init()\n        case \"azurecosmosdb\":\n            from datastore.providers.azurecosmosdb_datastore import (\n                AzureCosmosDBDataStore,\n            )\n\n            return await AzureCosmosDBDataStore.create()\n        case \"qdrant\":\n            from datastore.providers.qdrant_datastore import QdrantDataStore\n\n            return QdrantDataStore()\n        case \"azuresearch\":\n            from datastore.providers.azuresearch_datastore import AzureSearchDataStore\n\n            return AzureSearchDataStore()\n        case \"supabase\":\n            from datastore.providers.supabase_datastore import SupabaseDataStore\n\n            return SupabaseDataStore()\n        case \"postgres\":\n            from datastore.providers.postgres_datastore import PostgresDataStore\n\n            return PostgresDataStore()\n        case \"analyticdb\":\n            from datastore.providers.analyticdb_datastore import AnalyticDBDataStore\n\n            return AnalyticDBDataStore()\n        case \"elasticsearch\":\n            from datastore.providers.elasticsearch_datastore import (\n                ElasticsearchDataStore,\n            )\n\n            return ElasticsearchDataStore()\n        case \"mongodb\":\n            from datastore.providers.mongodb_atlas_datastore import (\n                MongoDBAtlasDataStore,\n            )\n\n            return MongoDBAtlasDataStore()\n        case _:\n            raise ValueError(\n                f\"Unsupported vector database: {datastore}. \"\n                f\"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, azuresearch, or qdrant\"\n            )\n"
  },
  {
    "path": "datastore/providers/__init__.py",
    "content": ""
  },
  {
    "path": "datastore/providers/analyticdb_datastore.py",
    "content": "import os\nimport asyncio\nfrom typing import Dict, List, Optional, Tuple, Any\nfrom datetime import datetime\nfrom loguru import logger\n\nfrom psycopg2cffi import compat\n\ncompat.register()\nimport psycopg2\nfrom psycopg2.extras import DictCursor\nfrom psycopg2.pool import SimpleConnectionPool\n\nfrom services.date import to_unix_timestamp\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    DocumentChunkWithScore,\n)\n\nPG_CONFIG = {\n    \"collection\": os.environ.get(\"PG_COLLECTION\", \"document_chunks\"),\n    \"database\": os.environ.get(\"PG_DATABASE\", \"postgres\"),\n    \"user\": os.environ.get(\"PG_USER\", \"user\"),\n    \"password\": os.environ.get(\"PG_PASSWORD\", \"password\"),\n    \"host\": os.environ.get(\"PG_HOST\", \"localhost\"),\n    \"port\": int(os.environ.get(\"PG_PORT\", \"5432\")),\n}\nOUTPUT_DIM = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\nclass AnalyticDBDataStore(DataStore):\n    def __init__(self, config: Dict[str, str] = PG_CONFIG):\n        self.collection_name = config[\"collection\"]\n        self.user = config[\"user\"]\n        self.password = config[\"password\"]\n        self.database = config[\"database\"]\n        self.host = config[\"host\"]\n        self.port = config[\"port\"]\n\n        self.connection_pool = SimpleConnectionPool(\n            minconn=1,\n            maxconn=100,\n            dbname=self.database,\n            user=self.user,\n            password=self.password,\n            host=self.host,\n            port=self.port,\n        )\n\n        self._initialize_db()\n\n    def _initialize_db(self):\n        conn = self.connection_pool.getconn()\n        try:\n            with conn.cursor() as cur:\n                self._create_table(cur)\n                self._create_embedding_index(cur)\n                conn.commit()\n        finally:\n            self.connection_pool.putconn(conn)\n\n    def _create_table(self, cur: psycopg2.extensions.cursor):\n        cur.execute(\n            f\"\"\"\n              CREATE TABLE IF NOT EXISTS {self.collection_name} (\n                id TEXT PRIMARY KEY DEFAULT uuid_generate_v4()::TEXT,\n                source TEXT,\n                source_id TEXT,\n                content TEXT,\n                document_id TEXT,\n                author TEXT,\n                url TEXT,\n                created_at TIMESTAMPTZ DEFAULT NOW(),\n                embedding real[]\n            );\n            \"\"\"\n        )\n\n    def _create_embedding_index(self, cur: psycopg2.extensions.cursor):\n        cur.execute(\n            f\"\"\"\n            SELECT * FROM pg_indexes WHERE tablename='{self.collection_name}';\n            \"\"\"\n        )\n        index_exists = any(\n            index[2] == f\"{self.collection_name}_embedding_idx\"\n            for index in cur.fetchall()\n        )\n        if not index_exists:\n            cur.execute(\n                f\"\"\"\n                CREATE INDEX {self.collection_name}_embedding_idx\n                ON {self.collection_name}\n                USING ann(embedding)\n                WITH (\n                    distancemeasure=L2,\n                    dim=OUTPUT_DIM,\n                    pq_segments=64,\n                    hnsw_m=100,\n                    pq_centers=2048\n                );\n                \"\"\"\n            )\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a dict of document_ids to list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        loop = asyncio.get_event_loop()\n        tasks = [\n            loop.run_in_executor(None, self._upsert_chunk, chunk)\n            for document_chunks in chunks.values()\n            for chunk in document_chunks\n        ]\n        await asyncio.gather(*tasks)\n\n        return list(chunks.keys())\n\n    def _upsert_chunk(self, chunk: DocumentChunk):\n        created_at = (\n            datetime.fromtimestamp(to_unix_timestamp(chunk.metadata.created_at))\n            if chunk.metadata.created_at\n            else None\n        )\n        data = (\n            chunk.id,\n            chunk.text,\n            chunk.embedding,\n            chunk.metadata.document_id,\n            chunk.metadata.source,\n            chunk.metadata.source_id,\n            chunk.metadata.url,\n            chunk.metadata.author,\n            created_at,\n        )\n\n        conn = self.connection_pool.getconn()\n        try:\n            with conn.cursor() as cur:\n                # Construct the SQL query and data\n                query = f\"\"\"\n                        INSERT INTO {self.collection_name} (id, content, embedding, document_id, source, source_id, url, author, created_at)\n                        VALUES (%s::text, %s::text, %s::real[], %s::text, %s::text, %s::text, %s::text, %s::text, %s::timestamp with time zone)\n                        ON CONFLICT (id) DO UPDATE SET\n                            content = EXCLUDED.content,\n                            embedding = EXCLUDED.embedding,\n                            document_id = EXCLUDED.document_id,\n                            source = EXCLUDED.source,\n                            source_id = EXCLUDED.source_id,\n                            url = EXCLUDED.url,\n                            author = EXCLUDED.author,\n                            created_at = EXCLUDED.created_at;\n                \"\"\"\n\n                # Execute the query\n                cur.execute(query, data)\n\n                # Commit the transaction\n                conn.commit()\n        finally:\n            self.connection_pool.putconn(conn)\n\n    async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        query_results: List[QueryResult] = []\n\n        def generate_query(query: QueryWithEmbedding) -> Tuple[str, List[Any]]:\n            embedding = \"[\" + \", \".join(str(x) for x in query.embedding) + \"]\"\n            q = f\"\"\"\n                SELECT\n                    id,\n                    content,\n                    source,\n                    source_id,\n                    document_id,\n                    url,\n                    created_at,\n                    author,\n                    embedding,\n                    l2_distance(embedding,array{embedding}::real[]) AS similarity\n                FROM\n                    {self.collection_name}\n            \"\"\"\n            where_clause, params = generate_where_clause(query.filter)\n            q += where_clause\n            q += f\"ORDER BY embedding <-> array{embedding}::real[] LIMIT {query.top_k};\"\n            return q, params\n\n        def generate_where_clause(\n            query_filter: Optional[DocumentMetadataFilter],\n        ) -> Tuple[str, List[Any]]:\n            if query_filter is None:\n                return \"\", []\n\n            conditions = [\n                (\"document_id=%s\", query_filter.document_id),\n                (\"source_id=%s\", query_filter.source_id),\n                (\"source LIKE %s\", query_filter.source),\n                (\"author LIKE %s\", query_filter.author),\n                (\"created_at >= %s\", query_filter.start_date),\n                (\"created_at <= %s\", query_filter.end_date),\n            ]\n\n            where_clause = \"WHERE \" + \" AND \".join(\n                [cond[0] for cond in conditions if cond[1] is not None]\n            )\n\n            values = [cond[1] for cond in conditions if cond[1] is not None]\n\n            return where_clause, values\n\n        def fetch_data(cur, q: str, params: List[Any]):\n            cur.execute(q, params)\n            return cur.fetchall()\n\n        def create_results(data):\n            results = []\n            for row in data:\n                document_chunk = DocumentChunkWithScore(\n                    id=row[\"id\"],\n                    text=row[\"content\"],\n                    score=float(row[\"similarity\"]),\n                    metadata=DocumentChunkMetadata(\n                        source=row[\"source\"],\n                        source_id=row[\"source_id\"],\n                        document_id=row[\"document_id\"],\n                        url=row[\"url\"],\n                        created_at=str(row[\"created_at\"]),\n                        author=row[\"author\"],\n                    ),\n                )\n                results.append(document_chunk)\n            return results\n\n        conn = self.connection_pool.getconn()\n        try:\n            for query in queries:\n                try:\n                    cur = conn.cursor(cursor_factory=DictCursor)\n                    for query in queries:\n                        q, params = generate_query(query)\n                        data = fetch_data(cur, q, params)\n                        results = create_results(data)\n                        query_results.append(\n                            QueryResult(query=query.query, results=results)\n                        )\n                except Exception as e:\n                    logger.error(e)\n                    query_results.append(QueryResult(query=query.query, results=[]))\n            return query_results\n        finally:\n            self.connection_pool.putconn(conn)\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        async def execute_delete(query: str, params: Optional[List] = None) -> bool:\n            conn = self.connection_pool.getconn()\n            try:\n                with conn.cursor() as cur:\n                    if params:\n                        cur.execute(query, params)\n                    else:\n                        cur.execute(query)\n                    self.conn.commit()\n                return True\n            except Exception as e:\n                logger.error(e)\n                return False\n            finally:\n                self.connection_pool.putconn(conn)\n\n        if delete_all:\n            query = f\"DELETE FROM {self.collection_name} WHERE document_id LIKE %s;\"\n            return await execute_delete(query, [\"%\"])\n        elif ids:\n            query = f\"DELETE FROM {self.collection_name} WHERE document_id IN ({','.join(['%s'] * len(ids))});\"\n            return await execute_delete(query, ids)\n        elif filter is not None:\n            query, params = self._generate_delete_query(filter)\n            return await execute_delete(query, params)\n        else:\n            return True\n\n    def _generate_delete_query(\n        self, filter: DocumentMetadataFilter\n    ) -> Tuple[str, List]:\n        conditions = [\n            (filter.document_id, \"document_id = %s\"),\n            (filter.source, \"source = %s\"),\n            (filter.source_id, \"source_id = %s\"),\n            (filter.author, \"author = %s\"),\n            (filter.start_date, \"created_at >= %s\"),\n            (filter.end_date, \"created_at <= %s\"),\n        ]\n\n        where_conditions = [f for value, f in conditions if value]\n        where_values = [value for value, _ in conditions if value]\n\n        query = f\"DELETE FROM {self.collection_name} WHERE {' AND '.join(where_conditions)};\"\n        return query, where_values\n"
  },
  {
    "path": "datastore/providers/azurecosmosdb_datastore.py",
    "content": "import logging\nimport os\n\nimport certifi\nimport numpy as np\nimport pymongo\n\nfrom pymongo.mongo_client import MongoClient\nfrom abc import ABC, abstractmethod\n\nfrom typing import Dict, List, Optional\nfrom datetime import datetime\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentMetadataFilter,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n)\nfrom services.date import to_unix_timestamp\n\n\n# Read environment variables for CosmosDB Mongo vCore\nAZCOSMOS_API = os.environ.get(\"AZCOSMOS_API\", \"mongo-vcore\")\nAZCOSMOS_CONNSTR = os.environ.get(\"AZCOSMOS_CONNSTR\")\nAZCOSMOS_DATABASE_NAME = os.environ.get(\"AZCOSMOS_DATABASE_NAME\")\nAZCOSMOS_CONTAINER_NAME = os.environ.get(\"AZCOSMOS_CONTAINER_NAME\")\nassert AZCOSMOS_API is not None\nassert AZCOSMOS_CONNSTR is not None\nassert AZCOSMOS_DATABASE_NAME is not None\nassert AZCOSMOS_CONTAINER_NAME is not None\n\n# OpenAI Ada Embeddings Dimension\nVECTOR_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\n# Abstract class similar to the original data store that allows API level abstraction\nclass AzureCosmosDBStoreApi(ABC):\n    @abstractmethod\n    async def ensure(self, num_lists, similarity):\n        raise NotImplementedError\n\n    @abstractmethod\n    async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]:\n        raise NotImplementedError\n\n    @abstractmethod\n    async def query_core(\n        self, query: QueryWithEmbedding\n    ) -> List[DocumentChunkWithScore]:\n        raise NotImplementedError\n\n    @abstractmethod\n    async def drop_container(self):\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete_filter(self, filter: DocumentMetadataFilter):\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete_ids(self, ids: List[str]):\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete_document_ids(self, documentIds: List[str]):\n        raise NotImplementedError\n\n\nclass MongoStoreApi(AzureCosmosDBStoreApi):\n    def __init__(self, mongoClient: MongoClient):\n        self.mongoClient = mongoClient\n\n    @staticmethod\n    def _get_metadata_filter(filter: DocumentMetadataFilter) -> dict:\n        returnedFilter: dict = {}\n        if filter.document_id is not None:\n            returnedFilter[\"document_id\"] = filter.document_id\n        if filter.author is not None:\n            returnedFilter[\"metadata.author\"] = filter.author\n        if filter.start_date is not None:\n            returnedFilter[\"metadata.created_at\"] = {\n                \"$gt\": datetime.fromisoformat(filter.start_date)\n            }\n        if filter.end_date is not None:\n            returnedFilter[\"metadata.created_at\"] = {\n                \"$lt\": datetime.fromisoformat(filter.end_date)\n            }\n        if filter.source is not None:\n            returnedFilter[\"metadata.source\"] = filter.source\n        if filter.source_id is not None:\n            returnedFilter[\"metadata.source_id\"] = filter.source_id\n        return returnedFilter\n\n    async def ensure(self, num_lists, similarity):\n        assert self.mongoClient.is_mongos\n        self.collection = self.mongoClient[AZCOSMOS_DATABASE_NAME][\n            AZCOSMOS_CONTAINER_NAME\n        ]\n\n        indexes = self.collection.index_information()\n        if indexes.get(\"embedding_cosmosSearch\") is None:\n            # Ensure the vector index exists.\n            indexDefs: List[any] = [\n                {\n                    \"name\": \"embedding_cosmosSearch\",\n                    \"key\": {\"embedding\": \"cosmosSearch\"},\n                    \"cosmosSearchOptions\": {\n                        \"kind\": \"vector-ivf\",\n                        \"numLists\": num_lists,\n                        \"similarity\": similarity,\n                        \"dimensions\": VECTOR_DIMENSION,\n                    },\n                }\n            ]\n            self.mongoClient[AZCOSMOS_DATABASE_NAME].command(\n                \"createIndexes\", AZCOSMOS_CONTAINER_NAME, indexes=indexDefs\n            )\n\n    async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]:\n        # Until nested doc embedding support is done, treat each chunk as a separate doc.\n        doc_ids: List[str] = []\n        for chunk in chunks:\n            finalDocChunk: dict = {\n                \"_id\": f\"doc:{docId}:chunk:{chunk.id}\",\n                \"document_id\": docId,\n                \"embedding\": chunk.embedding,\n                \"text\": chunk.text,\n                \"metadata\": chunk.metadata.__dict__,\n            }\n\n            if chunk.metadata.created_at is not None:\n                finalDocChunk[\"metadata\"][\"created_at\"] = datetime.fromisoformat(\n                    chunk.metadata.created_at\n                )\n            self.collection.insert_one(finalDocChunk)\n            doc_ids.append(finalDocChunk[\"_id\"])\n        return doc_ids\n\n    async def query_core(\n        self, query: QueryWithEmbedding\n    ) -> List[DocumentChunkWithScore]:\n        pipeline = [\n            {\n                \"$search\": {\n                    \"cosmosSearch\": {\n                        \"vector\": query.embedding,\n                        \"path\": \"embedding\",\n                        \"k\": query.top_k,\n                    },\n                    \"returnStoredSource\": True,\n                }\n            },\n            {\n                \"$project\": {\n                    \"similarityScore\": {\"$meta\": \"searchScore\"},\n                    \"document\": \"$$ROOT\",\n                }\n            },\n        ]\n\n        # TODO: Add in match filter (once it can be satisfied).\n        # Perform vector search\n        query_results: List[DocumentChunkWithScore] = []\n        for aggResult in self.collection.aggregate(pipeline):\n            finalMetadata = aggResult[\"document\"][\"metadata\"]\n            if finalMetadata[\"created_at\"] is not None:\n                finalMetadata[\"created_at\"] = datetime.isoformat(\n                    finalMetadata[\"created_at\"]\n                )\n            result = DocumentChunkWithScore(\n                id=aggResult[\"_id\"],\n                score=aggResult[\"similarityScore\"],\n                text=aggResult[\"document\"][\"text\"],\n                metadata=finalMetadata,\n            )\n            query_results.append(result)\n        return query_results\n\n    async def drop_container(self):\n        self.collection.drop()\n\n    async def delete_filter(self, filter: DocumentMetadataFilter):\n        delete_filter = self._get_metadata_filter(filter)\n        self.collection.delete_many(delete_filter)\n\n    async def delete_ids(self, ids: List[str]):\n        self.collection.delete_many({\"_id\": {\"$in\": ids}})\n\n    async def delete_document_ids(self, documentIds: List[str]):\n        self.collection.delete_many({\"document_id\": {\"$in\": documentIds}})\n\n\n# Datastore implementation.\n\"\"\"\nA class representing a memory store for Azure CosmosDB DataStore, currently only supports Mongo vCore\n\"\"\"\n\n\nclass AzureCosmosDBDataStore(DataStore):\n    def __init__(self, cosmosStore: AzureCosmosDBStoreApi):\n        self.cosmosStore = cosmosStore\n\n    \"\"\"\n    Creates a new datastore based on the Cosmos Api provided in the environment variables, \n    only supports Mongo vCore for now\n    \n    Args:\n        numLists (int)   : This integer is the number of clusters that the inverted file (IVF) index \n                                  uses to group the vector data. We recommend that numLists is set to \n                                  documentCount/1000 for up to 1 million documents and to sqrt(documentCount) \n                                  for more than 1 million documents. Using a numLists value of 1 is akin to \n                                  performing brute-force search, which has limited performance.\n        similarity (str) : Similarity metric to use with the IVF index. Possible options are COS (cosine distance),\n                           L2 (Euclidean distance), and IP (inner product). \n                      \n    \"\"\"\n\n    @staticmethod\n    async def create(num_lists, similarity) -> DataStore:\n        # Create underlying data store based on the API definition.\n        # Right now this only supports Mongo, but set up to support more.\n        apiStore: AzureCosmosDBStoreApi = None\n        if AZCOSMOS_API == \"mongo-vcore\":\n            mongoClient = MongoClient(AZCOSMOS_CONNSTR)\n            apiStore = MongoStoreApi(mongoClient)\n        else:\n            raise NotImplementedError\n\n        await apiStore.ensure(num_lists, similarity)\n        store = AzureCosmosDBDataStore(apiStore)\n        return store\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        # Initialize a list of ids to return\n        doc_ids: List[str] = []\n        for doc_id, chunk_list in chunks.items():\n            returnedIds = await self.cosmosStore.upsert_core(doc_id, chunk_list)\n            for returnedId in returnedIds:\n                doc_ids.append(returnedId)\n        return doc_ids\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and\n        returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        # Prepare query responses and results object\n        results: List[QueryResult] = []\n\n        # Gather query results in a pipeline\n        logging.info(f\"Gathering {len(queries)} query results\", flush=True)\n        for query in queries:\n            logging.info(f\"Query: {query.query}\")\n            query_results = await self.cosmosStore.query_core(query)\n\n            # Add to overall results\n            results.append(QueryResult(query=query.query, results=query_results))\n        return results\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n        \"\"\"\n        if delete_all:\n            # fast path - truncate/delete all items.\n            await self.cosmosStore.drop_container()\n            return True\n\n        if filter:\n            if filter.document_id is not None:\n                await self.cosmosStore.delete_document_ids([filter.document_id])\n            else:\n                await self.cosmosStore.delete_filter(filter)\n\n        if ids:\n            await self.cosmosStore.delete_ids(ids)\n\n        return True\n"
  },
  {
    "path": "datastore/providers/azuresearch_datastore.py",
    "content": "import asyncio\nimport base64\nimport os\nimport re\nimport time\nfrom typing import Dict, List, Optional, Union\n\nfrom azure.core.credentials import AzureKeyCredential\nfrom azure.identity import DefaultAzureCredential as DefaultAzureCredentialSync\nfrom azure.identity.aio import DefaultAzureCredential\nfrom azure.search.documents.aio import SearchClient\nfrom azure.search.documents.indexes import SearchIndexClient\nfrom azure.search.documents.indexes.models import *\nfrom azure.search.documents.models import QueryType, Vector\nfrom loguru import logger\n\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    Query,\n    QueryResult,\n    QueryWithEmbedding,\n)\n\nAZURESEARCH_SERVICE = os.environ.get(\"AZURESEARCH_SERVICE\")\nAZURESEARCH_INDEX = os.environ.get(\"AZURESEARCH_INDEX\")\nAZURESEARCH_API_KEY = os.environ.get(\"AZURESEARCH_API_KEY\")\nAZURESEARCH_SEMANTIC_CONFIG = os.environ.get(\"AZURESEARCH_SEMANTIC_CONFIG\")\nAZURESEARCH_LANGUAGE = os.environ.get(\"AZURESEARCH_LANGUAGE\", \"en-us\")\nAZURESEARCH_DISABLE_HYBRID = os.environ.get(\"AZURESEARCH_DISABLE_HYBRID\")\nAZURESEARCH_DIMENSIONS = os.environ.get(\n    \"AZURESEARCH_DIMENSIONS\", 256\n)  # Default to 256 dimensions, change if using a different embeddings model\nassert AZURESEARCH_SERVICE is not None\nassert AZURESEARCH_INDEX is not None\n\n# Allow overriding field names for Azure Search\nFIELDS_ID = os.environ.get(\"AZURESEARCH_FIELDS_ID\", \"id\")\nFIELDS_TEXT = os.environ.get(\"AZURESEARCH_FIELDS_TEXT\", \"text\")\nFIELDS_EMBEDDING = os.environ.get(\"AZURESEARCH_FIELDS_EMBEDDING\", \"embedding\")\nFIELDS_DOCUMENT_ID = os.environ.get(\"AZURESEARCH_FIELDS_DOCUMENT_ID\", \"document_id\")\nFIELDS_SOURCE = os.environ.get(\"AZURESEARCH_FIELDS_SOURCE\", \"source\")\nFIELDS_SOURCE_ID = os.environ.get(\"AZURESEARCH_FIELDS_SOURCE_ID\", \"source_id\")\nFIELDS_URL = os.environ.get(\"AZURESEARCH_FIELDS_URL\", \"url\")\nFIELDS_CREATED_AT = os.environ.get(\"AZURESEARCH_FIELDS_CREATED_AT\", \"created_at\")\nFIELDS_AUTHOR = os.environ.get(\"AZURESEARCH_FIELDS_AUTHOR\", \"author\")\n\nMAX_UPLOAD_BATCH_SIZE = 1000\nMAX_DELETE_BATCH_SIZE = 1000\n\n\nclass AzureSearchDataStore(DataStore):\n    def __init__(self):\n        self.client = SearchClient(\n            endpoint=f\"https://{AZURESEARCH_SERVICE}.search.windows.net\",\n            index_name=AZURESEARCH_INDEX,\n            credential=AzureSearchDataStore._create_credentials(True),\n            user_agent=\"retrievalplugin\",\n        )\n\n        mgmt_client = SearchIndexClient(\n            endpoint=f\"https://{AZURESEARCH_SERVICE}.search.windows.net\",\n            credential=AzureSearchDataStore._create_credentials(False),\n            user_agent=\"retrievalplugin\",\n        )\n        if AZURESEARCH_INDEX not in [name for name in mgmt_client.list_index_names()]:\n            self._create_index(mgmt_client)\n        else:\n            logger.info(\n                f\"Using existing index {AZURESEARCH_INDEX} in service {AZURESEARCH_SERVICE}\"\n            )\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        azdocuments: List[Dict] = []\n\n        async def upload():\n            r = await self.client.upload_documents(documents=azdocuments)\n            count = sum(1 for rr in r if rr.succeeded)\n            logger.info(f\"Upserted {count} chunks out of {len(azdocuments)}\")\n            if count < len(azdocuments):\n                raise Exception(f\"Failed to upload {len(azdocuments) - count} chunks\")\n\n        ids = []\n        for document_id, document_chunks in chunks.items():\n            ids.append(document_id)\n            for chunk in document_chunks:\n                azdocuments.append(\n                    {\n                        # base64-encode the id string to stay within Azure Search's valid characters for keys\n                        FIELDS_ID: base64.urlsafe_b64encode(\n                            bytes(chunk.id, \"utf-8\")\n                        ).decode(\"ascii\"),\n                        FIELDS_TEXT: chunk.text,\n                        FIELDS_EMBEDDING: chunk.embedding,\n                        FIELDS_DOCUMENT_ID: document_id,\n                        FIELDS_SOURCE: chunk.metadata.source,\n                        FIELDS_SOURCE_ID: chunk.metadata.source_id,\n                        FIELDS_URL: chunk.metadata.url,\n                        FIELDS_CREATED_AT: chunk.metadata.created_at,\n                        FIELDS_AUTHOR: chunk.metadata.author,\n                    }\n                )\n\n                if len(azdocuments) >= MAX_UPLOAD_BATCH_SIZE:\n                    await upload()\n                    azdocuments = []\n\n        if len(azdocuments) > 0:\n            await upload()\n\n        return ids\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        filter = None if delete_all else self._translate_filter(filter)\n        if delete_all or filter is not None:\n            deleted = set()\n            while True:\n                search_result = await self.client.search(\n                    None,\n                    filter=filter,\n                    top=MAX_DELETE_BATCH_SIZE,\n                    include_total_count=True,\n                    select=FIELDS_ID,\n                )\n                if await search_result.get_count() == 0:\n                    break\n                documents = [\n                    {FIELDS_ID: d[FIELDS_ID]}\n                    async for d in search_result\n                    if d[FIELDS_ID] not in deleted\n                ]\n                if len(documents) > 0:\n                    logger.info(\n                        f\"Deleting {len(documents)} chunks \"\n                        + (\n                            \"using a filter\"\n                            if filter is not None\n                            else \"using delete_all\"\n                        )\n                    )\n                    del_result = await self.client.delete_documents(documents=documents)\n                    if not all([rr.succeeded for rr in del_result]):\n                        raise Exception(\"Failed to delete documents\")\n                    deleted.update([d[FIELDS_ID] for d in documents])\n                else:\n                    # All repeats, delay a bit to let the index refresh and try again\n                    time.sleep(0.25)\n\n        if ids is not None and len(ids) > 0:\n            for id in ids:\n                logger.info(f\"Deleting chunks for document id {id}\")\n                await self.delete(filter=DocumentMetadataFilter(document_id=id))\n\n        return True\n\n    async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        return await asyncio.gather(*(self._single_query(query) for query in queries))\n\n    async def _single_query(self, query: QueryWithEmbedding) -> QueryResult:\n        \"\"\"\n        Takes in a single query and filters and returns a query result with matching document chunks and scores.\n        \"\"\"\n        filter = (\n            self._translate_filter(query.filter) if query.filter is not None else None\n        )\n        try:\n            vector_top_k = query.top_k if filter is None else query.top_k * 2\n            if not AZURESEARCH_DISABLE_HYBRID:\n                vector_top_k *= 2\n            q = query.query if not AZURESEARCH_DISABLE_HYBRID else None\n            vector_q = Vector(\n                value=query.embedding, k=vector_top_k, fields=FIELDS_EMBEDDING\n            )\n            if AZURESEARCH_SEMANTIC_CONFIG != None and not AZURESEARCH_DISABLE_HYBRID:\n                # Ensure we're feeding a good number of candidates to the L2 reranker\n                vector_top_k = max(50, vector_top_k)\n                r = await self.client.search(\n                    q,\n                    filter=filter,\n                    top=query.top_k,\n                    vectors=[vector_q],\n                    query_type=QueryType.SEMANTIC,\n                    query_language=AZURESEARCH_LANGUAGE,\n                    semantic_configuration_name=AZURESEARCH_SEMANTIC_CONFIG,\n                )\n            else:\n                r = await self.client.search(\n                    q, filter=filter, top=query.top_k, vectors=[vector_q]\n                )\n            results: List[DocumentChunkWithScore] = []\n            async for hit in r:\n                f = lambda field: hit.get(field) if field != \"-\" else None\n                results.append(\n                    DocumentChunkWithScore(\n                        id=hit[FIELDS_ID],\n                        text=hit[FIELDS_TEXT],\n                        metadata=DocumentChunkMetadata(\n                            document_id=f(FIELDS_DOCUMENT_ID),\n                            source=f(FIELDS_SOURCE) or \"file\",\n                            source_id=f(FIELDS_SOURCE_ID),\n                            url=f(FIELDS_URL),\n                            created_at=f(FIELDS_CREATED_AT),\n                            author=f(FIELDS_AUTHOR),\n                        ),\n                        score=hit[\"@search.score\"],\n                    )\n                )\n\n            return QueryResult(query=query.query, results=results)\n        except Exception as e:\n            raise Exception(f\"Error querying the index: {e}\")\n\n    @staticmethod\n    def _translate_filter(filter: DocumentMetadataFilter) -> str:\n        \"\"\"\n        Translates a DocumentMetadataFilter into an Azure Search filter string\n        \"\"\"\n        if filter is None:\n            return None\n\n        escape = lambda s: s.replace(\"'\", \"''\")\n\n        # regex to validate dates are in OData format\n        date_re = re.compile(r\"\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z\")\n\n        filter_list = []\n        if filter.document_id is not None:\n            filter_list.append(\n                f\"{FIELDS_DOCUMENT_ID} eq '{escape(filter.document_id)}'\"\n            )\n        if filter.source is not None:\n            filter_list.append(f\"{FIELDS_SOURCE} eq '{escape(filter.source)}'\")\n        if filter.source_id is not None:\n            filter_list.append(f\"{FIELDS_SOURCE_ID} eq '{escape(filter.source_id)}'\")\n        if filter.author is not None:\n            filter_list.append(f\"{FIELDS_AUTHOR} eq '{escape(filter.author)}'\")\n        if filter.start_date is not None:\n            if not date_re.match(filter.start_date):\n                raise ValueError(\n                    f\"start_date must be in OData format, got {filter.start_date}\"\n                )\n            filter_list.append(f\"{FIELDS_CREATED_AT} ge {filter.start_date}\")\n        if filter.end_date is not None:\n            if not date_re.match(filter.end_date):\n                raise ValueError(\n                    f\"end_date must be in OData format, got {filter.end_date}\"\n                )\n            filter_list.append(f\"{FIELDS_CREATED_AT} le {filter.end_date}\")\n        return \" and \".join(filter_list) if len(filter_list) > 0 else None\n\n    def _create_index(self, mgmt_client: SearchIndexClient):\n        \"\"\"\n        Creates an Azure Cognitive Search index, including a semantic search configuration if a name is specified for it\n        \"\"\"\n        logger.info(\n            f\"Creating index {AZURESEARCH_INDEX} in service {AZURESEARCH_SERVICE}\"\n            + (\n                f\" with semantic search configuration {AZURESEARCH_SEMANTIC_CONFIG}\"\n                if AZURESEARCH_SEMANTIC_CONFIG is not None\n                else \"\"\n            )\n        )\n        mgmt_client.create_index(\n            SearchIndex(\n                name=AZURESEARCH_INDEX,\n                fields=[\n                    SimpleField(\n                        name=FIELDS_ID, type=SearchFieldDataType.String, key=True\n                    ),\n                    SearchableField(\n                        name=FIELDS_TEXT,\n                        type=SearchFieldDataType.String,\n                        analyzer_name=\"standard.lucene\",\n                    ),\n                    SearchField(\n                        name=FIELDS_EMBEDDING,\n                        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n                        hidden=False,\n                        searchable=True,\n                        filterable=False,\n                        sortable=False,\n                        facetable=False,\n                        vector_search_dimensions=AZURESEARCH_DIMENSIONS,\n                        vector_search_configuration=\"default\",\n                    ),\n                    SimpleField(\n                        name=FIELDS_DOCUMENT_ID,\n                        type=SearchFieldDataType.String,\n                        filterable=True,\n                        sortable=True,\n                    ),\n                    SimpleField(\n                        name=FIELDS_SOURCE,\n                        type=SearchFieldDataType.String,\n                        filterable=True,\n                        sortable=True,\n                    ),\n                    SimpleField(\n                        name=FIELDS_SOURCE_ID,\n                        type=SearchFieldDataType.String,\n                        filterable=True,\n                        sortable=True,\n                    ),\n                    SimpleField(name=FIELDS_URL, type=SearchFieldDataType.String),\n                    SimpleField(\n                        name=FIELDS_CREATED_AT,\n                        type=SearchFieldDataType.DateTimeOffset,\n                        filterable=True,\n                        sortable=True,\n                    ),\n                    SimpleField(\n                        name=FIELDS_AUTHOR,\n                        type=SearchFieldDataType.String,\n                        filterable=True,\n                        sortable=True,\n                    ),\n                ],\n                semantic_settings=None\n                if AZURESEARCH_SEMANTIC_CONFIG is None\n                else SemanticSettings(\n                    configurations=[\n                        SemanticConfiguration(\n                            name=AZURESEARCH_SEMANTIC_CONFIG,\n                            prioritized_fields=PrioritizedFields(\n                                title_field=None,\n                                prioritized_content_fields=[\n                                    SemanticField(field_name=FIELDS_TEXT)\n                                ],\n                            ),\n                        )\n                    ]\n                ),\n                vector_search=VectorSearch(\n                    algorithm_configurations=[\n                        HnswVectorSearchAlgorithmConfiguration(\n                            name=\"default\",\n                            kind=\"hnsw\",\n                            # Could change to dotproduct for OpenAI's embeddings since they normalize vectors to unit length\n                            hnsw_parameters=HnswParameters(metric=\"cosine\"),\n                        )\n                    ]\n                ),\n            )\n        )\n\n    @staticmethod\n    def _create_credentials(\n        use_async: bool,\n    ) -> Union[AzureKeyCredential, DefaultAzureCredential, DefaultAzureCredentialSync]:\n        if AZURESEARCH_API_KEY is None:\n            logger.info(\n                \"Using DefaultAzureCredential for Azure Search, make sure local identity or managed identity are set up appropriately\"\n            )\n            credential = (\n                DefaultAzureCredential() if use_async else DefaultAzureCredentialSync()\n            )\n        else:\n            logger.info(\"Using an API key to authenticate with Azure Search\")\n            credential = AzureKeyCredential(AZURESEARCH_API_KEY)\n        return credential\n"
  },
  {
    "path": "datastore/providers/chroma_datastore.py",
    "content": "\"\"\"\nChroma datastore support for the ChatGPT retrieval plugin.\n\nConsult the Chroma docs and GitHub repo for more information:\n- https://docs.trychroma.com/usage-guide?lang=py\n- https://github.com/chroma-core/chroma\n- https://www.trychroma.com/\n\"\"\"\n\nimport os\nfrom datetime import datetime\nfrom typing import Dict, List, Optional\n\nimport chromadb\n\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    Document,\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    Source,\n)\nfrom services.chunks import get_document_chunks\n\nCHROMA_IN_MEMORY = os.environ.get(\"CHROMA_IN_MEMORY\", \"True\")\nCHROMA_PERSISTENCE_DIR = os.environ.get(\"CHROMA_PERSISTENCE_DIR\", \"openai\")\nCHROMA_HOST = os.environ.get(\"CHROMA_HOST\", \"http://127.0.0.1\")\nCHROMA_PORT = os.environ.get(\"CHROMA_PORT\", \"8000\")\nCHROMA_COLLECTION = os.environ.get(\"CHROMA_COLLECTION\", \"openaiembeddings\")\n\n\nclass ChromaDataStore(DataStore):\n    def __init__(\n        self,\n        in_memory: bool = CHROMA_IN_MEMORY,  # type: ignore\n        persistence_dir: Optional[str] = CHROMA_PERSISTENCE_DIR,\n        collection_name: str = CHROMA_COLLECTION,\n        host: str = CHROMA_HOST,\n        port: str = CHROMA_PORT,\n        client: Optional[chromadb.Client] = None,\n    ):\n        if client:\n            self._client = client\n        else:\n            if in_memory:\n                settings = (\n                    chromadb.config.Settings(\n                        chroma_db_impl=\"duckdb+parquet\",\n                        persist_directory=persistence_dir,\n                    )\n                    if persistence_dir\n                    else chromadb.config.Settings()\n                )\n\n                self._client = chromadb.Client(settings=settings)\n            else:\n                self._client = chromadb.Client(\n                    settings=chromadb.config.Settings(\n                        chroma_api_impl=\"rest\",\n                        chroma_server_host=host,\n                        chroma_server_http_port=port,\n                    )\n                )\n        self._collection = self._client.get_or_create_collection(\n            name=collection_name,\n            embedding_function=None,\n        )\n\n    async def upsert(\n        self, documents: List[Document], chunk_token_size: Optional[int] = None\n    ) -> List[str]:\n        \"\"\"\n        Takes in a list of documents and inserts them into the database. If an id already exists, the document is updated.\n        Return a list of document ids.\n        \"\"\"\n\n        chunks = get_document_chunks(documents, chunk_token_size)\n\n        # Chroma has a true upsert, so we don't need to delete first\n        return await self._upsert(chunks)\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n\n        self._collection.upsert(\n            ids=[chunk.id for chunk_list in chunks.values() for chunk in chunk_list],\n            embeddings=[\n                chunk.embedding\n                for chunk_list in chunks.values()\n                for chunk in chunk_list\n            ],\n            documents=[\n                chunk.text for chunk_list in chunks.values() for chunk in chunk_list\n            ],\n            metadatas=[\n                self._process_metadata_for_storage(chunk.metadata)\n                for chunk_list in chunks.values()\n                for chunk in chunk_list\n            ],\n        )\n        return list(chunks.keys())\n\n    def _where_from_query_filter(self, query_filter: DocumentMetadataFilter) -> Dict:\n        output = {\n            k: v\n            for (k, v) in query_filter.dict().items()\n            if v is not None and k != \"start_date\" and k != \"end_date\" and k != \"source\"\n        }\n        if query_filter.source:\n            output[\"source\"] = query_filter.source.value\n        if query_filter.start_date and query_filter.end_date:\n            output[\"$and\"] = [\n                {\n                    \"created_at\": {\n                        \"$gte\": int(\n                            datetime.fromisoformat(query_filter.start_date).timestamp()\n                        )\n                    }\n                },\n                {\n                    \"created_at\": {\n                        \"$lte\": int(\n                            datetime.fromisoformat(query_filter.end_date).timestamp()\n                        )\n                    }\n                },\n            ]\n        elif query_filter.start_date:\n            output[\"created_at\"] = {\n                \"$gte\": int(datetime.fromisoformat(query_filter.start_date).timestamp())\n            }\n        elif query_filter.end_date:\n            output[\"created_at\"] = {\n                \"$lte\": int(datetime.fromisoformat(query_filter.end_date).timestamp())\n            }\n\n        return output\n\n    def _process_metadata_for_storage(self, metadata: DocumentChunkMetadata) -> Dict:\n        stored_metadata = {}\n        if metadata.source:\n            stored_metadata[\"source\"] = metadata.source.value\n        if metadata.source_id:\n            stored_metadata[\"source_id\"] = metadata.source_id\n        if metadata.url:\n            stored_metadata[\"url\"] = metadata.url\n        if metadata.created_at:\n            stored_metadata[\"created_at\"] = int(\n                datetime.fromisoformat(metadata.created_at).timestamp()\n            )\n        if metadata.author:\n            stored_metadata[\"author\"] = metadata.author\n        if metadata.document_id:\n            stored_metadata[\"document_id\"] = metadata.document_id\n\n        return stored_metadata\n\n    def _process_metadata_from_storage(self, metadata: Dict) -> DocumentChunkMetadata:\n        return DocumentChunkMetadata(\n            source=Source(metadata[\"source\"]) if \"source\" in metadata else None,\n            source_id=metadata.get(\"source_id\", None),\n            url=metadata.get(\"url\", None),\n            created_at=datetime.fromtimestamp(metadata[\"created_at\"]).isoformat()\n            if \"created_at\" in metadata\n            else None,\n            author=metadata.get(\"author\", None),\n            document_id=metadata.get(\"document_id\", None),\n        )\n\n    async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        results = [\n            self._collection.query(\n                query_embeddings=[query.embedding],\n                include=[\"documents\", \"distances\", \"metadatas\"],  # embeddings\n                n_results=min(query.top_k, self._collection.count()),  # type: ignore\n                where=(\n                    self._where_from_query_filter(query.filter) if query.filter else {}\n                ),\n            )\n            for query in queries\n        ]\n\n        output = []\n        for query, result in zip(queries, results):\n            inner_results = []\n            (ids,) = result[\"ids\"]\n            # (embeddings,) = result[\"embeddings\"]\n            (documents,) = result[\"documents\"]\n            (metadatas,) = result[\"metadatas\"]\n            (distances,) = result[\"distances\"]\n            for id_, text, metadata, distance in zip(\n                ids,\n                documents,\n                metadatas,\n                distances,  # embeddings (https://github.com/openai/chatgpt-retrieval-plugin/pull/59#discussion_r1154985153)\n            ):\n                inner_results.append(\n                    DocumentChunkWithScore(\n                        id=id_,\n                        text=text,\n                        metadata=self._process_metadata_from_storage(metadata),\n                        # embedding=embedding,\n                        score=distance,\n                    )\n                )\n            output.append(QueryResult(query=query.query, results=inner_results))\n\n        return output\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Multiple parameters can be used at once.\n        Returns whether the operation was successful.\n        \"\"\"\n        if delete_all:\n            self._collection.delete()\n            return True\n\n        if ids and len(ids) > 0:\n            if len(ids) > 1:\n                where_clause = {\"$or\": [{\"document_id\": id_} for id_ in ids]}\n            else:\n                (id_,) = ids\n                where_clause = {\"document_id\": id_}\n\n            if filter:\n                where_clause = {\n                    \"$and\": [self._where_from_query_filter(filter), where_clause]\n                }\n        elif filter:\n            where_clause = self._where_from_query_filter(filter)\n\n        self._collection.delete(where=where_clause)\n        return True\n"
  },
  {
    "path": "datastore/providers/elasticsearch_datastore.py",
    "content": "import os\nfrom typing import Dict, List, Any, Optional\n\nimport elasticsearch\nfrom elasticsearch import Elasticsearch, helpers\nfrom loguru import logger\n\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n)\nfrom services.date import to_unix_timestamp\n\nELASTICSEARCH_URL = os.environ.get(\"ELASTICSEARCH_URL\", \"http://localhost:9200\")\nELASTICSEARCH_CLOUD_ID = os.environ.get(\"ELASTICSEARCH_CLOUD_ID\")\nELASTICSEARCH_USERNAME = os.environ.get(\"ELASTICSEARCH_USERNAME\")\nELASTICSEARCH_PASSWORD = os.environ.get(\"ELASTICSEARCH_PASSWORD\")\nELASTICSEARCH_API_KEY = os.environ.get(\"ELASTICSEARCH_API_KEY\")\n\nELASTICSEARCH_INDEX = os.environ.get(\"ELASTICSEARCH_INDEX\")\nELASTICSEARCH_REPLICAS = int(os.environ.get(\"ELASTICSEARCH_REPLICAS\", \"1\"))\nELASTICSEARCH_SHARDS = int(os.environ.get(\"ELASTICSEARCH_SHARDS\", \"1\"))\n\nVECTOR_SIZE = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\nUPSERT_BATCH_SIZE = 100\n\n\nclass ElasticsearchDataStore(DataStore):\n    def __init__(\n        self,\n        index_name: Optional[str] = None,\n        vector_size: int = VECTOR_SIZE,\n        similarity: str = \"cosine\",\n        replicas: int = ELASTICSEARCH_REPLICAS,\n        shards: int = ELASTICSEARCH_SHARDS,\n        recreate_index: bool = True,\n    ):\n        \"\"\"\n        Args:\n            index_name: Name of the index to be used\n            vector_size: Size of the embedding stored in a collection\n            similarity:\n                Any of \"cosine\" / \"l2_norm\" / \"dot_product\".\n\n        \"\"\"\n        assert similarity in [\n            \"cosine\",\n            \"l2_norm\",\n            \"dot_product\",\n        ], \"Similarity must be one of 'cosine' / 'l2_norm' / 'dot_product'.\"\n        assert replicas > 0, \"Replicas must be greater than or equal to 0.\"\n        assert shards > 0, \"Shards must be greater than or equal to 0.\"\n\n        self.client = connect_to_elasticsearch(\n            ELASTICSEARCH_URL,\n            ELASTICSEARCH_CLOUD_ID,\n            ELASTICSEARCH_API_KEY,\n            ELASTICSEARCH_USERNAME,\n            ELASTICSEARCH_PASSWORD,\n        )\n        assert (\n            index_name != \"\" or ELASTICSEARCH_INDEX != \"\"\n        ), \"Please provide an index name.\"\n        self.index_name = index_name or ELASTICSEARCH_INDEX or \"\"\n\n        replicas = replicas or ELASTICSEARCH_REPLICAS\n        shards = shards or ELASTICSEARCH_SHARDS\n\n        # Set up the collection so the documents might be inserted or queried\n        self._set_up_index(vector_size, similarity, replicas, shards, recreate_index)\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        actions = []\n        for _, chunkList in chunks.items():\n            for chunk in chunkList:\n                actions = (\n                    actions\n                    + self._convert_document_chunk_to_es_document_operation(chunk)\n                )\n\n        self.client.bulk(operations=actions, index=self.index_name)\n        return list(chunks.keys())\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        searches = self._convert_queries_to_msearch_query(queries)\n        results = self.client.msearch(searches=searches)\n        return [\n            QueryResult(\n                query=query.query,\n                results=[\n                    self._convert_hit_to_document_chunk_with_score(hit)\n                    for hit in result[\"hits\"][\"hits\"]\n                ],\n            )\n            for query, result in zip(queries, results[\"responses\"])\n        ]\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n        \"\"\"\n\n        # Delete all vectors from the index if delete_all is True\n        if delete_all:\n            try:\n                logger.info(f\"Deleting all vectors from index\")\n                self.client.delete_by_query(\n                    index=self.index_name, query={\"match_all\": {}}\n                )\n                logger.info(f\"Deleted all vectors successfully\")\n                return True\n            except Exception as e:\n                logger.error(f\"Error deleting all vectors: {e}\")\n                raise e\n\n        # Convert the metadata filter object to a dict with elasticsearch filter expressions\n        es_filters = self._get_es_filters(filter)\n        # Delete vectors that match the filter from the index if the filter is not empty\n        if es_filters != {}:\n            try:\n                logger.info(f\"Deleting vectors with filter {es_filters}\")\n                self.client.delete_by_query(index=self.index_name, query=es_filters)\n                logger.info(f\"Deleted vectors with filter successfully\")\n            except Exception as e:\n                logger.error(f\"Error deleting vectors with filter: {e}\")\n                raise e\n\n        if ids:\n            try:\n                documents_to_delete = [doc_id for doc_id in ids]\n                logger.info(f\"Deleting {len(documents_to_delete)} documents\")\n                res = self.client.delete_by_query(\n                    index=self.index_name,\n                    query={\"terms\": {\"metadata.document_id\": documents_to_delete}},\n                )\n                logger.info(f\"Deleted documents successfully\")\n            except Exception as e:\n                logger.error(f\"Error deleting documents: {e}\")\n                raise e\n\n        return True\n\n    def _get_es_filters(\n        self, filter: Optional[DocumentMetadataFilter] = None\n    ) -> Dict[str, Any]:\n        if filter is None:\n            return {}\n\n        es_filters = {\n            \"bool\": {\n                \"must\": [],\n            }\n        }\n\n        # For each field in the MetadataFilter, check if it has a value and add the corresponding pinecone filter expression\n        # For start_date and end_date, uses the range query - gte and lte operators respectively\n        # For other fields, uses the term query\n        for field, value in filter.dict().items():\n            if value is not None:\n                if field == \"start_date\":\n                    es_filters[\"bool\"][\"must\"].append(\n                        {\"range\": {\"created_at\": {\"gte\": to_unix_timestamp(value)}}}\n                    )\n                elif field == \"end_date\":\n                    es_filters[\"bool\"][\"must\"].append(\n                        {\"range\": {\"created_at\": {\"lte\": to_unix_timestamp(value)}}}\n                    )\n                else:\n                    es_filters[\"bool\"][\"must\"].append(\n                        {\"term\": {f\"metadata.{field}\": value}}\n                    )\n\n        return es_filters\n\n    def _convert_document_chunk_to_es_document_operation(\n        self, document_chunk: DocumentChunk\n    ) -> List[Dict]:\n        created_at = (\n            to_unix_timestamp(document_chunk.metadata.created_at)\n            if document_chunk.metadata.created_at is not None\n            else None\n        )\n\n        action_and_metadata = {\n            \"index\": {\n                \"_index\": self.index_name,\n                \"_id\": document_chunk.id,\n            }\n        }\n\n        source = {\n            \"id\": document_chunk.id,\n            \"text\": document_chunk.text,\n            \"metadata\": document_chunk.metadata.dict(),\n            \"created_at\": created_at,\n            \"embedding\": document_chunk.embedding,\n        }\n\n        return [action_and_metadata, source]\n\n    def _convert_queries_to_msearch_query(self, queries: List[QueryWithEmbedding]):\n        searches = []\n\n        for query in queries:\n            searches.append({\"index\": self.index_name})\n            searches.append(\n                {\n                    \"_source\": True,\n                    \"knn\": {\n                        \"field\": \"embedding\",\n                        \"query_vector\": query.embedding,\n                        \"k\": query.top_k,\n                        \"num_candidates\": query.top_k,\n                    },\n                    \"size\": query.top_k,\n                }\n            )\n\n        return searches\n\n    def _convert_hit_to_document_chunk_with_score(self, hit) -> DocumentChunkWithScore:\n        return DocumentChunkWithScore(\n            id=hit[\"_id\"],\n            text=hit[\"_source\"][\"text\"],  # type: ignore\n            metadata=hit[\"_source\"][\"metadata\"],  # type: ignore\n            embedding=hit[\"_source\"][\"embedding\"],  # type: ignore\n            score=hit[\"_score\"],\n        )\n\n    def _set_up_index(\n        self,\n        vector_size: int,\n        similarity: str,\n        replicas: int,\n        shards: int,\n        recreate_index: bool,\n    ) -> None:\n        if recreate_index:\n            self._recreate_index(similarity, vector_size, replicas, shards)\n\n        try:\n            index_mapping = self.client.indices.get_mapping(index=self.index_name)\n            current_similarity = index_mapping[self.index_name][\"mappings\"][\"properties\"][\"embedding\"][\"similarity\"]  # type: ignore\n            current_vector_size = index_mapping[self.index_name][\"mappings\"][\"properties\"][\"embedding\"][\"dims\"]  # type: ignore\n\n            if current_similarity != similarity:\n                raise ValueError(\n                    f\"Collection '{self.index_name}' already exists in Elasticsearch, \"\n                    f\"but it is configured with a similarity '{current_similarity}'. \"\n                    f\"If you want to use that collection, but with a different \"\n                    f\"similarity, please set `recreate_index=True` argument.\"\n                )\n\n            if current_vector_size != vector_size:\n                raise ValueError(\n                    f\"Collection '{self.index_name}' already exists in Elasticsearch, \"\n                    f\"but it is configured with a vector size '{current_vector_size}'. \"\n                    f\"If you want to use that collection, but with a different \"\n                    f\"vector size, please set `recreate_index=True` argument.\"\n                )\n        except elasticsearch.exceptions.NotFoundError:\n            self._recreate_index(similarity, vector_size, replicas, shards)\n\n    def _recreate_index(\n        self, similarity: str, vector_size: int, replicas: int, shards: int\n    ) -> None:\n        settings = {\n            \"index\": {\n                \"number_of_shards\": shards,\n                \"number_of_replicas\": replicas,\n                \"refresh_interval\": \"1s\",\n            }\n        }\n        mappings = {\n            \"properties\": {\n                \"embedding\": {\n                    \"type\": \"dense_vector\",\n                    \"dims\": vector_size,\n                    \"index\": True,\n                    \"similarity\": similarity,\n                }\n            }\n        }\n\n        self.client.indices.delete(\n            index=self.index_name, ignore_unavailable=True, allow_no_indices=True\n        )\n        self.client.indices.create(\n            index=self.index_name, mappings=mappings, settings=settings\n        )\n\n\ndef connect_to_elasticsearch(\n    elasticsearch_url=None, cloud_id=None, api_key=None, username=None, password=None\n):\n    # Check if both elasticsearch_url and cloud_id are defined\n    if elasticsearch_url and cloud_id:\n        raise ValueError(\n            \"Both elasticsearch_url and cloud_id are defined. Please provide only one.\"\n        )\n\n    # Initialize connection parameters dictionary\n    connection_params = {}\n\n    # Define the connection based on the provided parameters\n    if elasticsearch_url:\n        connection_params[\"hosts\"] = [elasticsearch_url]\n    elif cloud_id:\n        connection_params[\"cloud_id\"] = cloud_id\n    else:\n        raise ValueError(\"Please provide either elasticsearch_url or cloud_id.\")\n\n    # Add authentication details based on the provided parameters\n    if api_key:\n        connection_params[\"api_key\"] = api_key\n    elif username and password:\n        connection_params[\"basic_auth\"] = (username, password)\n    else:\n        logger.warning(\n            \"No authentication details provided. Please consider using an api_key or username and password to secure your connection.\"\n        )\n\n    # Establish the Elasticsearch client connection\n    es_client = Elasticsearch(**connection_params)\n    try:\n        es_client.info()\n    except Exception as e:\n        logger.error(f\"Error connecting to Elasticsearch: {e}\")\n        raise e\n\n    return es_client\n"
  },
  {
    "path": "datastore/providers/llama_datastore.py",
    "content": "import json\nimport os\nfrom typing import Dict, List, Optional, Type\nfrom loguru import logger\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    Query,\n    QueryResult,\n    QueryWithEmbedding,\n)\n\nfrom llama_index.indices.base import BaseGPTIndex\nfrom llama_index.indices.vector_store.base import GPTVectorStoreIndex\nfrom llama_index.indices.query.schema import QueryBundle\nfrom llama_index.response.schema import Response\nfrom llama_index.data_structs.node_v2 import Node, DocumentRelationship, NodeWithScore\nfrom llama_index.indices.registry import INDEX_STRUCT_TYPE_TO_INDEX_CLASS\nfrom llama_index.data_structs.struct_type import IndexStructType\nfrom llama_index.indices.response.builder import ResponseMode\n\nINDEX_STRUCT_TYPE_STR = os.environ.get(\n    \"LLAMA_INDEX_TYPE\", IndexStructType.SIMPLE_DICT.value\n)\nINDEX_JSON_PATH = os.environ.get(\"LLAMA_INDEX_JSON_PATH\", None)\nQUERY_KWARGS_JSON_PATH = os.environ.get(\"LLAMA_QUERY_KWARGS_JSON_PATH\", None)\nRESPONSE_MODE = os.environ.get(\"LLAMA_RESPONSE_MODE\", ResponseMode.NO_TEXT.value)\n\nEXTERNAL_VECTOR_STORE_INDEX_STRUCT_TYPES = [\n    IndexStructType.DICT,\n    IndexStructType.WEAVIATE,\n    IndexStructType.PINECONE,\n    IndexStructType.QDRANT,\n    IndexStructType.CHROMA,\n    IndexStructType.VECTOR_STORE,\n]\n\n\ndef _create_or_load_index(\n    index_type_str: Optional[str] = None,\n    index_json_path: Optional[str] = None,\n    index_type_to_index_cls: Optional[dict[str, Type[BaseGPTIndex]]] = None,\n) -> BaseGPTIndex:\n    \"\"\"Create or load index from json path.\"\"\"\n    index_json_path = index_json_path or INDEX_JSON_PATH\n    index_type_to_index_cls = (\n        index_type_to_index_cls or INDEX_STRUCT_TYPE_TO_INDEX_CLASS\n    )\n    index_type_str = index_type_str or INDEX_STRUCT_TYPE_STR\n    index_type = IndexStructType(index_type_str)\n\n    if index_type not in index_type_to_index_cls:\n        raise ValueError(f\"Unknown index type: {index_type}\")\n\n    if index_type in EXTERNAL_VECTOR_STORE_INDEX_STRUCT_TYPES:\n        raise ValueError(\"Please use vector store directly.\")\n\n    index_cls = index_type_to_index_cls[index_type]\n    if index_json_path is None:\n        return index_cls(nodes=[])  # Create empty index\n    else:\n        return index_cls.load_from_disk(index_json_path)  # Load index from disk\n\n\ndef _create_or_load_query_kwargs(\n    query_kwargs_json_path: Optional[str] = None,\n) -> Optional[dict]:\n    \"\"\"Create or load query kwargs from json path.\"\"\"\n    query_kwargs_json_path = query_kwargs_json_path or QUERY_KWARGS_JSON_PATH\n    query_kargs: Optional[dict] = None\n    if query_kwargs_json_path is not None:\n        with open(INDEX_JSON_PATH, \"r\") as f:\n            query_kargs = json.load(f)\n    return query_kargs\n\n\ndef _doc_chunk_to_node(doc_chunk: DocumentChunk, source_doc_id: str) -> Node:\n    \"\"\"Convert document chunk to Node\"\"\"\n    return Node(\n        doc_id=doc_chunk.id,\n        text=doc_chunk.text,\n        embedding=doc_chunk.embedding,\n        extra_info=doc_chunk.metadata.dict(),\n        relationships={DocumentRelationship.SOURCE: source_doc_id},\n    )\n\n\ndef _query_with_embedding_to_query_bundle(query: QueryWithEmbedding) -> QueryBundle:\n    return QueryBundle(\n        query_str=query.query,\n        embedding=query.embedding,\n    )\n\n\ndef _source_node_to_doc_chunk_with_score(\n    node_with_score: NodeWithScore,\n) -> DocumentChunkWithScore:\n    node = node_with_score.node\n    if node.extra_info is not None:\n        metadata = DocumentChunkMetadata(**node.extra_info)\n    else:\n        metadata = DocumentChunkMetadata()\n\n    return DocumentChunkWithScore(\n        id=node.doc_id,\n        text=node.text,\n        score=node_with_score.score if node_with_score.score is not None else 1.0,\n        metadata=metadata,\n    )\n\n\ndef _response_to_query_result(\n    response: Response, query: QueryWithEmbedding\n) -> QueryResult:\n    results = [\n        _source_node_to_doc_chunk_with_score(node) for node in response.source_nodes\n    ]\n    return QueryResult(\n        query=query.query,\n        results=results,\n    )\n\n\nclass LlamaDataStore(DataStore):\n    def __init__(\n        self, index: Optional[BaseGPTIndex] = None, query_kwargs: Optional[dict] = None\n    ):\n        self._index = index or _create_or_load_index()\n        self._query_kwargs = query_kwargs or _create_or_load_query_kwargs()\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        doc_ids = []\n        for doc_id, doc_chunks in chunks.items():\n            logger.debug(f\"Upserting {doc_id} with {len(doc_chunks)} chunks\")\n\n            nodes = [\n                _doc_chunk_to_node(doc_chunk=doc_chunk, source_doc_id=doc_id)\n                for doc_chunk in doc_chunks\n            ]\n\n            self._index.insert_nodes(nodes)\n            doc_ids.append(doc_id)\n        return doc_ids\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and\n        returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        query_result_all = []\n        for query in queries:\n            if query.filter is not None:\n                logger.warning(\"Filters are not supported yet, ignoring for now.\")\n\n            query_bundle = _query_with_embedding_to_query_bundle(query)\n\n            # Setup query kwargs\n            if self._query_kwargs is not None:\n                query_kwargs = self._query_kwargs\n            else:\n                query_kwargs = {}\n            # TODO: support top_k for other indices\n            if isinstance(self._index, GPTVectorStoreIndex):\n                query_kwargs[\"similarity_top_k\"] = query.top_k\n\n            response = await self._index.aquery(\n                query_bundle, response_mode=RESPONSE_MODE, **query_kwargs\n            )\n\n            query_result = _response_to_query_result(response, query)\n            query_result_all.append(query_result)\n\n        return query_result_all\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n        \"\"\"\n        if delete_all:\n            logger.warning(\"Delete all not supported yet.\")\n            return False\n\n        if filter is not None:\n            logger.warning(\"Filters are not supported yet.\")\n            return False\n\n        if ids is not None:\n            for id_ in ids:\n                try:\n                    self._index.delete(id_)\n                except NotImplementedError:\n                    # NOTE: some indices does not support delete yet.\n                    logger.warning(f\"{type(self._index)} does not support delete yet.\")\n                    return False\n\n        return True\n"
  },
  {
    "path": "datastore/providers/milvus_datastore.py",
    "content": "import json\nimport os\nimport asyncio\n\nfrom loguru import logger\nfrom typing import Dict, List, Optional\nfrom pymilvus import (\n    Collection,\n    connections,\n    utility,\n    FieldSchema,\n    DataType,\n    CollectionSchema,\n    MilvusException,\n)\nfrom uuid import uuid4\n\n\nfrom services.date import to_unix_timestamp\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    Source,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    DocumentChunkWithScore,\n)\n\nMILVUS_COLLECTION = os.environ.get(\"MILVUS_COLLECTION\") or \"c\" + uuid4().hex\nMILVUS_HOST = os.environ.get(\"MILVUS_HOST\") or \"localhost\"\nMILVUS_PORT = os.environ.get(\"MILVUS_PORT\") or 19530\nMILVUS_USER = os.environ.get(\"MILVUS_USER\")\nMILVUS_PASSWORD = os.environ.get(\"MILVUS_PASSWORD\")\nMILVUS_USE_SECURITY = False if MILVUS_PASSWORD is None else True\n\nMILVUS_INDEX_PARAMS = os.environ.get(\"MILVUS_INDEX_PARAMS\")\nMILVUS_SEARCH_PARAMS = os.environ.get(\"MILVUS_SEARCH_PARAMS\")\nMILVUS_CONSISTENCY_LEVEL = os.environ.get(\"MILVUS_CONSISTENCY_LEVEL\")\n\nUPSERT_BATCH_SIZE = 100\nOUTPUT_DIM = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\nEMBEDDING_FIELD = \"embedding\"\n\n\nclass Required:\n    pass\n\n\n# The fields names that we are going to be storing within Milvus, the field declaration for schema creation, and the default value\nSCHEMA_V1 = [\n    (\n        \"pk\",\n        FieldSchema(name=\"pk\", dtype=DataType.INT64, is_primary=True, auto_id=True),\n        Required,\n    ),\n    (\n        EMBEDDING_FIELD,\n        FieldSchema(name=EMBEDDING_FIELD, dtype=DataType.FLOAT_VECTOR, dim=OUTPUT_DIM),\n        Required,\n    ),\n    (\n        \"text\",\n        FieldSchema(name=\"text\", dtype=DataType.VARCHAR, max_length=65535),\n        Required,\n    ),\n    (\n        \"document_id\",\n        FieldSchema(name=\"document_id\", dtype=DataType.VARCHAR, max_length=65535),\n        \"\",\n    ),\n    (\n        \"source_id\",\n        FieldSchema(name=\"source_id\", dtype=DataType.VARCHAR, max_length=65535),\n        \"\",\n    ),\n    (\n        \"id\",\n        FieldSchema(\n            name=\"id\",\n            dtype=DataType.VARCHAR,\n            max_length=65535,\n        ),\n        \"\",\n    ),\n    (\n        \"source\",\n        FieldSchema(name=\"source\", dtype=DataType.VARCHAR, max_length=65535),\n        \"\",\n    ),\n    (\"url\", FieldSchema(name=\"url\", dtype=DataType.VARCHAR, max_length=65535), \"\"),\n    (\"created_at\", FieldSchema(name=\"created_at\", dtype=DataType.INT64), -1),\n    (\n        \"author\",\n        FieldSchema(name=\"author\", dtype=DataType.VARCHAR, max_length=65535),\n        \"\",\n    ),\n]\n\n# V2 schema, remomve the \"pk\" field\nSCHEMA_V2 = SCHEMA_V1[1:]\nSCHEMA_V2[4][1].is_primary = True\n\n\nclass MilvusDataStore(DataStore):\n    def __init__(\n        self,\n        create_new: Optional[bool] = False,\n        consistency_level: str = \"Bounded\",\n    ):\n        \"\"\"Create a Milvus DataStore.\n\n        The Milvus Datastore allows for storing your indexes and metadata within a Milvus instance.\n\n        Args:\n            create_new (Optional[bool], optional): Whether to overwrite if collection already exists. Defaults to True.\n            consistency_level(str, optional): Specify the collection consistency level.\n                                                Defaults to \"Bounded\" for search performance.\n                                                Set to \"Strong\" in test cases for result validation.\n        \"\"\"\n        # Overwrite the default consistency level by MILVUS_CONSISTENCY_LEVEL\n        self._consistency_level = MILVUS_CONSISTENCY_LEVEL or consistency_level\n        self._create_connection()\n\n        self._create_collection(MILVUS_COLLECTION, create_new)  # type: ignore\n        self._create_index()\n\n    def _get_schema(self):\n        return SCHEMA_V1 if self._schema_ver == \"V1\" else SCHEMA_V2\n\n    def _create_connection(self):\n        try:\n            self.alias = \"\"\n            # Check if the connection already exists\n            for x in connections.list_connections():\n                addr = connections.get_connection_addr(x[0])\n                if (\n                    x[1]\n                    and (\"address\" in addr)\n                    and (addr[\"address\"] == \"{}:{}\".format(MILVUS_HOST, MILVUS_PORT))\n                ):\n                    self.alias = x[0]\n                    logger.info(\n                        \"Reuse connection to Milvus server '{}:{}' with alias '{:s}'\".format(\n                            MILVUS_HOST, MILVUS_PORT, self.alias\n                        )\n                    )\n                    break\n\n            # Connect to the Milvus instance using the passed in Environment variables\n            if len(self.alias) == 0:\n                self.alias = uuid4().hex\n                connections.connect(\n                    alias=self.alias,\n                    host=MILVUS_HOST,\n                    port=MILVUS_PORT,\n                    user=MILVUS_USER,  # type: ignore\n                    password=MILVUS_PASSWORD,  # type: ignore\n                    secure=MILVUS_USE_SECURITY,\n                )\n                logger.info(\n                    \"Create connection to Milvus server '{}:{}' with alias '{:s}'\".format(\n                        MILVUS_HOST, MILVUS_PORT, self.alias\n                    )\n                )\n        except Exception as e:\n            logger.error(\n                \"Failed to create connection to Milvus server '{}:{}', error: {}\".format(\n                    MILVUS_HOST, MILVUS_PORT, e\n                )\n            )\n\n    def _create_collection(self, collection_name, create_new: bool) -> None:\n        \"\"\"Create a collection based on environment and passed in variables.\n\n        Args:\n            create_new (bool): Whether to overwrite if collection already exists.\n        \"\"\"\n        try:\n            self._schema_ver = \"V1\"\n            # If the collection exists and create_new is True, drop the existing collection\n            if utility.has_collection(collection_name, using=self.alias) and create_new:\n                utility.drop_collection(collection_name, using=self.alias)\n\n            # Check if the collection doesnt exist\n            if utility.has_collection(collection_name, using=self.alias) is False:\n                # If it doesnt exist use the field params from init to create a new schem\n                schema = [field[1] for field in SCHEMA_V2]\n                schema = CollectionSchema(schema)\n                # Use the schema to create a new collection\n                self.col = Collection(\n                    collection_name,\n                    schema=schema,\n                    using=self.alias,\n                    consistency_level=self._consistency_level,\n                )\n                self._schema_ver = \"V2\"\n                logger.info(\n                    \"Create Milvus collection '{}' with schema {} and consistency level {}\".format(\n                        collection_name, self._schema_ver, self._consistency_level\n                    )\n                )\n            else:\n                # If the collection exists, point to it\n                self.col = Collection(collection_name, using=self.alias)  # type: ignore\n                # Which sechma is used\n                for field in self.col.schema.fields:\n                    if field.name == \"id\" and field.is_primary:\n                        self._schema_ver = \"V2\"\n                        break\n                logger.info(\n                    \"Milvus collection '{}' already exists with schema {}\".format(\n                        collection_name, self._schema_ver\n                    )\n                )\n        except Exception as e:\n            logger.error(\n                \"Failed to create collection '{}', error: {}\".format(collection_name, e)\n            )\n\n    def _create_index(self):\n        # TODO: verify index/search params passed by os.environ\n        self.index_params = MILVUS_INDEX_PARAMS or None\n        self.search_params = MILVUS_SEARCH_PARAMS or None\n        try:\n            # If no index on the collection, create one\n            if len(self.col.indexes) == 0:\n                if self.index_params is not None:\n                    # Convert the string format to JSON format parameters passed by MILVUS_INDEX_PARAMS\n                    self.index_params = json.loads(self.index_params)\n                    logger.info(\"Create Milvus index: {}\".format(self.index_params))\n                    # Create an index on the 'embedding' field with the index params found in init\n                    self.col.create_index(\n                        EMBEDDING_FIELD, index_params=self.index_params\n                    )\n                else:\n                    # If no index param supplied, to first create an HNSW index for Milvus\n                    try:\n                        i_p = {\n                            \"metric_type\": \"IP\",\n                            \"index_type\": \"HNSW\",\n                            \"params\": {\"M\": 8, \"efConstruction\": 64},\n                        }\n                        logger.info(\n                            \"Attempting creation of Milvus '{}' index\".format(\n                                i_p[\"index_type\"]\n                            )\n                        )\n                        self.col.create_index(EMBEDDING_FIELD, index_params=i_p)\n                        self.index_params = i_p\n                        logger.info(\n                            \"Creation of Milvus '{}' index successful\".format(\n                                i_p[\"index_type\"]\n                            )\n                        )\n                    # If create fails, most likely due to being Zilliz Cloud instance, try to create an AutoIndex\n                    except MilvusException:\n                        logger.info(\"Attempting creation of Milvus default index\")\n                        i_p = {\n                            \"metric_type\": \"IP\",\n                            \"index_type\": \"AUTOINDEX\",\n                            \"params\": {},\n                        }\n                        self.col.create_index(EMBEDDING_FIELD, index_params=i_p)\n                        self.index_params = i_p\n                        logger.info(\"Creation of Milvus default index successful\")\n            # If an index already exists, grab its params\n            else:\n                # How about if the first index is not vector index?\n                for index in self.col.indexes:\n                    idx = index.to_dict()\n                    if idx[\"field\"] == EMBEDDING_FIELD:\n                        logger.info(\"Index already exists: {}\".format(idx))\n                        self.index_params = idx[\"index_param\"]\n                        break\n\n            self.col.load()\n\n            if self.search_params is not None:\n                # Convert the string format to JSON format parameters passed by MILVUS_SEARCH_PARAMS\n                self.search_params = json.loads(self.search_params)\n            else:\n                # The default search params\n                metric_type = \"IP\"\n                if \"metric_type\" in self.index_params:\n                    metric_type = self.index_params[\"metric_type\"]\n                default_search_params = {\n                    \"IVF_FLAT\": {\"metric_type\": metric_type, \"params\": {\"nprobe\": 10}},\n                    \"IVF_SQ8\": {\"metric_type\": metric_type, \"params\": {\"nprobe\": 10}},\n                    \"IVF_PQ\": {\"metric_type\": metric_type, \"params\": {\"nprobe\": 10}},\n                    \"HNSW\": {\"metric_type\": metric_type, \"params\": {\"ef\": 10}},\n                    \"RHNSW_FLAT\": {\"metric_type\": metric_type, \"params\": {\"ef\": 10}},\n                    \"RHNSW_SQ\": {\"metric_type\": metric_type, \"params\": {\"ef\": 10}},\n                    \"RHNSW_PQ\": {\"metric_type\": metric_type, \"params\": {\"ef\": 10}},\n                    \"IVF_HNSW\": {\n                        \"metric_type\": metric_type,\n                        \"params\": {\"nprobe\": 10, \"ef\": 10},\n                    },\n                    \"ANNOY\": {\"metric_type\": metric_type, \"params\": {\"search_k\": 10}},\n                    \"AUTOINDEX\": {\"metric_type\": metric_type, \"params\": {}},\n                }\n                # Set the search params\n                self.search_params = default_search_params[\n                    self.index_params[\"index_type\"]\n                ]\n            logger.info(\"Milvus search parameters: {}\".format(self.search_params))\n        except Exception as e:\n            logger.error(\"Failed to create index, error: {}\".format(e))\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"Upsert chunks into the datastore.\n\n        Args:\n            chunks (Dict[str, List[DocumentChunk]]): A list of DocumentChunks to insert\n\n        Raises:\n            e: Error in upserting data.\n\n        Returns:\n            List[str]: The document_id's that were inserted.\n        \"\"\"\n        try:\n            # The doc id's to return for the upsert\n            doc_ids: List[str] = []\n            # List to collect all the insert data, skip the \"pk\" for schema V1\n            offset = 1 if self._schema_ver == \"V1\" else 0\n            insert_data = [[] for _ in range(len(self._get_schema()) - offset)]\n\n            # Go through each document chunklist and grab the data\n            for doc_id, chunk_list in chunks.items():\n                # Append the doc_id to the list we are returning\n                doc_ids.append(doc_id)\n                # Examine each chunk in the chunklist\n                for chunk in chunk_list:\n                    # Extract data from the chunk\n                    list_of_data = self._get_values(chunk)\n                    # Check if the data is valid\n                    if list_of_data is not None:\n                        # Append each field to the insert_data\n                        for x in range(len(insert_data)):\n                            insert_data[x].append(list_of_data[x])\n            # Slice up our insert data into batches\n            batches = [\n                insert_data[i : i + UPSERT_BATCH_SIZE]\n                for i in range(0, len(insert_data), UPSERT_BATCH_SIZE)\n            ]\n\n            # Attempt to insert each batch into our collection\n            # batch data can work with both V1 and V2 schema\n            for batch in batches:\n                if len(batch[0]) != 0:\n                    try:\n                        logger.info(f\"Upserting batch of size {len(batch[0])}\")\n                        self.col.insert(batch)\n                        logger.info(f\"Upserted batch successfully\")\n                    except Exception as e:\n                        logger.error(f\"Failed to insert batch records, error: {e}\")\n                        raise e\n\n            # This setting perfoms flushes after insert. Small insert == bad to use\n            # self.col.flush()\n            return doc_ids\n        except Exception as e:\n            logger.error(\"Failed to insert records, error: {}\".format(e))\n            return []\n\n    def _get_values(self, chunk: DocumentChunk) -> List[any] | None:  # type: ignore\n        \"\"\"Convert the chunk into a list of values to insert whose indexes align with fields.\n\n        Args:\n            chunk (DocumentChunk): The chunk to convert.\n\n        Returns:\n            List (any): The values to insert.\n        \"\"\"\n        # Convert DocumentChunk and its sub models to dict\n        values = chunk.dict()\n        # Unpack the metadata into the same dict\n        meta = values.pop(\"metadata\")\n        values.update(meta)\n\n        # Convert date to int timestamp form\n        if values[\"created_at\"]:\n            values[\"created_at\"] = to_unix_timestamp(values[\"created_at\"])\n\n        # If source exists, change from Source object to the string value it holds\n        if values[\"source\"]:\n            values[\"source\"] = values[\"source\"].value\n        # List to collect data we will return\n        ret = []\n        # Grab data responding to each field, excluding the hidden auto pk field for schema V1\n        offset = 1 if self._schema_ver == \"V1\" else 0\n        for key, _, default in self._get_schema()[offset:]:\n            # Grab the data at the key and default to our defaults set in init\n            x = values.get(key) or default\n            # If one of our required fields is missing, ignore the entire entry\n            if x is Required:\n                logger.info(\"Chunk \" + values[\"id\"] + \" missing \" + key + \" skipping\")\n                return None\n            # Add the corresponding value if it passes the tests\n            ret.append(x)\n        return ret\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"Query the QueryWithEmbedding against the MilvusDocumentSearch\n\n        Search the embedding and its filter in the collection.\n\n        Args:\n            queries (List[QueryWithEmbedding]): The list of searches to perform.\n\n        Returns:\n            List[QueryResult]: Results for each search.\n        \"\"\"\n\n        # Async to perform the query, adapted from pinecone implementation\n        async def _single_query(query: QueryWithEmbedding) -> QueryResult:\n            try:\n                filter = None\n                # Set the filter to expression that is valid for Milvus\n                if query.filter is not None:\n                    # Either a valid filter or None will be returned\n                    filter = self._get_filter(query.filter)\n\n                # Perform our search\n                return_from = 2 if self._schema_ver == \"V1\" else 1\n                res = self.col.search(\n                    data=[query.embedding],\n                    anns_field=EMBEDDING_FIELD,\n                    param=self.search_params,\n                    limit=query.top_k,\n                    expr=filter,\n                    output_fields=[\n                        field[0] for field in self._get_schema()[return_from:]\n                    ],  # Ignoring pk, embedding\n                )\n                # Results that will hold our DocumentChunkWithScores\n                results = []\n                # Parse every result for our search\n                for hit in res[0]:  # type: ignore\n                    # The distance score for the search result, falls under DocumentChunkWithScore\n                    score = hit.score\n                    # Our metadata info, falls under DocumentChunkMetadata\n                    metadata = {}\n                    # Grab the values that correspond to our fields, ignore pk and embedding.\n                    for x in [field[0] for field in self._get_schema()[return_from:]]:\n                        metadata[x] = hit.entity.get(x)\n                    # If the source isn't valid, convert to None\n                    if metadata[\"source\"] not in Source.__members__:\n                        metadata[\"source\"] = None\n                    # Text falls under the DocumentChunk\n                    text = metadata.pop(\"text\")\n                    # Id falls under the DocumentChunk\n                    ids = metadata.pop(\"id\")\n                    chunk = DocumentChunkWithScore(\n                        id=ids,\n                        score=score,\n                        text=text,\n                        metadata=DocumentChunkMetadata(**metadata),\n                    )\n                    results.append(chunk)\n\n                # TODO: decide on doing queries to grab the embedding itself, slows down performance as double query occurs\n\n                return QueryResult(query=query.query, results=results)\n            except Exception as e:\n                logger.error(\"Failed to query, error: {}\".format(e))\n                return QueryResult(query=query.query, results=[])\n\n        results: List[QueryResult] = await asyncio.gather(\n            *[_single_query(query) for query in queries]\n        )\n        return results\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"Delete the entities based either on the chunk_id of the vector,\n\n        Args:\n            ids (Optional[List[str]], optional): The document_ids to delete. Defaults to None.\n            filter (Optional[DocumentMetadataFilter], optional): The filter to delete by. Defaults to None.\n            delete_all (Optional[bool], optional): Whether to drop the collection and recreate it. Defaults to None.\n        \"\"\"\n        # If deleting all, drop and create the new collection\n        if delete_all:\n            coll_name = self.col.name\n            logger.info(\n                \"Delete the entire collection {} and create new one\".format(coll_name)\n            )\n            # Release the collection from memory\n            self.col.release()\n            # Drop the collection\n            self.col.drop()\n            # Recreate the new collection\n            self._create_collection(coll_name, True)\n            self._create_index()\n            return True\n\n        # Keep track of how many we have deleted for later printing\n        delete_count = 0\n        batch_size = 100\n        pk_name = \"pk\" if self._schema_ver == \"V1\" else \"id\"\n        try:\n            # According to the api design, the ids is a list of document_id,\n            # document_id is not primary key, use query+delete to workaround,\n            # in future version we can delete by expression\n            if (ids is not None) and len(ids) > 0:\n                # Add quotation marks around the string format id\n                ids = ['\"' + str(id) + '\"' for id in ids]\n                # Query for the pk's of entries that match id's\n                ids = self.col.query(f\"document_id in [{','.join(ids)}]\")\n                # Convert to list of pks\n                pks = [str(entry[pk_name]) for entry in ids]  # type: ignore\n                # for schema V2, the \"id\" is varchar, rewrite the expression\n                if self._schema_ver != \"V1\":\n                    pks = ['\"' + pk + '\"' for pk in pks]\n\n                # Delete by ids batch by batch(avoid too long expression)\n                logger.info(\n                    \"Apply {:d} deletions to schema {:s}\".format(\n                        len(pks), self._schema_ver\n                    )\n                )\n                while len(pks) > 0:\n                    batch_pks = pks[:batch_size]\n                    pks = pks[batch_size:]\n                    # Delete the entries batch by batch\n                    res = self.col.delete(f\"{pk_name} in [{','.join(batch_pks)}]\")\n                    # Increment our deleted count\n                    delete_count += int(res.delete_count)  # type: ignore\n        except Exception as e:\n            logger.error(\"Failed to delete by ids, error: {}\".format(e))\n\n        try:\n            # Check if empty filter\n            if filter is not None:\n                # Convert filter to milvus expression\n                filter = self._get_filter(filter)  # type: ignore\n                # Check if there is anything to filter\n                if len(filter) != 0:  # type: ignore\n                    # Query for the pk's of entries that match filter\n                    res = self.col.query(filter)  # type: ignore\n                    # Convert to list of pks\n                    pks = [str(entry[pk_name]) for entry in res]  # type: ignore\n                    # for schema V2, the \"id\" is varchar, rewrite the expression\n                    if self._schema_ver != \"V1\":\n                        pks = ['\"' + pk + '\"' for pk in pks]\n                    # Check to see if there are valid pk's to delete, delete batch by batch(avoid too long expression)\n                    while len(pks) > 0:  # type: ignore\n                        batch_pks = pks[:batch_size]\n                        pks = pks[batch_size:]\n                        # Delete the entries batch by batch\n                        res = self.col.delete(f\"{pk_name} in [{','.join(batch_pks)}]\")  # type: ignore\n                        # Increment our delete count\n                        delete_count += int(res.delete_count)  # type: ignore\n        except Exception as e:\n            logger.error(\"Failed to delete by filter, error: {}\".format(e))\n\n        logger.info(\"{:d} records deleted\".format(delete_count))\n\n        # This setting performs flushes after delete. Small delete == bad to use\n        # self.col.flush()\n\n        return True\n\n    def _get_filter(self, filter: DocumentMetadataFilter) -> Optional[str]:\n        \"\"\"Converts a DocumentMetdataFilter to the expression that Milvus takes.\n\n        Args:\n            filter (DocumentMetadataFilter): The Filter to convert to Milvus expression.\n\n        Returns:\n            Optional[str]: The filter if valid, otherwise None.\n        \"\"\"\n        filters = []\n        # Go through all the fields and their values\n        for field, value in filter.dict().items():\n            # Check if the Value is empty\n            if value is not None:\n                # Convert start_date to int and add greater than or equal logic\n                if field == \"start_date\":\n                    filters.append(\n                        \"(created_at >= \" + str(to_unix_timestamp(value)) + \")\"\n                    )\n                # Convert end_date to int and add less than or equal logic\n                elif field == \"end_date\":\n                    filters.append(\n                        \"(created_at <= \" + str(to_unix_timestamp(value)) + \")\"\n                    )\n                # Convert Source to its string value and check equivalency\n                elif field == \"source\":\n                    filters.append(\"(\" + field + ' == \"' + str(value.value) + '\")')\n                # Check equivalency of rest of string fields\n                else:\n                    filters.append(\"(\" + field + ' == \"' + str(value) + '\")')\n        # Join all our expressions with `and``\n        return \" and \".join(filters)\n"
  },
  {
    "path": "datastore/providers/mongodb_atlas_datastore.py",
    "content": "import os\nfrom typing import Dict, List, Any, Optional\nfrom loguru import logger\nfrom importlib.metadata import version\nfrom motor.motor_asyncio import AsyncIOMotorClient\nfrom pymongo.driver_info import DriverInfo\nfrom pymongo import UpdateOne\n\nfrom datastore.datastore import DataStore\nfrom functools import cached_property\nfrom models.models import (\n    Document,\n    DocumentChunk,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n)\nfrom services.chunks import get_document_chunks\nfrom services.date import to_unix_timestamp\n\n\nMONGODB_CONNECTION_URI = os.environ.get(\"MONGODB_URI\")\nMONGODB_DATABASE = os.environ.get(\"MONGODB_DATABASE\", \"default\")\nMONGODB_COLLECTION = os.environ.get(\"MONGODB_COLLECTION\", \"default\")\nMONGODB_INDEX = os.environ.get(\"MONGODB_INDEX\", \"default\")\nOVERSAMPLING_FACTOR = 10\nMAX_CANDIDATES = 10_000\n\n\nclass MongoDBAtlasDataStore(DataStore):\n\n    def __init__(\n        self,\n        atlas_connection_uri: str = MONGODB_CONNECTION_URI,\n        index_name: str = MONGODB_INDEX,\n        database_name: str = MONGODB_DATABASE,\n        collection_name: str = MONGODB_COLLECTION,\n        oversampling_factor: float = OVERSAMPLING_FACTOR,\n    ):\n        \"\"\"\n        Initialize a MongoDBAtlasDataStore instance.\n\n        Parameters:\n        - index_name (str, optional): Vector search index. If not provided, default index name is used.\n        - database_name (str, optional): Database. If not provided, default database name is used.\n        - collection_name (str, optional): Collection. If not provided, default collection name is used.\n        - oversampling_factor (float, optional): Oversampling factor for data augmentation.\n                                                 Default is OVERSAMPLING_FACTOR.\n\n        Raises:\n        - ValueError: If index_name is not a valid string.\n\n        Attributes:\n        - index_name (str): Name of the index.\n        - database_name (str): Name of the database.\n        - collection_name (str): Name of the collection.\n        - oversampling_factor (float): Oversampling factor for data augmentation.\n        \"\"\"\n\n        self.atlas_connection_uri = atlas_connection_uri\n        self.oversampling_factor = oversampling_factor\n        self.database_name = database_name\n        self.collection_name = collection_name\n\n        if not (index_name and isinstance(index_name, str)):\n            raise ValueError(\"Provide a valid index name\")\n        self.index_name = index_name\n\n        # TODO: Create index via driver https://jira.mongodb.org/browse/PYTHON-4175\n        # self._create_search_index(num_dimensions=1536, path=\"embedding\", similarity=\"dotProduct\", type=\"vector\")\n\n    @cached_property\n    def client(self):\n        return self._connect_to_mongodb_atlas(\n            atlas_connection_uri=MONGODB_CONNECTION_URI\n        )\n\n    async def upsert(\n        self, documents: List[Document], chunk_token_size: Optional[int] = None\n    ) -> List[str]:\n        \"\"\"\n        Takes in a list of Documents, chunks them, and upserts the chunks into the database.\n        Return a list the ids of the document chunks.\n        \"\"\"\n        chunks = get_document_chunks(documents, chunk_token_size)\n        return await self._upsert(chunks)\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        documents_to_upsert = []\n        inserted_ids = []\n        for chunk_list in chunks.values():\n            for chunk in chunk_list:\n                inserted_ids.append(chunk.id)\n                documents_to_upsert.append(\n                        UpdateOne({'_id': chunk.id}, {\"$set\": chunk.dict()}, upsert=True)\n                )\n        logger.info(f\"Upsert documents into MongoDB collection: {self.database_name}: {self.collection_name}\")\n        await self.client[self.database_name][self.collection_name].bulk_write(documents_to_upsert)\n        logger.info(\"Upsert successful\")\n\n        return inserted_ids\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns\n        a list of query results with matching document chunks and scores.\n        \"\"\"\n        results = []\n        for query in queries:\n            query_result = await self._execute_embedding_query(query)\n            results.append(query_result)\n\n        return results\n\n    async def _execute_embedding_query(self, query: QueryWithEmbedding) -> QueryResult:\n        \"\"\"\n        Execute a MongoDB query using vector search on the specified collection and\n        return the result of the query, including matched documents and their scores.\n        \"\"\"\n        pipeline = [\n            {\n                '$vectorSearch': {\n                    'index': self.index_name,\n                    'path': 'embedding',\n                    'queryVector': query.embedding,\n                    'numCandidates': min(query.top_k * self.oversampling_factor, MAX_CANDIDATES),\n                    'limit': query.top_k\n                 }\n            }, {\n                '$project': {\n                    'text': 1,\n                    'metadata': 1,\n                    'score': {\n                        '$meta': 'vectorSearchScore'\n                    }\n                }\n            }\n        ]\n\n        async with self.client[self.database_name][self.collection_name].aggregate(pipeline) as cursor:\n            results = [\n                self._convert_mongodb_document_to_document_chunk_with_score(doc)\n                async for doc in cursor\n            ]\n\n            return QueryResult(\n                query=query.query,\n                results=results,\n            )\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes documents by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n\n        Note that ids refer to those in the datastore,\n        which are those of the **DocumentChunks**\n        \"\"\"\n        # Delete all documents from the collection if delete_all is True\n        if delete_all:\n            logger.info(\"Deleting all documents from collection\")\n            mg_filter = {}\n\n        # Delete by ids\n        elif ids:\n            logger.info(f\"Deleting documents with ids: {ids}\")\n            mg_filter = {\"_id\": {\"$in\": ids}}\n\n        # Delete by filters\n        elif filter:\n            mg_filter = self._build_mongo_filter(filter)\n            logger.info(f\"Deleting documents with filter: {mg_filter}\")\n        # Do nothing\n        else:\n            logger.warning(\"No criteria set; nothing to delete args: ids: %s, filter: %s delete_all: %s\", ids, filter, delete_all)\n            return True\n\n        try:\n            await self.client[self.database_name][self.collection_name].delete_many(mg_filter)\n            logger.info(\"Deleted documents successfully\")\n        except Exception as e:\n            logger.error(\"Error deleting documents with filter: %s -- error: %s\", mg_filter, e)\n            return False\n\n        return True\n\n    def _convert_mongodb_document_to_document_chunk_with_score(\n        self, document: Dict\n    ) -> DocumentChunkWithScore:\n        # Convert MongoDB document to DocumentChunkWithScore\n        return DocumentChunkWithScore(\n            id=document.get(\"_id\"),\n            text=document[\"text\"],\n            metadata=document.get(\"metadata\"),\n            score=document.get(\"score\"),\n        )\n\n    def _build_mongo_filter(\n        self, filter: Optional[DocumentMetadataFilter] = None\n    ) -> Dict[str, Any]:\n        \"\"\"\n        Generate MongoDB query filters based on the provided DocumentMetadataFilter.\n        \"\"\"\n        if filter is None:\n            return {}\n\n        mongo_filters = {\n            \"$and\": [],\n        }\n\n        # For each field in the MetadataFilter,\n        # check if it has a value and add the corresponding MongoDB filter expression\n        for field, value in filter.dict().items():\n            if value is not None:\n                if field == \"start_date\":\n                    mongo_filters[\"$and\"].append(\n                        {\"created_at\": {\"$gte\": to_unix_timestamp(value)}}\n                    )\n                elif field == \"end_date\":\n                    mongo_filters[\"$and\"].append(\n                        {\"created_at\": {\"$lte\": to_unix_timestamp(value)}}\n                    )\n                else:\n                    mongo_filters[\"$and\"].append(\n                        {f\"metadata.{field}\": value}\n                    )\n\n        return mongo_filters\n\n    @staticmethod\n    def _connect_to_mongodb_atlas(atlas_connection_uri: str):\n        \"\"\"\n        Establish a connection to MongoDB Atlas.\n        \"\"\"\n\n        client = AsyncIOMotorClient(\n            atlas_connection_uri,\n            driver=DriverInfo(name=\"Chatgpt Retrieval Plugin\", version=version(\"chatgpt_retrieval_plugin\")))\n        return client\n"
  },
  {
    "path": "datastore/providers/pgvector_datastore.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import Any, Dict, List, Optional\nfrom datetime import datetime\nfrom loguru import logger\n\nfrom services.date import to_unix_timestamp\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    DocumentChunkWithScore,\n)\n\n\n# interface for Postgres client to implement pg based Datastore providers\nclass PGClient(ABC):\n    @abstractmethod\n    async def upsert(self, table: str, json: dict[str, Any]) -> None:\n        \"\"\"\n        Takes in a list of documents and inserts them into the table.\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    async def rpc(self, function_name: str, params: dict[str, Any]) -> Any:\n        \"\"\"\n        Calls a stored procedure in the database with the given parameters.\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete_like(self, table: str, column: str, pattern: str) -> None:\n        \"\"\"\n        Deletes rows in the table that match the pattern.\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete_in(self, table: str, column: str, ids: List[str]) -> None:\n        \"\"\"\n        Deletes rows in the table that match the ids.\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    async def delete_by_filters(\n        self, table: str, filter: DocumentMetadataFilter\n    ) -> None:\n        \"\"\"\n        Deletes rows in the table that match the filter.\n        \"\"\"\n        raise NotImplementedError\n\n\n# abstract class for Postgres based Datastore providers that implements DataStore interface\nclass PgVectorDataStore(DataStore):\n    def __init__(self):\n        self.client = self.create_db_client()\n\n    @abstractmethod\n    def create_db_client(self) -> PGClient:\n        \"\"\"\n        Create db client, can be accessing postgres database via different APIs.\n        Can be supabase client or psycopg2 based client.\n        Return a client for postgres DB.\n        \"\"\"\n\n        raise NotImplementedError\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a dict of document_ids to list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        for document_id, document_chunks in chunks.items():\n            for chunk in document_chunks:\n                json = {\n                    \"id\": chunk.id,\n                    \"content\": chunk.text,\n                    \"embedding\": chunk.embedding,\n                    \"document_id\": document_id,\n                    \"source\": chunk.metadata.source,\n                    \"source_id\": chunk.metadata.source_id,\n                    \"url\": chunk.metadata.url,\n                    \"author\": chunk.metadata.author,\n                }\n                if chunk.metadata.created_at:\n                    json[\"created_at\"] = (\n                        datetime.fromtimestamp(\n                            to_unix_timestamp(chunk.metadata.created_at)\n                        ),\n                    )\n                await self.client.upsert(\"documents\", json)\n\n        return list(chunks.keys())\n\n    async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        query_results: List[QueryResult] = []\n        for query in queries:\n            # get the top 3 documents with the highest cosine similarity using rpc function in the database called \"match_page_sections\"\n            params = {\n                \"in_embedding\": query.embedding,\n            }\n            if query.top_k:\n                params[\"in_match_count\"] = query.top_k\n            if query.filter:\n                if query.filter.document_id:\n                    params[\"in_document_id\"] = query.filter.document_id\n                if query.filter.source:\n                    params[\"in_source\"] = query.filter.source.value\n                if query.filter.source_id:\n                    params[\"in_source_id\"] = query.filter.source_id\n                if query.filter.author:\n                    params[\"in_author\"] = query.filter.author\n                if query.filter.start_date:\n                    params[\"in_start_date\"] = datetime.fromtimestamp(\n                        to_unix_timestamp(query.filter.start_date)\n                    )\n                if query.filter.end_date:\n                    params[\"in_end_date\"] = datetime.fromtimestamp(\n                        to_unix_timestamp(query.filter.end_date)\n                    )\n            try:\n                data = await self.client.rpc(\"match_page_sections\", params=params)\n                results: List[DocumentChunkWithScore] = []\n                for row in data:\n                    document_chunk = DocumentChunkWithScore(\n                        id=row[\"id\"],\n                        text=row[\"content\"],\n                        # TODO: add embedding to the response ?\n                        # embedding=row[\"embedding\"],\n                        score=float(row[\"similarity\"]),\n                        metadata=DocumentChunkMetadata(\n                            source=row[\"source\"],\n                            source_id=row[\"source_id\"],\n                            document_id=row[\"document_id\"],\n                            url=row[\"url\"],\n                            created_at=row[\"created_at\"],\n                            author=row[\"author\"],\n                        ),\n                    )\n                    results.append(document_chunk)\n                query_results.append(QueryResult(query=query.query, results=results))\n            except Exception as e:\n                logger.error(e)\n                query_results.append(QueryResult(query=query.query, results=[]))\n        return query_results\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Multiple parameters can be used at once.\n        Returns whether the operation was successful.\n        \"\"\"\n        if delete_all:\n            try:\n                await self.client.delete_like(\"documents\", \"document_id\", \"%\")\n            except:\n                return False\n        elif ids:\n            try:\n                await self.client.delete_in(\"documents\", \"document_id\", ids)\n            except:\n                return False\n        elif filter:\n            try:\n                await self.client.delete_by_filters(\"documents\", filter)\n            except:\n                return False\n        return True\n"
  },
  {
    "path": "datastore/providers/pinecone_datastore.py",
    "content": "import os\nfrom typing import Any, Dict, List, Optional\nimport pinecone\nfrom tenacity import retry, wait_random_exponential, stop_after_attempt\nimport asyncio\nfrom loguru import logger\n\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    Source,\n)\nfrom services.date import to_unix_timestamp\n\n# Read environment variables for Pinecone configuration\nPINECONE_API_KEY = os.environ.get(\"PINECONE_API_KEY\")\nPINECONE_ENVIRONMENT = os.environ.get(\"PINECONE_ENVIRONMENT\")\nPINECONE_INDEX = os.environ.get(\"PINECONE_INDEX\")\nassert PINECONE_API_KEY is not None\nassert PINECONE_ENVIRONMENT is not None\nassert PINECONE_INDEX is not None\n\n# Initialize Pinecone with the API key and environment\npinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)\n\n# Set the batch size for upserting vectors to Pinecone\nUPSERT_BATCH_SIZE = 100\n\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\nclass PineconeDataStore(DataStore):\n    def __init__(self):\n        # Check if the index name is specified and exists in Pinecone\n        if PINECONE_INDEX and PINECONE_INDEX not in pinecone.list_indexes():\n            # Get all fields in the metadata object in a list\n            fields_to_index = list(DocumentChunkMetadata.__fields__.keys())\n\n            # Create a new index with the specified name, dimension, and metadata configuration\n            try:\n                logger.info(\n                    f\"Creating index {PINECONE_INDEX} with metadata config {fields_to_index}\"\n                )\n                pinecone.create_index(\n                    PINECONE_INDEX,\n                    dimension=EMBEDDING_DIMENSION,\n                    metadata_config={\"indexed\": fields_to_index},\n                )\n                self.index = pinecone.Index(PINECONE_INDEX)\n                logger.info(f\"Index {PINECONE_INDEX} created successfully\")\n            except Exception as e:\n                logger.error(f\"Error creating index {PINECONE_INDEX}: {e}\")\n                raise e\n        elif PINECONE_INDEX and PINECONE_INDEX in pinecone.list_indexes():\n            # Connect to an existing index with the specified name\n            try:\n                logger.info(f\"Connecting to existing index {PINECONE_INDEX}\")\n                self.index = pinecone.Index(PINECONE_INDEX)\n                logger.info(f\"Connected to index {PINECONE_INDEX} successfully\")\n            except Exception as e:\n                logger.error(f\"Error connecting to index {PINECONE_INDEX}: {e}\")\n                raise e\n\n    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a dict from document id to list of document chunks and inserts them into the index.\n        Return a list of document ids.\n        \"\"\"\n        # Initialize a list of ids to return\n        doc_ids: List[str] = []\n        # Initialize a list of vectors to upsert\n        vectors = []\n        # Loop through the dict items\n        for doc_id, chunk_list in chunks.items():\n            # Append the id to the ids list\n            doc_ids.append(doc_id)\n            logger.info(f\"Upserting document_id: {doc_id}\")\n            for chunk in chunk_list:\n                # Create a vector tuple of (id, embedding, metadata)\n                # Convert the metadata object to a dict with unix timestamps for dates\n                pinecone_metadata = self._get_pinecone_metadata(chunk.metadata)\n                # Add the text and document id to the metadata dict\n                pinecone_metadata[\"text\"] = chunk.text\n                pinecone_metadata[\"document_id\"] = doc_id\n                vector = (chunk.id, chunk.embedding, pinecone_metadata)\n                vectors.append(vector)\n\n        # Split the vectors list into batches of the specified size\n        batches = [\n            vectors[i : i + UPSERT_BATCH_SIZE]\n            for i in range(0, len(vectors), UPSERT_BATCH_SIZE)\n        ]\n        # Upsert each batch to Pinecone\n        for batch in batches:\n            try:\n                logger.info(f\"Upserting batch of size {len(batch)}\")\n                self.index.upsert(vectors=batch)\n                logger.info(f\"Upserted batch successfully\")\n            except Exception as e:\n                logger.error(f\"Error upserting batch: {e}\")\n                raise e\n\n        return doc_ids\n\n    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n\n        # Define a helper coroutine that performs a single query and returns a QueryResult\n        async def _single_query(query: QueryWithEmbedding) -> QueryResult:\n            logger.debug(f\"Query: {query.query}\")\n\n            # Convert the metadata filter object to a dict with pinecone filter expressions\n            pinecone_filter = self._get_pinecone_filter(query.filter)\n\n            try:\n                # Query the index with the query embedding, filter, and top_k\n                query_response = self.index.query(\n                    # namespace=namespace,\n                    top_k=query.top_k,\n                    vector=query.embedding,\n                    filter=pinecone_filter,\n                    include_metadata=True,\n                )\n            except Exception as e:\n                logger.error(f\"Error querying index: {e}\")\n                raise e\n\n            query_results: List[DocumentChunkWithScore] = []\n            for result in query_response.matches:\n                score = result.score\n                metadata = result.metadata\n                # Remove document id and text from metadata and store it in a new variable\n                metadata_without_text = (\n                    {key: value for key, value in metadata.items() if key != \"text\"}\n                    if metadata\n                    else None\n                )\n\n                # If the source is not a valid Source in the Source enum, set it to None\n                if (\n                    metadata_without_text\n                    and \"source\" in metadata_without_text\n                    and metadata_without_text[\"source\"] not in Source.__members__\n                ):\n                    metadata_without_text[\"source\"] = None\n\n                # Create a document chunk with score object with the result data\n                result = DocumentChunkWithScore(\n                    id=result.id,\n                    score=score,\n                    text=str(metadata[\"text\"])\n                    if metadata and \"text\" in metadata\n                    else \"\",\n                    metadata=metadata_without_text,\n                )\n                query_results.append(result)\n            return QueryResult(query=query.query, results=query_results)\n\n        # Use asyncio.gather to run multiple _single_query coroutines concurrently and collect their results\n        results: List[QueryResult] = await asyncio.gather(\n            *[_single_query(query) for query in queries]\n        )\n\n        return results\n\n    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything from the index.\n        \"\"\"\n        # Delete all vectors from the index if delete_all is True\n        if delete_all:\n            try:\n                logger.info(f\"Deleting all vectors from index\")\n                self.index.delete(delete_all=True)\n                logger.info(f\"Deleted all vectors successfully\")\n                return True\n            except Exception as e:\n                logger.error(f\"Error deleting all vectors: {e}\")\n                raise e\n\n        # Convert the metadata filter object to a dict with pinecone filter expressions\n        pinecone_filter = self._get_pinecone_filter(filter)\n        # Delete vectors that match the filter from the index if the filter is not empty\n        if pinecone_filter != {}:\n            try:\n                logger.info(f\"Deleting vectors with filter {pinecone_filter}\")\n                self.index.delete(filter=pinecone_filter)\n                logger.info(f\"Deleted vectors with filter successfully\")\n            except Exception as e:\n                logger.error(f\"Error deleting vectors with filter: {e}\")\n                raise e\n\n        # Delete vectors that match the document ids from the index if the ids list is not empty\n        if ids is not None and len(ids) > 0:\n            try:\n                logger.info(f\"Deleting vectors with ids {ids}\")\n                pinecone_filter = {\"document_id\": {\"$in\": ids}}\n                self.index.delete(filter=pinecone_filter)  # type: ignore\n                logger.info(f\"Deleted vectors with ids successfully\")\n            except Exception as e:\n                logger.error(f\"Error deleting vectors with ids: {e}\")\n                raise e\n\n        return True\n\n    def _get_pinecone_filter(\n        self, filter: Optional[DocumentMetadataFilter] = None\n    ) -> Dict[str, Any]:\n        if filter is None:\n            return {}\n\n        pinecone_filter = {}\n\n        # For each field in the MetadataFilter, check if it has a value and add the corresponding pinecone filter expression\n        # For start_date and end_date, uses the $gte and $lte operators respectively\n        # For other fields, uses the $eq operator\n        for field, value in filter.dict().items():\n            if value is not None:\n                if field == \"start_date\":\n                    pinecone_filter[\"created_at\"] = pinecone_filter.get(\n                        \"created_at\", {}\n                    )\n                    pinecone_filter[\"created_at\"][\"$gte\"] = to_unix_timestamp(value)\n                elif field == \"end_date\":\n                    pinecone_filter[\"created_at\"] = pinecone_filter.get(\n                        \"created_at\", {}\n                    )\n                    pinecone_filter[\"created_at\"][\"$lte\"] = to_unix_timestamp(value)\n                else:\n                    pinecone_filter[field] = value\n\n        return pinecone_filter\n\n    def _get_pinecone_metadata(\n        self, metadata: Optional[DocumentChunkMetadata] = None\n    ) -> Dict[str, Any]:\n        if metadata is None:\n            return {}\n\n        pinecone_metadata = {}\n\n        # For each field in the Metadata, check if it has a value and add it to the pinecone metadata dict\n        # For fields that are dates, convert them to unix timestamps\n        for field, value in metadata.dict().items():\n            if value is not None:\n                if field in [\"created_at\"]:\n                    pinecone_metadata[field] = to_unix_timestamp(value)\n                else:\n                    pinecone_metadata[field] = value\n\n        return pinecone_metadata\n"
  },
  {
    "path": "datastore/providers/postgres_datastore.py",
    "content": "import os\nfrom typing import Any, List\nfrom datetime import datetime\nimport numpy as np\n\nfrom psycopg2 import connect\nfrom psycopg2.extras import DictCursor\nfrom pgvector.psycopg2 import register_vector\n\nfrom services.date import to_unix_timestamp\nfrom datastore.providers.pgvector_datastore import PGClient, PgVectorDataStore\nfrom models.models import (\n    DocumentMetadataFilter,\n)\n\nPG_HOST = os.environ.get(\"PG_HOST\", \"localhost\")\nPG_PORT = int(os.environ.get(\"PG_PORT\", 5432))\nPG_DB = os.environ.get(\"PG_DB\", \"postgres\")\nPG_USER = os.environ.get(\"PG_USER\", \"postgres\")\nPG_PASSWORD = os.environ.get(\"PG_PASSWORD\", \"postgres\")\n\n\n# class that implements the DataStore interface for Postgres Datastore provider\nclass PostgresDataStore(PgVectorDataStore):\n    def create_db_client(self):\n        return PostgresClient()\n\n\nclass PostgresClient(PGClient):\n    def __init__(self) -> None:\n        super().__init__()\n        self.client = connect(\n            dbname=PG_DB, user=PG_USER, password=PG_PASSWORD, host=PG_HOST, port=PG_PORT\n        )\n        register_vector(self.client)\n\n    def __del__(self):\n        # close the connection when the client is destroyed\n        self.client.close()\n\n    async def upsert(self, table: str, json: dict[str, Any]):\n        \"\"\"\n        Takes in a list of documents and inserts them into the table.\n        \"\"\"\n        with self.client.cursor() as cur:\n            if not json.get(\"created_at\"):\n                json[\"created_at\"] = datetime.now()\n            json[\"embedding\"] = np.array(json[\"embedding\"])\n            cur.execute(\n                f\"INSERT INTO {table} (id, content, embedding, document_id, source, source_id, url, author, created_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO UPDATE SET content = %s, embedding = %s, document_id = %s, source = %s, source_id = %s, url = %s, author = %s, created_at = %s\",\n                (\n                    json[\"id\"],\n                    json[\"content\"],\n                    json[\"embedding\"],\n                    json[\"document_id\"],\n                    json[\"source\"],\n                    json[\"source_id\"],\n                    json[\"url\"],\n                    json[\"author\"],\n                    json[\"created_at\"],\n                    json[\"content\"],\n                    json[\"embedding\"],\n                    json[\"document_id\"],\n                    json[\"source\"],\n                    json[\"source_id\"],\n                    json[\"url\"],\n                    json[\"author\"],\n                    json[\"created_at\"],\n                ),\n            )\n            self.client.commit()\n\n    async def rpc(self, function_name: str, params: dict[str, Any]):\n        \"\"\"\n        Calls a stored procedure in the database with the given parameters.\n        \"\"\"\n        data = []\n        params[\"in_embedding\"] = np.array(params[\"in_embedding\"])\n        with self.client.cursor(cursor_factory=DictCursor) as cur:\n            cur.callproc(function_name, params)\n            rows = cur.fetchall()\n            self.client.commit()\n            for row in rows:\n                row[\"created_at\"] = to_unix_timestamp(row[\"created_at\"])\n                data.append(dict(row))\n        return data\n\n    async def delete_like(self, table: str, column: str, pattern: str):\n        \"\"\"\n        Deletes rows in the table that match the pattern.\n        \"\"\"\n        with self.client.cursor() as cur:\n            cur.execute(\n                f\"DELETE FROM {table} WHERE {column} LIKE %s\",\n                (f\"%{pattern}%\",),\n            )\n            self.client.commit()\n\n    async def delete_in(self, table: str, column: str, ids: List[str]):\n        \"\"\"\n        Deletes rows in the table that match the ids.\n        \"\"\"\n        with self.client.cursor() as cur:\n            cur.execute(\n                f\"DELETE FROM {table} WHERE {column} IN %s\",\n                (tuple(ids),),\n            )\n            self.client.commit()\n\n    async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter):\n        \"\"\"\n        Deletes rows in the table that match the filter.\n        \"\"\"\n\n        filters = \"WHERE\"\n        if filter.document_id:\n            filters += f\" document_id = '{filter.document_id}' AND\"\n        if filter.source:\n            filters += f\" source = '{filter.source}' AND\"\n        if filter.source_id:\n            filters += f\" source_id = '{filter.source_id}' AND\"\n        if filter.author:\n            filters += f\" author = '{filter.author}' AND\"\n        if filter.start_date:\n            filters += f\" created_at >= '{filter.start_date}' AND\"\n        if filter.end_date:\n            filters += f\" created_at <= '{filter.end_date}' AND\"\n        filters = filters[:-4]\n\n        with self.client.cursor() as cur:\n            cur.execute(f\"DELETE FROM {table} {filters}\")\n            self.client.commit()\n"
  },
  {
    "path": "datastore/providers/qdrant_datastore.py",
    "content": "import os\nimport uuid\nfrom typing import Dict, List, Optional\n\nfrom grpc._channel import _InactiveRpcError\nfrom qdrant_client.http.exceptions import UnexpectedResponse\nfrom qdrant_client.http.models import PayloadSchemaType\n\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    DocumentChunkWithScore,\n)\nfrom qdrant_client.http import models as rest\n\nimport qdrant_client\n\nfrom services.date import to_unix_timestamp\n\nQDRANT_URL = os.environ.get(\"QDRANT_URL\", \"http://localhost\")\nQDRANT_PORT = os.environ.get(\"QDRANT_PORT\", \"6333\")\nQDRANT_GRPC_PORT = os.environ.get(\"QDRANT_GRPC_PORT\", \"6334\")\nQDRANT_API_KEY = os.environ.get(\"QDRANT_API_KEY\")\nQDRANT_COLLECTION = os.environ.get(\"QDRANT_COLLECTION\", \"document_chunks\")\n\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\nclass QdrantDataStore(DataStore):\n    UUID_NAMESPACE = uuid.UUID(\"3896d314-1e95-4a3a-b45a-945f9f0b541d\")\n\n    def __init__(\n        self,\n        collection_name: Optional[str] = None,\n        vector_size: int = EMBEDDING_DIMENSION,\n        distance: str = \"Cosine\",\n        recreate_collection: bool = False,\n    ):\n        \"\"\"\n        Args:\n            collection_name: Name of the collection to be used\n            vector_size: Size of the embedding stored in a collection\n            distance:\n                Any of \"Cosine\" / \"Euclid\" / \"Dot\". Distance function to measure\n                similarity\n        \"\"\"\n        self.client = qdrant_client.QdrantClient(\n            url=QDRANT_URL,\n            port=int(QDRANT_PORT),\n            grpc_port=int(QDRANT_GRPC_PORT),\n            api_key=QDRANT_API_KEY,\n            prefer_grpc=True,\n            timeout=10,\n        )\n        self.collection_name = collection_name or QDRANT_COLLECTION\n\n        # Set up the collection so the points might be inserted or queried\n        self._set_up_collection(vector_size, distance, recreate_collection)\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        points = [\n            self._convert_document_chunk_to_point(chunk)\n            for _, chunks in chunks.items()\n            for chunk in chunks\n        ]\n        self.client.upsert(\n            collection_name=self.collection_name,\n            points=points,  # type: ignore\n            wait=True,\n        )\n        return list(chunks.keys())\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        search_requests = [\n            self._convert_query_to_search_request(query) for query in queries\n        ]\n        results = self.client.search_batch(\n            collection_name=self.collection_name,\n            requests=search_requests,\n        )\n        return [\n            QueryResult(\n                query=query.query,\n                results=[\n                    self._convert_scored_point_to_document_chunk_with_score(point)\n                    for point in result\n                ],\n            )\n            for query, result in zip(queries, results)\n        ]\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n        \"\"\"\n        if ids is None and filter is None and delete_all is None:\n            raise ValueError(\n                \"Please provide one of the parameters: ids, filter or delete_all.\"\n            )\n\n        if delete_all:\n            points_selector = rest.Filter()\n        else:\n            points_selector = self._convert_metadata_filter_to_qdrant_filter(\n                filter, ids\n            )\n\n        response = self.client.delete(\n            collection_name=self.collection_name,\n            points_selector=points_selector,  # type: ignore\n        )\n        return \"COMPLETED\" == response.status\n\n    def _convert_document_chunk_to_point(\n        self, document_chunk: DocumentChunk\n    ) -> rest.PointStruct:\n        created_at = (\n            to_unix_timestamp(document_chunk.metadata.created_at)\n            if document_chunk.metadata.created_at is not None\n            else None\n        )\n        return rest.PointStruct(\n            id=self._create_document_chunk_id(document_chunk.id),\n            vector=document_chunk.embedding,  # type: ignore\n            payload={\n                \"id\": document_chunk.id,\n                \"text\": document_chunk.text,\n                \"metadata\": document_chunk.metadata.dict(),\n                \"created_at\": created_at,\n            },\n        )\n\n    def _create_document_chunk_id(self, external_id: Optional[str]) -> str:\n        if external_id is None:\n            return uuid.uuid4().hex\n        return uuid.uuid5(self.UUID_NAMESPACE, external_id).hex\n\n    def _convert_query_to_search_request(\n        self, query: QueryWithEmbedding\n    ) -> rest.SearchRequest:\n        return rest.SearchRequest(\n            vector=query.embedding,\n            filter=self._convert_metadata_filter_to_qdrant_filter(query.filter),\n            limit=query.top_k,  # type: ignore\n            with_payload=True,\n            with_vector=False,\n        )\n\n    def _convert_metadata_filter_to_qdrant_filter(\n        self,\n        metadata_filter: Optional[DocumentMetadataFilter] = None,\n        ids: Optional[List[str]] = None,\n    ) -> Optional[rest.Filter]:\n        if metadata_filter is None and ids is None:\n            return None\n\n        must_conditions, should_conditions = [], []\n\n        # Filtering by document ids\n        if ids and len(ids) > 0:\n            for document_id in ids:\n                should_conditions.append(\n                    rest.FieldCondition(\n                        key=\"metadata.document_id\",\n                        match=rest.MatchValue(value=document_id),\n                    )\n                )\n\n        # Equality filters for the payload attributes\n        if metadata_filter:\n            meta_attributes_keys = {\n                \"document_id\": \"metadata.document_id\",\n                \"source\": \"metadata.source\",\n                \"source_id\": \"metadata.source_id\",\n                \"author\": \"metadata.author\",\n            }\n\n            for meta_attr_name, payload_key in meta_attributes_keys.items():\n                attr_value = getattr(metadata_filter, meta_attr_name)\n                if attr_value is None:\n                    continue\n\n                must_conditions.append(\n                    rest.FieldCondition(\n                        key=payload_key, match=rest.MatchValue(value=attr_value)\n                    )\n                )\n\n            # Date filters use range filtering\n            start_date = metadata_filter.start_date\n            end_date = metadata_filter.end_date\n            if start_date or end_date:\n                gte_filter = (\n                    to_unix_timestamp(start_date) if start_date is not None else None\n                )\n                lte_filter = (\n                    to_unix_timestamp(end_date) if end_date is not None else None\n                )\n                must_conditions.append(\n                    rest.FieldCondition(\n                        key=\"created_at\",\n                        range=rest.Range(\n                            gte=gte_filter,\n                            lte=lte_filter,\n                        ),\n                    )\n                )\n\n        if 0 == len(must_conditions) and 0 == len(should_conditions):\n            return None\n\n        return rest.Filter(must=must_conditions, should=should_conditions)\n\n    def _convert_scored_point_to_document_chunk_with_score(\n        self, scored_point: rest.ScoredPoint\n    ) -> DocumentChunkWithScore:\n        payload = scored_point.payload or {}\n        return DocumentChunkWithScore(\n            id=payload.get(\"id\"),\n            text=scored_point.payload.get(\"text\"),  # type: ignore\n            metadata=scored_point.payload.get(\"metadata\"),  # type: ignore\n            embedding=scored_point.vector,  # type: ignore\n            score=scored_point.score,\n        )\n\n    def _set_up_collection(\n        self, vector_size: int, distance: str, recreate_collection: bool\n    ):\n        distance = rest.Distance[distance.upper()]\n\n        if recreate_collection:\n            self._recreate_collection(distance, vector_size)\n\n        try:\n            collection_info = self.client.get_collection(self.collection_name)\n            current_distance = collection_info.config.params.vectors.distance  # type: ignore\n            current_vector_size = collection_info.config.params.vectors.size  # type: ignore\n\n            if current_distance != distance:\n                raise ValueError(\n                    f\"Collection '{self.collection_name}' already exists in Qdrant, \"\n                    f\"but it is configured with a similarity '{current_distance.name}'. \"\n                    f\"If you want to use that collection, but with a different \"\n                    f\"similarity, please set `recreate_collection=True` argument.\"\n                )\n\n            if current_vector_size != vector_size:\n                raise ValueError(\n                    f\"Collection '{self.collection_name}' already exists in Qdrant, \"\n                    f\"but it is configured with a vector size '{current_vector_size}'. \"\n                    f\"If you want to use that collection, but with a different \"\n                    f\"vector size, please set `recreate_collection=True` argument.\"\n                )\n        except (UnexpectedResponse, _InactiveRpcError):\n            self._recreate_collection(distance, vector_size)\n\n    def _recreate_collection(self, distance: rest.Distance, vector_size: int):\n        self.client.recreate_collection(\n            self.collection_name,\n            vectors_config=rest.VectorParams(\n                size=vector_size,\n                distance=distance,\n            ),\n        )\n\n        # Create the payload index for the document_id metadata attribute, as it is\n        # used to delete the document related entries\n        self.client.create_payload_index(\n            self.collection_name,\n            field_name=\"metadata.document_id\",\n            field_type=PayloadSchemaType.KEYWORD,\n        )\n\n        # Create the payload index for the created_at attribute, to make the lookup\n        # by range filters faster\n        self.client.create_payload_index(\n            self.collection_name,\n            field_name=\"created_at\",\n            field_schema=PayloadSchemaType.INTEGER,\n        )\n"
  },
  {
    "path": "datastore/providers/redis_datastore.py",
    "content": "import asyncio\nimport os\nimport re\nimport json\nimport redis.asyncio as redis\nimport numpy as np\n\nfrom redis.commands.search.query import Query as RediSearchQuery\nfrom redis.commands.search.indexDefinition import IndexDefinition, IndexType\nfrom redis.commands.search.field import (\n    TagField,\n    TextField,\n    NumericField,\n    VectorField,\n)\nfrom loguru import logger\nfrom typing import Dict, List, Optional\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentMetadataFilter,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n)\nfrom services.date import to_unix_timestamp\n\n# Read environment variables for Redis\nREDIS_HOST = os.environ.get(\"REDIS_HOST\", \"localhost\")\nREDIS_PORT = int(os.environ.get(\"REDIS_PORT\", 6379))\nREDIS_PASSWORD = os.environ.get(\"REDIS_PASSWORD\")\nREDIS_INDEX_NAME = os.environ.get(\"REDIS_INDEX_NAME\", \"index\")\nREDIS_DOC_PREFIX = os.environ.get(\"REDIS_DOC_PREFIX\", \"doc\")\nREDIS_DISTANCE_METRIC = os.environ.get(\"REDIS_DISTANCE_METRIC\", \"COSINE\")\nREDIS_INDEX_TYPE = os.environ.get(\"REDIS_INDEX_TYPE\", \"FLAT\")\nassert REDIS_INDEX_TYPE in (\"FLAT\", \"HNSW\")\n\n# OpenAI Embeddings Dimension\nVECTOR_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\n# RediSearch constants\nREDIS_REQUIRED_MODULES = [\n    {\"name\": \"search\", \"ver\": 20600},\n    {\"name\": \"ReJSON\", \"ver\": 20404},\n]\n\nREDIS_DEFAULT_ESCAPED_CHARS = re.compile(r\"[,.<>{}\\[\\]\\\\\\\"\\':;!@#$%^&()\\-+=~\\/ ]\")\n\n\n# Helper functions\ndef unpack_schema(d: dict):\n    for v in d.values():\n        if isinstance(v, dict):\n            yield from unpack_schema(v)\n        else:\n            yield v\n\n\nasync def _check_redis_module_exist(client: redis.Redis, modules: List[dict]):\n    installed_modules = (await client.info()).get(\"modules\", [])\n    installed_modules = {module[\"name\"]: module for module in installed_modules}\n    for module in modules:\n        if module[\"name\"] not in installed_modules or int(\n            installed_modules[module[\"name\"]][\"ver\"]\n        ) < int(module[\"ver\"]):\n            error_message = (\n                \"You must add the RediSearch (>= 2.6) and ReJSON (>= 2.4) modules from Redis Stack. \"\n                \"Please refer to Redis Stack docs: https://redis.io/docs/stack/\"\n            )\n            logger.error(error_message)\n            raise AttributeError(error_message)\n\n\nclass RedisDataStore(DataStore):\n    def __init__(self, client: redis.Redis, redisearch_schema: dict):\n        self.client = client\n        self._schema = redisearch_schema\n        # Init default metadata with sentinel values in case the document written has no metadata\n        self._default_metadata = {\n            field: (0 if field == \"created_at\" else \"_null_\")\n            for field in redisearch_schema[\"metadata\"]\n        }\n\n    ### Redis Helper Methods ###\n\n    @classmethod\n    async def init(cls, **kwargs):\n        \"\"\"\n        Setup the index if it does not exist.\n        \"\"\"\n        try:\n            # Connect to the Redis Client\n            logger.info(\"Connecting to Redis\")\n            client = redis.Redis(\n                host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD\n            )\n        except Exception as e:\n            logger.error(f\"Error setting up Redis: {e}\")\n            raise e\n\n        await _check_redis_module_exist(client, modules=REDIS_REQUIRED_MODULES)\n\n        dim = kwargs.get(\"dim\", VECTOR_DIMENSION)\n        redisearch_schema = {\n            \"metadata\": {\n                \"document_id\": TagField(\n                    \"$.metadata.document_id\", as_name=\"document_id\"\n                ),\n                \"source_id\": TagField(\"$.metadata.source_id\", as_name=\"source_id\"),\n                \"source\": TagField(\"$.metadata.source\", as_name=\"source\"),\n                \"author\": TextField(\"$.metadata.author\", as_name=\"author\"),\n                \"created_at\": NumericField(\n                    \"$.metadata.created_at\", as_name=\"created_at\"\n                ),\n            },\n            \"embedding\": VectorField(\n                \"$.embedding\",\n                REDIS_INDEX_TYPE,\n                {\n                    \"TYPE\": \"FLOAT64\",\n                    \"DIM\": dim,\n                    \"DISTANCE_METRIC\": REDIS_DISTANCE_METRIC,\n                },\n                as_name=\"embedding\",\n            ),\n        }\n        try:\n            # Check for existence of RediSearch Index\n            await client.ft(REDIS_INDEX_NAME).info()\n            logger.info(f\"RediSearch index {REDIS_INDEX_NAME} already exists\")\n        except:\n            # Create the RediSearch Index\n            logger.info(f\"Creating new RediSearch index {REDIS_INDEX_NAME}\")\n            definition = IndexDefinition(\n                prefix=[REDIS_DOC_PREFIX], index_type=IndexType.JSON\n            )\n            fields = list(unpack_schema(redisearch_schema))\n            logger.info(f\"Creating index with fields: {fields}\")\n            await client.ft(REDIS_INDEX_NAME).create_index(\n                fields=fields, definition=definition\n            )\n        return cls(client, redisearch_schema)\n\n    @staticmethod\n    def _redis_key(document_id: str, chunk_id: str) -> str:\n        \"\"\"\n        Create the JSON key for document chunks in Redis.\n\n        Args:\n            document_id (str): Document Identifier\n            chunk_id (str): Chunk Identifier\n\n        Returns:\n            str: JSON key string.\n        \"\"\"\n        return f\"doc:{document_id}:chunk:{chunk_id}\"\n\n    @staticmethod\n    def _escape(value: str) -> str:\n        \"\"\"\n        Escape filter value.\n\n        Args:\n            value (str): Value to escape.\n\n        Returns:\n            str: Escaped filter value for RediSearch.\n        \"\"\"\n\n        def escape_symbol(match) -> str:\n            value = match.group(0)\n            return f\"\\\\{value}\"\n\n        return REDIS_DEFAULT_ESCAPED_CHARS.sub(escape_symbol, value)\n\n    def _get_redis_chunk(self, chunk: DocumentChunk) -> dict:\n        \"\"\"\n        Convert DocumentChunk into a JSON object for storage\n        in Redis.\n\n        Args:\n            chunk (DocumentChunk): Chunk of a Document.\n\n        Returns:\n            dict: JSON object for storage in Redis.\n        \"\"\"\n        # Convert chunk -> dict\n        data = chunk.__dict__\n        metadata = chunk.metadata.__dict__\n        data[\"chunk_id\"] = data.pop(\"id\")\n\n        # Prep Redis Metadata\n        redis_metadata = dict(self._default_metadata)\n        if metadata:\n            for field, value in metadata.items():\n                if value:\n                    if field == \"created_at\":\n                        redis_metadata[field] = to_unix_timestamp(value)  # type: ignore\n                    else:\n                        redis_metadata[field] = value\n        data[\"metadata\"] = redis_metadata\n        return data\n\n    def _get_redis_query(self, query: QueryWithEmbedding) -> RediSearchQuery:\n        \"\"\"\n        Convert a QueryWithEmbedding into a RediSearchQuery.\n\n        Args:\n            query (QueryWithEmbedding): Search query.\n\n        Returns:\n            RediSearchQuery: Query for RediSearch.\n        \"\"\"\n        filter_str: str = \"\"\n\n        # RediSearch field type to query string\n        def _typ_to_str(typ, field, value) -> str:  # type: ignore\n            if isinstance(typ, TagField):\n                return f\"@{field}:{{{self._escape(value)}}} \"\n            elif isinstance(typ, TextField):\n                return f\"@{field}:{value} \"\n            elif isinstance(typ, NumericField):\n                num = to_unix_timestamp(value)\n                match field:\n                    case \"start_date\":\n                        return f\"@{field}:[{num} +inf] \"\n                    case \"end_date\":\n                        return f\"@{field}:[-inf {num}] \"\n\n        # Build filter\n        if query.filter:\n            redisearch_schema = self._schema\n            for field, value in query.filter.__dict__.items():\n                if not value:\n                    continue\n                if field in redisearch_schema:\n                    filter_str += _typ_to_str(redisearch_schema[field], field, value)\n                elif field in redisearch_schema[\"metadata\"]:\n                    if field == \"source\":  # handle the enum\n                        value = value.value\n                    filter_str += _typ_to_str(\n                        redisearch_schema[\"metadata\"][field], field, value\n                    )\n                elif field in [\"start_date\", \"end_date\"]:\n                    filter_str += _typ_to_str(\n                        redisearch_schema[\"metadata\"][\"created_at\"], field, value\n                    )\n\n        # Postprocess filter string\n        filter_str = filter_str.strip()\n        filter_str = filter_str if filter_str else \"*\"\n\n        # Prepare query string\n        query_str = (\n            f\"({filter_str})=>[KNN {query.top_k} @embedding $embedding as score]\"\n        )\n        return (\n            RediSearchQuery(query_str)\n            .sort_by(\"score\")\n            .paging(0, query.top_k)\n            .dialect(2)\n        )\n\n    async def _redis_delete(self, keys: List[str]):\n        \"\"\"\n        Delete a list of keys from Redis.\n\n        Args:\n            keys (List[str]): List of keys to delete.\n        \"\"\"\n        # Delete the keys\n        await asyncio.gather(*[self.client.delete(key) for key in keys])\n\n    #######\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        # Initialize a list of ids to return\n        doc_ids: List[str] = []\n\n        # Loop through the dict items\n        for doc_id, chunk_list in chunks.items():\n            # Append the id to the ids list\n            doc_ids.append(doc_id)\n\n            # Write chunks in a pipelines\n            async with self.client.pipeline(transaction=False) as pipe:\n                for chunk in chunk_list:\n                    key = self._redis_key(doc_id, chunk.id)\n                    data = self._get_redis_chunk(chunk)\n                    await pipe.json().set(key, \"$\", data)\n                await pipe.execute()\n\n        return doc_ids\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and\n        returns a list of query results with matching document chunks and scores.\n        \"\"\"\n        # Prepare query responses and results object\n        results: List[QueryResult] = []\n\n        # Gather query results in a pipeline\n        logger.info(f\"Gathering {len(queries)} query results\")\n        for query in queries:\n            logger.debug(f\"Query: {query.query}\")\n            query_results: List[DocumentChunkWithScore] = []\n\n            # Extract Redis query\n            redis_query: RediSearchQuery = self._get_redis_query(query)\n            embedding = np.array(query.embedding, dtype=np.float64).tobytes()\n\n            # Perform vector search\n            query_response = await self.client.ft(REDIS_INDEX_NAME).search(\n                redis_query, {\"embedding\": embedding}\n            )\n\n            # Iterate through the most similar documents\n            for doc in query_response.docs:\n                # Load JSON data\n                doc_json = json.loads(doc.json)\n                # Create document chunk object with score\n                result = DocumentChunkWithScore(\n                    id=doc_json[\"metadata\"][\"document_id\"],\n                    score=doc.score,\n                    text=doc_json[\"text\"],\n                    metadata=doc_json[\"metadata\"],\n                )\n                query_results.append(result)\n\n            # Add to overall results\n            results.append(QueryResult(query=query.query, results=query_results))\n\n        return results\n\n    async def _find_keys(self, pattern: str) -> List[str]:\n        return [key async for key in self.client.scan_iter(pattern)]\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n        \"\"\"\n        # Delete all vectors from the index if delete_all is True\n        if delete_all:\n            try:\n                logger.info(f\"Deleting all documents from index\")\n                await self.client.ft(REDIS_INDEX_NAME).dropindex(True)\n                logger.info(f\"Deleted all documents successfully\")\n                return True\n            except Exception as e:\n                logger.error(f\"Error deleting all documents: {e}\")\n                raise e\n\n        # Delete by filter\n        if filter:\n            # TODO - extend this to work with other metadata filters?\n            if filter.document_id:\n                try:\n                    keys = await self._find_keys(\n                        f\"{REDIS_DOC_PREFIX}:{filter.document_id}:*\"\n                    )\n                    await self._redis_delete(keys)\n                    logger.info(f\"Deleted document {filter.document_id} successfully\")\n                except Exception as e:\n                    logger.error(f\"Error deleting document {filter.document_id}: {e}\")\n                    raise e\n\n        # Delete by explicit ids (Redis keys)\n        if ids:\n            try:\n                logger.info(f\"Deleting document ids {ids}\")\n                keys = []\n                # find all keys associated with the document ids\n                for document_id in ids:\n                    doc_keys = await self._find_keys(\n                        pattern=f\"{REDIS_DOC_PREFIX}:{document_id}:*\"\n                    )\n                    keys.extend(doc_keys)\n                # delete all keys\n                logger.info(f\"Deleting {len(keys)} keys from Redis\")\n                await self._redis_delete(keys)\n            except Exception as e:\n                logger.error(f\"Error deleting ids: {e}\")\n                raise e\n\n        return True\n"
  },
  {
    "path": "datastore/providers/supabase_datastore.py",
    "content": "import os\nfrom typing import Any, List\nfrom datetime import datetime\n\nfrom supabase import Client\n\nfrom datastore.providers.pgvector_datastore import PGClient, PgVectorDataStore\nfrom models.models import (\n    DocumentMetadataFilter,\n)\n\nSUPABASE_URL = os.environ.get(\"SUPABASE_URL\")\nassert SUPABASE_URL is not None, \"SUPABASE_URL is not set\"\nSUPABASE_ANON_KEY = os.environ.get(\"SUPABASE_ANON_KEY\")\n# use service role key if you want this app to be able to bypass your Row Level Security policies\nSUPABASE_SERVICE_ROLE_KEY = os.environ.get(\"SUPABASE_SERVICE_ROLE_KEY\")\nassert (\n    SUPABASE_ANON_KEY is not None or SUPABASE_SERVICE_ROLE_KEY is not None\n), \"SUPABASE_ANON_KEY or SUPABASE_SERVICE_ROLE_KEY must be set\"\n\n\n# class that implements the DataStore interface for Supabase Datastore provider\nclass SupabaseDataStore(PgVectorDataStore):\n    def create_db_client(self):\n        return SupabaseClient()\n\n\nclass SupabaseClient(PGClient):\n    def __init__(self) -> None:\n        super().__init__()\n        if not SUPABASE_SERVICE_ROLE_KEY:\n            self.client = Client(SUPABASE_URL, SUPABASE_ANON_KEY)\n        else:\n            self.client = Client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)\n\n    async def upsert(self, table: str, json: dict[str, Any]):\n        \"\"\"\n        Takes in a list of documents and inserts them into the table.\n        \"\"\"\n        if \"created_at\" in json:\n            json[\"created_at\"] = json[\"created_at\"][0].isoformat()\n\n        self.client.table(table).upsert(json).execute()\n\n    async def rpc(self, function_name: str, params: dict[str, Any]):\n        \"\"\"\n        Calls a stored procedure in the database with the given parameters.\n        \"\"\"\n        if \"in_start_date\" in params:\n            params[\"in_start_date\"] = params[\"in_start_date\"].isoformat()\n        if \"in_end_date\" in params:\n            params[\"in_end_date\"] = params[\"in_end_date\"].isoformat()\n\n        response = self.client.rpc(function_name, params=params).execute()\n        return response.data\n\n    async def delete_like(self, table: str, column: str, pattern: str):\n        \"\"\"\n        Deletes rows in the table that match the pattern.\n        \"\"\"\n        self.client.table(table).delete().like(column, pattern).execute()\n\n    async def delete_in(self, table: str, column: str, ids: List[str]):\n        \"\"\"\n        Deletes rows in the table that match the ids.\n        \"\"\"\n        self.client.table(table).delete().in_(column, ids).execute()\n\n    async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter):\n        \"\"\"\n        Deletes rows in the table that match the filter.\n        \"\"\"\n        builder = self.client.table(table).delete()\n        if filter.document_id:\n            builder = builder.eq(\n                \"document_id\",\n                filter.document_id,\n            )\n        if filter.source:\n            builder = builder.eq(\"source\", filter.source)\n        if filter.source_id:\n            builder = builder.eq(\"source_id\", filter.source_id)\n        if filter.author:\n            builder = builder.eq(\"author\", filter.author)\n        if filter.start_date:\n            builder = builder.gte(\n                \"created_at\",\n                filter.start_date[0].isoformat(),\n            )\n        if filter.end_date:\n            builder = builder.lte(\n                \"created_at\",\n                filter.end_date[0].isoformat(),\n            )\n        builder.execute()\n"
  },
  {
    "path": "datastore/providers/weaviate_datastore.py",
    "content": "import asyncio\nimport os\nimport re\nimport uuid\nfrom typing import Dict, List, Optional\n\nimport weaviate\nfrom loguru import logger\nfrom weaviate import Client\nfrom weaviate.util import generate_uuid5\n\nfrom datastore.datastore import DataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentChunkWithScore,\n    DocumentMetadataFilter,\n    QueryResult,\n    QueryWithEmbedding,\n    Source,\n)\n\nWEAVIATE_URL_DEFAULT = \"http://localhost:8080\"\nWEAVIATE_CLASS = os.environ.get(\"WEAVIATE_CLASS\", \"OpenAIDocument\")\n\nWEAVIATE_BATCH_SIZE = int(os.environ.get(\"WEAVIATE_BATCH_SIZE\", 20))\nWEAVIATE_BATCH_DYNAMIC = os.environ.get(\"WEAVIATE_BATCH_DYNAMIC\", False)\nWEAVIATE_BATCH_TIMEOUT_RETRIES = int(os.environ.get(\"WEAVIATE_TIMEOUT_RETRIES\", 3))\nWEAVIATE_BATCH_NUM_WORKERS = int(os.environ.get(\"WEAVIATE_BATCH_NUM_WORKERS\", 1))\n\nSCHEMA = {\n    \"class\": WEAVIATE_CLASS,\n    \"description\": \"The main class\",\n    \"properties\": [\n        {\n            \"name\": \"chunk_id\",\n            \"dataType\": [\"string\"],\n            \"description\": \"The chunk id\",\n        },\n        {\n            \"name\": \"document_id\",\n            \"dataType\": [\"string\"],\n            \"description\": \"The document id\",\n        },\n        {\n            \"name\": \"text\",\n            \"dataType\": [\"text\"],\n            \"description\": \"The chunk's text\",\n        },\n        {\n            \"name\": \"source\",\n            \"dataType\": [\"string\"],\n            \"description\": \"The source of the data\",\n        },\n        {\n            \"name\": \"source_id\",\n            \"dataType\": [\"string\"],\n            \"description\": \"The source id\",\n        },\n        {\n            \"name\": \"url\",\n            \"dataType\": [\"string\"],\n            \"description\": \"The source url\",\n        },\n        {\n            \"name\": \"created_at\",\n            \"dataType\": [\"date\"],\n            \"description\": \"Creation date of document\",\n        },\n        {\n            \"name\": \"author\",\n            \"dataType\": [\"string\"],\n            \"description\": \"Document author\",\n        },\n    ],\n}\n\n\ndef extract_schema_properties(schema):\n    properties = schema[\"properties\"]\n\n    return {property[\"name\"] for property in properties}\n\n\nclass WeaviateDataStore(DataStore):\n    def handle_errors(self, results: Optional[List[dict]]) -> List[str]:\n        if not self or not results:\n            return []\n\n        error_messages = []\n        for result in results:\n            if (\n                \"result\" not in result\n                or \"errors\" not in result[\"result\"]\n                or \"error\" not in result[\"result\"][\"errors\"]\n            ):\n                continue\n            for message in result[\"result\"][\"errors\"][\"error\"]:\n                error_messages.append(message[\"message\"])\n                logger.error(message[\"message\"])\n\n        return error_messages\n\n    def __init__(self):\n        auth_credentials = self._build_auth_credentials()\n\n        url = os.environ.get(\"WEAVIATE_URL\", WEAVIATE_URL_DEFAULT)\n\n        logger.debug(\n            f\"Connecting to weaviate instance at {url} with credential type {type(auth_credentials).__name__}\"\n        )\n        self.client = Client(url, auth_client_secret=auth_credentials)\n        self.client.batch.configure(\n            batch_size=WEAVIATE_BATCH_SIZE,\n            dynamic=WEAVIATE_BATCH_DYNAMIC,  # type: ignore\n            callback=self.handle_errors,  # type: ignore\n            timeout_retries=WEAVIATE_BATCH_TIMEOUT_RETRIES,\n            num_workers=WEAVIATE_BATCH_NUM_WORKERS,\n        )\n\n        if self.client.schema.contains(SCHEMA):\n            current_schema = self.client.schema.get(WEAVIATE_CLASS)\n            current_schema_properties = extract_schema_properties(current_schema)\n\n            logger.debug(\n                f\"Found index {WEAVIATE_CLASS} with properties {current_schema_properties}\"\n            )\n            logger.debug(\"Will reuse this schema\")\n        else:\n            new_schema_properties = extract_schema_properties(SCHEMA)\n            logger.debug(\n                f\"Creating collection {WEAVIATE_CLASS} with properties {new_schema_properties}\"\n            )\n            self.client.schema.create_class(SCHEMA)\n\n    @staticmethod\n    def _build_auth_credentials():\n        url = os.environ.get(\"WEAVIATE_URL\", WEAVIATE_URL_DEFAULT)\n\n        if WeaviateDataStore._is_wcs_domain(url):\n            api_key = os.environ.get(\"WEAVIATE_API_KEY\")\n            if api_key is not None:\n                return weaviate.auth.AuthApiKey(api_key=api_key)\n            else:\n                raise ValueError(\"WEAVIATE_API_KEY environment variable is not set\")\n        else:\n            return None\n\n    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:\n        \"\"\"\n        Takes in a list of list of document chunks and inserts them into the database.\n        Return a list of document ids.\n        \"\"\"\n        doc_ids = []\n\n        with self.client.batch as batch:\n            for doc_id, doc_chunks in chunks.items():\n                logger.debug(f\"Upserting {doc_id} with {len(doc_chunks)} chunks\")\n                for doc_chunk in doc_chunks:\n                    # we generate a uuid regardless of the format of the document_id because\n                    # weaviate needs a uuid to store each document chunk and\n                    # a document chunk cannot share the same uuid\n                    doc_uuid = generate_uuid5(doc_chunk, WEAVIATE_CLASS)\n                    metadata = doc_chunk.metadata\n                    doc_chunk_dict = doc_chunk.dict()\n                    doc_chunk_dict.pop(\"metadata\")\n                    for key, value in metadata.dict().items():\n                        doc_chunk_dict[key] = value\n                    doc_chunk_dict[\"chunk_id\"] = doc_chunk_dict.pop(\"id\")\n                    doc_chunk_dict[\"source\"] = (\n                        doc_chunk_dict.pop(\"source\").value\n                        if doc_chunk_dict[\"source\"]\n                        else None\n                    )\n                    embedding = doc_chunk_dict.pop(\"embedding\")\n\n                    batch.add_data_object(\n                        uuid=doc_uuid,\n                        data_object=doc_chunk_dict,\n                        class_name=WEAVIATE_CLASS,\n                        vector=embedding,\n                    )\n\n                doc_ids.append(doc_id)\n            batch.flush()\n        return doc_ids\n\n    async def _query(\n        self,\n        queries: List[QueryWithEmbedding],\n    ) -> List[QueryResult]:\n        \"\"\"\n        Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.\n        \"\"\"\n\n        async def _single_query(query: QueryWithEmbedding) -> QueryResult:\n            logger.debug(f\"Query: {query.query}\")\n            if not hasattr(query, \"filter\") or not query.filter:\n                result = (\n                    self.client.query.get(\n                        WEAVIATE_CLASS,\n                        [\n                            \"chunk_id\",\n                            \"document_id\",\n                            \"text\",\n                            \"source\",\n                            \"source_id\",\n                            \"url\",\n                            \"created_at\",\n                            \"author\",\n                        ],\n                    )\n                    .with_hybrid(query=query.query, alpha=0.5, vector=query.embedding)\n                    .with_limit(query.top_k)  # type: ignore\n                    .with_additional([\"score\", \"vector\"])\n                    .do()\n                )\n            else:\n                filters_ = self.build_filters(query.filter)\n                result = (\n                    self.client.query.get(\n                        WEAVIATE_CLASS,\n                        [\n                            \"chunk_id\",\n                            \"document_id\",\n                            \"text\",\n                            \"source\",\n                            \"source_id\",\n                            \"url\",\n                            \"created_at\",\n                            \"author\",\n                        ],\n                    )\n                    .with_hybrid(query=query.query, alpha=0.5, vector=query.embedding)\n                    .with_where(filters_)\n                    .with_limit(query.top_k)  # type: ignore\n                    .with_additional([\"score\", \"vector\"])\n                    .do()\n                )\n\n            query_results: List[DocumentChunkWithScore] = []\n            response = result[\"data\"][\"Get\"][WEAVIATE_CLASS]\n\n            for resp in response:\n                result = DocumentChunkWithScore(\n                    id=resp[\"chunk_id\"],\n                    text=resp[\"text\"],\n                    # embedding=resp[\"_additional\"][\"vector\"],\n                    score=resp[\"_additional\"][\"score\"],\n                    metadata=DocumentChunkMetadata(\n                        document_id=resp[\"document_id\"] if resp[\"document_id\"] else \"\",\n                        source=Source(resp[\"source\"]) if resp[\"source\"] else None,\n                        source_id=resp[\"source_id\"],\n                        url=resp[\"url\"],\n                        created_at=resp[\"created_at\"],\n                        author=resp[\"author\"],\n                    ),\n                )\n                query_results.append(result)\n            return QueryResult(query=query.query, results=query_results)\n\n        return await asyncio.gather(*[_single_query(query) for query in queries])\n\n    async def delete(\n        self,\n        ids: Optional[List[str]] = None,\n        filter: Optional[DocumentMetadataFilter] = None,\n        delete_all: Optional[bool] = None,\n    ) -> bool:\n        # TODO\n        \"\"\"\n        Removes vectors by ids, filter, or everything in the datastore.\n        Returns whether the operation was successful.\n        \"\"\"\n        if delete_all:\n            logger.debug(f\"Deleting all vectors in index {WEAVIATE_CLASS}\")\n            self.client.schema.delete_all()\n            return True\n\n        if ids:\n            operands = [\n                {\"path\": [\"document_id\"], \"operator\": \"Equal\", \"valueString\": id}\n                for id in ids\n            ]\n\n            where_clause = {\"operator\": \"Or\", \"operands\": operands}\n\n            logger.debug(f\"Deleting vectors from index {WEAVIATE_CLASS} with ids {ids}\")\n            result = self.client.batch.delete_objects(\n                class_name=WEAVIATE_CLASS, where=where_clause, output=\"verbose\"\n            )\n\n            if not bool(result[\"results\"][\"successful\"]):\n                logger.debug(\n                    f\"Failed to delete the following objects: {result['results']['objects']}\"\n                )\n\n        if filter:\n            where_clause = self.build_filters(filter)\n\n            logger.debug(\n                f\"Deleting vectors from index {WEAVIATE_CLASS} with filter {where_clause}\"\n            )\n            result = self.client.batch.delete_objects(\n                class_name=WEAVIATE_CLASS, where=where_clause\n            )\n\n            if not bool(result[\"results\"][\"successful\"]):\n                logger.debug(\n                    f\"Failed to delete the following objects: {result['results']['objects']}\"\n                )\n\n        return True\n\n    @staticmethod\n    def build_filters(filter):\n        if filter.source:\n            filter.source = filter.source.value\n\n        operands = []\n        filter_conditions = {\n            \"source\": {\n                \"operator\": \"Equal\",\n                \"value\": \"query.filter.source.value\",\n                \"value_key\": \"valueString\",\n            },\n            \"start_date\": {\"operator\": \"GreaterThanEqual\", \"value_key\": \"valueDate\"},\n            \"end_date\": {\"operator\": \"LessThanEqual\", \"value_key\": \"valueDate\"},\n            \"default\": {\"operator\": \"Equal\", \"value_key\": \"valueString\"},\n        }\n\n        for attr, value in filter.__dict__.items():\n            if value is not None:\n                filter_condition = filter_conditions.get(\n                    attr, filter_conditions[\"default\"]\n                )\n                value_key = filter_condition[\"value_key\"]\n\n                operand = {\n                    \"path\": [\n                        attr\n                        if not (attr == \"start_date\" or attr == \"end_date\")\n                        else \"created_at\"\n                    ],\n                    \"operator\": filter_condition[\"operator\"],\n                    value_key: value,\n                }\n\n                operands.append(operand)\n\n        return {\"operator\": \"And\", \"operands\": operands}\n\n    @staticmethod\n    def _is_valid_weaviate_id(candidate_id: str) -> bool:\n        \"\"\"\n        Check if candidate_id is a valid UUID for weaviate's use\n\n        Weaviate supports UUIDs of version 3, 4 and 5. This function checks if the candidate_id is a valid UUID of one of these versions.\n        See https://weaviate.io/developers/weaviate/more-resources/faq#q-are-there-restrictions-on-uuid-formatting-do-i-have-to-adhere-to-any-standards\n        for more information.\n        \"\"\"\n        acceptable_version = [3, 4, 5]\n\n        try:\n            result = uuid.UUID(candidate_id)\n            if result.version not in acceptable_version:\n                return False\n            else:\n                return True\n        except ValueError:\n            return False\n\n    @staticmethod\n    def _is_wcs_domain(url: str) -> bool:\n        \"\"\"\n        Check if the given URL ends with \".weaviate.network\" or \".weaviate.network/\".\n\n        Args:\n            url (str): The URL to check.\n\n        Returns:\n            bool: True if the URL ends with the specified strings, False otherwise.\n        \"\"\"\n        pattern = r\"\\.(weaviate\\.cloud|weaviate\\.network)(/)?$\"\n        return bool(re.search(pattern, url))\n"
  },
  {
    "path": "datastore/providers/zilliz_datastore.py",
    "content": "import os\n\nfrom loguru import logger\nfrom typing import Optional\nfrom pymilvus import (\n    connections,\n)\nfrom uuid import uuid4\n\nfrom datastore.providers.milvus_datastore import (\n    MilvusDataStore,\n)\n\n\nZILLIZ_COLLECTION = os.environ.get(\"ZILLIZ_COLLECTION\") or \"c\" + uuid4().hex\nZILLIZ_URI = os.environ.get(\"ZILLIZ_URI\")\nZILLIZ_USER = os.environ.get(\"ZILLIZ_USER\")\nZILLIZ_PASSWORD = os.environ.get(\"ZILLIZ_PASSWORD\")\nZILLIZ_USE_SECURITY = False if ZILLIZ_PASSWORD is None else True\n\nZILLIZ_CONSISTENCY_LEVEL = os.environ.get(\"ZILLIZ_CONSISTENCY_LEVEL\")\n\n\nclass ZillizDataStore(MilvusDataStore):\n    def __init__(self, create_new: Optional[bool] = False):\n        \"\"\"Create a Zilliz DataStore.\n\n        The Zilliz Datastore allows for storing your indexes and metadata within a Zilliz Cloud instance.\n\n        Args:\n            create_new (Optional[bool], optional): Whether to overwrite if collection already exists. Defaults to True.\n        \"\"\"\n        # Overwrite the default consistency level by MILVUS_CONSISTENCY_LEVEL\n        self._consistency_level = ZILLIZ_CONSISTENCY_LEVEL or \"Bounded\"\n        self._create_connection()\n\n        self._create_collection(ZILLIZ_COLLECTION, create_new)  # type: ignore\n        self._create_index()\n\n    def _create_connection(self):\n        # Check if the connection already exists\n        try:\n            i = [\n                connections.get_connection_addr(x[0])\n                for x in connections.list_connections()\n            ].index({\"address\": ZILLIZ_URI, \"user\": ZILLIZ_USER})\n            self.alias = connections.list_connections()[i][0]\n        except ValueError:\n            # Connect to the Zilliz instance using the passed in Environment variables\n            self.alias = uuid4().hex\n            connections.connect(alias=self.alias, uri=ZILLIZ_URI, user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, secure=ZILLIZ_USE_SECURITY)  # type: ignore\n            logger.info(\"Connect to zilliz cloud server\")\n\n    def _create_index(self):\n        try:\n            # If no index on the collection, create one\n            if len(self.col.indexes) == 0:\n                self.index_params = {\n                    \"metric_type\": \"IP\",\n                    \"index_type\": \"AUTOINDEX\",\n                    \"params\": {},\n                }\n                self.col.create_index(\"embedding\", index_params=self.index_params)\n\n            self.col.load()\n            self.search_params = {\"metric_type\": \"IP\", \"params\": {}}\n        except Exception as e:\n            logger.error(\"Failed to create index, error: {}\".format(e))\n"
  },
  {
    "path": "docs/deployment/flyio.md",
    "content": "# Deploying to Fly.io\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.\n\nFind the packages you can remove for each vector database provider [here](removing-unused-dependencies.md).\n\nAfter removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.\n\n## Deployment\n\nTo deploy the Docker container from this repository to Fly.io, follow\nthese steps:\n\n[Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed.\n\nInstall the [Fly.io CLI](https://fly.io/docs/getting-started/installing-flyctl/) on your local machine.\n\nClone the repository from GitHub:\n\n```\ngit clone https://github.com/openai/chatgpt-retrieval-plugin.git\n```\n\nNavigate to the cloned repository directory:\n\n```\ncd path/to/chatgpt-retrieval-plugin\n```\n\nLog in to the Fly.io CLI:\n\n```\nflyctl auth login\n```\n\nCreate and launch your Fly.io app:\n\n```\nflyctl launch\n```\n\nFollow the instructions in your terminal:\n\n- Choose your app name\n- Choose your app region\n- Don't add any databases\n- Don't deploy yet (if you do, the first deploy might fail as the environment variables are not yet set)\n\nSet the required environment variables:\n\n```\nflyctl secrets set DATASTORE=your_datastore \\\nOPENAI_API_KEY=your_openai_api_key \\\nBEARER_TOKEN=your_bearer_token \\\n<Add the environment variables for your chosen vector DB here>\n```\n\nAlternatively, you could set environment variables in the [Fly.io Console](https://fly.io/dashboard).\n\nAt this point, you can change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml) to the url for your Fly.io app, which will be `https://your-app-name.fly.dev`.\n\nDeploy your app with:\n\n```\nflyctl deploy\n```\n\nAfter completing these steps, your Docker container should be deployed to Fly.io and running with the necessary environment variables set. You can view your app by running:\n\n```\nflyctl open\n```\n\nwhich will open your app url. You should be able to find the OpenAPI schema at `<your_app_url>/.well-known/openapi.yaml` and the manifest at `<your_app_url>/.well-known/ai-plugin.json`.\n\nTo view your app logs:\n\n```\nflyctl logs\n```\n\nNow, make sure you have changed the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy with `flyctl deploy`. This url will be `https://<your-app-name>.fly.dev`.\n\n**Debugging tips:**\nFly.io uses port 8080 by default.\n\nIf your app fails to deploy, check if the environment variables are set correctly, and then check if your port is configured correctly. You could also try using the [`-e` flag](https://fly.io/docs/flyctl/launch/) with the `flyctl launch` command to set the environment variables at launch.\n"
  },
  {
    "path": "docs/deployment/heroku.md",
    "content": "# Deploying to Heroku\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.\n\nFind the packages you can remove for each vector database provider [here](removing-unused-dependencies.md).\n\nAfter removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.\n\n## Deployment\n\nTo deploy the Docker container from this repository to Heroku and set the required environment variables, follow these steps:\n\n[Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed.\n\nInstall the [Heroku CLI](https://devcenter.heroku.com/articles/heroku-cli) on your local machine.\n\nClone the repository from GitHub:\n\n```\ngit clone https://github.com/openai/chatgpt-retrieval-plugin.git\n```\n\nNavigate to the cloned repository directory:\n\n```\ncd path/to/chatgpt-retrieval-plugin\n```\n\nLog in to the Heroku CLI:\n\n```\nheroku login\n```\n\nCreate a Heroku app:\n\n```\nheroku create [app-name]\n```\n\nLog in to the Heroku Container Registry:\n\n```\nheroku container:login\n```\n\nAlternatively, you can use a command from the Makefile to log in to the Heroku Container Registry by running:\n\n```\nmake heroku-login\n```\n\nBuild the Docker image using the Dockerfile:\n\n```\ndocker buildx build --platform linux/amd64 -t [image-name] .\n```\n\n(Replace `[image-name]` with the name you want to give your Docker image)\n\nPush the Docker image to the Heroku Container Registry, and release the newly pushed image to your Heroku app.\n\n```\ndocker tag [image-name] registry.heroku.com/[app-name]/web\ndocker push registry.heroku.com/[app-name]/web\nheroku container:release web -a [app-name]\n```\n\nAlternatively, you can use a command from the to push the Docker image to the Heroku Container Registry by running:\n\n```\nmake heroku-push\n```\n\n**Note:** You will need to edit the Makefile and replace `<your app name>` with your actual app name.\n\nSet the required environment variables for your Heroku app:\n\n```\nheroku config:set DATASTORE=your_datastore \\\nOPENAI_API_KEY=your_openai_api_key \\\nBEARER_TOKEN=your_bearer_token \\\n<Add the environment variables for your chosen vector DB here> \\\n-a [app-name]\n```\n\nYou could also set environment variables in the [Heroku Console](https://dashboard.heroku.com/apps).\n\nAfter completing these steps, your Docker container should be deployed to Heroku and running with the necessary environment variables set. You can view your app by running:\n\n```\nheroku open -a [app-name]\n```\n\nwhich will open your app url. You should be able to find the OpenAPI schema at `<your_app_url>/.well-known/openapi.yaml` and the manifest at `<your_app_url>/.well-known/ai-plugin.json`.\n\nTo view your app logs:\n\n```\nheroku logs --tail -a [app-name]\n```\n\nNow make sure to change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy with `make heroku-push`. This url will be `https://your-app-name.herokuapp.com`.\n"
  },
  {
    "path": "docs/deployment/other-options.md",
    "content": "# Other Deployment Options\n\nSome possible other options for deploying the app are:\n\n- **Azure Container Apps**: This is a cloud platform that allows you to deploy and manage web apps using Docker containers. You can use the Azure CLI or the Azure Portal to create and configure your app service, and then push your Docker image to a container registry and deploy it to your app service. You can also set environment variables and scale your app using the Azure Portal. Learn more [here](https://learn.microsoft.com/en-us/azure/container-apps/get-started-existing-container-image-portal?pivots=container-apps-private-registry).\n- **Google Cloud Run**: This is a serverless platform that allows you to run stateless web apps using Docker containers. You can use the Google Cloud Console or the gcloud command-line tool to create and deploy your Cloud Run service, and then push your Docker image to the Google Container Registry and deploy it to your service. You can also set environment variables and scale your app using the Google Cloud Console. Learn more [here](https://cloud.google.com/run/docs/quickstarts/build-and-deploy).\n- **AWS Elastic Container Service**: This is a cloud platform that allows you to run and manage web apps using Docker containers. You can use the AWS CLI or the AWS Management Console to create and configure your ECS cluster, and then push your Docker image to the Amazon Elastic Container Registry and deploy it to your cluster. You can also set environment variables and scale your app using the AWS Management Console. Learn more [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/docker-basics.html).\n\nAfter you create your app, make sure to change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy.\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.\n\nFind the packages you can remove for each vector database provider [here](removing_unused_dependencies.md).\n\nAfter removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.\n"
  },
  {
    "path": "docs/deployment/removing-unused-dependencies.md",
    "content": "# Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.\n\nHere are the packages you can remove for each vector database provider:\n\n- **Pinecone:** Remove `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity`, `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Weaviate:** Remove `pinecone-client`, `pymilvus`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, `psycopg2cffi`.\n- **Zilliz:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Milvus:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Qdrant:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Redis:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **LlamaIndex:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `chromadb`, `redis`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Chroma:**: Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `llama-index`, `redis`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Azure Cognitive Search**: Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `llama-index`, `redis` and `chromadb`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Supabase:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `psycopg2`+`pgvector`, and `psycopg2cffi`.\n- **Postgres:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, and `psycopg2cffi`.\n- **AnalyticDB:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, and `psycopg2`+`pgvector`.\n\nAfter removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.\n"
  },
  {
    "path": "docs/deployment/render.md",
    "content": "# Deploying to Render\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.\n\nFind the packages you can remove for each vector database provider [here](removing-unused-dependencies.md).\n\nAfter removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.\n\n## Deployment\n\nRender maintains a [fork](https://github.com/render-examples/chatgpt-retrieval-plugin/) of this repository with a few small changes that facilitate easy deployment. The source code is unchanged. To deploy both the Docker container from this repository and a self-hosted Weaviate database to back it, just click the button below. Enter your OpenAI API key when prompted.\n\n[<img src=\"https://render.com/images/deploy-to-render-button.svg\" alt=\"Deploy to Render\" />](https://render.com/deploy?repo=https://github.com/render-examples/chatgpt-retrieval-plugin/tree/main)\n\nThe bearer token will be randomly generated for you. You can view it in in the \"Environment\" tab on the [Render dashboard](https://dashboard.render.com) page for your server. For more guidance, consult the [README in Render's fork](https://github.com/render-examples/chatgpt-retrieval-plugin/blob/main/README.md), [Render's documentation](https://render.com/docs), or the screen recording linked below.\n\n[![Deploy to Render screen recording](render-thumbnail.png)](https://vimeo.com/823610578)\n"
  },
  {
    "path": "docs/deprecated/plugins.md",
    "content": "## Plugins (deprecated)\n\nPlugins are chat extensions designed specifically for language models like ChatGPT, enabling them to access up-to-date information, run computations, or interact with third-party services in response to a user's request. They unlock a wide range of potential use cases and enhance the capabilities of language models.\n\nDevelopers can create a plugin by exposing an API through their website and providing a standardized manifest file that describes the API. ChatGPT consumes these files and allows the AI models to make calls to the API defined by the developer.\n\nA plugin consists of:\n\n- An API\n- An API schema (OpenAPI JSON or YAML format)\n- A manifest (JSON file) that defines relevant metadata for the plugin\n\nThe Retrieval Plugin already contains all of these components. Read the Chat Plugins blogpost [here](https://openai.com/blog/chatgpt-plugins), and find the docs [here](https://platform.openai.com/docs/plugins/introduction).\n\nTo access the plugins model, navigate [here](https://chat.openai.com/?model=gpt-4-plugins).\n\n### Testing a Localhost Plugin in ChatGPT\n\nTo test a localhost plugin in ChatGPT, use the provided [`local_server/main.py`](/local_server/main.py) file, which is specifically configured for localhost testing with CORS settings, no authentication and routes for the manifest, OpenAPI schema and logo.\n\nFollow these steps to test your localhost plugin:\n\n1. Run the localhost server using the `poetry run dev` command. This starts the server at the default address (e.g. `localhost:3333`).\n\n2. Visit [ChatGPT](https://chat.openai.com/), select \"Plugins\" from the model picker, click on the plugins picker, and click on \"Plugin store\" at the bottom of the list.\n\n3. Choose \"Develop your own plugin\" and enter your localhost URL (e.g. `localhost:3333`) when prompted.\n\n4. Your localhost plugin is now enabled for your ChatGPT session.\n\nFor more information, refer to the [OpenAI documentation](https://platform.openai.com/docs/plugins/getting-started/openapi-definition).\n\n## Installing a Developer Plugin\n\nTo install a developer plugin, follow the steps below:\n\n- First, create your developer plugin by deploying it to your preferred hosting platform (e.g. Fly.io, Heroku, etc.) and updating the plugin URL in the manifest file and OpenAPI schema.\n\n- Go to [ChatGPT](https://chat.openai.com/) and select \"Plugins\" from the model picker.\n\n- From the plugins picker, scroll to the bottom and click on \"Plugin store.\"\n\n- Go to \"Develop your own plugin\" and follow the instructions provided. You will need to enter the domain where your plugin is deployed.\n\n- Follow the instructions based on the authentication type you have chosen for your plugin (e.g. if your plugin uses Service Level HTTP, you will have to paste in your access token, then paste the new access token you receive from the plugin flow into your [ai-plugin.json](/.well-known/ai-plugin.json) file and redeploy your app).\n\n- Next, you must add your plugin. Go to the \"Plugin store\" again and click on \"Install an unverified plugin.\"\n\n- Follow the instructions provided, which will require you to enter the domain where your plugin is deployed.\n\n- Follow the instructions based on the authentication type you have chosen for your plugin (e.g. if your plugin uses User Level HTTP, you will have to paste in your bearer token).\n\nAfter completing these steps, your developer plugin should be installed and ready to use in ChatGPT.\n"
  },
  {
    "path": "docs/providers/analyticdb/setup.md",
    "content": "# AnalyticDB\n\n[AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is a high-performance vector database that is fully compatible with PostgreSQL syntax, making it easy to use. Managed by Alibaba Cloud, AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing a wide range of features, including indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. Additionally, it offers full OLAP database functionality and an SLA commitment for production use.\n\n## Install Requirements\n\nRun the following command to install the required packages, including the `psycopg2cffi` package:\n\n```\npoetry install --extras \"postgresql\"\n```\n\nIf you encounter the `Error: pg_config executable not found.` issue, you need to install the PostgreSQL development package on your system. Follow the instructions for your specific Linux distribution:\n\n1. Debian-based systems (e.g., Ubuntu):\n\n```bash\nsudo apt-get update\nsudo apt-get install libpq-dev\n```\n\n2. RHEL-based systems (e.g., CentOS, Fedora):\n\n```bash\nsudo yum install postgresql-devel\n```\n\n3. Arch-based systems (e.g., Manjaro, Arch Linux):\n\n```bash\nsudo pacman -S postgresql-libs\n```\n\n4. macOS:\n\n```bash\nbrew install postgresql\n```\n\nAfter installing the required package, try to install `psycopg2cffi` again. If the `pg_config` executable is still not found, add its location to your system's `PATH` variable. You can typically find the `pg_config` executable in the `bin` directory of your PostgreSQL installation, for example `/usr/pgsql-13/bin/pg_config`. To add it to your `PATH` variable, use the following command (replace the path with the correct one for your system):\n\n```bash\nexport PATH=$PATH:/usr/pgsql-13/bin\n```\n\nNow, try installing `psycopg2cffi` again using Poetry.\n\n**Environment Variables:**\n\n| Name             | Required | Description                         | Default           |\n| ---------------- | -------- | ----------------------------------- | ----------------- |\n| `DATASTORE`      | Yes      | Datastore name, set to `analyticdb` |                   |\n| `BEARER_TOKEN`   | Yes      | Secret token                        |                   |\n| `OPENAI_API_KEY` | Yes      | OpenAI API key                      |                   |\n| `PG_HOST`        | Yes      | AnalyticDB instance URL             | `localhost`       |\n| `PG_USER`        | Yes      | Database user                       | `user`            |\n| `PG_PASSWORD`    | Yes      | Database password                   | `password`        |\n| `PG_PORT`        | Optional | Port for AnalyticDB communication   | `5432`            |\n| `PG_DATABASE`    | Optional | Database name                       | `postgres`        |\n| `PG_COLLECTION`  | Optional | AnalyticDB relation name            | `document_chunks` |\n\n## AnalyticDB Cloud\n\nFor a hosted [AnalyticDB Cloud](https://cloud.qdrant.io/) version, provide the AnalyticDB instance URL:\n\n**Example:**\n\n```bash\nPG_HOST=\"https://YOUR-CLUSTER-URL.gpdb.rds.aliyuncs.com\"\nPG_USER=\"YOUR-USER-NAME\"\nPG_PASSWORD=\"YOUR-PASSWORD\"\n```\n\nThe other parameters are optional and can be changed if needed.\n\n## Running AnalyticDB Integration Tests\n\nA suite of integration tests verifies the AnalyticDB integration. Launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/analyticdb/test_analyticdb_datastore.py\n```\n"
  },
  {
    "path": "docs/providers/azurecosmosdb/setup.md",
    "content": "# Azure Cosmos DB\n\n[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service.\n\nLearn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/).\n\n## Environment variables\n\n| Name                         | Required | Description                                                             | Default             |\n| ---------------------------- | -------- |-------------------------------------------------------------------------| ------------------- |\n| `DATASTORE`                  | Yes      | Datastore name, set to `azurecosmosdb`                                  |                     |\n| `BEARER_TOKEN`               | Yes      | Secret token                                                            |                     |\n| `OPENAI_API_KEY`             | Yes      | OpenAI API key                                                          |                     |\n| `AZCOSMOS_API`               | Yes      | Name of the API you're connecting to. Currently supported `mongo-vcore` |                     |\n| `AZCOSMOS_CONNSTR`           | Yes      | The connection string to your account.                                  |                     |\n| `AZCOSMOS_DATABASE_NAME`     | Yes      | The database where the data is stored/queried                           |                     |\n| `AZCOSMOS_CONTAINER_NAME`    | Yes      | The container where the data is stored/queried                          |                     |\n\n## Indexing\nOn first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported.\n"
  },
  {
    "path": "docs/providers/azuresearch/setup.md",
    "content": "# Azure Cognitive Search\n\n[Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval cloud service that supports vector search, text search, and hybrid (vectors + text combined to yield the best of the two approaches). Azure Cognitive Search also offers an [optional L2 re-ranking step](https://learn.microsoft.com/azure/search/semantic-search-overview) to further improve results quality.\n\nYou can find the Azure Cognitive Search documentation [here](https://learn.microsoft.com/azure/search/search-what-is-azure-search). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/).\n\n## Signing up for vector search\n\nAzure Cognitive Search supports searching using pure vectors, pure text, or hybrid mode where both are combined. For the vector-based cases, you'll need to sign up for vector search private preview. To sign up, please fill in this form: https://aka.ms/VectorSearchSignUp\n\n## Environment variables\n\n| Name                          | Required | Description                                                                                                                                                                        | Default               |\n| ----------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------- |\n| `DATASTORE`                   | Yes      | Datastore name, set to `azuresearch`                                                                                                                                               |                       |\n| `BEARER_TOKEN`                | Yes      | Secret token                                                                                                                                                                       |                       |\n| `OPENAI_API_KEY`              | Yes      | OpenAI API key                                                                                                                                                                     |                       |\n| `AZURESEARCH_SERVICE`         | Yes      | Name of your search service                                                                                                                                                        |                       |\n| `AZURESEARCH_INDEX`           | Yes      | Name of your search index                                                                                                                                                          |                       |\n| `AZURESEARCH_API_KEY`         | No       | Your API key, if using key-based auth instead of Azure managed identity                                                                                                            | Uses managed identity |\n| `AZURESEARCH_DISABLE_HYBRID`  | No       | Disable hybrid search and only use vector similarity                                                                                                                               | Use hybrid search     |\n| `AZURESEARCH_SEMANTIC_CONFIG` | No       | Enable L2 re-ranking with this configuration name [see re-ranking below](#re-ranking)                                                                                              | L2 not enabled        |\n| `AZURESEARCH_LANGUAGE`        | No       | If using L2 re-ranking, language for queries/documents (valid values [listed here](https://learn.microsoft.com/rest/api/searchservice/preview-api/search-documents#queryLanguage)) | `en-us`               |\n| `AZURESEARCH_DIMENSIONS`      | No       | Vector size for embeddings                                                                                                                                                         | 256, or other         |\n\n## Authentication Options\n\n- API key: this is enabled by default; you can obtain the key in the Azure Portal or using the Azure CLI.\n- Managed identity: If the plugin is running in Azure, you can enable managed identity for the host and give that identity access to the service, without having to manage keys (avoiding secret storage, rotation, etc.). More details [here](https://learn.microsoft.com/azure/search/search-security-rbac).\n\n## Re-ranking\n\nAzure Cognitive Search offers the option to enable a second (L2) ranking step after retrieval to further improve results quality. This only applies when using text or hybrid search. Since it has latency and cost implications, if you want to try this option you need to explicitly [enable \"semantic search\"](https://learn.microsoft.com/azure/search/semantic-search-overview#enable-semantic-search) in your Cognitive Search service, and [create a semantic search configuration](https://learn.microsoft.com/azure/search/semantic-how-to-query-request#2---create-a-semantic-configuration) for your index.\n\n## Using existing search indexes\n\nIf an existing index has fields that align with what's needed by the retrieval plugin but just differ in names, you can map your fields to the plugin fields using the following environment variables:\n\n| Plugin field name | Environment variable to override it |\n| ----------------- | ----------------------------------- |\n| id                | AZURESEARCH_FIELDS_ID               |\n| text              | AZURESEARCH_FIELDS_TEXT             |\n| embedding         | AZURESEARCH_FIELDS_EMBEDDING        |\n| document_id       | AZURESEARCH_FIELDS_DOCUMENT_ID      |\n| source            | AZURESEARCH_FIELDS_SOURCE           |\n| source_id         | AZURESEARCH_FIELDS_SOURCE_ID        |\n| url               | AZURESEARCH_FIELDS_URL              |\n| created_at        | AZURESEARCH_FIELDS_CREATED_AT       |\n| author            | AZURESEARCH_FIELDS_AUTHOR           |\n"
  },
  {
    "path": "docs/providers/chroma/setup.md",
    "content": "[Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make it easy to work with embeddings. Chroma runs in-memory, or in a client-server setup.\n\nInstall Chroma by running `pip install chromadb`. Once installed, the core API consists of four essential commands for creating collections, adding embeddings, documents, and metadata, and querying embeddings to find similar documents. Get started with Chroma by visiting the [Getting Started](https://docs.trychroma.com) page on their documentation website, or explore the open-source code on their [GitHub repository](https://github.com/chroma-core/chroma).\n\n**Chroma Environment Variables**\n\nTo set up Chroma and start using it as your vector database provider, you need to define some environment variables to connect to your Chroma instance.\n\n**Chroma Datastore Environment Variables**\n\nChroma runs _in-memory_ by default, with local persistence. It can also run in [self-hosted](https://docs.trychroma.com/usage-guide#running-chroma-in-clientserver-mode) client-server mode, with a fully managed hosted version coming soon.\n\n| Name                     | Required | Description                                                                                        | Default          |\n| ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ---------------- |\n| `DATASTORE`              | Yes      | Datastore name. Set this to `chroma`                                                               |                  |\n| `BEARER_TOKEN`           | Yes      | Your secret token for authenticating requests to the API                                           |                  |\n| `OPENAI_API_KEY`         | Yes      | Your OpenAI API key for generating embeddings                                                      |                  |\n| `CHROMA_COLLECTION`      | Optional | Your chosen Chroma collection name to store your embeddings                                        | openaiembeddings |\n| `CHROMA_IN_MEMORY`       | Optional | If set to `True`, ignore `CHROMA_HOST` and `CHROMA_PORT` and just use an in-memory Chroma instance | `True`           |\n| `CHROMA_PERSISTENCE_DIR` | Optional | If set, and `CHROMA_IN_MEMORY` is set, persist to and load from this directory.                    | `openai`         |\n\nTo run Chroma in self-hosted client-server mode, st the following variables:\n\n| Name          | Required | Description                                         | Default            |\n| ------------- | -------- | --------------------------------------------------- | ------------------ |\n| `CHROMA_HOST` | Optional | Your Chroma instance host address (see notes below) | `http://127.0.0.1` |\n| `CHROMA_PORT` | Optional | Your Chroma port number                             | `8000`             |\n\n> For **self-hosted instances**, if your instance is not at 127.0.0.1:8000, set `CHROMA_HOST` and `CHROMA_PORT` accordingly. For example: `CHROMA_HOST=http://localhost/` and `CHROMA_PORT=8080`.\n"
  },
  {
    "path": "docs/providers/elasticsearch/setup.md",
    "content": "# Elasticsearch\n\nElasticsearch is a search engine based on the Lucene library. It provides a distributed, full-text and vector search engine with an HTTP web interface and schema-free JSON documents. To use Elasticsearch as your vector database, start by [installing Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) or signing up for a free trial of [Elastic Cloud](https://www.elastic.co/cloud/).\n\nThe app will create an Elasticsearch index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable.\n\n**Environment Variables:**\n\n| Name                  | Required | Description                                                                                                          |\n| --------------------- | -------- | -------------------------------------------------------------------------------------------------------------------- |\n| `DATASTORE`           | Yes      | Datastore name, set this to `elasticsearch`                                                                          |\n| `BEARER_TOKEN`        | Yes      | Your secret token for authenticating requests to the API                                                             |\n| `OPENAI_API_KEY`      | Yes      | Your OpenAI API key for generating embeddings with the OpenAI embeddings model                                       |\n| `ELASTICSEARCH_INDEX` | Yes      | Your chosen Elasticsearch index name. **Note:** Index name must consist of lower case alphanumeric characters or '-' |\n\n**Connection Evironment Variables:**\nDepending on your Elasticsearch setup, you may need to set one of the following environment variables to connect to your Elasticsearch instance. If you are using Elastic Cloud, you can connect via `ELASTICSEARCH_CLOUD_ID`. If you are using a local instance of Elasticsearch, you will need to set `ELASTICSEARCH_URL`.\n\nYou can authenticate to Elasticsearch using either `ELASTICSEARCH_USERNAME` and `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY`. If you are using Elastic Cloud, you can find this in Kibana.\n\n| Name                     | Required | Description                                                                                      |\n| ------------------------ | -------- | ------------------------------------------------------------------------------------------------ |\n| `ELASTICSEARCH_URL`      | Yes      | Your Elasticsearch URL. If installed locally, this would be https://localhost:9200               |\n| `ELASTICSEARCH_CLOUD_ID` | Yes      | Your cloud id, linked to your deployment. This can be found in the deployment's console          |\n| `ELASTICSEARCH_USERNAME` | Yes      | Your username for authenticating requests to the API. Commonly 'elastic'.                        |\n| `ELASTICSEARCH_PASSWORD` | Yes      | Your password for authenticating requests to the API                                             |\n| `ELASTICSEARCH_API_KEY`  | Yes      | Alternatively you can authenticate using api-key. This can be created in Kibana stack management |\n\n## Running Elasticsearch Integration Tests\n\nA suite of integration tests is available to verify the Elasticsearch integration. To run the tests, run the docker compose found in the `examples/docker/elasticsearch` folder with `docker-compose up`. This will start Elasticsearch in single node, security off mode, listening on `http://localhost:9200`.\n\nThen, launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py\n```\n"
  },
  {
    "path": "docs/providers/llama/setup.md",
    "content": "\n# LlamaIndex\n\n[LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data.\nIt provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT.\nUnlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases.\nIt is light-weight, easy-to-use, and requires no additional deployment.\nAll you need to do is specifying a few environment variables (optionally point to an existing saved Index json file).\nNote that metadata filters in queries are not yet supported.\n\n## Setup\nCurrently, LlamaIndex requires no additional deployment\nand runs as a part of the Retrieval Plugin.\nIt is super easy to setup and great for quick prototyping\nwith ChatGPT and your external data.\n\n**Retrieval App Environment Variables**\n\n| Name             | Required | Description                         |\n|------------------|----------|-------------------------------------|\n| `DATASTORE`      | Yes      | Datastore name. Set this to `llama` |\n| `BEARER_TOKEN`   | Yes      | Your secret token                   |\n| `OPENAI_API_KEY` | Yes      | Your OpenAI API key                 |\n\n**Llama Datastore Environment Variables**\n\n| Name                           | Required | Description                          | Default       |\n|--------------------------------|----------|--------------------------------------|---------------|\n| `LLAMA_INDEX_TYPE`             | Optional | Index type (see below for details)   | `simple_dict` |\n| `LLAMA_INDEX_JSON_PATH`        | Optional | Path to saved Index json file        | None          |\n| `LLAMA_QUERY_KWARGS_JSON_PATH` | Optional | Path to saved query kwargs json file | None          |\n| `LLAMA_RESPONSE_MODE`          | Optional | Response mode for query              | `no_text`     | \n\n\n**Different Index Types**\nBy default, we use a `GPTVectorStoreIndex` to store document chunks in memory, \nand retrieve top-k nodes by embedding similarity.\nDifferent index types are optimized for different data and query use-cases.\nSee this guide on [How Each Index Works](https://gpt-index.readthedocs.io/en/latest/guides/primer/index_guide.html) to learn more.\nYou can configure the index type via the `LLAMA_INDEX_TYPE`, see [here](https://gpt-index.readthedocs.io/en/latest/reference/indices/composability_query.html#gpt_index.data_structs.struct_type.IndexStructType) for the full list of accepted index type identifiers.\n\n\nRead more details on [readthedocs](https://gpt-index.readthedocs.io/en/latest/), \nand engage with the community on [discord](https://discord.com/invite/dGcwcsnxhU).\n\n## Running Tests\nYou can launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/llama/test_llama_datastore.py\n```\n"
  },
  {
    "path": "docs/providers/milvus/setup.md",
    "content": "# Milvus\n\n[Milvus](https://milvus.io/) is the open-source, cloud-native vector database that scales to billions of vectors. It's the open-source version of Zilliz. It supports:\n\n- Various indexing algorithms and distance metrics\n- Scalar filtering and time travel searches\n- Rollback and snapshots\n- Multi-language SDKs\n- Storage and compute separation\n- Cloud scalability\n- A developer-first community with multi-language support\n\nVisit the [Github](https://github.com/milvus-io/milvus) to learn more.\n\n## Deploying the Database\n\nYou can deploy and manage Milvus using Docker Compose, Helm, K8's Operator, or Ansible. Follow the instructions [here](https://milvus.io/docs) to get started.\n\n**Environment Variables:**\n\n| Name                       | Required | Description                                                                                                                                  |\n|----------------------------| -------- |----------------------------------------------------------------------------------------------------------------------------------------------|\n| `DATASTORE`                | Yes      | Datastore name, set to `milvus`                                                                                                              |\n| `BEARER_TOKEN`             | Yes      | Your bearer token                                                                                                                            |\n| `OPENAI_API_KEY`           | Yes      | Your OpenAI API key                                                                                                                          |\n| `MILVUS_COLLECTION`        | Optional | Milvus collection name, defaults to a random UUID                                                                                            |\n| `MILVUS_HOST`              | Optional | Milvus host IP, defaults to `localhost`                                                                                                      |\n| `MILVUS_PORT`              | Optional | Milvus port, defaults to `19530`                                                                                                             |\n| `MILVUS_USER`              | Optional | Milvus username if RBAC is enabled, defaults to `None`                                                                                       |\n| `MILVUS_PASSWORD`          | Optional | Milvus password if required, defaults to `None`                                                                                              |\n| `MILVUS_INDEX_PARAMS`      | Optional | Custom index options for the collection, defaults to `{\"metric_type\": \"IP\", \"index_type\": \"HNSW\", \"params\": {\"M\": 8, \"efConstruction\": 64}}` |\n| `MILVUS_SEARCH_PARAMS`     | Optional | Custom search options for the collection, defaults to `{\"metric_type\": \"IP\", \"params\": {\"ef\": 10}}`                                          |\n| `MILVUS_CONSISTENCY_LEVEL` | Optional | Data consistency level for the collection, defaults to `Bounded`                                                                             |\n\n## Running Milvus Integration Tests\n\nA suite of integration tests is available to verify the Milvus integration. To run the tests, run the milvus docker compose found in the examples folder.\n\nThen, launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/milvus/test_milvus_datastore.py\n```\n"
  },
  {
    "path": "docs/providers/mongodb/setup.md",
    "content": "# Setting up MongoDB Atlas as the Datastore Provider\n\nMongoDB Atlas is a multi-cloud database service made by the same people that build MongoDB. \nAtlas simplifies deploying and managing your databases while offering the versatility you need \nto build resilient and performant global applications on the cloud providers of your choice.\n\nYou can perform semantic search on data in your Atlas cluster running MongoDB v6.0.11, v7.0.2, \nor later using Atlas Vector Search. You can store vector embeddings for any kind of data along \nwith other data in your collection on the Atlas cluster.\n\nIn the section, we set up a cluster, a database, test it, and finally create an Atlas Vector Search Index.\n\n### Deploy a Cluster\n\nFollow the [Getting-Started](https://www.mongodb.com/basics/mongodb-atlas-tutorial) documentation \nto create an account, deploy an Atlas cluster, and connect to a database.\n\n\n### Retrieve the URI used by Python to connect to the Cluster\n\nWhen you deploy the ChatGPT Retrieval App, this will be stored as the environment variable: `MONGODB_URI`  \nIt will look something like the following. The username and password, if not provided,\ncan be configured in *Database Access* under Security in the left panel. \n\n```\nexport MONGODB_URI=\"mongodb+srv://<username>:<password>@chatgpt-retrieval-plugin.zeatahb.mongodb.net/?retryWrites=true&w=majority\"\n```\n\nThere are a number of ways to navigate the Atlas UI. Keep your eye out for \"Connect\" and \"driver\".\n\nOn the left panel, navigate and click 'Database' under DEPLOYMENT. \nClick the Connect button that appears, then Drivers. Select Python.\n(Have no concern for the version. This is the PyMongo, not Python, version.)\nOnce you have got the Connect Window open, you will see an instruction to `pip install pymongo`.\nYou will also see a **connection string**. \nThis is the `uri` that a `pymongo.MongoClient` uses to connect to the Database.\n\n\n### Test the connection\n\nAtlas provides a simple check. Once you have your `uri` and `pymongo` installed, \ntry the following in a python console.\n\n```python\nfrom pymongo.mongo_client import MongoClient\nclient = MongoClient(uri)  # Create a new client and connect to the server\ntry:\n    client.admin.command('ping')  # Send a ping to confirm a successful connection\n    print(\"Pinged your deployment. You successfully connected to MongoDB!\")\nexcept Exception as e:\n    print(e)\n```\n\n**Troubleshooting**\n* You can edit a Database's users and passwords on the 'Database Access' page, under Security.\n* Remember to add your IP address. (Try `curl -4 ifconfig.co`)\n\n### Create a Database and Collection\n\nAs mentioned, Vector Databases provide two functions. In addition to being the data store,\nthey provide very efficient search based on natural language queries.\nWith Vector Search, one will index and query data with a powerful vector search algorithm\nusing \"Hierarchical Navigable Small World (HNSW) graphs to find vector similarity.\n\nThe indexing runs beside the data as a separate service asynchronously.\nThe Search index monitors changes to the Collection that it applies to.\nSubsequently, one need not upload the data first. \nWe will create an empty collection now, which will simplify setup in the example notebook.\n\nBack in the UI, navigate to the Database Deployments page by clicking Database on the left panel.\nClick the \"Browse Collections\" and then \"+ Create Database\" buttons. \nThis will open a window where you choose Database and Collection names. (No additional preferences.)\nRemember these values as they will be as the environment variables, \n`MONGODB_DATABASE` and `MONGODB_COLLECTION`. Though arbitrary, we suggest \"SQUAD\" and \"Beyonce\"\nas these describe the data that we will use in our example Jupyter Notebook.\n\n\n### Set Datastore Environment Variables\n\nTo establish a connection to the MongoDB Cluster, Database, and Collection, plus create a Vector Search Index,\ndefine the following environment variables.\nYou can confirm that the required ones have been set like this:  `assert \"MONGODB_URI\" in os.environ`\n\n**IMPORTANT** It is crucial that the choices are consistent between setup in Atlas and Python environment(s).\n\n| Name                  | Description                 | Example                                                                          |\n|-----------------------|-----------------------------|----------------------------------------------------------------------------------|\n| `MONGODB_URI`         | Connection String           | mongodb+srv://`<user>`:`<password>`@chatgpt-retrieval-plugin.zeatahb.mongodb.net |\n| `MONGODB_DATABASE`    | Database name               | SQUAD                                                                            |\n| `MONGODB_COLLECTION`  | Collection name             | Beyonce                                                                          |\n| `MONGODB_INDEX`       | Search index name           | vector_index                                                                     |\n| `DATASTORE`           | Datastore name              | [must be] mongodb                                                                |\n| `EMBEDDING_MODEL`     | OpenAI Embedding Model      | text-embedding-3-small                                                           |\n| `EMBEDDING_DIMENSION` | Length of Embedding Vectors | 1536                                                                             |\n\nThe following will also be required to authenticate with OpenAI and Plugin APIs.\n\n| Name             | Description                                                     |\n|------------------|-----------------------------------------------------------------|\n| `OPENAI_API_KEY` | OpenAI token created at https://platform.openai.com/api-keys    |\n| `BEARER_TOKEN`   | Secret string passed in HTTP request header that server expects |\n\n### Create an Atlas Vector Search Index\n\nThe final step to configure MongoDB as the Datastore is to create a Vector Search Index.\nThe procedure is described [here](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure).\n\nUnder Services on the left panel, choose Atlas Search > Create Search Index > \nAtlas Vector Search JSON Editor.\n\nThe Plugin expects an index definition like the following.\nTo begin, choose `numDimensions: 1536` along with the suggested EMBEDDING variables above.\nYou can experiment with these later.\n\n```json\n{\n  \"fields\": [\n    {\n      \"numDimensions\": 1536,\n      \"path\": \"embedding\",\n      \"similarity\": \"cosine\",\n      \"type\": \"vector\"\n    }\n  ]\n}\n```\n\n\n### Running MongoDB Integration Tests\n\nIn addition to the Jupyter Notebooks in `examples/`, \na suite of integration tests is available to verify the MongoDB integration. \nThe test suite needs the cluster up and running, and the environment variables defined above.\n\nThen, launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/mongodb_atlas/test_mongodb_datastore.py\n```\n"
  },
  {
    "path": "docs/providers/pinecone/setup.md",
    "content": "# Pinecone\n\n[Pinecone](https://www.pinecone.io) is a managed vector database built for speed, scale, and shipping to production sooner. To use Pinecone as your vector database provider, first get an API key by [signing up for an account](https://app.pinecone.io/). You can access your API key from the \"API Keys\" section in the sidebar of your dashboard. Pinecone also supports hybrid search and at the time of writing is the only datastore to support SPLADE sparse vectors natively.\n\nA full Jupyter notebook walkthrough for the Pinecone flavor of the retrieval plugin can be found [here](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/examples/providers/pinecone/semantic-search.ipynb). There is also a [video walkthrough here](https://youtu.be/hpePPqKxNq8).\n\nThe app will create a Pinecone index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable.\n\n**Environment Variables:**\n\n| Name                   | Required | Description                                                                                                                      |\n| ---------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------- |\n| `DATASTORE`            | Yes      | Datastore name, set this to `pinecone`                                                                                           |\n| `BEARER_TOKEN`         | Yes      | Your secret token for authenticating requests to the API                                                                         |\n| `OPENAI_API_KEY`       | Yes      | Your OpenAI API key for generating embeddings with one of the OpenAI embeddings models                                           |\n| `PINECONE_API_KEY`     | Yes      | Your Pinecone API key, found in the [Pinecone console](https://app.pinecone.io/)                                                 |\n| `PINECONE_ENVIRONMENT` | Yes      | Your Pinecone environment, found in the [Pinecone console](https://app.pinecone.io/), e.g. `us-west1-gcp`, `us-east-1-aws`, etc. |\n| `PINECONE_INDEX`       | Yes      | Your chosen Pinecone index name. **Note:** Index name must consist of lower case alphanumeric characters or '-'                  |\n\nIf you want to create your own index with custom configurations, you can do so using the Pinecone SDK, API, or web interface ([see docs](https://docs.pinecone.io/docs/manage-indexes)). Make sure to use a dimensionality of 256 (or another dimension) for the embeddings and avoid indexing on the text field in the metadata, as this will reduce the performance significantly.\n\n```python\n# Creating index with Pinecone SDK - use only if you wish to create the index manually.\n\nimport os, pinecone\n\npinecone.init(api_key=os.environ['PINECONE_API_KEY'],\n              environment=os.environ['PINECONE_ENVIRONMENT'])\n\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\npinecone.create_index(name=os.environ['PINECONE_INDEX'],\n                      dimension=EMBEDDING_DIMENSION,\n                      metric='cosine',\n                      metadata_config={\n                          \"indexed\": ['source', 'source_id', 'url', 'created_at', 'author', 'document_id']})\n```\n"
  },
  {
    "path": "docs/providers/postgres/setup.md",
    "content": "# Postgres\n\nPostgres Database offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension. To use pgvector, you will need to set up a PostgreSQL database with the pgvector extension enabled or use a managed solution that provides pgvector. For a hosted/managed solution, you can use any of the cloud vendors which support [pgvector](https://github.com/pgvector/pgvector#hosted-postgres).\n\n- The database needs the `pgvector` extension.\n- To apply required migrations you may use any tool you are more familiar with like [pgAdmin](https://www.pgadmin.org/), [DBeaver](https://dbeaver.io/), [DataGrip](https://www.jetbrains.com/datagrip/), or `psql` cli.\n\n**Retrieval App Environment Variables**\n\n| Name             | Required | Description                            |\n| ---------------- | -------- | -------------------------------------- |\n| `DATASTORE`      | Yes      | Datastore name. Set this to `postgres` |\n| `BEARER_TOKEN`   | Yes      | Your secret token                      |\n| `OPENAI_API_KEY` | Yes      | Your OpenAI API key                    |\n\n**Postgres Datastore Environment Variables**\n\n| Name          | Required | Description       | Default    |\n| ------------- | -------- | ----------------- | ---------- |\n| `PG_HOST`     | Optional | Postgres host     | localhost  |\n| `PG_PORT`     | Optional | Postgres port     | `5432`     |\n| `PG_PASSWORD` | Optional | Postgres password | `postgres` |\n| `PG_USER`     | Optional | Postgres username | `postgres` |\n| `PG_DB`       | Optional | Postgres database | `postgres` |\n\n## Postgres Datastore local development & testing\n\nIn order to test your changes to the Postgres Datastore, you can run the following:\n\n1. You can run local or self-hosted instance of PostgreSQL with `pgvector` enabled using Docker.\n\n```bash\ndocker pull ankane/pgvector\n```\n\n```bash\ndocker run --name pgvector -e POSTGRES_PASSWORD=mysecretpassword -d postgres\n```\n\nCheck PostgreSQL [official docker image](https://github.com/docker-library/docs/blob/master/postgres/README.md) for more options.\n\n2. Apply migrations using any tool you like most [pgAdmin](https://www.pgadmin.org/), [DBeaver](https://dbeaver.io/), [DataGrip](https://www.jetbrains.com/datagrip/), or `psql` cli.\n\n```bash\n# apply migrations using psql cli\npsql -h localhost -p 5432 -U postgres -d postgres -f examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql\n```\n\n3. Export environment variables required for the Postgres Datastore\n\n```bash\nexport PG_HOST=localhost\nexport PG_PORT=54322\nexport PG_PASSWORD=mysecretpassword\n```\n\n4. Run the Postgres datastore tests from the project's root directory\n\n```bash\n# Run the Postgres datastore tests\n# go to project's root directory and run\npoetry run pytest -s ./tests/datastore/providers/postgres/test_postgres_datastore.py\n```\n\n5. When going to prod don't forget to set the password for the `postgres` user to something more secure and apply migrations.\n\n6. You may want to remove RLS (Row Level Security) from the `documents` table. If you are not using RLS, it is not required in this setup. But it may be useful if you want to separate documents by user or group of users, or if you want to give permissions to insert or query documents to different users. And RLS is especially important if you are willing to use PostgREST. To do so you can just remove the following statement from the `20230414142107_init_pg_vector.sql` migration file: `alter table documents enable row level security;`.\n\n## Indexes for Postgres\n\nBy default, pgvector performs exact nearest neighbor search. To speed up the vector comparison, you may want to create indexes for the `embedding` column in the `documents` table. You should do this **only** after a few thousand records are inserted.\n\nAs datasotre is using inner product for similarity search, you can add index as follows:\n\n```sql\ncreate index on documents using ivfflat (embedding vector_ip_ops) with (lists = 100);\n```\n\nTo choose `lists` constant - a good place to start is records / 1000 for up to 1M records and sqrt(records) for over 1M records\n\nFor more information about indexes, see [pgvector docs](https://github.com/pgvector/pgvector#indexing).\n"
  },
  {
    "path": "docs/providers/qdrant/setup.md",
    "content": "# Qdrant\n\n[Qdrant](https://qdrant.tech/) is a vector database that can store documents and vector embeddings. It can run as a self-hosted version or a managed [Qdrant Cloud](https://cloud.qdrant.io/)\nsolution. The configuration is almost identical for both options, except for the API key that [Qdrant Cloud](https://cloud.qdrant.io/) provides.\n\n**Environment Variables:**\n\n| Name                | Required | Description                                                 | Default            |\n| ------------------- | -------- | ----------------------------------------------------------- | ------------------ |\n| `DATASTORE`         | Yes      | Datastore name, set to `qdrant`                             |                    |\n| `BEARER_TOKEN`      | Yes      | Secret token                                                |                    |\n| `OPENAI_API_KEY`    | Yes      | OpenAI API key                                              |                    |\n| `QDRANT_URL`        | Yes      | Qdrant instance URL                                         | `http://localhost` |\n| `QDRANT_PORT`       | Optional | TCP port for Qdrant HTTP communication                      | `6333`             |\n| `QDRANT_GRPC_PORT`  | Optional | TCP port for Qdrant GRPC communication                      | `6334`             |\n| `QDRANT_API_KEY`    | Optional | Qdrant API key for [Qdrant Cloud](https://cloud.qdrant.io/) |                    |\n| `QDRANT_COLLECTION` | Optional | Qdrant collection name                                      | `document_chunks`  |\n\n## Qdrant Cloud\n\nFor a hosted [Qdrant Cloud](https://cloud.qdrant.io/) version, provide the Qdrant instance\nURL and the API key from the [Qdrant Cloud UI](https://cloud.qdrant.io/).\n\n**Example:**\n\n```bash\nQDRANT_URL=\"https://YOUR-CLUSTER-URL.aws.cloud.qdrant.io\"\nQDRANT_API_KEY=\"<YOUR_QDRANT_CLOUD_CLUSTER_API_KEY>\"\n```\n\nThe other parameters are optional and can be changed if needed.\n\n## Self-hosted Qdrant Instance\n\nFor a self-hosted version, use Docker containers or the official Helm chart for deployment. The only\nrequired parameter is the `QDRANT_URL` that points to the Qdrant server URL.\n\n**Example:**\n\n```bash\nQDRANT_URL=\"http://YOUR_HOST.example.com:6333\"\n```\n\nThe other parameters are optional and can be changed if needed.\n\n## Running Qdrant Integration Tests\n\nA suite of integration tests verifies the Qdrant integration. To run it, start a local Qdrant instance in a Docker container.\n\n```bash\ndocker run -p \"6333:6333\" -p \"6334:6334\" qdrant/qdrant:v1.0.3\n```\n\nThen, launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/qdrant/test_qdrant_datastore.py\n```\n"
  },
  {
    "path": "docs/providers/redis/setup.md",
    "content": "# Redis\n\n[Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform that supports a variety of use cases for everyday applications as well as AI/ML workloads. Use Redis as a low-latency vector engine by creating a Redis database with the [Redis Stack docker container](/examples/docker/redis/docker-compose.yml). For a hosted/managed solution, try [Redis Cloud](https://app.redislabs.com/#/). See more helpful examples of Redis as a vector database [here](https://github.com/RedisVentures/redis-ai-resources).\n\n- The database **needs the RediSearch module (>=v2.6) and RedisJSON**, which are included in the self-hosted docker compose above.\n- Run the App with the Redis docker image: `docker compose up -d` in [this dir](/examples/docker/redis/).\n- The app automatically creates a Redis vector search index on the first run. Optionally, create a custom index with a specific name and set it as an environment variable (see below).\n- To enable more hybrid searching capabilities, adjust the document schema [here](/datastore/providers/redis_datastore.py).\n\n**Environment Variables:**\n\n| Name                    | Required | Description                                                                                                            | Default     |\n| ----------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- |\n| `DATASTORE`             | Yes      | Datastore name, set to `redis`                                                                                         |             |\n| `BEARER_TOKEN`          | Yes      | Secret token                                                                                                           |             |\n| `OPENAI_API_KEY`        | Yes      | OpenAI API key                                                                                                         |             |\n| `REDIS_HOST`            | Optional | Redis host url                                                                                                         | `localhost` |\n| `REDIS_PORT`            | Optional | Redis port                                                                                                             | `6379`      |\n| `REDIS_PASSWORD`        | Optional | Redis password                                                                                                         | none        |\n| `REDIS_INDEX_NAME`      | Optional | Redis vector index name                                                                                                | `index`     |\n| `REDIS_DOC_PREFIX`      | Optional | Redis key prefix for the index                                                                                         | `doc`       |\n| `REDIS_DISTANCE_METRIC` | Optional | Vector similarity distance metric                                                                                      | `COSINE`    |\n| `REDIS_INDEX_TYPE`      | Optional | [Vector index algorithm type](https://redis.io/docs/stack/search/reference/vectors/#creation-attributes-per-algorithm) | `FLAT`      |\n\n\n## Redis Datastore development & testing\nIn order to test your changes to the Redis Datastore, you can run the following commands:\n\n```bash\n# Run the Redis stack docker image\ndocker run -it --rm -p 6379:6379 redis/redis-stack-server:latest\n```\n    \n```bash\n# Run the Redis datastore tests\npoetry run pytest -s ./tests/datastore/providers/redis/test_redis_datastore.py\n```"
  },
  {
    "path": "docs/providers/supabase/setup.md",
    "content": "# Supabase\n\n[Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension for Postgres Database. [You can use Supabase CLI](https://github.com/supabase/cli) to set up a whole Supabase stack locally or in the cloud or you can also use docker-compose, k8s and other options available. For a hosted/managed solution, try [Supabase.com](https://supabase.com/) and unlock the full power of Postgres with built-in authentication, storage, auto APIs, and Realtime features. See more helpful examples of Supabase & pgvector as a vector database [here](https://github.com/supabase-community/nextjs-openai-doc-search).\n\n- The database needs the `pgvector` extension, which is included in [Supabase distribution of Postgres](https://github.com/supabase/postgres).\n- It is possible to provide a Postgres connection string and an app will add `documents` table, query Postgres function, and `pgvector` extension automatically.\n- But it is recommended to separate the migration process from an app. And execute the migration script in a different pipeline by using SQL statements from `_init_db()` function in [Supabase datastore provider](/datastore/providers/supabase_datastore.py).\n\n**Retrieval App Environment Variables**\n\n| Name             | Required | Description                            |\n| ---------------- | -------- | -------------------------------------- |\n| `DATASTORE`      | Yes      | Datastore name. Set this to `supabase` |\n| `BEARER_TOKEN`   | Yes      | Your secret token                      |\n| `OPENAI_API_KEY` | Yes      | Your OpenAI API key                    |\n\n**Supabase Datastore Environment Variables**\n\n| Name                        | Required | Description                                                                    | Default |\n| --------------------------- | -------- | ------------------------------------------------------------------------------ | ------- |\n| `SUPABASE_URL`              | Yes      | Supabase Project URL                                                           |         |\n| `SUPABASE_ANON_KEY`         | Optional | Supabase Project API anon key                                                  |         |\n| `SUPABASE_SERVICE_ROLE_KEY` | Optional | Supabase Project API service key, will be used if provided instead of anon key |         |\n\n## Supabase Datastore local development & testing\n\nIn order to test your changes to the Supabase Datastore, you can run the following commands:\n\n1. Install [Supabase CLI](https://github.com/supabase/cli) and [Docker](https://docs.docker.com/get-docker/)\n\n2. Run the Supabase `start` command from `examples/providers` directory. Config for Supabase local setup is available in `examples/providers/supabase` directory with required migrations.\n\n```bash\n# Run the Supabase stack using cli in docker\n# go to examples/providers and run supabase start\ncd examples/providers\nsupabase start\n```\n\n3. Supabase `start` will download docker images and launch Supabase stack locally. You will see similar output:\n\n```bash\nApplying migration 20230414142107_init_pg_vector.sql...\nSeeding data supabase/seed.sql...\nStarted supabase local development setup.\n\n         API URL: http://localhost:54321\n          DB URL: postgresql://postgres:postgres@localhost:54322/postgres\n      Studio URL: http://localhost:54323\n    Inbucket URL: http://localhost:54324\n      JWT secret: super-secret-jwt-token-with-at-least-32-characters-long\n        anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0\nservice_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU\n```\n\n4. Export environment variables required for the Supabase Datastore\n\n```bash\nexport SUPABASE_URL=http://localhost:54321\nexport SUPABASE_SERVICE_ROLE_KEY='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU'\n```\n\n5. Run the Supabase datastore tests from the project's root directory\n\n```bash\n# Run the Supabase datastore tests\n# go to project's root directory and run\npoetry run pytest -s ./tests/datastore/providers/supabase/test_supabase_datastore.py\n```\n\n6. When you go to prod (if cloud hosted) it is recommended to link your supabase project with the local setup from `examples/providers/supabase`. All migrations will be synced with the cloud project after you run `supabase db push`. Or you can manually apply migrations from `examples/providers/supabase/migrations` directory.\n\n7. You might want to add RLS policies to the `documents` table. Or you can just continue using it on the server side only with the service role key. But you should not use service role key on the client side in any case.\n\n## Indexes for Postgres\n\nBy default, pgvector performs exact nearest neighbor search. To speed up the vector comparison, you may want to create indexes for the `embedding` column in the `documents` table. You should do this **only** after a few thousand records are inserted.\n\nAs datasotre is using inner product for similarity search, you can add index as follows:\n\n```sql\ncreate index on documents using ivfflat (embedding vector_ip_ops) with (lists = 100);\n```\n\nTo choose `lists` constant - a good place to start is records / 1000 for up to 1M records and sqrt(records) for over 1M records\n\nFor more information about indexes, see [pgvector docs](https://github.com/pgvector/pgvector#indexing).\n"
  },
  {
    "path": "docs/providers/weaviate/setup.md",
    "content": "# Weaviate\n\n## Set up a Weaviate Instance\n\n[Weaviate](https://weaviate.io/) is an open-source vector search engine designed to scale seamlessly into billions of data objects. This implementation supports hybrid search out-of-the-box (meaning it will perform better for keyword searches).\n\nYou can run Weaviate in 4 ways:\n\n- **SaaS** – with [Weaviate Cloud Services (WCS)](https://weaviate.io/pricing).\n\n  WCS is a fully managed service that takes care of hosting, scaling, and updating your Weaviate instance. You can try it out for free with a sandbox that lasts for 30 days.\n\n  To set up a SaaS Weaviate instance with WCS:\n\n  1.  Navigate to [Weaviate Cloud Console](https://console.weaviate.io/).\n  2.  Register or sign in to your WCS account.\n  3.  Create a new cluster with the following settings:\n      - `Name` – a unique name for your cluster. The name will become part of the URL used to access this instance.\n      - `Subscription Tier` – Sandbox for a free trial, or contact [hello@weaviate.io](mailto:hello@weaviate.io) for other options.\n      - `Weaviate Version` - The latest version by default.\n      - `OIDC Authentication` – Enabled by default. This requires a username and password to access your instance.\n  4.  Wait for a few minutes until your cluster is ready. You will see a green tick ✔️ when it's done. Copy your cluster URL.\n\n- **Hybrid SaaS**\n\n  > If you need to keep your data on-premise for security or compliance reasons, Weaviate also offers a Hybrid SaaS option: Weaviate runs within your cloud instances, but the cluster is managed remotely by Weaviate. This gives you the benefits of a managed service without sending data to an external party.\n\n  The Weaviate Hybrid SaaS is a custom solution. If you are interested in this option, please reach out to [hello@weaviate.io](mailto:hello@weaviate.io).\n\n- **Self-hosted** – with a Docker container\n\n  To set up a Weaviate instance with Docker:\n\n  1. [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed.\n  2. [Install the Docker Compose Plugin](https://docs.docker.com/compose/install/)\n  3. Download a `docker-compose.yml` file with this `curl` command:\n\n     ```\n     curl -o docker-compose.yml \"https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v1.18.0\"\n     ```\n\n     Alternatively, you can use Weaviate's docker compose [configuration tool](https://weaviate.io/developers/weaviate/installation/docker-compose) to generate your own `docker-compose.yml` file.\n\n  4. Run `docker compose up -d` to spin up a Weaviate instance.\n\n     > To shut it down, run `docker compose down`.\n\n- **Self-hosted** – with a Kubernetes cluster\n\n  To configure a self-hosted instance with Kubernetes, follow Weaviate's [documentation](https://weaviate.io/developers/weaviate/installation/kubernetes).\n\n## Configure Weaviate Environment Variables\n\nYou need to set some environment variables to connect to your Weaviate instance.\n\n**Retrieval App Environment Variables**\n\n| Name             | Required | Description                                                                          |\n| ---------------- | -------- |--------------------------------------------------------------------------------------|\n| `DATASTORE`      | Yes      | Datastore name. Set this to `weaviate`                                               |\n| `BEARER_TOKEN`   | Yes      | Your [secret token](/README.md#general-environment-variables) (not the Weaviate one) |\n| `OPENAI_API_KEY` | Yes      | Your OpenAI API key                                                                  |\n\n**Weaviate Datastore Environment Variables**\n\n| Name             | Required | Description                                                        | Default            |\n|------------------| -------- | ------------------------------------------------------------------ | ------------------ |\n| `WEAVIATE_URL`  | Optional | Your weaviate instance's url/WCS endpoint              | `http://localhost:8080` |           |\n| `WEAVIATE_CLASS` | Optional | Your chosen Weaviate class/collection name to store your documents | OpenAIDocument     |\n\n**Weaviate Auth Environment Variables**\n\nIf using WCS instances, set the following environment variables:\n\n| Name                | Required | Description                    |\n| ------------------- | -------- | ------------------------------ |\n| `WEAVIATE_API_KEY` | Yes      | Your API key WCS      |\n\nLearn more about accessing your [WCS API key](https://weaviate.io/developers/wcs/guides/authentication#access-api-keys)."
  },
  {
    "path": "docs/providers/zilliz/setup.md",
    "content": "# Zilliz\n\n[Zilliz](https://zilliz.com) is a managed cloud-native vector database designed for the billion scale. Zilliz offers many key features, such as:\n\n- Multiple indexing algorithms\n- Multiple distance metrics\n- Scalar filtering\n- Time travel searches\n- Rollback and with snapshots\n- Full RBAC\n- 99.9% uptime\n- Separated storage and compute\n- Multi-language SDK's\n\nFind more information [here](https://zilliz.com).\n\n**Self Hosted vs SaaS**\n\nZilliz is a SaaS database, but offers an open-source solution, Milvus. Both options offer fast searches at the billion scale, but Zilliz handles data management for you. It automatically scales compute and storage resources and creates optimal indexes for your data. See the comparison [here](https://zilliz.com/doc/about_zilliz_cloud).\n\n## Deploying the Database\n\nZilliz Cloud is deployable in a few simple steps. First, create an account [here](https://cloud.zilliz.com/signup). Once you have an account set up, follow the guide [here](https://zilliz.com/doc/quick_start) to set up a database and get the parameters needed for this application.\n\nEnvironment Variables:\n\n| Name                       | Required | Description                                                      |\n|----------------------------| -------- |------------------------------------------------------------------|\n| `DATASTORE`                | Yes      | Datastore name, set to `zilliz`                                  |\n| `BEARER_TOKEN`             | Yes      | Your secret token                                                |\n| `OPENAI_API_KEY`           | Yes      | Your OpenAI API key                                              |\n| `ZILLIZ_COLLECTION`        | Optional | Zilliz collection name. Defaults to a random UUID                |\n| `ZILLIZ_URI`               | Yes      | URI for the Zilliz instance                                      |\n| `ZILLIZ_USER`              | Yes      | Zilliz username                                                  |\n| `ZILLIZ_PASSWORD`          | Yes      | Zilliz password                                                  |\n| `ZILLIZ_CONSISTENCY_LEVEL` | Optional | Data consistency level for the collection, defaults to `Bounded` |\n\n## Running Zilliz Integration Tests\n\nA suite of integration tests is available to verify the Zilliz integration. To run the tests, create a Zilliz database and update the environment variables.\n\nThen, launch the test suite with this command:\n\n```bash\npytest ./tests/datastore/providers/zilliz/test_zilliz_datastore.py\n```\n"
  },
  {
    "path": "examples/authentication-methods/no-auth/ai-plugin.json",
    "content": "{\n  \"schema_version\": \"v1\",\n  \"name_for_model\": \"retrieval\",\n  \"name_for_human\": \"Retrieval Plugin\",\n  \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.\",\n  \"description_for_human\": \"Search through your documents.\",\n  \"auth\": {\n    \"type\": \"none\"\n  },\n  \"api\": {\n    \"type\": \"openapi\",\n    \"url\": \"https://your-app-url.com/.well-known/openapi.yaml\"\n  },\n  \"logo_url\": \"https://your-app-url.com/.well-known/logo.png\",\n  \"contact_email\": \"hello@contact.com\", \n  \"legal_info_url\": \"hello@legal.com\"\n}\n\n"
  },
  {
    "path": "examples/authentication-methods/no-auth/main.py",
    "content": "# This is a version of the main.py file found in ../../../server/main.py without authentication.\n# Copy and paste this into the main file at ../../../server/main.py if you choose to use no authentication for your retrieval plugin.\nfrom typing import Optional\nimport uvicorn\nfrom fastapi import FastAPI, File, Form, HTTPException, Body, UploadFile\nfrom fastapi.staticfiles import StaticFiles\nfrom loguru import logger\n\nfrom models.api import (\n    DeleteRequest,\n    DeleteResponse,\n    QueryRequest,\n    QueryResponse,\n    UpsertRequest,\n    UpsertResponse,\n)\nfrom datastore.factory import get_datastore\nfrom services.file import get_document_from_file\n\nfrom models.models import DocumentMetadata, Source\n\n\napp = FastAPI()\napp.mount(\"/.well-known\", StaticFiles(directory=\".well-known\"), name=\"static\")\n\n# Create a sub-application, in order to access just the query endpoints in the OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally\nsub_app = FastAPI(\n    title=\"Retrieval Plugin API\",\n    description=\"A retrieval API for querying and filtering documents based on natural language queries and metadata\",\n    version=\"1.0.0\",\n    servers=[{\"url\": \"https://your-app-url.com\"}],\n)\napp.mount(\"/sub\", sub_app)\n\n\n@app.post(\n    \"/upsert-file\",\n    response_model=UpsertResponse,\n)\nasync def upsert_file(\n    file: UploadFile = File(...),\n    metadata: Optional[str] = Form(None),\n):\n    try:\n        metadata_obj = (\n            DocumentMetadata.parse_raw(metadata)\n            if metadata\n            else DocumentMetadata(source=Source.file)\n        )\n    except:\n        metadata_obj = DocumentMetadata(source=Source.file)\n\n    document = await get_document_from_file(file, metadata_obj)\n\n    try:\n        ids = await datastore.upsert([document])\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=f\"str({e})\")\n\n\n@app.post(\n    \"/upsert\",\n    response_model=UpsertResponse,\n)\nasync def upsert(\n    request: UpsertRequest = Body(...),\n):\n    try:\n        ids = await datastore.upsert(request.documents)\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.post(\n    \"/query\",\n    response_model=QueryResponse,\n)\nasync def query_main(\n    request: QueryRequest = Body(...),\n):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@sub_app.post(\n    \"/query\",\n    response_model=QueryResponse,\n    description=\"Accepts search query objects with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\",\n)\nasync def query(\n    request: QueryRequest = Body(...),\n):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.delete(\n    \"/delete\",\n    response_model=DeleteResponse,\n)\nasync def delete(\n    request: DeleteRequest = Body(...),\n):\n    if not (request.ids or request.filter or request.delete_all):\n        raise HTTPException(\n            status_code=400,\n            detail=\"One of ids, filter, or delete_all is required\",\n        )\n    try:\n        success = await datastore.delete(\n            ids=request.ids,\n            filter=request.filter,\n            delete_all=request.delete_all,\n        )\n        return DeleteResponse(success=success)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.on_event(\"startup\")\nasync def startup():\n    global datastore\n    datastore = await get_datastore()\n\n\ndef start():\n    uvicorn.run(\"server.main:app\", host=\"0.0.0.0\", port=8000, reload=True)\n"
  },
  {
    "path": "examples/authentication-methods/oauth/ai-plugin.json",
    "content": "{\n  \"schema_version\": \"v1\",\n  \"name_for_model\": \"retrieval\",\n  \"name_for_human\": \"Retrieval Plugin\",\n  \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.\",\n  \"description_for_human\": \"Search through your documents.\",\n  \"auth\" : {\n    \"type\":\"oauth\",\n    \"client_url\":\"e.g. https://<your domain>/oauth/v2/authorize\",\n    \"authorization_url\":\"e.g. https://<your domain>/api/oauth.v2.access\",\n    \"scope\":\"search:read\",\n    \"authorization_content_type\":\"application/x-www-form-urlencoded\",\n    \"verification_tokens\":{\n\t\t\t\"openai\":\"<token from add plugin flow from the ChatGPT UI>\"\n    }\n  },\n\t\"api\":{\n    \"url\": \"https://your-app-url.com/.well-known/openapi.yaml\",\n\t\t\"has_user_authentication\":true,\n\t\t\"type\":\"openapi\"\n\t},\n  \"logo_url\": \"https://your-app-url.com/.well-known/logo.png\",\n  \"contact_email\": \"hello@contact.com\", \n  \"legal_info_url\": \"hello@legal.com\"\n}\n"
  },
  {
    "path": "examples/authentication-methods/service-http/ai-plugin.json",
    "content": "{\n  \"schema_version\": \"v1\",\n  \"name_for_model\": \"retrieval\",\n  \"name_for_human\": \"Retrieval Plugin\",\n  \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.\",\n  \"description_for_human\": \"Search through your documents.\",\n  \"auth\":{\n\t\t\"type\":\"service_http\",\n\t\t\"authorization_type\":\"bearer\",\n\t\t\"verification_tokens\":{\n\t\t\t\"openai\":\"<token from add plugin flow from the ChatGPT UI>\"\n\t\t}\n\t},\n\t\"api\":{\n    \"url\": \"https://your-app-url.com/.well-known/openapi.yaml\",\n\t\t\"has_user_authentication\":false,\n\t\t\"type\":\"openapi\"\n\t},\n  \"logo_url\": \"https://your-app-url.com/.well-known/logo.png\",\n  \"contact_email\": \"hello@contact.com\", \n  \"legal_info_url\": \"hello@legal.com\"\n}\n"
  },
  {
    "path": "examples/authentication-methods/user-http/ai-plugin.json",
    "content": "{\n  \"schema_version\": \"v1\",\n  \"name_for_model\": \"retrieval\",\n  \"name_for_human\": \"Retrieval Plugin\",\n  \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.\",\n  \"description_for_human\": \"Search through your documents.\",\n  \"auth\": {\n    \"type\": \"user_http\",\n    \"authorization_type\": \"bearer\"\n  },\n  \"api\": {\n    \"type\": \"openapi\",\n    \"url\": \"https://your-app-url.com/.well-known/openapi.yaml\",\n    \"has_user_authentication\": false\n  },\n  \"logo_url\": \"https://your-app-url.com/.well-known/logo.png\",\n  \"contact_email\": \"hello@contact.com\", \n  \"legal_info_url\": \"hello@legal.com\"\n}"
  },
  {
    "path": "examples/docker/elasticsearch/README.md",
    "content": "## Running Elasticsearch\n\n```bash\ndocker-compose up -d\n```\n\nshould now be running at http://localhost:9200\n"
  },
  {
    "path": "examples/docker/elasticsearch/docker-compose.yaml",
    "content": "version: \"3.7\"\n\nservices:\n  elasticsearch:\n    image: docker.elastic.co/elasticsearch/elasticsearch:8.8.2\n    container_name: elasticsearch\n    environment:\n      - discovery.type=single-node\n      - node.name=elasticsearch\n      - xpack.security.enabled=false\n    ulimits:\n      memlock:\n        soft: -1\n        hard: -1\n    ports:\n      - \"9200:9200\"\n    networks:\n      - esnet\n    volumes:\n      - esdata:/usr/share/elasticsearch/data\n\nnetworks:\n  esnet:\n\nvolumes:\n  esdata:\n    driver: local\n"
  },
  {
    "path": "examples/docker/milvus/docker-compose.yaml",
    "content": "version: '3.5'\n\nservices:\n  etcd:\n    container_name: milvus-etcd\n    image: quay.io/coreos/etcd:v3.5.0\n    environment:\n      - ETCD_AUTO_COMPACTION_MODE=revision\n      - ETCD_AUTO_COMPACTION_RETENTION=1000\n      - ETCD_QUOTA_BACKEND_BYTES=4294967296\n      - ETCD_SNAPSHOT_COUNT=50000\n    volumes:\n      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd\n    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd\n\n  minio:\n    container_name: milvus-minio\n    image: minio/minio:RELEASE.2023-03-20T20-16-18Z\n    environment:\n      MINIO_ACCESS_KEY: minioadmin\n      MINIO_SECRET_KEY: minioadmin\n    volumes:\n      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data\n    command: minio server /minio_data\n    healthcheck:\n      test: [\"CMD\", \"curl\", \"-f\", \"http://localhost:9000/minio/health/live\"]\n      interval: 30s\n      timeout: 20s\n      retries: 3\n\n  standalone:\n    container_name: milvus-standalone\n    image: milvusdb/milvus:v2.2.5\n    command: [\"milvus\", \"run\", \"standalone\"]\n    environment:\n      ETCD_ENDPOINTS: etcd:2379\n      MINIO_ADDRESS: minio:9000\n    volumes:\n      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus\n    ports:\n      - \"19530:19530\"\n      - \"9091:9091\"\n    depends_on:\n      - \"etcd\"\n      - \"minio\"\n\nnetworks:\n  default:\n    name: milvus"
  },
  {
    "path": "examples/docker/qdrant/README.md",
    "content": "# Running the Retrieval Plugin with Qdrant in Docker Containers\n\nTo set up the ChatGPT retrieval plugin with a single instance of a Qdrant vector database, follow these steps:\n\n## Set Environment Variables\n\nSet the following environment variables:\n\n```bash\n# Provide your own OpenAI API key in order to start.\nexport OPENAI_API_KEY=\"<your_OpenAI_API_key>\"\n# This is an example of a minimal token generated by https://jwt.io/\nexport BEARER_TOKEN=\"<your_bearer_token>\"\n```\n\n## Run Qdrant and the Retrieval Plugin in Docker Containers\n\nBoth Docker containers might be launched with docker-compose:\n\n```bash\ndocker-compose up -d\n```\n\n## Store the Documents\n\nStore an initial batch of documents by calling the `/upsert` endpoint:\n\n```bash\ncurl -X POST \\\n  -H \"Content-type: application/json\" \\\n  -H \"Authorization: Bearer $BEARER_TOKEN\" \\\n  --data-binary '@documents.json' \\\n  \"http://localhost:80/upsert\"\n```\n\n## Send a Test Query\n\nYou can query Qdrant to find relevant document chunks by calling the `/query` endpoint:\n\n```bash\ncurl -X POST \\\n  -H \"Content-type: application/json\" \\\n  -H \"Authorization: Bearer $BEARER_TOKEN\" \\\n  --data-binary '@queries.json' \\\n  \"http://localhost:80/query\"\n```\n"
  },
  {
    "path": "examples/docker/qdrant/docker-compose.yaml",
    "content": "services:\n  retrieval-app:\n    build:\n      context: ../../../\n      dockerfile: Dockerfile\n    image: openai/chatgpt-retrieval-plugin\n    ports:\n      - \"80:80\"\n    depends_on:\n      - qdrant\n    environment:\n      DATASTORE: \"qdrant\"\n      QDRANT_URL: \"http://qdrant\"\n      BEARER_TOKEN: \"${BEARER_TOKEN}\"\n      OPENAI_API_KEY: \"${OPENAI_API_KEY}\"\n  qdrant:\n    image: qdrant/qdrant:v1.0.3"
  },
  {
    "path": "examples/docker/qdrant/documents.json",
    "content": "{\n  \"documents\": [\n    {\n      \"id\": \"openai\",\n      \"text\": \"OpenAI is an AI research and deployment company. Our mission is to ensure that artificial general intelligence benefits all of humanity.\",\n      \"metadata\": {\n        \"created_at\": \"2023-03-14\"\n      }\n    },\n    {\n      \"id\": \"chatgpt\",\n      \"text\": \"ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. The dialogue format makes it possible for ChatGPT to answer followup questions, admit its mistakes, challenge incorrect premises, and reject inappropriate requests.\"\n    },\n    {\n      \"id\": \"qdrant\",\n      \"text\": \"Qdrant is a vector similarity engine & vector database. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!\",\n      \"metadata\": {\n        \"created_at\": \"2023-03-14\",\n        \"author\": \"Kacper Łukawski\"\n      }\n    }\n  ]\n}"
  },
  {
    "path": "examples/docker/qdrant/queries.json",
    "content": "{\n  \"queries\": [\n    {\n      \"query\": \"What vector database should I use?\"\n    }\n  ]\n}"
  },
  {
    "path": "examples/docker/redis/docker-compose.yml",
    "content": "version: \"3.9\"\n\nservices:\n  redis:\n    image: redis/redis-stack-server:latest\n    ports:\n      - \"6379:6379\"\n    volumes:\n        - redis_data:/data\n    healthcheck:\n      test: [\"CMD\", \"redis-cli\", \"-h\", \"localhost\", \"-p\", \"6379\", \"ping\"]\n      interval: 2s\n      timeout: 1m30s\n      retries: 5\n      start_period: 5s\n\nvolumes:\n  redis_data:"
  },
  {
    "path": "examples/function-calling/README.md",
    "content": "# Retrieval Plugin Function Calling Guide\n\nThis guide provides an overview of how to use the Retrieval Plugin with function calling in both the [Chat Completions API](https://platform.openai.com/docs/guides/function-calling) and the [Assistants API](https://platform.openai.com/docs/assistants/overview). This allows the model to decide when to use your functions (query, fetch, upsert) based on the conversation context.\n\n## Table of Contents\n\n- [Function Calling with Chat Completions](#function-calling-with-chat-completions)\n- [Function Calling with Assistants API](#function-calling-with-assistants-api)\n- [Tool Definitions](#tool-definitions)\n- [Chat Completions Example](#chat-completions-example)\n- [Assistants API Example](#assistants-api-example)\n\n## Function Calling with Chat Completions\n\nIn a call to the chat completions API, you can describe functions and have the model generate a JSON object containing arguments to call one or many functions. The latest models (gpt-3.5-turbo-0125 and gpt-4-turbo-preview) have been trained to detect when a function should be called and to respond with JSON that adheres to the function signature.\n\nYou can define the functions for the Retrieval Plugin endpoints and pass them in as tools when you use the Chat Completions API with one of the latest models. The model will then intelligently call the functions. You can use function calling to write queries to your APIs, call the endpoint on the backend, and return the response as a tool message to the model to continue the conversation. The function definitions/schemas and an example can be found [here](#chat-completions-example).\n\n## Function Calling with Assistants API\n\nYou can use the same function definitions with the OpenAI [Assistants API](https://platform.openai.com/docs/assistants/overview), specifically the [function calling in tool use](https://platform.openai.com/docs/assistants/tools/function-calling). The Assistants API allows you to build AI assistants within your own applications, leveraging models, tools, and knowledge to respond to user queries. The function definitions/schemas and an example can be found [here](/examples/function-calling/). The Assistants API natively supports retrieval from uploaded files, so you should use the Retrieval Plugin with function calling only if you want more granular control of your retrieval system (e.g. embedding chunk length, embedding model / size, etc.).\n\nParallel function calling is supported for both the Chat Completions API and the Assistants API. This means you can perform multiple tasks, such as querying something and saving something back to the vector database, in the same message.\n\nRead more about function calling with the Retrieval Plugin [here](#assistants-api-example).\n\n## Tool Definitions\n\nHere is the tool definition for the `query` function:\n\n```python\ntools = [\n    {\n        \"type\": \"function\",\n        \"function\": {\n            \"name\": \"query\",\n            \"description\": \"Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\",\n            \"parameters\": {\n                \"type\": \"object\",\n                \"properties\": {\n                    \"queries\": {\n                        \"type\": \"array\",\n                        \"items\": {\n                            \"type\": \"object\",\n                            \"properties\": {\n                                \"query\": {\n                                    \"type\": \"string\",\n                                    \"title\": \"Query\"\n                                },\n                                \"filter\": {\n                                    \"type\": \"object\",\n                                    \"properties\": {\n                                        \"document_id\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Document Id\"\n                                        },\n                                        \"source\": {\n                                            \"type\": \"string\",\n                                            \"enum\": [\"email\", \"file\", \"chat\"],\n                                        },\n                                        \"source_id\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Source Id\"\n                                        },\n                                        \"author\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Author\"\n                                        },\n                                        \"start_date\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Start Date\"\n                                        },\n                                        \"end_date\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"End Date\"\n                                        }\n                                    }\n                                },\n                                \"top_k\": {\n                                    \"type\": \"integer\",\n                                    \"title\": \"Top K\",\n                                    \"default\": 3\n                                }\n                            },\n                            \"required\": [\"query\"]\n                        },\n                        \"description\": \"Array of queries to be processed\",\n                    },\n                },\n                \"required\": [\"queries\"],\n            },\n        }\n    }\n]\n```\n\nIf using memory, as defined [here](/examples/memory/), then tools would include both the `query` and `upsert` functions:\n\n```python\ntools = [\n    {\n        \"type\": \"function\",\n        \"function\": {\n            \"name\": \"query\",\n            \"description\": \"Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\",\n            \"parameters\": {\n                \"type\": \"object\",\n                \"properties\": {\n                    \"queries\": {\n                        \"type\": \"array\",\n                        \"items\": {\n                            \"type\": \"object\",\n                            \"properties\": {\n                                \"query\": {\n                                    \"type\": \"string\",\n                                    \"title\": \"Query\"\n                                },\n                                \"filter\": {\n                                    \"type\": \"object\",\n                                    \"properties\": {\n                                        \"document_id\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Document Id\"\n                                        },\n                                        \"source\": {\n                                            \"type\": \"string\",\n                                            \"enum\": [\"email\", \"file\", \"chat\"],\n                                        },\n                                        \"source_id\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Source Id\"\n                                        },\n                                        \"author\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Author\"\n                                        },\n                                        \"start_date\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Start Date\"\n                                        },\n                                        \"end_date\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"End Date\"\n                                        }\n                                    }\n                                },\n                                \"top_k\": {\n                                    \"type\": \"integer\",\n                                    \"title\": \"Top K\",\n                                    \"default\": 3\n                                }\n                            },\n                            \"required\": [\"query\"]\n                        },\n                        \"description\": \"Array of queries to be processed\",\n                    },\n                },\n                \"required\": [\"queries\"],\n            },\n        }\n    },\n    {\n        \"type\": \"function\",\n        \"function\": {\n            \"name\": \"upsert\",\n            \"description\": \"Save chat information. Accepts an array of documents with text (potential questions + conversation text), metadata (source 'chat' and timestamp, no ID as this will be generated). Confirm with the user before saving, ask for more details/context.\",\n            \"parameters\": {\n                \"type\": \"object\",\n                \"properties\": {\n                    \"documents\": {\n                        \"type\": \"array\",\n                        \"items\": {\n                            \"type\": \"object\",\n                            \"properties\": {\n                                \"id\": {\n                                    \"type\": \"string\",\n                                    \"title\": \"Id\"\n                                },\n                                \"text\": {\n                                    \"type\": \"string\",\n                                    \"title\": \"Text\"\n                                },\n                                \"metadata\": {\n                                    \"type\": \"object\",\n                                    \"properties\": {\n                                        \"source\": {\n                                            \"type\": \"string\",\n                                            \"enum\": [\"email\", \"file\", \"chat\"],\n                                        },\n                                        \"source_id\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Source Id\"\n                                        },\n                                        \"url\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Url\"\n                                        },\n                                        \"created_at\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Created At\"\n                                        },\n                                        \"author\": {\n                                            \"type\": \"string\",\n                                            \"title\": \"Author\"\n                                        }\n                                    }\n                                }\n                            },\n                            \"required\": [\"text\"]\n                        },\n                        \"description\": \"Array of documents to be upserted\",\n                    },\n                },\n                \"required\": [\"documents\"],\n            },\n        }\n    },\n]\n```\n\n## Chat Completions Example\n\nHere is an example of how to use the [Chat Completions API with function calling](https://platform.openai.com/docs/guides/function-calling):\n\n```python\n# Step 1: send the conversation and available functions to the model\nmessages = [{\"role\": \"user\", \"content\": \"What's the weather like in San Francisco, Tokyo, and Paris?\"}]\ntools = tools # as above\nresponse = client.chat.completions.create(\n    model=\"gpt-3.5-turbo-0125\",\n    messages=messages,\n    tools=tools,\n    tool_choice=\"auto\",  # auto is default, but we'll be explicit\n)\nresponse_message = response.choices[0].message\ntool_calls = response_message.tool_calls\n# Step 2: check if the model wanted to call a function\nif tool_calls:\n    # Step 3: call the function\n    # Note: the JSON response may not always be valid; be sure to handle errors\n    available_functions = {\n        \"query\": query,\n    }  # only one function in this example, but you can have multiple\n    messages.append(response_message)  # extend conversation with assistant's reply\n    # Step 4: send the info for each function call and function response to the model\n    for tool_call in tool_calls:\n        function_name = tool_call.function.name\n        function_to_call = available_functions[function_name]\n        function_args = json.loads(tool_call.function.arguments)\n        function_response = function_to_call(\n            queries=function_args.get(\"query\"),\n            filter=function_args.get(\"filter\"),\n        )\n        messages.append(\n            {\n                \"tool_call_id\": tool_call.id,\n                \"role\": \"tool\",\n                \"name\": function_name,\n                \"content\": function_response,\n            }\n        )  # extend conversation with function response\n    second_response = client.chat.completions.create(\n        model=\"gpt-3.5-turbo-0125\",\n        messages=messages,\n    )  # get a new response from the model where it can see the function response\n    print(second_response)\n```\n\n## Assistants API Example\n\nFor more information on how to use the Assistants API with function calling, refer to the [official documentation](https://platform.openai.com/docs/assistants/tools/function-calling). As mentioned above, the Assistants API natively supports retrieval from uploaded files, so you should use the Assistants API only if you want more granular control of your retrieval system (e.g. embedding chunk length, embedding model / size, etc). Here is a brief example:\n\nFirst, define your functions when creating an Assistant:\n\n```python\nassistant = client.beta.assistants.create(\n  instructions=\"You are a personal assistant with access to all of the user's personal documents.\",\n  model=\"gpt-4-turbo-preview\",\n  tools=tools # as defined above\n)\n```\n\nWhen you initiate a Run with a user Message that triggers the function, the Run will enter a pending status. After it processes, the run will enter a requires_action state which you can verify by retrieving the Run. The model can provide multiple functions to call at once using parallel function calling:\n\n```python\n{\n  \"id\": \"run_abc123\",\n  \"object\": \"thread.run\",\n  \"assistant_id\": \"asst_abc123\",\n  \"thread_id\": \"thread_abc123\",\n  \"status\": \"requires_action\",\n  \"required_action\": {\n    \"type\": \"submit_tool_outputs\",\n    \"submit_tool_outputs\": {\n      \"tool_calls\": [\n        {\n          \"id\": \"call_abc123\",\n          \"type\": \"function\",\n          \"function\": {\n            \"name\": \"query\",\n            \"arguments\": \"{ \\\"queries\\\": <queries go here> }\"\n          }\n        },\n        {\n          \"id\": \"call_abc456\",\n          \"type\": \"function\",\n          \"function\": {\n            \"name\": \"upsert\",\n            \"arguments\": \"{ \\\"text\\\": <text goes here> }\"\n          }\n        }\n      ]\n    }\n  },\n...\n```\n\nYou can then complete the Run by [submitting the tool output](https://platform.openai.com/docs/api-reference/runs/submitToolOutputs) from the function(s) you call. Pass the tool_call_id referenced in the required_action object above to match output to each function call.\n\n```python\nrun = client.beta.threads.runs.submit_tool_outputs(\n  thread_id=thread.id,\n  run_id=run.id,\n  tool_outputs=[\n      {\n        \"tool_call_id\": call_ids[0],\n        \"output\": [\"\"\"\"list of results here\"\"\"],\n      },\n      {\n        \"tool_call_id\": call_ids[1],\n        \"output\": [\"\"\"\"list of results here\"\"\"],\n      },\n    ]\n)\n```\n"
  },
  {
    "path": "examples/memory/README.md",
    "content": "# ChatGPT Retrieval Plugin with Memory\n\nThis example demonstrates how to give ChatGPT the ability to remember information from conversations and store it in the retrieval plugin for later use. By allowing the model to access the `/upsert` endpoint, it can save snippets from the conversation to the vector database and retrieve them when needed.\n\n## Setup\n\nTo enable ChatGPT to save information from conversations, follow these steps:\n\n- Navigate to the \"Configure\" tab in the [create GPT page](https://chat.openai.com/gpts/editor), and copy the contents of [openapi.yaml](openapi.yaml) into the custom actions section. This will give the custom GPT access to both the Retrieval Plugin's query and upsert endpoints.\n\n**Optional:** If you make any changes to the plugin instructions or metadata models, you can also copy the contents of [main.py](main.py) into the main [main.py](../../server/main.py) file. This will allow you to access the openapi.json at `http://0.0.0.0:8000/sub/openapi.json` when you run the app locally. You can convert from JSON to YAML format with [Swagger Editor](https://editor.swagger.io/). Alternatively, you can replace the openapi.yaml file with an openapi.json file.\n\nAfter completing these steps, your custom GPT will be able to access your plugin's `/upsert` endpoint and save snippets from the conversation to the vector database. This enables the model to remember information from previous conversations and retrieve it when needed.\n"
  },
  {
    "path": "examples/memory/ai-plugin.json",
    "content": "{\n    \"schema_version\": \"v1\",\n    \"name_for_model\": \"retrieval\",\n    \"name_for_human\": \"Retrieval Plugin\",\n    \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information, or asks you to save information for later.\",\n    \"description_for_human\": \"Search through your documents.\",\n    \"auth\": {\n      \"type\": \"user_http\",\n      \"authorization_type\": \"bearer\"\n    },\n    \"api\": {\n      \"type\": \"openapi\",\n      \"url\": \"https://your-app-url.com/.well-known/openapi.yaml\",\n      \"has_user_authentication\": false\n    },\n    \"logo_url\": \"https://your-app-url.com/.well-known/logo.png\",\n    \"contact_email\": \"hello@contact.com\", \n    \"legal_info_url\": \"hello@legal.com\"\n  }"
  },
  {
    "path": "examples/memory/main.py",
    "content": "# This is a version of the main.py file found in ../../server/main.py that also gives ChatGPT access to the upsert endpoint\n# (allowing it to save information from the chat back to the vector) database.\n# Copy and paste this into the main file at ../../server/main.py if you choose to give the model access to the upsert endpoint\n# and want to access the openapi.json when you run the app locally at http://0.0.0.0:8000/sub/openapi.json.\nimport os\nfrom typing import Optional\nimport uvicorn\nfrom fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile\nfrom fastapi.security import HTTPBearer, HTTPAuthorizationCredentials\nfrom fastapi.staticfiles import StaticFiles\nfrom loguru import logger\n\nfrom models.api import (\n    DeleteRequest,\n    DeleteResponse,\n    QueryRequest,\n    QueryResponse,\n    UpsertRequest,\n    UpsertResponse,\n)\nfrom datastore.factory import get_datastore\nfrom services.file import get_document_from_file\n\nfrom models.models import DocumentMetadata, Source\n\n\nbearer_scheme = HTTPBearer()\nBEARER_TOKEN = os.environ.get(\"BEARER_TOKEN\")\nassert BEARER_TOKEN is not None\n\n\ndef validate_token(credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme)):\n    if credentials.scheme != \"Bearer\" or credentials.credentials != BEARER_TOKEN:\n        raise HTTPException(status_code=401, detail=\"Invalid or missing token\")\n    return credentials\n\n\napp = FastAPI()\napp.mount(\"/.well-known\", StaticFiles(directory=\".well-known\"), name=\"static\")\n\n# Create a sub-application, in order to access just the upsert and query endpoints in the OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally\nsub_app = FastAPI(\n    title=\"Retrieval Plugin API\",\n    description=\"A retrieval API for querying and filtering documents based on natural language queries and metadata\",\n    version=\"1.0.0\",\n    servers=[{\"url\": \"https://your-app-url.com\"}],\n    dependencies=[Depends(validate_token)],\n)\napp.mount(\"/sub\", sub_app)\n\n\n@app.post(\n    \"/upsert-file\",\n    response_model=UpsertResponse,\n)\nasync def upsert_file(\n    file: UploadFile = File(...),\n    metadata: Optional[str] = Form(None),\n):\n    try:\n        metadata_obj = (\n            DocumentMetadata.parse_raw(metadata)\n            if metadata\n            else DocumentMetadata(source=Source.file)\n        )\n    except:\n        metadata_obj = DocumentMetadata(source=Source.file)\n\n    document = await get_document_from_file(file, metadata_obj)\n\n    try:\n        ids = await datastore.upsert([document])\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=f\"str({e})\")\n\n\n@app.post(\n    \"/upsert\",\n    response_model=UpsertResponse,\n)\nasync def upsert_main(\n    request: UpsertRequest = Body(...),\n    token: HTTPAuthorizationCredentials = Depends(validate_token),\n):\n    try:\n        ids = await datastore.upsert(request.documents)\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@sub_app.post(\n    \"/upsert\",\n    response_model=UpsertResponse,\n    # NOTE: We are describing the shape of the API endpoint input due to a current limitation in parsing arrays of objects from OpenAPI schemas. This will not be necessary in the future.\n    description=\"Save chat information. Accepts an array of documents with text (potential questions + conversation text), metadata (source 'chat' and timestamp, no ID as this will be generated). Confirm with the user before saving, ask for more details/context.\",\n)\nasync def upsert(\n    request: UpsertRequest = Body(...),\n    token: HTTPAuthorizationCredentials = Depends(validate_token),\n):\n    try:\n        ids = await datastore.upsert(request.documents)\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.post(\n    \"/query\",\n    response_model=QueryResponse,\n)\nasync def query_main(\n    request: QueryRequest = Body(...),\n    token: HTTPAuthorizationCredentials = Depends(validate_token),\n):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@sub_app.post(\n    \"/query\",\n    response_model=QueryResponse,\n    # NOTE: We are describing the shape of the API endpoint input due to a current limitation in parsing arrays of objects from OpenAPI schemas. This will not be necessary in the future.\n    description=\"Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\",\n)\nasync def query(\n    request: QueryRequest = Body(...),\n    token: HTTPAuthorizationCredentials = Depends(validate_token),\n):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.delete(\n    \"/delete\",\n    response_model=DeleteResponse,\n)\nasync def delete(\n    request: DeleteRequest = Body(...),\n    token: HTTPAuthorizationCredentials = Depends(validate_token),\n):\n    if not (request.ids or request.filter or request.delete_all):\n        raise HTTPException(\n            status_code=400,\n            detail=\"One of ids, filter, or delete_all is required\",\n        )\n    try:\n        success = await datastore.delete(\n            ids=request.ids,\n            filter=request.filter,\n            delete_all=request.delete_all,\n        )\n        return DeleteResponse(success=success)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.on_event(\"startup\")\nasync def startup():\n    global datastore\n    datastore = await get_datastore()\n\n\ndef start():\n    uvicorn.run(\"server.main:app\", host=\"0.0.0.0\", port=8000, reload=True)\n"
  },
  {
    "path": "examples/memory/openapi.yaml",
    "content": "openapi: 3.0.2\ninfo:\n  title: Retrieval Plugin API\n  description: A retrieval API for querying and filtering documents based on natural language queries and metadata\n  version: 1.0.0\nservers:\n  - url: https://your-app-url.com\npaths:\n  /upsert:\n    post:\n      summary: Upsert\n      description: Save chat information. Accepts an array of documents with text (potential questions + conversation text), metadata (source 'chat' and timestamp, no ID as this will be generated). Confirm with the user before saving, ask for more details/context.\n      operationId: upsert_upsert_post\n      requestBody:\n        content:\n          application/json:\n            schema:\n              $ref: \"#/components/schemas/UpsertRequest\"\n        required: true\n      responses:\n        \"200\":\n          description: Successful Response\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/UpsertResponse\"\n        \"422\":\n          description: Validation Error\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/HTTPValidationError\"\n      security:\n        - HTTPBearer: []\n  /query:\n    post:\n      summary: Query\n      description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\n      operationId: query_query_post\n      requestBody:\n        content:\n          application/json:\n            schema:\n              $ref: \"#/components/schemas/QueryRequest\"\n        required: true\n      responses:\n        \"200\":\n          description: Successful Response\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/QueryResponse\"\n        \"422\":\n          description: Validation Error\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/HTTPValidationError\"\n      security:\n        - HTTPBearer: []\ncomponents:\n  schemas:\n    Document:\n      title: Document\n      required:\n        - text\n      type: object\n      properties:\n        id:\n          title: Id\n          type: string\n        text:\n          title: Text\n          type: string\n        metadata:\n          $ref: \"#/components/schemas/DocumentMetadata\"\n    DocumentChunkMetadata:\n      title: DocumentChunkMetadata\n      type: object\n      properties:\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        url:\n          title: Url\n          type: string\n        created_at:\n          title: Created At\n          type: string\n        author:\n          title: Author\n          type: string\n        document_id:\n          title: Document Id\n          type: string\n    DocumentChunkWithScore:\n      title: DocumentChunkWithScore\n      required:\n        - text\n        - metadata\n        - score\n      type: object\n      properties:\n        id:\n          title: Id\n          type: string\n        text:\n          title: Text\n          type: string\n        metadata:\n          $ref: \"#/components/schemas/DocumentChunkMetadata\"\n        embedding:\n          title: Embedding\n          type: array\n          items:\n            type: number\n        score:\n          title: Score\n          type: number\n    DocumentMetadata:\n      title: DocumentMetadata\n      type: object\n      properties:\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        url:\n          title: Url\n          type: string\n        created_at:\n          title: Created At\n          type: string\n        author:\n          title: Author\n          type: string\n    DocumentMetadataFilter:\n      title: DocumentMetadataFilter\n      type: object\n      properties:\n        document_id:\n          title: Document Id\n          type: string\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        author:\n          title: Author\n          type: string\n        start_date:\n          title: Start Date\n          type: string\n        end_date:\n          title: End Date\n          type: string\n    HTTPValidationError:\n      title: HTTPValidationError\n      type: object\n      properties:\n        detail:\n          title: Detail\n          type: array\n          items:\n            $ref: \"#/components/schemas/ValidationError\"\n    Query:\n      title: Query\n      required:\n        - query\n      type: object\n      properties:\n        query:\n          title: Query\n          type: string\n        filter:\n          $ref: \"#/components/schemas/DocumentMetadataFilter\"\n        top_k:\n          title: Top K\n          type: integer\n          default: 3\n    QueryRequest:\n      title: QueryRequest\n      required:\n        - queries\n      type: object\n      properties:\n        queries:\n          title: Queries\n          type: array\n          items:\n            $ref: \"#/components/schemas/Query\"\n    QueryResponse:\n      title: QueryResponse\n      required:\n        - results\n      type: object\n      properties:\n        results:\n          title: Results\n          type: array\n          items:\n            $ref: \"#/components/schemas/QueryResult\"\n    QueryResult:\n      title: QueryResult\n      required:\n        - query\n        - results\n      type: object\n      properties:\n        query:\n          title: Query\n          type: string\n        results:\n          title: Results\n          type: array\n          items:\n            $ref: \"#/components/schemas/DocumentChunkWithScore\"\n    Source:\n      title: Source\n      enum:\n        - email\n        - file\n        - chat\n      type: string\n      description: An enumeration.\n    UpsertRequest:\n      title: UpsertRequest\n      required:\n        - documents\n      type: object\n      properties:\n        documents:\n          title: Documents\n          type: array\n          items:\n            $ref: \"#/components/schemas/Document\"\n    UpsertResponse:\n      title: UpsertResponse\n      required:\n        - ids\n      type: object\n      properties:\n        ids:\n          title: Ids\n          type: array\n          items:\n            type: string\n    ValidationError:\n      title: ValidationError\n      required:\n        - loc\n        - msg\n        - type\n      type: object\n      properties:\n        loc:\n          title: Location\n          type: array\n          items:\n            anyOf:\n              - type: string\n              - type: integer\n        msg:\n          title: Message\n          type: string\n        type:\n          title: Error Type\n          type: string\n  securitySchemes:\n    HTTPBearer:\n      type: http\n      scheme: bearer\n"
  },
  {
    "path": "examples/providers/azurecosmosdb/semantic-search.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"id\": \"de02cdc9\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import requests\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"7e5d60e1\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Document retrieval: upsert and basic query usage\\n\",\n    \"\\n\",\n    \"In this walkthrough we will go over the Retrieval API with a Azure CosmosDB Mongo vCore datastore for semantic search.\\n\",\n    \"\\n\",\n    \"Before running the notebook please initialize the retrieval API and have it running locally somewhere. Please follow the instructions to start the Retreival API provided [here](https://github.com/openai/chatgpt-retrieval-plugin#quickstart). \\n\",\n    \"\\n\",\n    \"[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service.\\n\",\n    \"\\n\",\n    \"Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"80988348\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Document\\n\",\n    \"\\n\",\n    \"First we will create a list of documents. From the perspective of the retrieval plugin, a [document](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/models/models.py) consists of an \\\"id\\\", \\\"text\\\", \\\"embedding\\\"(optional) and a collection of \\\"metadata\\\". The \\\"metadata\\\" has \\\"source\\\", \\\"source_id\\\", \\\"created_at\\\", \\\"url\\\" and \\\"author\\\" fields. Query metadata does not expose the \\\"url\\\" field.\\n\",\n    \"\\n\",\n    \"For this example we have taken some data about a few dog breeds. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"id\": \"52829ffc\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"document_1 = {\\n\",\n    \"    \\\"id\\\": \\\"Siberian Husky\\\",\\n\",\n    \"    \\\"text\\\": \\\"Siberian Huskies are strikingly beautiful and energetic Arctic breed dogs known for their captivating blue eyes and remarkable endurance in cold climates.\\\"\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"document_2 = {\\n\",\n    \"    \\\"id\\\": \\\"Alaskan Malamute\\\",\\n\",\n    \"    \\\"text\\\": \\\"The Alaskan Malamute is a powerful and friendly Arctic sled dog breed known for its strength, endurance, and affectionate nature.\\\"\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"document_3 = {\\n\",\n    \"    \\\"id\\\": \\\"Samoyed\\\",\\n\",\n    \"    \\\"text\\\": \\\"The Samoyed is a cheerful and fluffy Arctic breed, renowned for its smile and gentle disposition, originally used for herding reindeer and pulling sleds in Siberia.\\\"\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"6af96f59\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Indexing the Docs\\n\",\n    \"\\n\",\n    \"On the first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported. \\n\",\n    \"\\n\",\n    \"To make these requests to the retrieval app API, we will need to provide authorization in the form of the BEARER_TOKEN we set earlier. We do this below:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"id\": \"d68e796e\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"BEARER_TOKEN_HERE = \\\"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkFheXVzaCBLYXRhcmlhIiwiaWF0IjoxNTE2MjM5MDIyfQ.VHEVK_IdThXZJr8aQsfjVQ-_n4raepdpqsC5gYDsubE\\\"\\n\",\n    \"endpoint_url = 'http://0.0.0.0:8000'\\n\",\n    \"headers = {\\n\",\n    \"    \\\"Authorization\\\": f\\\"Bearer {BEARER_TOKEN_HERE}\\\"\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"id\": \"954a09da\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'ids': ['doc:Siberian Husky:chunk:Siberian Husky_0',\\n\",\n       \"  'doc:Alaskan Malamute:chunk:Alaskan Malamute_0',\\n\",\n       \"  'doc:Samoyed:chunk:Samoyed_0']}\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/upsert\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\\"documents\\\": [document_1, document_2, document_3]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"431a8616\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Querying the datastore\\n\",\n    \"Let's query the data store for dogs based on the place of their origin.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"23441d46\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"queries = [\\n\",\n    \"    {\\n\",\n    \"        \\\"query\\\":\\\"I want dog breeds from Siberia.\\\",\\n\",\n    \"        \\\"top_k\\\":2\\n\",\n    \"    },\\n\",\n    \"    {\\n\",\n    \"        \\\"query\\\":\\\"I want dog breed from Alaska.\\\",\\n\",\n    \"        \\\"top_k\\\":1\\n\",\n    \"    }\\n\",\n    \"]\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\\"queries\\\":queries}\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"705181ee\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Deleting the data from the datastore\\n\",\n    \"You can either delete all the data, or provide a list of docIds to delete\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"b15513ca\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"response = requests.delete(\\n\",\n    \"    f\\\"{endpoint_url}/delete\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\\"ids\\\":[\\\"doc:SiberianHusky:chunk:SiberianHusky_0\\\"]}\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"cc748e50\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"response = requests.delete(\\n\",\n    \"    f\\\"{endpoint_url}/delete\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\\"delete_all\\\":True}\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"19531965\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.9.12\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "examples/providers/elasticsearch/search.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Using Elasticsearch as a datastore\\n\",\n    \"\\n\",\n    \"In this walkthrough we will see how to use the retrieval API with a Elasticsearch datastore for *search / question-answering*.\\n\",\n    \"\\n\",\n    \"Before running this notebook you should have already initialized the retrieval API and have it running locally or elsewhere. See readme for instructions on how to do this.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## App Quickstart\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"1. Install Python 3.10 if not already installed.\\n\",\n    \"\\n\",\n    \"2. Clone the `retrieval-app` repository:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"git clone git@github.com:openai/retrieval-app.git\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"3. Navigate to the app directory:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"cd /path/to/retrieval-app\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"4. Install `poetry`:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"pip install poetry\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"5. Create a new virtual environment:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"poetry env use python3.10\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"6. Install the `retrieval-app` dependencies:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"poetry install\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"7. Set app environment variables:\\n\",\n    \"\\n\",\n    \"* `BEARER_TOKEN`: Secret token used by the app to authorize incoming requests. We will later include this in the request `headers`. The token can be generated however you prefer, such as using [jwt.io](https://jwt.io/).\\n\",\n    \"\\n\",\n    \"* `OPENAI_API_KEY`: The OpenAI API key used for generating embeddings with the OpenAI embeddings model. [Get an API key here](https://platform.openai.com/account/api-keys)!\\n\",\n    \"\\n\",\n    \"8. Set Elasticsearch-specific environment variables:\\n\",\n    \"\\n\",\n    \"* `DATASTORE`: set to `elasticsearch`.\\n\",\n    \"\\n\",\n    \"9. Set the Elasticsearch connection specific environment variables. Either set `ELASTICSEARCH_CLOUD_ID` or `ELASTICSEARCH_URL`.\\n\",\n    \"* `ELASTICSEARCH_CLOUD_ID`: Set to your deployment cloud id. You can find this in the [Elasticsearch console](https://cloud.elastic.co).\\n\",\n    \"\\n\",\n    \"* `ELASTICSEARCH_URL`: Set to your Elasticsearch URL, looks like `https://<username>:<password>@<host>:<port>`. You can find this in the [Elasticsearch console](https://cloud.elastic.co).\\n\",\n    \"\\n\",\n    \"10. Set the Elasticsearch authentication specific environment variables. Either set `ELASTICSEARCH_USERNAME` and `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY`.\\n\",\n    \"\\n\",\n    \"* `ELASTICSEARCH_USERNAME`: Set to your Elasticsearch username. You can find this in the [Elasticsearch console](https://cloud.elastic.co). Typically this is set to `elastic`.\\n\",\n    \"\\n\",\n    \"* `ELASTICSEARCH_PASSWORD`: Set to your Elasticsearch password. You can find this in the [Elasticsearch console](https://cloud.elastic.co) in security.\\n\",\n    \"\\n\",\n    \"* `ELASTICSEARCH_API_KEY`: Set to your Elasticsearch API key. You can set one up in Kibana Stack management page.\\n\",\n    \"\\n\",\n    \"11. Set the Elasticsearch index specific environment variables.\\n\",\n    \"\\n\",\n    \"* `ELASTICSEARCH_INDEX`: Set to the name of the Elasticsearch index you want to use.\\n\",\n    \"\\n\",\n    \"12. Run the app with:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"poetry run start\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"If running the app locally you should see something like:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"INFO:     Uvicorn running on http://0.0.0.0:8000\\n\",\n    \"INFO:     Application startup complete.\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"In that case, the app is automatically connected to our index (specified by `ELASTICSEARCH_INDEX`), if no index with that name existed beforehand, the app creates one for us.\\n\",\n    \"\\n\",\n    \"Now we're ready to move on to populating our index with some data.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Required Libraries\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"There are a few Python libraries we must `pip install` for this notebook to run, those are:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 39,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"\\u001b[1m[\\u001b[0m\\u001b[34;49mnotice\\u001b[0m\\u001b[1;39;49m]\\u001b[0m\\u001b[39;49m A new release of pip is available: \\u001b[0m\\u001b[31;49m23.2\\u001b[0m\\u001b[39;49m -> \\u001b[0m\\u001b[32;49m23.2.1\\u001b[0m\\n\",\n      \"\\u001b[1m[\\u001b[0m\\u001b[34;49mnotice\\u001b[0m\\u001b[1;39;49m]\\u001b[0m\\u001b[39;49m To update, run: \\u001b[0m\\u001b[32;49mpip install --upgrade pip\\u001b[0m\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install -qU datasets pandas tqdm\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Preparing Data\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this example, we will use the **S**tanford **Qu**estion **A**nswering **D**ataset (SQuAD2), which we download from Hugging Face Datasets.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from datasets import load_dataset\\n\",\n    \"\\n\",\n    \"data = load_dataset(\\\"squad_v2\\\", split=\\\"train\\\")\\n\",\n    \"data\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Transform the data into a Pandas dataframe for simpler preprocessing.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 41,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>title</th>\\n\",\n       \"      <th>context</th>\\n\",\n       \"      <th>question</th>\\n\",\n       \"      <th>answers</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>56be85543aeaaa14008c9063</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>When did Beyonce start becoming popular?</td>\\n\",\n       \"      <td>{'text': ['in the late 1990s'], 'answer_start'...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>56be85543aeaaa14008c9065</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>What areas did Beyonce compete in when she was...</td>\\n\",\n       \"      <td>{'text': ['singing and dancing'], 'answer_star...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>56be85543aeaaa14008c9066</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>When did Beyonce leave Destiny's Child and bec...</td>\\n\",\n       \"      <td>{'text': ['2003'], 'answer_start': [526]}</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>56bf6b0f3aeaaa14008c9601</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>In what city and state did Beyonce  grow up?</td>\\n\",\n       \"      <td>{'text': ['Houston, Texas'], 'answer_start': [...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>56bf6b0f3aeaaa14008c9602</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>In which decade did Beyonce become famous?</td>\\n\",\n       \"      <td>{'text': ['late 1990s'], 'answer_start': [276]}</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                         id    title  \\\\\\n\",\n       \"0  56be85543aeaaa14008c9063  Beyoncé   \\n\",\n       \"1  56be85543aeaaa14008c9065  Beyoncé   \\n\",\n       \"2  56be85543aeaaa14008c9066  Beyoncé   \\n\",\n       \"3  56bf6b0f3aeaaa14008c9601  Beyoncé   \\n\",\n       \"4  56bf6b0f3aeaaa14008c9602  Beyoncé   \\n\",\n       \"\\n\",\n       \"                                             context  \\\\\\n\",\n       \"0  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"1  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"2  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"3  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"4  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"\\n\",\n       \"                                            question  \\\\\\n\",\n       \"0           When did Beyonce start becoming popular?   \\n\",\n       \"1  What areas did Beyonce compete in when she was...   \\n\",\n       \"2  When did Beyonce leave Destiny's Child and bec...   \\n\",\n       \"3      In what city and state did Beyonce  grow up?    \\n\",\n       \"4         In which decade did Beyonce become famous?   \\n\",\n       \"\\n\",\n       \"                                             answers  \\n\",\n       \"0  {'text': ['in the late 1990s'], 'answer_start'...  \\n\",\n       \"1  {'text': ['singing and dancing'], 'answer_star...  \\n\",\n       \"2          {'text': ['2003'], 'answer_start': [526]}  \\n\",\n       \"3  {'text': ['Houston, Texas'], 'answer_start': [...  \\n\",\n       \"4    {'text': ['late 1990s'], 'answer_start': [276]}  \"\n      ]\n     },\n     \"execution_count\": 41,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"data = data.to_pandas()\\n\",\n    \"data.head()\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset contains a lot of duplicate `context` paragraphs, this is because each `context` can have many relevant questions. We don't want these duplicates so we remove like so:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 42,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"19029\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>title</th>\\n\",\n       \"      <th>context</th>\\n\",\n       \"      <th>question</th>\\n\",\n       \"      <th>answers</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>56be85543aeaaa14008c9063</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>When did Beyonce start becoming popular?</td>\\n\",\n       \"      <td>{'text': ['in the late 1990s'], 'answer_start'...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>15</th>\\n\",\n       \"      <td>56be86cf3aeaaa14008c9076</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Following the disbandment of Destiny's Child i...</td>\\n\",\n       \"      <td>After her second solo album, what other entert...</td>\\n\",\n       \"      <td>{'text': ['acting'], 'answer_start': [207]}</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>27</th>\\n\",\n       \"      <td>56be88473aeaaa14008c9080</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>A self-described \\\"modern-day feminist\\\", Beyonc...</td>\\n\",\n       \"      <td>In her music, what are some recurring elements...</td>\\n\",\n       \"      <td>{'text': ['love, relationships, and monogamy']...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>39</th>\\n\",\n       \"      <td>56be892d3aeaaa14008c908b</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles was born in Houston, T...</td>\\n\",\n       \"      <td>Beyonce's younger sibling also sang with her i...</td>\\n\",\n       \"      <td>{'text': ['Destiny's Child'], 'answer_start': ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>52</th>\\n\",\n       \"      <td>56be8a583aeaaa14008c9094</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé attended St. Mary's Elementary School ...</td>\\n\",\n       \"      <td>What town did Beyonce go to school in?</td>\\n\",\n       \"      <td>{'text': ['Fredericksburg'], 'answer_start': [...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                          id    title  \\\\\\n\",\n       \"0   56be85543aeaaa14008c9063  Beyoncé   \\n\",\n       \"15  56be86cf3aeaaa14008c9076  Beyoncé   \\n\",\n       \"27  56be88473aeaaa14008c9080  Beyoncé   \\n\",\n       \"39  56be892d3aeaaa14008c908b  Beyoncé   \\n\",\n       \"52  56be8a583aeaaa14008c9094  Beyoncé   \\n\",\n       \"\\n\",\n       \"                                              context  \\\\\\n\",\n       \"0   Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"15  Following the disbandment of Destiny's Child i...   \\n\",\n       \"27  A self-described \\\"modern-day feminist\\\", Beyonc...   \\n\",\n       \"39  Beyoncé Giselle Knowles was born in Houston, T...   \\n\",\n       \"52  Beyoncé attended St. Mary's Elementary School ...   \\n\",\n       \"\\n\",\n       \"                                             question  \\\\\\n\",\n       \"0            When did Beyonce start becoming popular?   \\n\",\n       \"15  After her second solo album, what other entert...   \\n\",\n       \"27  In her music, what are some recurring elements...   \\n\",\n       \"39  Beyonce's younger sibling also sang with her i...   \\n\",\n       \"52             What town did Beyonce go to school in?   \\n\",\n       \"\\n\",\n       \"                                              answers  \\n\",\n       \"0   {'text': ['in the late 1990s'], 'answer_start'...  \\n\",\n       \"15        {'text': ['acting'], 'answer_start': [207]}  \\n\",\n       \"27  {'text': ['love, relationships, and monogamy']...  \\n\",\n       \"39  {'text': ['Destiny's Child'], 'answer_start': ...  \\n\",\n       \"52  {'text': ['Fredericksburg'], 'answer_start': [...  \"\n      ]\n     },\n     \"execution_count\": 42,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"data = data.drop_duplicates(subset=[\\\"context\\\"])\\n\",\n    \"print(len(data))\\n\",\n    \"data.head()\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The format required by the apps `upsert` function is a list of documents like:\\n\",\n    \"\\n\",\n    \"```json\\n\",\n    \"[\\n\",\n    \"    {\\n\",\n    \"        \\\"id\\\": \\\"abc\\\",\\n\",\n    \"        \\\"text\\\": \\\"some important document text\\\",\\n\",\n    \"        \\\"metadata\\\": {\\n\",\n    \"            \\\"field1\\\": \\\"optional metadata goes here\\\",\\n\",\n    \"            \\\"field2\\\": 54\\n\",\n    \"        }\\n\",\n    \"    },\\n\",\n    \"    {\\n\",\n    \"        \\\"id\\\": \\\"123\\\",\\n\",\n    \"        \\\"text\\\": \\\"some other important text\\\",\\n\",\n    \"        \\\"metadata\\\": {\\n\",\n    \"            \\\"field1\\\": \\\"another metadata\\\",\\n\",\n    \"            \\\"field2\\\": 71,\\n\",\n    \"            \\\"field3\\\": \\\"not all metadatas need the same structure\\\"\\n\",\n    \"        }\\n\",\n    \"    }\\n\",\n    \"    ...\\n\",\n    \"]\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"Every document *must* have a `\\\"text\\\"` field. The `\\\"id\\\"` and `\\\"metadata\\\"` fields are optional.\\n\",\n    \"\\n\",\n    \"To create this format for our SQuAD data we do:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"documents = [\\n\",\n    \"    {\\n\",\n    \"        'id': r['id'],\\n\",\n    \"        'text': r['context'],\\n\",\n    \"        'metadata': {\\n\",\n    \"            'title': r['title']\\n\",\n    \"        }\\n\",\n    \"    } for r in data.to_dict(orient='records')\\n\",\n    \"]\\n\",\n    \"documents[:3]\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Indexing the Docs\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now, it's time to initiate the indexing process, also known as upserting, for our documents. To perform these requests to the retrieval app API, we must provide authorization using the BEARER_TOKEN we defined earlier. Below is how we accomplish this:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 44,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"\\n\",\n    \"BEARER_TOKEN = os.environ.get(\\\"BEARER_TOKEN\\\") or \\\"BEARER_TOKEN_HERE\\\"\\n\",\n    \"\\n\",\n    \"headers = {\\n\",\n    \"    \\\"Authorization\\\": f\\\"Bearer {BEARER_TOKEN}\\\"\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now we will execute bulk inserts in batches set by the `batch_size`.\\n\",\n    \"\\n\",\n    \"Now that all our SQuAD2 records have been successfully indexed, we can proceed with the querying phase.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 46,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|██████████| 1/1 [00:16<00:00, 16.88s/it]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from tqdm.auto import tqdm\\n\",\n    \"import requests\\n\",\n    \"from requests.adapters import HTTPAdapter, Retry\\n\",\n    \"\\n\",\n    \"batch_size = 100\\n\",\n    \"endpoint_url = \\\"http://localhost:8000\\\"\\n\",\n    \"s = requests.Session()\\n\",\n    \"\\n\",\n    \"# we setup a retry strategy to retry on 5xx errors\\n\",\n    \"retries = Retry(\\n\",\n    \"    total=5,  # number of retries before raising error\\n\",\n    \"    backoff_factor=0.1,\\n\",\n    \"    status_forcelist=[500, 502, 503, 504]\\n\",\n    \")\\n\",\n    \"s.mount('http://', HTTPAdapter(max_retries=retries))\\n\",\n    \"\\n\",\n    \"for i in tqdm(range(0, 10, batch_size)):\\n\",\n    \"    i_end = min(len(documents), i+batch_size)\\n\",\n    \"    # make post request that allows up to 5 retries\\n\",\n    \"    res = s.post(\\n\",\n    \"        f\\\"{endpoint_url}/upsert\\\",\\n\",\n    \"        headers=headers,\\n\",\n    \"        json={\\n\",\n    \"            \\\"documents\\\": documents[i:i_end]\\n\",\n    \"        }\\n\",\n    \"    )\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Making Queries\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"By passing one or more queries to the /query endpoint, we can easily conduct a query on the datastore. For this task, we can utilize a few questions from SQuAD2.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 47,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"19029\"\n      ]\n     },\n     \"execution_count\": 47,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"queries = data['question'].tolist()\\n\",\n    \"# format into the structure needed by the /query endpoint\\n\",\n    \"queries = [{'query': queries[i]} for i in range(len(queries))]\\n\",\n    \"len(queries)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 49,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"<Response [200]>\"\n      ]\n     },\n     \"execution_count\": 49,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"res = requests.post(\\n\",\n    \"    \\\"http://0.0.0.0:8000/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        'queries': queries[:3]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"res\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"At this point, we have the ability to iterate through the responses and observe the outcomes obtained for each query:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 50,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"----------------------------------------------------------------------\\n\",\n      \"When did Beyonce start becoming popular?\\n\",\n      \"\\n\",\n      \"0.93: On December 13, 2013, Beyoncé unexpectedly released her eponymous fifth studio album on the iTunes Store without any prior announcement or promotion. The album debuted atop the Billboard 200 chart, giving Beyoncé her fifth consecutive number-one album in the US. This made her the first woman in the chart's history to have her first five studio albums debut at number one. Beyoncé received critical acclaim and commercial success, selling one million digital copies worldwide in six days; The New York Times noted the album's unconventional, unexpected release as significant. Musically an electro-R&B album, it concerns darker themes previously unexplored in her work, such as \\\"bulimia, postnatal depression [and] the fears and insecurities of marriage and motherhood\\\". The single \\\"Drunk in Love\\\", featuring Jay Z, peaked at number two on the Billboard Hot 100 chart.\\n\",\n      \"0.93: Beyoncé's first solo recording was a feature on Jay Z's \\\"'03 Bonnie & Clyde\\\" that was released in October 2002, peaking at number four on the U.S. Billboard Hot 100 chart. Her first solo album Dangerously in Love was released on June 24, 2003, after Michelle Williams and Kelly Rowland had released their solo efforts. The album sold 317,000 copies in its first week, debuted atop the Billboard 200, and has since sold 11 million copies worldwide. The album's lead single, \\\"Crazy in Love\\\", featuring Jay Z, became Beyoncé's first number-one single as a solo artist in the US. The single \\\"Baby Boy\\\" also reached number one, and singles, \\\"Me, Myself and I\\\" and \\\"Naughty Girl\\\", both reached the top-five.\\n\",\n      \"0.93: Beyoncé is believed to have first started a relationship with Jay Z after a collaboration on \\\"'03 Bonnie & Clyde\\\", which appeared on his seventh album The Blueprint 2: The Gift & The Curse (2002). Beyoncé appeared as Jay Z's girlfriend in the music video for the song, which would further fuel speculation of their relationship. On April 4, 2008, Beyoncé and Jay Z were married without publicity. As of April 2014, the couple have sold a combined 300 million records together. The couple are known for their private relationship, although they have appeared to become more relaxed in recent years. Beyoncé suffered a miscarriage in 2010 or 2011, describing it as \\\"the saddest thing\\\" she had ever endured. She returned to the studio and wrote music in order to cope with the loss.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"After her second solo album, what other entertainment venture did Beyonce explore?\\n\",\n      \"\\n\",\n      \"0.93: Following the disbandment of Destiny's Child in June 2005, she released her second solo album, B'Day (2006), which contained hits \\\"Déjà Vu\\\", \\\"Irreplaceable\\\", and \\\"Beautiful Liar\\\". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for \\\"Single Ladies (Put a Ring on It)\\\".\\n\",\n      \"0.92: Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, \\\"to live life, to be inspired by things again\\\". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances.\\n\",\n      \"0.92: Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"In her music, what are some recurring elements in them?\\n\",\n      \"\\n\",\n      \"0.91: Beyoncé's music is generally R&B, but she also incorporates pop, soul and funk into her songs. 4 demonstrated Beyoncé's exploration of 90s-style R&B, as well as further use of soul and hip hop than compared to previous releases. While she almost exclusively releases English songs, Beyoncé recorded several Spanish songs for Irreemplazable (re-recordings of songs from B'Day for a Spanish-language audience), and the re-release of B'Day. To record these, Beyoncé was coached phonetically by American record producer Rudy Perez.\\n\",\n      \"0.9: The feminism and female empowerment themes on Beyoncé's second solo album B'Day were inspired by her role in Dreamgirls and by singer Josephine Baker. Beyoncé paid homage to Baker by performing \\\"Déjà Vu\\\" at the 2006 Fashion Rocks concert wearing Baker's trademark mini-hula skirt embellished with fake bananas. Beyoncé's third solo album I Am... Sasha Fierce was inspired by Jay Z and especially by Etta James, whose \\\"boldness\\\" inspired Beyoncé to explore other musical genres and styles. Her fourth solo album, 4, was inspired by Fela Kuti, 1990s R&B, Earth, Wind & Fire, DeBarge, Lionel Richie, Teena Marie with additional influences by The Jackson 5, New Edition, Adele, Florence and the Machine, and Prince.\\n\",\n      \"0.9: She has received co-writing credits for most of the songs recorded with Destiny's Child and her solo efforts. Her early songs were personally driven and female-empowerment themed compositions like \\\"Independent Women\\\" and \\\"Survivor\\\", but after the start of her relationship with Jay Z she transitioned to more man-tending anthems such as \\\"Cater 2 U\\\". Beyoncé has also received co-producing credits for most of the records in which she has been involved, especially during her solo efforts. However, she does not formulate beats herself, but typically comes up with melodies and ideas during production, sharing them with producers.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"for query_result in res.json()['results']:\\n\",\n    \"    query = query_result['query']\\n\",\n    \"    answers = []\\n\",\n    \"    scores = []\\n\",\n    \"    for result in query_result['results']:\\n\",\n    \"        answers.append(result['text'])\\n\",\n    \"        scores.append(round(result['score'], 2))\\n\",\n    \"    print(\\\"-\\\"*70+\\\"\\\\n\\\"+query+\\\"\\\\n\\\\n\\\"+\\\"\\\\n\\\".join([f\\\"{s}: {a}\\\" for a, s in zip(answers, scores)])+\\\"\\\\n\\\"+\\\"-\\\"*70+\\\"\\\\n\\\\n\\\")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The top results are all relevant as we would have hoped. We can see that the `score` is a measure of how relevant the document is to the query. The higher the score the more relevant the document is to the query.\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"chatgpt-retrieval-plugin-S7h-2AWq-py3.10\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.3\"\n  },\n  \"orig_nbformat\": 4,\n  \"vscode\": {\n   \"interpreter\": {\n    \"hash\": \"1979a773a5778de9a5fa593a629dff0ab3c80c2563810d3e6a8dfb123dc01c7d\"\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/providers/mongodb/semantic-search.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"735ae737-86be-4497-a8e9-38525e422380\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Semantic Search of one's own data with OpenAI Embedding Model and MongoDB Atlas Vector Search\\n\",\n    \"\\n\",\n    \"It is often a valuable exercise, when developing and documenting, to consider [User Stories](https://www.atlassian.com/agile/project-management/user-stories). We have a number of different personas interested in the ChatGPT Retrieval Plugin.\\n\",\n    \"\\n\",\n    \"1. The End User, who wishes to extract information from her organization's or personal data.\\n\",\n    \"2. The Data Scientist, who curates the data.\\n\",\n    \"3. The Application Engineer, who sets up and maintains the application.\\n\",\n    \"\\n\",\n    \"### Application Setup\\n\",\n    \"\\n\",\n    \"**The Application Engineer** has a number of tasks to complete in order to provide service to her two users.\\n\",\n    \"\\n\",\n    \"1. Set up the DataStore.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"    * Create a MongoDB Atlas cluster.\\n\",\n    \"    * Add a Vector Index Search to it.<br><br>\\n\",\n    \"\\n\",\n    \"    Begin by following the detailed steps in **[setup.md](https://github.com/caseyclements/chatgpt-retrieval-plugin/blob/mongodb/docs/providers/mongodb/setup.md)**.\\n\",\n    \"    Once completed, you will have a running Cluster, with a Database, a Collection, and a Vector Search Index attached to it.\\n\",\n    \"\\n\",\n    \"    You will also have a number of required environment variables. These need to be available to run this example.\\n\",\n    \"    We will check for them below, and suggest how to set them up with an `.env` file if that is your preference.\\n\",\n    \"\\n\",\n    \"  \\n\",\n    \"2. Create and Serve the ChatGPT Retrival Plugin.\\n\",\n    \"    * Provide an API for the Data Scientist to insert, update, and delete data.\\n\",\n    \"    * Provide an API for the End User to query the data using natural language.<br><br>\\n\",\n    \"    \\n\",\n    \"    Start the service in another terminal as described in the repo's **[QuickStart]( [here](https://github.com/openai/chatgpt-retrieval-plugin#quickstart)**. \\n\",\n    \"\\n\",\n    \"   **IMPORTANT** Make sure the environment variables are set in the terminal before `poetry run start`.\\n\",\n    \"\\n\",\n    \"### Application Usage\\n\",\n    \"\\n\",\n    \"This notebook tells a story of a **Data Scientist** and an **End User** as they interact with the service.\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"We begin by collecting and fiiltering an example dataset, the Stanford Question Answering Dataset (SQuAD)[https://huggingface.co/datasets/squad].\\n\",\n    \"We upsert the data into a MongoDB Collection via the `query` endpoint of the Plugin API. \\n\",\n    \"Upon doing this, Atlas begins to automatically index the data in preparation for Semantic Search. \\n\",\n    \"\\n\",\n    \"We close by asking a question of the data, searching not for a particular text string, but using common language.\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"42f2c141-2643-4bff-b431-532916dfedf9\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 1) Application Engineering\\n\",\n    \"\\n\",\n    \"Of course, we cannot begin until we test that our environment is set up.\\n\",\n    \"\\n\",\n    \"### Check environment variables\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"02c045c9-39c8-47e4-a726-7b2a4c1cef21\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pwd\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"id\": \"143c3e90-cf24-45dc-af65-646bcf89b071\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!which python\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"id\": \"8cfe7dc4-820c-4117-bdb7-debf3f5ec5ff\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"required_vars = {'BEARER_TOKEN', 'OPENAI_API_KEY', 'DATASTORE', 'EMBEDDING_DIMENSION', 'EMBEDDING_MODEL',\\n\",\n    \"                 'MONGODB_COLLECTION', 'MONGODB_DATABASE', 'MONGODB_INDEX', 'MONGODB_URI'}\\n\",\n    \"assert os.environ[\\\"DATASTORE\\\"] == 'mongodb'\\n\",\n    \"missing = required_vars - set(os.environ)\\n\",\n    \"if missing:\\n\",\n    \"    print(f\\\"It is strongly recommended to set these additional environment variables. {missing}=\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"id\": \"88f07920-3616-4663-bb59-68179c97933e\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# If you keep the environment variables in a .env file, like that .env.example, do this:\\n\",\n    \"if missing:\\n\",\n    \"    from dotenv import dotenv_values\\n\",\n    \"    from pathlib import Path\\n\",\n    \"    import os\\n\",\n    \"    config = dotenv_values(Path('../.env'))\\n\",\n    \"    os.environ.update(config)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"c0e152ba-4cb4-4703-ac30-035afbc84e67\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Check MongoDB Atlas Datastore connection\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"id\": \"1aa90750-3f23-4671-8419-53e867649a6b\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Pinged your deployment. You successfully connected to MongoDB!\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from pymongo import MongoClient\\n\",\n    \"client = MongoClient(os.environ[\\\"MONGODB_URI\\\"])\\n\",\n    \"# Send a ping to confirm a successful connection\\n\",\n    \"try:\\n\",\n    \"    client.admin.command('ping')\\n\",\n    \"    print(\\\"Pinged your deployment. You successfully connected to MongoDB!\\\")\\n\",\n    \"except Exception as e:\\n\",\n    \"    print(e)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"id\": \"52480c9d-fcf0-4cd2-8031-94999a0f87cc\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'Beyonce'\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"db = client[os.environ[\\\"MONGODB_DATABASE\\\"]]\\n\",\n    \"clxn = db[os.environ[\\\"MONGODB_COLLECTION\\\"]]\\n\",\n    \"clxn.name\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"5e334b80-babe-414d-adb1-6c4b8baff137\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Check OpenAI Connection\\n\",\n    \"\\n\",\n    \"These tests require the environment variables: `OPENAI_API_KEY, EMBEDDING_MODEL`\\n\",\n    \"\\n\",\n    \"We set the api_key, then query the API for its available models. We then loop over this list to find which can provide text embeddings, and their natural, full, default dimensions.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"id\": \"8435ce2e-ed38-48e8-a9eb-4595d9c8eee3\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"\\\"model_dimensions={'text-embedding-3-small': 1536, 'text-embedding-ada-002': 1536, 'text-embedding-3-large': 3072}\\\"\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"import openai\\n\",\n    \"openai.api_key = os.environ[\\\"OPENAI_API_KEY\\\"]\\n\",\n    \"models = openai.Model.list()\\n\",\n    \"model_names = [model[\\\"id\\\"] for model in models['data']]\\n\",\n    \"model_dimensions = {}\\n\",\n    \"for model_name in model_names:\\n\",\n    \"    try:\\n\",\n    \"        response = openai.Embedding.create(input=[\\\"Some input text\\\"], model=model_name)\\n\",\n    \"        model_dimensions[model_name] = len(response['data'][0]['embedding'])\\n\",\n    \"    except:\\n\",\n    \"        pass\\n\",\n    \"f\\\"{model_dimensions=}\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"bcddf850-3a7c-4164-8862-88553d7b3970\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2) Data Engineering\\n\",\n    \"\\n\",\n    \"### Prepare personal or organizational dataset\\n\",\n    \"\\n\",\n    \"The ChatGPT Retrieval Plug provides semantic search of your own data using OpenAI's Embedding Models and MongoDB's Vector Datastore and Semantic Search.\\n\",\n    \"\\n\",\n    \"In this example, we will use the **S**tanford **Qu**estion **A**nswering **D**ataset (SQuAD), which we download from Hugging Face Datasets.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"id\": \"6833a874-e6f8-4f7b-9889-9b5184f458aa\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"len(data)=19029\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>title</th>\\n\",\n       \"      <th>context</th>\\n\",\n       \"      <th>question</th>\\n\",\n       \"      <th>answers</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>56be85543aeaaa14008c9063</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...</td>\\n\",\n       \"      <td>When did Beyonce start becoming popular?</td>\\n\",\n       \"      <td>{'text': ['in the late 1990s'], 'answer_start'...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>15</th>\\n\",\n       \"      <td>56be86cf3aeaaa14008c9076</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Following the disbandment of Destiny's Child i...</td>\\n\",\n       \"      <td>After her second solo album, what other entert...</td>\\n\",\n       \"      <td>{'text': ['acting'], 'answer_start': [207]}</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>27</th>\\n\",\n       \"      <td>56be88473aeaaa14008c9080</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>A self-described \\\"modern-day feminist\\\", Beyonc...</td>\\n\",\n       \"      <td>In her music, what are some recurring elements...</td>\\n\",\n       \"      <td>{'text': ['love, relationships, and monogamy']...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>39</th>\\n\",\n       \"      <td>56be892d3aeaaa14008c908b</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé Giselle Knowles was born in Houston, T...</td>\\n\",\n       \"      <td>Beyonce's younger sibling also sang with her i...</td>\\n\",\n       \"      <td>{'text': ['Destiny's Child'], 'answer_start': ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>52</th>\\n\",\n       \"      <td>56be8a583aeaaa14008c9094</td>\\n\",\n       \"      <td>Beyoncé</td>\\n\",\n       \"      <td>Beyoncé attended St. Mary's Elementary School ...</td>\\n\",\n       \"      <td>What town did Beyonce go to school in?</td>\\n\",\n       \"      <td>{'text': ['Fredericksburg'], 'answer_start': [...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                          id    title  \\\\\\n\",\n       \"0   56be85543aeaaa14008c9063  Beyoncé   \\n\",\n       \"15  56be86cf3aeaaa14008c9076  Beyoncé   \\n\",\n       \"27  56be88473aeaaa14008c9080  Beyoncé   \\n\",\n       \"39  56be892d3aeaaa14008c908b  Beyoncé   \\n\",\n       \"52  56be8a583aeaaa14008c9094  Beyoncé   \\n\",\n       \"\\n\",\n       \"                                              context  \\\\\\n\",\n       \"0   Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   \\n\",\n       \"15  Following the disbandment of Destiny's Child i...   \\n\",\n       \"27  A self-described \\\"modern-day feminist\\\", Beyonc...   \\n\",\n       \"39  Beyoncé Giselle Knowles was born in Houston, T...   \\n\",\n       \"52  Beyoncé attended St. Mary's Elementary School ...   \\n\",\n       \"\\n\",\n       \"                                             question  \\\\\\n\",\n       \"0            When did Beyonce start becoming popular?   \\n\",\n       \"15  After her second solo album, what other entert...   \\n\",\n       \"27  In her music, what are some recurring elements...   \\n\",\n       \"39  Beyonce's younger sibling also sang with her i...   \\n\",\n       \"52             What town did Beyonce go to school in?   \\n\",\n       \"\\n\",\n       \"                                              answers  \\n\",\n       \"0   {'text': ['in the late 1990s'], 'answer_start'...  \\n\",\n       \"15        {'text': ['acting'], 'answer_start': [207]}  \\n\",\n       \"27  {'text': ['love, relationships, and monogamy']...  \\n\",\n       \"39  {'text': ['Destiny's Child'], 'answer_start': ...  \\n\",\n       \"52  {'text': ['Fredericksburg'], 'answer_start': [...  \"\n      ]\n     },\n     \"execution_count\": 6,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"import pandas as pd\\n\",\n    \"from datasets import load_dataset\\n\",\n    \"data = load_dataset(\\\"squad_v2\\\", split=\\\"train\\\")\\n\",\n    \"data = data.to_pandas().drop_duplicates(subset=[\\\"context\\\"])\\n\",\n    \"print(f'{len(data)=}')\\n\",\n    \"data.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"29c69543-51d7-49f9-b936-b7b99804818c\",\n   \"metadata\": {},\n   \"source\": [\n    \"To speed up our example, let's focus specifically on questions about Beyoncé\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"id\": \"b444b0f8-a341-4f65-ab26-0793300d275f\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data = data.loc[data['title']=='Beyoncé']\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"id\": \"2136e910-9944-422d-aaa5-50f1d3d7a5ed\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'id': '56be85543aeaaa14008c9063',\\n\",\n       \" 'text': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\\\\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\\\\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\\\\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \\\"Crazy in Love\\\" and \\\"Baby Boy\\\".',\\n\",\n       \" 'metadata': {'title': 'Beyoncé'}}\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"documents = [\\n\",\n    \"    {\\n\",\n    \"        'id': r['id'],\\n\",\n    \"        'text': r['context'],\\n\",\n    \"        'metadata': {\\n\",\n    \"            'title': r['title']\\n\",\n    \"        }\\n\",\n    \"    } for r in data.to_dict(orient='records')\\n\",\n    \"]\\n\",\n    \"documents[0]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"98388d9a-bb33-4eea-8d43-e890517f829a\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Upsert and Index data via the Plugin API\\n\",\n    \"\\n\",\n    \"Posting an `upsert` request to the ChatGPT Retrieval Plugin API performs two tasks on the backend. First, it inserts into (or updates) your data in the MONGODB_COLLECTION in the MongoDB Cluster that you setup. Second, Atlas asynchronously begins populating a Vector Search Index on the embedding key. \\n\",\n    \"\\n\",\n    \"If you have already created the Collection and a Vector Search Index through the Atlas UI while Setting up MongoDB Atlas Cluster in [setup.md](https://github.com/caseyclements/chatgpt-retrieval-plugin/blob/main/docs/providers/mongodb/setup.md), then indexing will begin immediately.\\n\",\n    \"\\n\",\n    \"If you haven't set up the Atlas Vector Search yet, no problem. `upsert` will insert the data. To start indexing, simply go back to the Atlas UI and add a Search Index. This will trigger indexing. Once complete, we can begin semantic queries!\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"The front end of the Plugin is a FastAPI web server. It's API provides simple `http` requests.'We will need to provide authorization in the form of the BEARER_TOKEN we set earlier. We do this below:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"id\": \"8e2891fb-4485-4f97-b73f-e2f4186238bd\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"endpoint_url = 'http://0.0.0.0:8000'\\n\",\n    \"headers = {\\\"Authorization\\\": f\\\"Bearer {os.environ['BEARER_TOKEN']}\\\"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"5f5cc603-88f1-402b-a0a7-52cabb9f7d9d\",\n   \"metadata\": {},\n   \"source\": [\n    \"Although our sample data is not large, and the service and datastore are reponsive, we follow best-practice and execute bulk upserts in batches with retries.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"id\": \"1c0e2eb7-58b5-4f36-af2d-d29aee18e08d\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"cb28c6a61b464858806fc8f3404e7a19\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"  0%|          | 0/1 [00:00<?, ?it/s]\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(i,i_end) =(0, 66)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from tqdm.auto import tqdm\\n\",\n    \"import requests\\n\",\n    \"from requests.adapters import HTTPAdapter, Retry\\n\",\n    \"\\n\",\n    \"# Setup request parameters to batch requests and retry on 5xx errors\\n\",\n    \"batch_size = 100\\n\",\n    \"retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])\\n\",\n    \"session = requests.Session()\\n\",\n    \"session.mount('http://', HTTPAdapter(max_retries=retries))\\n\",\n    \"n_docs = len(documents)\\n\",\n    \"for i in tqdm(range(0, n_docs, batch_size)):\\n\",\n    \"    i_end = min(n_docs, i+batch_size)\\n\",\n    \"    print(f'{(i,i_end) =}') \\n\",\n    \"    # make post request that allows up to 5 retries\\n\",\n    \"    res = session.post(\\n\",\n    \"        f\\\"{endpoint_url}/upsert\\\",\\n\",\n    \"        headers=headers,\\n\",\n    \"        json={\\\"documents\\\": documents[i:i_end]}\\n\",\n    \"    )\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"id\": \"6f193886-f14e-4d97-9fc8-6c269b67602f\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"if res.status_code != 200:\\n\",\n    \"    res.text, res.reason\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"938f4f7c-5a67-4f54-9596-cf27a79e05e8\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3) Answering Questions\\n\",\n    \"\\n\",\n    \"Now would be a good time to go back to the Atlas UI, navigate to your collection's search index. Once all our SQuAD records have been successfully indexed, we can proceed with the querying phase. By passing one or more queries to the /query endpoint, we can easily conduct a query on the datastore. For this task, we can utilize a few questions from SQuAD2.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"id\": \"1f0a9973-1326-4d16-9c47-c5245675ea44\",\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def format_results(results):\\n\",\n    \"    for query_result in results.json()['results']:\\n\",\n    \"        query = query_result['query']\\n\",\n    \"        answers = []\\n\",\n    \"        scores = []\\n\",\n    \"        for result in query_result['results']:\\n\",\n    \"            answers.append(result['text'])\\n\",\n    \"            scores.append(round(result['score'], 2))\\n\",\n    \"        print(\\\"-\\\"*70+\\\"\\\\n\\\"+query+\\\"\\\\n\\\\n\\\"+\\\"\\\\n\\\".join([f\\\"{s}: {a}\\\" for a, s in zip(answers, scores)])+\\\"\\\\n\\\"+\\\"-\\\"*70+\\\"\\\\n\\\\n\\\")    \\n\",\n    \"\\n\",\n    \"def ask(question: str):\\n\",\n    \"    res = requests.post(\\n\",\n    \"        f\\\"{endpoint_url}/query\\\",\\n\",\n    \"        headers=headers,\\n\",\n    \"        json={'queries': [{\\\"query\\\": question}]}\\n\",\n    \"    )\\n\",\n    \"    format_results(res)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"id\": \"b257e50b-f888-49c8-9e12-c98b8726aaab\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"----------------------------------------------------------------------\\n\",\n      \"Who is Beyonce?\\n\",\n      \"\\n\",\n      \"0.83: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \\\"Crazy in Love\\\" and \\\"Baby Boy\\\".\\n\",\n      \"0.82: A self-described \\\"modern-day feminist\\\", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. Throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with Destiny's Child, making her one of the best-selling music artists of all time. She has won 20 Grammy Awards and is the most nominated woman in the award's history. The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade. In 2009, Billboard named her the Top Radio Songs Artist of the Decade, the Top Female Artist of the 2000s and their Artist of the Millennium in 2011.\\n\",\n      \"0.79: When Beyoncé does an album, when Beyoncé sings a song, when Beyoncé does anything, it's an event, and it's broadly influential. Right now, she is the heir-apparent diva of the USA — the reigning national voice.\\\" In 2014, Beyoncé was listed again on the Time 100 and also featured on the cover of the issue.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"ask(\\\"Who is Beyonce?\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"id\": \"c36d54ae-5bac-4ceb-b98d-6067867f954c\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"----------------------------------------------------------------------\\n\",\n      \"Who is Beyonce married to?\\n\",\n      \"\\n\",\n      \"0.78: On April 4, 2008, Beyoncé married Jay Z. She publicly revealed their marriage in a video montage at the listening party for her third studio album, I Am... Sasha Fierce, in Manhattan's Sony Club on October 22, 2008. I Am... Sasha Fierce was released on November 18, 2008 in the United States. The album formally introduces Beyoncé's alter ego Sasha Fierce, conceived during the making of her 2003 single \\\"Crazy in Love\\\", selling 482,000 copies in its first week, debuting atop the Billboard 200, and giving Beyoncé her third consecutive number-one album in the US. The album featured the number-one song \\\"Single Ladies (Put a Ring on It)\\\" and the top-five songs \\\"If I Were a Boy\\\" and \\\"Halo\\\".\\n\",\n      \"0.77: Beyoncé is believed to have first started a relationship with Jay Z after a collaboration on \\\"'03 Bonnie & Clyde\\\", which appeared on his seventh album The Blueprint 2: The Gift & The Curse (2002). Beyoncé appeared as Jay Z's girlfriend in the music video for the song, which would further fuel speculation of their relationship. On April 4, 2008, Beyoncé and Jay Z were married without publicity. As of April 2014, the couple have sold a combined 300 million records together. The couple are known for their private relationship, although they have appeared to become more relaxed in recent years. Beyoncé suffered a miscarriage in 2010 or 2011, describing it as \\\"the saddest thing\\\" she had ever endured. She returned to the studio and wrote music in order to cope with the loss.\\n\",\n      \"0.76: In the same year, Beyoncé and Jay Z placed at number one on the \\\"World's Highest-Paid Celebrity Couples\\\", for collectively earning $78 million. The couple made it into the previous year's Guinness World Records as the \\\"highest-earning power couple\\\" for collectively earning $122 million in 2009. For the years 2009 to 2011, Beyoncé earned an average of $70 million per year, and earned $40 million in 2012. In 2013, Beyoncé's endorsements of Pepsi and H&M made her and Jay Z the world's first billion dollar couple in the music industry. That year, Beyoncé was published as the fourth most-powerful celebrity in the Forbes rankings. MTV estimated that by the end of 2014, Beyoncé would become the highest-paid black musician in history; she succeeded to do so in April 2014.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"ask(\\\"Who is Beyonce married to?\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"4231d114-6341-433d-9b82-a1527d64775c\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4) Clean up\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"id\": \"9d12f4d6-9205-4c2e-bf17-f84956065672\",\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'success': True}\"\n      ]\n     },\n     \"execution_count\": 15,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"response = requests.delete(\\n\",\n    \"    f\\\"{endpoint_url}/delete\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\\"delete_all\\\":True}\\n\",\n    \")\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.0\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 5\n}\n"
  },
  {
    "path": "examples/providers/pinecone/semantic-search.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Using the Pinecone Retrieval App\\n\",\n    \"\\n\",\n    \"In this walkthrough we will see how to use the retrieval API with a Pinecone datastore for *semantic search / question-answering*.\\n\",\n    \"\\n\",\n    \"Before running this notebook you should have already initialized the retrieval API and have it running locally or elsewhere. The full instructions for doing this are found in the [project README]().\\n\",\n    \"\\n\",\n    \"We will summarize the instructions (specific to the Pinecone datastore) before moving on to the walkthrough.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## App Quickstart\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"1. Install Python 3.10 if not already installed.\\n\",\n    \"\\n\",\n    \"2. Clone the `retrieval-app` repository:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"git clone git@github.com:openai/retrieval-app.git\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"3. Navigate to the app directory:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"cd /path/to/retrieval-app\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"4. Install `poetry`:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"pip install poetry\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"5. Create a new virtual environment:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"poetry env use python3.10\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"6. Install the `retrieval-app` dependencies:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"poetry install\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"7. Set app environment variables:\\n\",\n    \"\\n\",\n    \"* `BEARER_TOKEN`: Secret token used by the app to authorize incoming requests. We will later include this in the request `headers`. The token can be generated however you prefer, such as using [jwt.io](https://jwt.io/).\\n\",\n    \"\\n\",\n    \"* `OPENAI_API_KEY`: The OpenAI API key used for generating embeddings with the OpenAI embeddings model. [Get an API key here](https://platform.openai.com/account/api-keys)!\\n\",\n    \"\\n\",\n    \"8. Set Pinecone-specific environment variables:\\n\",\n    \"\\n\",\n    \"* `DATASTORE`: set to `pinecone`.\\n\",\n    \"\\n\",\n    \"* `PINECONE_API_KEY`: Set to your Pinecone API key. This requires a free Pinecone account and can be [found in the Pinecone console](https://app.pinecone.io/).\\n\",\n    \"\\n\",\n    \"* `PINECONE_ENVIRONMENT`: Set to your Pinecone environment, looks like `us-east1-gcp`, `us-west1-aws`, and can be found next to your API key in the [Pinecone console](https://app.pinecone.io/).\\n\",\n    \"\\n\",\n    \"* `PINECONE_INDEX`: Set this to your chosen index name. The name you choose is your choice, we just recommend setting it to something descriptive like `\\\"openai-retrieval-app\\\"`. *Note that index names are restricted to alphanumeric characters, `\\\"-\\\"`, and can contain a maximum of 45 characters.*\\n\",\n    \"\\n\",\n    \"8. Run the app with:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"poetry run start\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"If running the app locally you should see something like:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"INFO:     Uvicorn running on http://0.0.0.0:8000\\n\",\n    \"INFO:     Application startup complete.\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"In that case, the app is automatically connected to our index (specified by `PINECONE_INDEX`), if no index with that name existed beforehand, the app creates one for us.\\n\",\n    \"\\n\",\n    \"Now we're ready to move on to populating our index with some data.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Required Libraries\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"There are a few Python libraries we must `pip install` for this notebook to run, those are:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip install -qU datasets pandas tqdm\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Preparing Data\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this example, we will use the **S**tanford **Qu**estion **A**nswering **D**ataset (SQuAD), which we download from Hugging Face Datasets.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Found cached dataset squad (/Users/jamesbriggs/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"Dataset({\\n\",\n       \"    features: ['id', 'title', 'context', 'question', 'answers'],\\n\",\n       \"    num_rows: 87599\\n\",\n       \"})\"\n      ]\n     },\n     \"execution_count\": 1,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"from datasets import load_dataset\\n\",\n    \"\\n\",\n    \"data = load_dataset(\\\"squad\\\", split=\\\"train\\\")\\n\",\n    \"data\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Convert to Pandas dataframe for easier preprocessing steps.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>title</th>\\n\",\n       \"      <th>context</th>\\n\",\n       \"      <th>question</th>\\n\",\n       \"      <th>answers</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>5733be284776f41900661182</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>To whom did the Virgin Mary allegedly appear i...</td>\\n\",\n       \"      <td>{'text': ['Saint Bernadette Soubirous'], 'answ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>5733be284776f4190066117f</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>What is in front of the Notre Dame Main Building?</td>\\n\",\n       \"      <td>{'text': ['a copper statue of Christ'], 'answe...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>5733be284776f41900661180</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>The Basilica of the Sacred heart at Notre Dame...</td>\\n\",\n       \"      <td>{'text': ['the Main Building'], 'answer_start'...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>5733be284776f41900661181</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>What is the Grotto at Notre Dame?</td>\\n\",\n       \"      <td>{'text': ['a Marian place of prayer and reflec...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>5733be284776f4190066117e</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>What sits on top of the Main Building at Notre...</td>\\n\",\n       \"      <td>{'text': ['a golden statue of the Virgin Mary'...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                         id                     title  \\\\\\n\",\n       \"0  5733be284776f41900661182  University_of_Notre_Dame   \\n\",\n       \"1  5733be284776f4190066117f  University_of_Notre_Dame   \\n\",\n       \"2  5733be284776f41900661180  University_of_Notre_Dame   \\n\",\n       \"3  5733be284776f41900661181  University_of_Notre_Dame   \\n\",\n       \"4  5733be284776f4190066117e  University_of_Notre_Dame   \\n\",\n       \"\\n\",\n       \"                                             context  \\\\\\n\",\n       \"0  Architecturally, the school has a Catholic cha...   \\n\",\n       \"1  Architecturally, the school has a Catholic cha...   \\n\",\n       \"2  Architecturally, the school has a Catholic cha...   \\n\",\n       \"3  Architecturally, the school has a Catholic cha...   \\n\",\n       \"4  Architecturally, the school has a Catholic cha...   \\n\",\n       \"\\n\",\n       \"                                            question  \\\\\\n\",\n       \"0  To whom did the Virgin Mary allegedly appear i...   \\n\",\n       \"1  What is in front of the Notre Dame Main Building?   \\n\",\n       \"2  The Basilica of the Sacred heart at Notre Dame...   \\n\",\n       \"3                  What is the Grotto at Notre Dame?   \\n\",\n       \"4  What sits on top of the Main Building at Notre...   \\n\",\n       \"\\n\",\n       \"                                             answers  \\n\",\n       \"0  {'text': ['Saint Bernadette Soubirous'], 'answ...  \\n\",\n       \"1  {'text': ['a copper statue of Christ'], 'answe...  \\n\",\n       \"2  {'text': ['the Main Building'], 'answer_start'...  \\n\",\n       \"3  {'text': ['a Marian place of prayer and reflec...  \\n\",\n       \"4  {'text': ['a golden statue of the Virgin Mary'...  \"\n      ]\n     },\n     \"execution_count\": 2,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"data = data.to_pandas()\\n\",\n    \"data.head()\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset contains a lot of duplicate `context` paragraphs, this is because each `context` can have many relevant questions. We don't want these duplicates so we remove like so:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"18891\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>title</th>\\n\",\n       \"      <th>context</th>\\n\",\n       \"      <th>question</th>\\n\",\n       \"      <th>answers</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>5733be284776f41900661182</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>Architecturally, the school has a Catholic cha...</td>\\n\",\n       \"      <td>To whom did the Virgin Mary allegedly appear i...</td>\\n\",\n       \"      <td>{'text': ['Saint Bernadette Soubirous'], 'answ...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>5733bf84d058e614000b61be</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>As at most other universities, Notre Dame's st...</td>\\n\",\n       \"      <td>When did the Scholastic Magazine of Notre dame...</td>\\n\",\n       \"      <td>{'text': ['September 1876'], 'answer_start': [...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>10</th>\\n\",\n       \"      <td>5733bed24776f41900661188</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>The university is the major seat of the Congre...</td>\\n\",\n       \"      <td>Where is the headquarters of the Congregation ...</td>\\n\",\n       \"      <td>{'text': ['Rome'], 'answer_start': [119]}</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>15</th>\\n\",\n       \"      <td>5733a6424776f41900660f51</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>The College of Engineering was established in ...</td>\\n\",\n       \"      <td>How many BS level degrees are offered in the C...</td>\\n\",\n       \"      <td>{'text': ['eight'], 'answer_start': [487]}</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>20</th>\\n\",\n       \"      <td>5733a70c4776f41900660f64</td>\\n\",\n       \"      <td>University_of_Notre_Dame</td>\\n\",\n       \"      <td>All of Notre Dame's undergraduate students are...</td>\\n\",\n       \"      <td>What entity provides help with the management ...</td>\\n\",\n       \"      <td>{'text': ['Learning Resource Center'], 'answer...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                          id                     title  \\\\\\n\",\n       \"0   5733be284776f41900661182  University_of_Notre_Dame   \\n\",\n       \"5   5733bf84d058e614000b61be  University_of_Notre_Dame   \\n\",\n       \"10  5733bed24776f41900661188  University_of_Notre_Dame   \\n\",\n       \"15  5733a6424776f41900660f51  University_of_Notre_Dame   \\n\",\n       \"20  5733a70c4776f41900660f64  University_of_Notre_Dame   \\n\",\n       \"\\n\",\n       \"                                              context  \\\\\\n\",\n       \"0   Architecturally, the school has a Catholic cha...   \\n\",\n       \"5   As at most other universities, Notre Dame's st...   \\n\",\n       \"10  The university is the major seat of the Congre...   \\n\",\n       \"15  The College of Engineering was established in ...   \\n\",\n       \"20  All of Notre Dame's undergraduate students are...   \\n\",\n       \"\\n\",\n       \"                                             question  \\\\\\n\",\n       \"0   To whom did the Virgin Mary allegedly appear i...   \\n\",\n       \"5   When did the Scholastic Magazine of Notre dame...   \\n\",\n       \"10  Where is the headquarters of the Congregation ...   \\n\",\n       \"15  How many BS level degrees are offered in the C...   \\n\",\n       \"20  What entity provides help with the management ...   \\n\",\n       \"\\n\",\n       \"                                              answers  \\n\",\n       \"0   {'text': ['Saint Bernadette Soubirous'], 'answ...  \\n\",\n       \"5   {'text': ['September 1876'], 'answer_start': [...  \\n\",\n       \"10          {'text': ['Rome'], 'answer_start': [119]}  \\n\",\n       \"15         {'text': ['eight'], 'answer_start': [487]}  \\n\",\n       \"20  {'text': ['Learning Resource Center'], 'answer...  \"\n      ]\n     },\n     \"execution_count\": 3,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"data = data.drop_duplicates(subset=[\\\"context\\\"])\\n\",\n    \"print(len(data))\\n\",\n    \"data.head()\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The format required by the apps `upsert` function is a list of documents like:\\n\",\n    \"\\n\",\n    \"```json\\n\",\n    \"[\\n\",\n    \"    {\\n\",\n    \"        \\\"id\\\": \\\"abc\\\",\\n\",\n    \"        \\\"text\\\": \\\"some important document text\\\",\\n\",\n    \"        \\\"metadata\\\": {\\n\",\n    \"            \\\"field1\\\": \\\"optional metadata goes here\\\",\\n\",\n    \"            \\\"field2\\\": 54\\n\",\n    \"        }\\n\",\n    \"    },\\n\",\n    \"    {\\n\",\n    \"        \\\"id\\\": \\\"123\\\",\\n\",\n    \"        \\\"text\\\": \\\"some other important text\\\",\\n\",\n    \"        \\\"metadata\\\": {\\n\",\n    \"            \\\"field1\\\": \\\"another metadata\\\",\\n\",\n    \"            \\\"field2\\\": 71,\\n\",\n    \"            \\\"field3\\\": \\\"not all metadatas need the same structure\\\"\\n\",\n    \"        }\\n\",\n    \"    }\\n\",\n    \"    ...\\n\",\n    \"]\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"Every document *must* have a `\\\"text\\\"` field. The `\\\"id\\\"` and `\\\"metadata\\\"` fields are optional.\\n\",\n    \"\\n\",\n    \"To create this format for our SQuAD data we do:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[{'id': '5733be284776f41900661182',\\n\",\n       \"  'text': 'Architecturally, the school has a Catholic character. Atop the Main Building\\\\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \\\"Venite Ad Me Omnes\\\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',\\n\",\n       \"  'metadata': {'title': 'University_of_Notre_Dame'}},\\n\",\n       \" {'id': '5733bf84d058e614000b61be',\\n\",\n       \"  'text': \\\"As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a liberal newspaper, Common Sense was published. Likewise, in 2003, when other students believed that the paper showed a liberal bias, the conservative paper Irish Rover went into production. Neither paper is published as often as The Observer; however, all three are distributed to all students. Finally, in Spring 2008 an undergraduate journal for political science research, Beyond Politics, made its debut.\\\",\\n\",\n       \"  'metadata': {'title': 'University_of_Notre_Dame'}},\\n\",\n       \" {'id': '5733bed24776f41900661188',\\n\",\n       \"  'text': 'The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.',\\n\",\n       \"  'metadata': {'title': 'University_of_Notre_Dame'}}]\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"documents = [\\n\",\n    \"    {\\n\",\n    \"        'id': r['id'],\\n\",\n    \"        'text': r['context'],\\n\",\n    \"        'metadata': {\\n\",\n    \"            'title': r['title']\\n\",\n    \"        }\\n\",\n    \"    } for r in data.to_dict(orient='records')\\n\",\n    \"]\\n\",\n    \"documents[:3]\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Indexing the Docs\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We're now ready to begin indexing (or *upserting*) our `documents`. To make these requests to the retrieval app API, we will need to provide authorization in the form of the `BEARER_TOKEN` we set earlier. We do this below:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"\\n\",\n    \"BEARER_TOKEN = os.environ.get(\\\"BEARER_TOKEN\\\") or \\\"BEARER_TOKEN_HERE\\\"\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Use the `BEARER_TOKEN` to create our authorization `headers`:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"headers = {\\n\",\n    \"    \\\"Authorization\\\": f\\\"Bearer {BEARER_TOKEN}\\\"\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We'll perform the upsert in batches of `batch_size`. Make sure that the `endpoint_url` variable is set to the correct location for your running *retrieval-app* API.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"application/vnd.jupyter.widget-view+json\": {\n       \"model_id\": \"8694da67455d4bb78cc778e49f69a872\",\n       \"version_major\": 2,\n       \"version_minor\": 0\n      },\n      \"text/plain\": [\n       \"  0%|          | 0/10 [00:00<?, ?it/s]\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"from tqdm.auto import tqdm\\n\",\n    \"import requests\\n\",\n    \"from requests.adapters import HTTPAdapter, Retry\\n\",\n    \"\\n\",\n    \"batch_size = 100\\n\",\n    \"endpoint_url = \\\"http://localhost:8000\\\"\\n\",\n    \"s = requests.Session()\\n\",\n    \"\\n\",\n    \"# we setup a retry strategy to retry on 5xx errors\\n\",\n    \"retries = Retry(\\n\",\n    \"    total=5,  # number of retries before raising error\\n\",\n    \"    backoff_factor=0.1,\\n\",\n    \"    status_forcelist=[500, 502, 503, 504]\\n\",\n    \")\\n\",\n    \"s.mount('http://', HTTPAdapter(max_retries=retries))\\n\",\n    \"\\n\",\n    \"for i in tqdm(range(0, len(documents), batch_size)):\\n\",\n    \"    i_end = min(len(documents), i+batch_size)\\n\",\n    \"    # make post request that allows up to 5 retries\\n\",\n    \"    res = s.post(\\n\",\n    \"        f\\\"{endpoint_url}/upsert\\\",\\n\",\n    \"        headers=headers,\\n\",\n    \"        json={\\n\",\n    \"            \\\"documents\\\": documents[i:i_end]\\n\",\n    \"        }\\n\",\n    \"    )\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"With that our SQuAD records have all been indexed and we can move on to querying.\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Making Queries\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"To query the datastore all we need to do is pass one or more queries to the `/query` endpoint. We can take a few questions from SQuAD:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"18891\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"queries = data['question'].tolist()\\n\",\n    \"# format into the structure needed by the /query endpoint\\n\",\n    \"queries = [{'query': queries[i]} for i in range(len(queries))]\\n\",\n    \"len(queries)\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We will use just the first *three* questions:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"[{'query': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'},\\n\",\n       \" {'query': 'When did the Scholastic Magazine of Notre dame begin publishing?'},\\n\",\n       \" {'query': 'Where is the headquarters of the Congregation of the Holy Cross?'}]\"\n      ]\n     },\n     \"execution_count\": 9,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"queries[:3]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"<Response [200]>\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"res = requests.post(\\n\",\n    \"    \\\"http://0.0.0.0:8000/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        'queries': queries[:3]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"res\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now we can loop through the responses and see the results returned for each query:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"----------------------------------------------------------------------\\n\",\n      \"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?\\n\",\n      \"\\n\",\n      \"0.83: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \\\"Venite Ad Me Omnes\\\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.\\n\",\n      \"0.81: Within the white inescutcheon, the five quinas (small blue shields) with their five white bezants representing the five wounds of Christ (Portuguese: Cinco Chagas) when crucified and are popularly associated with the \\\"Miracle of Ourique\\\". The story associated with this miracle tells that before the Battle of Ourique (25 July 1139), an old hermit appeared before Count Afonso Henriques (future Afonso I) as a divine messenger. He foretold Afonso's victory and assured him that God was watching over him and his peers. The messenger advised him to walk away from his camp, alone, if he heard a nearby chapel bell tolling, in the following night. In doing so, he witnessed an apparition of Jesus on the cross. Ecstatic, Afonso heard Jesus promising victories for the coming battles, as well as God's wish to act through Afonso, and his descendants, in order to create an empire which would carry His name to unknown lands, thus choosing the Portuguese to perform great tasks.\\n\",\n      \"0.79: In 1842, the Bishop of Vincennes, Célestine Guynemer de la Hailandière, offered land to Father Edward Sorin of the Congregation of the Holy Cross, on the condition that he build a college in two years. Fr. Sorin arrived on the site with eight Holy Cross brothers from France and Ireland on November 26, 1842, and began the school using Father Stephen Badin's old log chapel. He soon erected additional buildings, including Old College, the first church, and the first main building. They immediately acquired two students and set about building additions to the campus.\\n\",\n      \"0.79: Because of its Catholic identity, a number of religious buildings stand on campus. The Old College building has become one of two seminaries on campus run by the Congregation of Holy Cross. The current Basilica of the Sacred Heart is located on the spot of Fr. Sorin's original church, which became too small for the growing college. It is built in French Revival style and it is decorated by stained glass windows imported directly from France. The interior was painted by Luigi Gregori, an Italian painter invited by Fr. Sorin to be artist in residence. The Basilica also features a bell tower with a carillon. Inside the church there are also sculptures by Ivan Mestrovic. The Grotto of Our Lady of Lourdes, which was built in 1896, is a replica of the original in Lourdes, France. It is very popular among students and alumni as a place of prayer and meditation, and it is considered one of the most beloved spots on campus.\\n\",\n      \"0.78: The funeral, held at the Church of the Madeleine in Paris, was delayed almost two weeks, until 30 October. Entrance was restricted to ticket holders as many people were expected to attend. Over 3,000 people arrived without invitations, from as far as London, Berlin and Vienna, and were excluded.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"When did the Scholastic Magazine of Notre dame begin publishing?\\n\",\n      \"\\n\",\n      \"0.88: As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the United States. The other magazine, The Juggler, is released twice a year and focuses on student literature and artwork. The Dome yearbook is published annually. The newspapers have varying publication interests, with The Observer published daily and mainly reporting university and other news, and staffed by students from both Notre Dame and Saint Mary's College. Unlike Scholastic and The Dome, The Observer is an independent publication and does not have a faculty advisor or any editorial oversight from the University. In 1987, when some students believed that The Observer began to show a conservative bias, a liberal newspaper, Common Sense was published. Likewise, in 2003, when other students believed that the paper showed a liberal bias, the conservative paper Irish Rover went into production. Neither paper is published as often as The Observer; however, all three are distributed to all students.\\n\",\n      \"0.83: In 1919 Father James Burns became president of Notre Dame, and in three years he produced an academic revolution that brought the school up to national standards by adopting the elective system and moving away from the university's traditional scholastic and classical emphasis. By contrast, the Jesuit colleges, bastions of academic conservatism, were reluctant to move to a system of electives. Their graduates were shut out of Harvard Law School for that reason. Notre Dame continued to grow over the years, adding more colleges, programs, and sports teams. By 1921, with the addition of the College of Commerce, Notre Dame had grown from a small college to a university with five colleges and a professional law school. The university continued to expand and add new residence halls and buildings with each subsequent president.\\n\",\n      \"0.83: The rise of Hitler and other dictators in the 1930s forced numerous Catholic intellectuals to flee Europe; president John O'Hara brought many to Notre Dame. From Germany came Anton-Hermann Chroust (1907–1982) in classics and law, and Waldemar Gurian a German Catholic intellectual of Jewish descent. Positivism dominated American intellectual life in the 1920s onward but in marked contrast, Gurian received a German Catholic education and wrote his doctoral dissertation under Max Scheler. Ivan Meštrović (1883–1962), a renowned sculptor, brought Croatian culture to campus, 1955–62. Yves Simon (1903–61), brought to ND in the 1940s the insights of French studies in the Aristotelian-Thomistic tradition of philosophy; his own teacher Jacques Maritain (1882–73) was a frequent visitor to campus.\\n\",\n      \"0.82: In the 18 years under the presidency of Edward Malloy, C.S.C., (1987–2005), there was a rapid growth in the school's reputation, faculty, and resources. He increased the faculty by more than 500 professors; the academic quality of the student body has improved dramatically, with the average SAT score rising from 1240 to 1360; the number of minority students more than doubled; the endowment grew from $350 million to more than $3 billion; the annual operating budget rose from $177 million to more than $650 million; and annual research funding improved from $15 million to more than $70 million. Notre Dame's most recent[when?] capital campaign raised $1.1 billion, far exceeding its goal of $767 million, and is the largest in the history of Catholic higher education.\\n\",\n      \"0.82: The Rev. John J. Cavanaugh, C.S.C. served as president from 1946 to 1952. Cavanaugh's legacy at Notre Dame in the post-war years was devoted to raising academic standards and reshaping the university administration to suit it to an enlarged educational mission and an expanded student body and stressing advanced studies and research at a time when Notre Dame quadrupled in student census, undergraduate enrollment increased by more than half, and graduate student enrollment grew fivefold. Cavanaugh also established the Lobund Institute for Animal Studies and Notre Dame's Medieval Institute. Cavanaugh also presided over the construction of the Nieuwland Science Hall, Fisher Hall, and the Morris Inn, as well as the Hall of Liberal Arts (now O'Shaughnessy Hall), made possible by a donation from I.A. O'Shaughnessy, at the time the largest ever made to an American Catholic university.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"Where is the headquarters of the Congregation of the Holy Cross?\\n\",\n      \"\\n\",\n      \"0.88: The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.\\n\",\n      \"0.84: In 1842, the Bishop of Vincennes, Célestine Guynemer de la Hailandière, offered land to Father Edward Sorin of the Congregation of the Holy Cross, on the condition that he build a college in two years. Fr. Sorin arrived on the site with eight Holy Cross brothers from France and Ireland on November 26, 1842, and began the school using Father Stephen Badin's old log chapel. He soon erected additional buildings, including Old College, the first church, and the first main building. They immediately acquired two students and set about building additions to the campus.\\n\",\n      \"0.84: Because of its Catholic identity, a number of religious buildings stand on campus. The Old College building has become one of two seminaries on campus run by the Congregation of Holy Cross. The current Basilica of the Sacred Heart is located on the spot of Fr. Sorin's original church, which became too small for the growing college. It is built in French Revival style and it is decorated by stained glass windows imported directly from France. The interior was painted by Luigi Gregori, an Italian painter invited by Fr. Sorin to be artist in residence. The Basilica also features a bell tower with a carillon. Inside the church there are also sculptures by Ivan Mestrovic. The Grotto of Our Lady of Lourdes, which was built in 1896, is a replica of the original in Lourdes, France. It is very popular among students and alumni as a place of prayer and meditation, and it is considered one of the most beloved spots on campus.\\n\",\n      \"0.84: The university is affiliated with the Congregation of Holy Cross (Latin: Congregatio a Sancta Cruce, abbreviated postnominals: \\\"CSC\\\"). While religious affiliation is not a criterion for admission, more than 93% of students identify as Christian, with over 80% of the total being Catholic. Collectively, Catholic Mass is celebrated over 100 times per week on campus, and a large campus ministry program provides for the faith needs of the community. There are multitudes of religious statues and artwork around campus, most prominent of which are the statue of Mary on the Main Building, the Notre Dame Grotto, and the Word of Life mural on Hesburgh Library depicting Christ as a teacher. Additionally, every classroom displays a crucifix. There are many religious clubs (catholic and non-Catholic) at the school, including Council #1477 of the Knights of Columbus (KOC), Baptist Collegiate Ministry (BCM), Jewish Club, Muslim Student Association, Orthodox Christian Fellowship, The Mormon Club, and many more. The Notre Dame KofC are known for being the first collegiate council of KofC, operating a charitable concession stand during every home football game and owning their own building on campus which can be used as a cigar lounge.\\n\",\n      \"0.83: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \\\"Venite Ad Me Omnes\\\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.\\n\",\n      \"----------------------------------------------------------------------\\n\",\n      \"\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"for query_result in res.json()['results']:\\n\",\n    \"    query = query_result['query']\\n\",\n    \"    answers = []\\n\",\n    \"    scores = []\\n\",\n    \"    for result in query_result['results']:\\n\",\n    \"        answers.append(result['text'])\\n\",\n    \"        scores.append(round(result['score'], 2))\\n\",\n    \"    print(\\\"-\\\"*70+\\\"\\\\n\\\"+query+\\\"\\\\n\\\\n\\\"+\\\"\\\\n\\\".join([f\\\"{s}: {a}\\\" for a, s in zip(answers, scores)])+\\\"\\\\n\\\"+\\\"-\\\"*70+\\\"\\\\n\\\\n\\\")\"\n   ]\n  },\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The top results are all relevant as we would have hoped. With that we've finished. The retrieval app API can be shut down, and to save resources the Pinecone index can be deleted within the [Pinecone console](https://app.pinecone.io/).\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"chatgpt-retrieval-plugin-S7h-2AWq-py3.10\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.8\"\n  },\n  \"orig_nbformat\": 4,\n  \"vscode\": {\n   \"interpreter\": {\n    \"hash\": \"1979a773a5778de9a5fa593a629dff0ab3c80c2563810d3e6a8dfb123dc01c7d\"\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/providers/redis/semantic-search-and-filter.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"import requests\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Document retrieval: upsert and query basic usage\\n\",\n    \"\\n\",\n    \"In this walkthrough we will see how to use the retrieval API with a Redis datastore for *semantic search / question-answering*. We will also provide a basic demo showing how to use the \\\"filter\\\" function.\\n\",\n    \"\\n\",\n    \"Before running this notebook you should have already initialized the retrieval API and have it running locally or elsewhere. The full instructions for doing this are found in on the chatgpt-retrieval-plugin page [page](https://github.com/openai/chatgpt-retrieval-plugin#quickstart). Please follow the instructions to start the app with the redis datastore.\\n\",\n    \"\\n\",\n    \"Additional examples using the search features can be found [here](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/examples/providers/pinecone/semantic-search.ipynb).\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Document\\n\",\n    \"\\n\",\n    \"First we will prepare a collection of documents. From the perspective of the retrieval plugin, a [document](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/models/models.py) this consists\\n\",\n    \"of an \\\"id\\\", \\\"text\\\" and a collection of \\\"metadata\\\".\\n\",\n    \"\\n\",\n    \"The \\\"metadata\\\" has \\\"source\\\", \\\"source_id\\\", \\\"created_at\\\", \\\"url\\\" and \\\"author\\\" fields. Query metadata does not expose the \\\"url\\\" field.\\n\",\n    \"\\n\",\n    \"The \\\"source\\\" field is an Enum and can only be one of (\\\"file\\\", \\\"email\\\" or \\\"chat\\\").\\n\",\n    \"\\n\",\n    \"Text is taken from company SEC 10-K filings which are in the public domain.\\n\",\n    \"\\n\",\n    \"For demonstration, we will insert some **fake** authors for the documents, see the respective links for the original sources. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"document_1 = {\\n\",\n    \"    \\\"id\\\": \\\"twtr\\\",\\n\",\n    \"    \\\"text\\\": \\\"\\\"\\\"Postponements, suspensions or cancellations of major events, such as sporting events\\n\",\n    \"                and music festivals, may lead to people perceiving the content on Twitter as less\\n\",\n    \"                relevant or useful or of lower quality, which could negatively affect mDAU growth,\\n\",\n    \"                or may reduce monetization opportunities in connection with such events.\\\"\\\"\\\",\\n\",\n    \"    \\\"metadata\\\" : {\\n\",\n    \"        \\\"source\\\" : \\\"file\\\",\\n\",\n    \"        \\\"source_id\\\" : \\\"test:twtr10k\\\",\\n\",\n    \"        \\\"created_at\\\": \\\"2020-12-31\\\",\\n\",\n    \"        \\\"url\\\": \\\"https://www.sec.gov/Archives/edgar/data/1418091/000141809121000031/twtr-20201231.htm\\\",\\n\",\n    \"        \\\"author\\\": 'Elvis Tusk Sr.'        \\n\",\n    \"    }\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"document_2 = {\\n\",\n    \"    \\\"id\\\": \\\"tsla\\\",\\n\",\n    \"    \\\"text\\\": \\\"\\\"\\\"Because we do not have independent dealer networks, we are responsible for delivering\\n\",\n    \"               all of our vehicles to our customers.\\\"\\\"\\\",\\n\",\n    \"    \\\"metadata\\\" : {\\n\",\n    \"        \\\"source\\\" : \\\"file\\\",\\n\",\n    \"        \\\"source_id\\\" : \\\"test:tesla10k\\\",\\n\",\n    \"        \\\"created_at\\\": \\\"2021-12-31\\\",\\n\",\n    \"        \\\"url\\\": \\\"https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm\\\",\\n\",\n    \"        \\\"author\\\": 'Elvis Tusk Jr.'        \\n\",\n    \"    }     \\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"document_3 = {\\n\",\n    \"    \\\"id\\\": \\\"xom\\\",\\n\",\n    \"    \\\"text\\\": \\\"\\\"\\\"All practical and economically-viable energy sources will need to be pursued to continue\\n\",\n    \"               meeting global energy demand, recognizing the scale and variety of worldwide energy needs\\n\",\n    \"               as well as the importance of expanding access to modern energy to promote better standards\\n\",\n    \"               of living for billions of people.\\\"\\\"\\\",\\n\",\n    \"    \\\"metadata\\\" : {\\n\",\n    \"        \\\"source\\\" : \\\"file\\\",\\n\",\n    \"        \\\"source_id\\\" : \\\"test:xom10k\\\",\\n\",\n    \"        \\\"created_at\\\": \\\"2020-12-31\\\",\\n\",\n    \"        \\\"url\\\": \\\"https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm\\\",\\n\",\n    \"        \\\"author\\\": 'Vape Jordan'        \\n\",\n    \"    }     \\n\",\n    \"}\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Indexing the Docs\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We're now ready to begin indexing (or *upserting*) our `documents`. To make these requests to the retrieval app API, we will need to provide authorization in the form of the `BEARER_TOKEN` we set earlier. We do this below:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"BEARER_TOKEN = os.environ.get(\\\"BEARER_TOKEN\\\") or \\\"BEARER_TOKEN_HERE\\\"\\n\",\n    \"endpoint_url = 'http://0.0.0.0:8000'\\n\",\n    \"headers = {\\n\",\n    \"    \\\"Authorization\\\": f\\\"Bearer {BEARER_TOKEN}\\\"\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Use the `BEARER_TOKEN` to create our authorization `headers`:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/upsert\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        \\\"documents\\\": [document_1, document_2, document_3]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"response.raise_for_status()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Example filter syntax\\n\",\n    \"In our example data we have tagged each companies 10k documents as a source: test:twtr10k, test:tsla10k, and test:xom10k.\\n\",\n    \"And we have created **fake** authors of the documents, Elvis Tusk Jr., Elvis Tusk Sr. and Vape Jordan. We will then filter based on these fields.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"### TAG Fields\\n\",\n    \"\\n\",\n    \"source and source_id are \\\"TAG\\\" fields, Redis supports a limited [query syntax](https://redis.io/docs/stack/search/reference/tags/) on TAGS, which includes and \\\"or\\\" syntax, i.e. \\\"test:twtr10k|test:tesla10k\\\" or a ```*``` wildcard to match a prefix.\\n\",\n    \"\\n\",\n    \"In this example we have only two documents that match the filter so only two documents will show.\\n\",\n    \"\\n\",\n    \"Gotcha: There cannot be a space between the bar \\\"|\\\", i.e. \\\"test:twtr10k|test:tesla10k\\\" is valid, \\\"test:twtr10k | test:tesla10k\\\" is not.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'results': [{'query': 'How does Tesla deliver cars?',\\n\",\n       \"   'results': [{'id': 'tsla',\\n\",\n       \"     'text': 'Because we do not have independent dealer networks, we are responsible for delivering                all of our vehicles to our customers.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:tesla10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',\\n\",\n       \"      'created_at': '1640908800',\\n\",\n       \"      'author': 'Elvis Tusk Jr.',\\n\",\n       \"      'document_id': 'tsla'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.185401830213},\\n\",\n       \"    {'id': 'twtr',\\n\",\n       \"     'text': 'Postponements, suspensions or cancellations of major events, such as sporting events                 and music festivals, may lead to people perceiving the content on Twitter as less                 relevant or useful or of lower quality, which could negatively affect mDAU growth,                 or may reduce monetization opportunities in connection with such events.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:twtr10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/1418091/000141809121000031/twtr-20201231.htm',\\n\",\n       \"      'created_at': '1609372800',\\n\",\n       \"      'author': 'Elvis Tusk Sr.',\\n\",\n       \"      'document_id': 'twtr'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.300053447242}]}]}\"\n      ]\n     },\n     \"execution_count\": 20,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"query = {\\n\",\n    \"    \\\"query\\\": \\\"How does Tesla deliver cars?\\\",\\n\",\n    \"    \\\"filter\\\": {\\\"source_id\\\": \\\"test:twtr10k|test:tesla10k\\\"},\\n\",\n    \"    \\\"top_k\\\": 3\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        \\\"queries\\\": [query]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"response.raise_for_status()\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"In this example we use a wild card to filter by prefix. There are three documents matching this filter so three results will be printed.\\n\",\n    \"\\n\",\n    \"Gotcha, only prefix filtering is supported for redis TAGS, i.e. \\\"test*\\\" is valid, where as \\\"te\\\\*t\\\\*\\\" is not.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'results': [{'query': 'I want information related to car dealerships.',\\n\",\n       \"   'results': [{'id': 'tsla',\\n\",\n       \"     'text': 'Because we do not have independent dealer networks, we are responsible for delivering                all of our vehicles to our customers.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:tesla10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',\\n\",\n       \"      'created_at': '1640908800',\\n\",\n       \"      'author': 'Elvis Tusk Jr.',\\n\",\n       \"      'document_id': 'tsla'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.204279193893},\\n\",\n       \"    {'id': 'twtr',\\n\",\n       \"     'text': 'Postponements, suspensions or cancellations of major events, such as sporting events                 and music festivals, may lead to people perceiving the content on Twitter as less                 relevant or useful or of lower quality, which could negatively affect mDAU growth,                 or may reduce monetization opportunities in connection with such events.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:twtr10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/1418091/000141809121000031/twtr-20201231.htm',\\n\",\n       \"      'created_at': '1609372800',\\n\",\n       \"      'author': 'Elvis Tusk Sr.',\\n\",\n       \"      'document_id': 'twtr'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.292188997496},\\n\",\n       \"    {'id': 'xom',\\n\",\n       \"     'text': 'All practical and economically-viable energy sources will need to be pursued to continue                meeting global energy demand, recognizing the scale and variety of worldwide energy needs                as well as the importance of expanding access to modern energy to promote better standards                of living for billions of people.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:xom10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\\n\",\n       \"      'created_at': '1609372800',\\n\",\n       \"      'author': 'Vape Jordan',\\n\",\n       \"      'document_id': 'xom'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.305264299269}]}]}\"\n      ]\n     },\n     \"execution_count\": 21,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"query = {\\n\",\n    \"    \\\"query\\\": \\\"I want information related to car dealerships.\\\",\\n\",\n    \"    \\\"filter\\\": {\\\"source_id\\\": \\\"test:*\\\"},\\n\",\n    \"    \\\"top_k\\\": 3\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        \\\"queries\\\": [query]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"response.raise_for_status()\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The last example we filter by the \\\"author\\\" field. The author field is a TextField, and so we have more options for filtering, \\n\",\n    \"see [here](https://redis.io/docs/stack/search/reference/query_syntax/) for a complete set of examples.\\n\",\n    \"\\n\",\n    \"We can select by a specific author, here we only expect to return a single result.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'results': [{'query': 'I want information related to car dealerships.',\\n\",\n       \"   'results': [{'id': 'xom',\\n\",\n       \"     'text': 'All practical and economically-viable energy sources will need to be pursued to continue                meeting global energy demand, recognizing the scale and variety of worldwide energy needs                as well as the importance of expanding access to modern energy to promote better standards                of living for billions of people.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:xom10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\\n\",\n       \"      'created_at': '1609372800',\\n\",\n       \"      'author': 'Vape Jordan',\\n\",\n       \"      'document_id': 'xom'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.305264299269}]}]}\"\n      ]\n     },\n     \"execution_count\": 22,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"query = {\\n\",\n    \"    \\\"query\\\": \\\"I want information related to car dealerships.\\\",\\n\",\n    \"    \\\"filter\\\": {\\\"source_id\\\": \\\"test:*\\\", \\\"author\\\": \\\"Vape Jordan\\\"},\\n\",\n    \"    \\\"top_k\\\": 3\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        \\\"queries\\\": [query]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"response.raise_for_status()\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here we use the negation \\\"-\\\" to select all documents, except those published by an author called Elvis\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'results': [{'query': 'I want information related to car dealerships.',\\n\",\n       \"   'results': [{'id': 'xom',\\n\",\n       \"     'text': 'All practical and economically-viable energy sources will need to be pursued to continue                meeting global energy demand, recognizing the scale and variety of worldwide energy needs                as well as the importance of expanding access to modern energy to promote better standards                of living for billions of people.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:xom10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\\n\",\n       \"      'created_at': '1609372800',\\n\",\n       \"      'author': 'Vape Jordan',\\n\",\n       \"      'document_id': 'xom'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.305264299269}]}]}\"\n      ]\n     },\n     \"execution_count\": 23,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"query = {\\n\",\n    \"    \\\"query\\\": \\\"I want information related to car dealerships.\\\",\\n\",\n    \"    \\\"filter\\\": {\\\"source_id\\\": \\\"test:*\\\", \\\"author\\\": \\\"-Elvis\\\"},\\n\",\n    \"    \\\"top_k\\\": 3\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        \\\"queries\\\": [query]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"response.raise_for_status()\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Last example we filter two of the authors:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"{'results': [{'query': 'I want information related to car dealerships.',\\n\",\n       \"   'results': [{'id': 'tsla',\\n\",\n       \"     'text': 'Because we do not have independent dealer networks, we are responsible for delivering                all of our vehicles to our customers.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:tesla10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/1318605/000095017022000796/tsla-20211231.htm',\\n\",\n       \"      'created_at': '1640908800',\\n\",\n       \"      'author': 'Elvis Tusk Jr.',\\n\",\n       \"      'document_id': 'tsla'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.204279193893},\\n\",\n       \"    {'id': 'xom',\\n\",\n       \"     'text': 'All practical and economically-viable energy sources will need to be pursued to continue                meeting global energy demand, recognizing the scale and variety of worldwide energy needs                as well as the importance of expanding access to modern energy to promote better standards                of living for billions of people.',\\n\",\n       \"     'metadata': {'source': 'file',\\n\",\n       \"      'source_id': 'test:xom10k',\\n\",\n       \"      'url': 'https://www.sec.gov/Archives/edgar/data/34088/000003408821000012/xom-20201231.htm',\\n\",\n       \"      'created_at': '1609372800',\\n\",\n       \"      'author': 'Vape Jordan',\\n\",\n       \"      'document_id': 'xom'},\\n\",\n       \"     'embedding': None,\\n\",\n       \"     'score': 0.305264299269}]}]}\"\n      ]\n     },\n     \"execution_count\": 24,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"query = {\\n\",\n    \"    \\\"query\\\": \\\"I want information related to car dealerships.\\\",\\n\",\n    \"    \\\"filter\\\": {\\\"source_id\\\": \\\"test:*\\\", \\\"author\\\": \\\"Elvis*Jr.|Vape\\\"},\\n\",\n    \"    \\\"top_k\\\": 3\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    f\\\"{endpoint_url}/query\\\",\\n\",\n    \"    headers=headers,\\n\",\n    \"    json={\\n\",\n    \"        \\\"queries\\\": [query]\\n\",\n    \"    }\\n\",\n    \")\\n\",\n    \"response.raise_for_status()\\n\",\n    \"\\n\",\n    \"response.json()\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3 (ipykernel)\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.11.1\"\n  },\n  \"vscode\": {\n   \"interpreter\": {\n    \"hash\": \"1979a773a5778de9a5fa593a629dff0ab3c80c2563810d3e6a8dfb123dc01c7d\"\n   }\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "examples/providers/supabase/.gitignore",
    "content": "# Supabase\n.branches\n.temp\n"
  },
  {
    "path": "examples/providers/supabase/config.toml",
    "content": "# A string used to distinguish different Supabase projects on the same host. Defaults to the working\n# directory name when running `supabase init`.\nproject_id = \"providers\"\n\n[api]\n# Port to use for the API URL.\nport = 54321\n# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API\n# endpoints. public and storage are always included.\nschemas = [\"public\", \"storage\", \"graphql_public\"]\n# Extra schemas to add to the search_path of every request. public is always included.\nextra_search_path = [\"public\", \"extensions\"]\n# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size\n# for accidental or malicious requests.\nmax_rows = 1000\n\n[db]\n# Port to use for the local database URL.\nport = 54322\n# The database major version to use. This has to be the same as your remote database's. Run `SHOW\n# server_version;` on the remote database to check.\nmajor_version = 15\n\n[studio]\n# Port to use for Supabase Studio.\nport = 54323\n\n# Email testing server. Emails sent with the local dev setup are not actually sent - rather, they\n# are monitored, and you can view the emails that would have been sent from the web interface.\n[inbucket]\n# Port to use for the email testing server web interface.\nport = 54324\nsmtp_port = 54325\npop3_port = 54326\n\n[storage]\n# The maximum file size allowed (e.g. \"5MB\", \"500KB\").\nfile_size_limit = \"50MiB\"\n\n[auth]\n# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used\n# in emails.\nsite_url = \"http://localhost:3000\"\n# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.\nadditional_redirect_urls = [\"https://localhost:3000\"]\n# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 seconds (one\n# week).\njwt_expiry = 3600\n# Allow/disallow new user signups to your project.\nenable_signup = true\n\n[auth.email]\n# Allow/disallow new user signups via email to your project.\nenable_signup = true\n# If enabled, a user will be required to confirm any email change on both the old, and new email\n# addresses. If disabled, only the new email is required to confirm.\ndouble_confirm_changes = true\n# If enabled, users need to confirm their email address before signing in.\nenable_confirmations = false\n\n# Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`,\n# `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin`, `notion`, `twitch`,\n# `twitter`, `slack`, `spotify`, `workos`, `zoom`.\n[auth.external.apple]\nenabled = false\nclient_id = \"\"\nsecret = \"\"\n# Overrides the default auth redirectUrl.\nredirect_uri = \"\"\n# Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure,\n# or any other third-party OIDC providers.\nurl = \"\"\n"
  },
  {
    "path": "examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql",
    "content": "create extension vector;\n\ncreate table if not exists documents (\n    id text primary key default gen_random_uuid()::text,\n    source text,\n    source_id text,\n    content text,\n    document_id text,\n    author text,\n    url text,\n    created_at timestamptz default now(),\n    embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model\n);\n\ncreate index ix_documents_document_id on documents using btree ( document_id );\ncreate index ix_documents_source on documents using btree ( source );\ncreate index ix_documents_source_id on documents using btree ( source_id );\ncreate index ix_documents_author on documents using btree ( author );\ncreate index ix_documents_created_at on documents using brin ( created_at );\n\nalter table documents enable row level security;\n\ncreate or replace function match_page_sections(in_embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model\n                                            , in_match_count int default 3\n                                            , in_document_id text default '%%'\n                                            , in_source_id text default '%%'\n                                            , in_source text default '%%'\n                                            , in_author text default '%%'\n                                            , in_start_date timestamptz default '-infinity'\n                                            , in_end_date timestamptz default 'infinity')\nreturns table (id text\n            , source text\n            , source_id text\n            , document_id text\n            , url text\n            , created_at timestamptz\n            , author text\n            , content text\n            , embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model\n            , similarity float)\nlanguage plpgsql\nas $$\n#variable_conflict use_variable\nbegin\nreturn query\nselect\n    documents.id,\n    documents.source,\n    documents.source_id,\n    documents.document_id,\n    documents.url,\n    documents.created_at,\n    documents.author,\n    documents.content,\n    documents.embedding,\n    (documents.embedding <#> in_embedding) * -1 as similarity\nfrom documents\n\nwhere in_start_date <= documents.created_at and \n    documents.created_at <= in_end_date and\n    (documents.source_id like in_source_id or documents.source_id is null) and\n    (documents.source like in_source or documents.source is null) and\n    (documents.author like in_author or documents.author is null) and\n    (documents.document_id like in_document_id or documents.document_id is null)\n\norder by documents.embedding <#> in_embedding\n\nlimit in_match_count;\nend;\n$$;"
  },
  {
    "path": "examples/providers/supabase/seed.sql",
    "content": ""
  },
  {
    "path": "local_server/ai-plugin.json",
    "content": "{\n  \"schema_version\": \"v1\",\n  \"name_for_model\": \"retrieval\",\n  \"name_for_human\": \"Retrieval Plugin\",\n  \"description_for_model\": \"Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.\",\n  \"description_for_human\": \"Search through your documents.\",\n  \"auth\": {\n    \"type\": \"none\"\n  },\n  \"api\": {\n    \"type\": \"openapi\",\n    \"url\": \"http://localhost:3333/.well-known/openapi.yaml\"\n  },\n  \"logo_url\": \"http://localhost:3333/.well-known/logo.png\",\n  \"contact_email\": \"hello@contact.com\", \n  \"legal_info_url\": \"hello@legal.com\"\n}\n\n"
  },
  {
    "path": "local_server/main.py",
    "content": "# This is a version of the main.py file found in ../../../server/main.py for testing the plugin locally.\n# Use the command `poetry run dev` to run this.\nfrom typing import Optional\nimport uvicorn\nfrom fastapi import FastAPI, File, Form, HTTPException, Body, UploadFile\nfrom loguru import logger\n\nfrom models.api import (\n    DeleteRequest,\n    DeleteResponse,\n    QueryRequest,\n    QueryResponse,\n    UpsertRequest,\n    UpsertResponse,\n)\nfrom datastore.factory import get_datastore\nfrom services.file import get_document_from_file\n\nfrom starlette.responses import FileResponse\n\nfrom models.models import DocumentMetadata, Source\nfrom fastapi.middleware.cors import CORSMiddleware\n\n\napp = FastAPI()\n\nPORT = 3333\n\norigins = [\n    f\"http://localhost:{PORT}\",\n    \"https://chat.openai.com\",\n]\n\napp.add_middleware(\n    CORSMiddleware,\n    allow_origins=origins,\n    allow_credentials=True,\n    allow_methods=[\"*\"],\n    allow_headers=[\"*\"],\n)\n\n\n@app.route(\"/.well-known/ai-plugin.json\")\nasync def get_manifest(request):\n    file_path = \"./local_server/ai-plugin.json\"\n    simple_headers = {}\n    simple_headers[\"Access-Control-Allow-Private-Network\"] = \"true\"\n    return FileResponse(file_path, media_type=\"text/json\", headers=simple_headers)\n\n\n@app.route(\"/.well-known/logo.png\")\nasync def get_logo(request):\n    file_path = \"./local_server/logo.png\"\n    return FileResponse(file_path, media_type=\"text/json\")\n\n\n@app.route(\"/.well-known/openapi.yaml\")\nasync def get_openapi(request):\n    file_path = \"./local_server/openapi.yaml\"\n    return FileResponse(file_path, media_type=\"text/json\")\n\n\n@app.post(\n    \"/upsert-file\",\n    response_model=UpsertResponse,\n)\nasync def upsert_file(\n    file: UploadFile = File(...),\n    metadata: Optional[str] = Form(None),\n):\n    try:\n        metadata_obj = (\n            DocumentMetadata.parse_raw(metadata)\n            if metadata\n            else DocumentMetadata(source=Source.file)\n        )\n    except:\n        metadata_obj = DocumentMetadata(source=Source.file)\n\n    document = await get_document_from_file(file, metadata_obj)\n\n    try:\n        ids = await datastore.upsert([document])\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=f\"str({e})\")\n\n\n@app.post(\n    \"/upsert\",\n    response_model=UpsertResponse,\n)\nasync def upsert(\n    request: UpsertRequest = Body(...),\n):\n    try:\n        ids = await datastore.upsert(request.documents)\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.post(\"/query\", response_model=QueryResponse)\nasync def query_main(request: QueryRequest = Body(...)):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.delete(\n    \"/delete\",\n    response_model=DeleteResponse,\n)\nasync def delete(\n    request: DeleteRequest = Body(...),\n):\n    if not (request.ids or request.filter or request.delete_all):\n        raise HTTPException(\n            status_code=400,\n            detail=\"One of ids, filter, or delete_all is required\",\n        )\n    try:\n        success = await datastore.delete(\n            ids=request.ids,\n            filter=request.filter,\n            delete_all=request.delete_all,\n        )\n        return DeleteResponse(success=success)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.on_event(\"startup\")\nasync def startup():\n    global datastore\n    datastore = await get_datastore()\n\n\ndef start():\n    uvicorn.run(\"local_server.main:app\", host=\"localhost\", port=PORT, reload=True)\n"
  },
  {
    "path": "local_server/openapi.yaml",
    "content": "openapi: 3.0.2\ninfo:\n  title: Retrieval Plugin API\n  description: A retrieval API for querying and filtering documents based on natural language queries and metadata\n  version: 1.0.0\nservers:\n  - url: http://localhost:3333\npaths:\n  /query:\n    post:\n      summary: Query\n      description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\n      operationId: query_query_post\n      requestBody:\n        content:\n          application/json:\n            schema:\n              $ref: \"#/components/schemas/QueryRequest\"\n        required: true\n      responses:\n        \"200\":\n          description: Successful Response\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/QueryResponse\"\n        \"422\":\n          description: Validation Error\n          content:\n            application/json:\n              schema:\n                $ref: \"#/components/schemas/HTTPValidationError\"\ncomponents:\n  schemas:\n    DocumentChunkMetadata:\n      title: DocumentChunkMetadata\n      type: object\n      properties:\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        url:\n          title: Url\n          type: string\n        created_at:\n          title: Created At\n          type: string\n        author:\n          title: Author\n          type: string\n        document_id:\n          title: Document Id\n          type: string\n    DocumentChunkWithScore:\n      title: DocumentChunkWithScore\n      required:\n        - text\n        - metadata\n        - score\n      type: object\n      properties:\n        id:\n          title: Id\n          type: string\n        text:\n          title: Text\n          type: string\n        metadata:\n          $ref: \"#/components/schemas/DocumentChunkMetadata\"\n        embedding:\n          title: Embedding\n          type: array\n          items:\n            type: number\n        score:\n          title: Score\n          type: number\n    DocumentMetadataFilter:\n      title: DocumentMetadataFilter\n      type: object\n      properties:\n        document_id:\n          title: Document Id\n          type: string\n        source:\n          $ref: \"#/components/schemas/Source\"\n        source_id:\n          title: Source Id\n          type: string\n        author:\n          title: Author\n          type: string\n        start_date:\n          title: Start Date\n          type: string\n        end_date:\n          title: End Date\n          type: string\n    HTTPValidationError:\n      title: HTTPValidationError\n      type: object\n      properties:\n        detail:\n          title: Detail\n          type: array\n          items:\n            $ref: \"#/components/schemas/ValidationError\"\n    Query:\n      title: Query\n      required:\n        - query\n      type: object\n      properties:\n        query:\n          title: Query\n          type: string\n        filter:\n          $ref: \"#/components/schemas/DocumentMetadataFilter\"\n        top_k:\n          title: Top K\n          type: integer\n          default: 3\n    QueryRequest:\n      title: QueryRequest\n      required:\n        - queries\n      type: object\n      properties:\n        queries:\n          title: Queries\n          type: array\n          items:\n            $ref: \"#/components/schemas/Query\"\n    QueryResponse:\n      title: QueryResponse\n      required:\n        - results\n      type: object\n      properties:\n        results:\n          title: Results\n          type: array\n          items:\n            $ref: \"#/components/schemas/QueryResult\"\n    QueryResult:\n      title: QueryResult\n      required:\n        - query\n        - results\n      type: object\n      properties:\n        query:\n          title: Query\n          type: string\n        results:\n          title: Results\n          type: array\n          items:\n            $ref: \"#/components/schemas/DocumentChunkWithScore\"\n    Source:\n      title: Source\n      enum:\n        - email\n        - file\n        - chat\n      type: string\n      description: An enumeration.\n    ValidationError:\n      title: ValidationError\n      required:\n        - loc\n        - msg\n        - type\n      type: object\n      properties:\n        loc:\n          title: Location\n          type: array\n          items:\n            anyOf:\n              - type: string\n              - type: integer\n        msg:\n          title: Message\n          type: string\n        type:\n          title: Error Type\n          type: string\n"
  },
  {
    "path": "models/api.py",
    "content": "from models.models import (\n    Document,\n    DocumentMetadataFilter,\n    Query,\n    QueryResult,\n)\nfrom pydantic import BaseModel\nfrom typing import List, Optional\n\n\nclass UpsertRequest(BaseModel):\n    documents: List[Document]\n\n\nclass UpsertResponse(BaseModel):\n    ids: List[str]\n\n\nclass QueryRequest(BaseModel):\n    queries: List[Query]\n\n\nclass QueryResponse(BaseModel):\n    results: List[QueryResult]\n\n\nclass DeleteRequest(BaseModel):\n    ids: Optional[List[str]] = None\n    filter: Optional[DocumentMetadataFilter] = None\n    delete_all: Optional[bool] = False\n\n\nclass DeleteResponse(BaseModel):\n    success: bool\n"
  },
  {
    "path": "models/models.py",
    "content": "from pydantic import BaseModel\nfrom typing import List, Optional\nfrom enum import Enum\n\n\nclass Source(str, Enum):\n    email = \"email\"\n    file = \"file\"\n    chat = \"chat\"\n\n\nclass DocumentMetadata(BaseModel):\n    source: Optional[Source] = None\n    source_id: Optional[str] = None\n    url: Optional[str] = None\n    created_at: Optional[str] = None\n    author: Optional[str] = None\n\n\nclass DocumentChunkMetadata(DocumentMetadata):\n    document_id: Optional[str] = None\n\n\nclass DocumentChunk(BaseModel):\n    id: Optional[str] = None\n    text: str\n    metadata: DocumentChunkMetadata\n    embedding: Optional[List[float]] = None\n\n\nclass DocumentChunkWithScore(DocumentChunk):\n    score: float\n\n\nclass Document(BaseModel):\n    id: Optional[str] = None\n    text: str\n    metadata: Optional[DocumentMetadata] = None\n\n\nclass DocumentWithChunks(Document):\n    chunks: List[DocumentChunk]\n\n\nclass DocumentMetadataFilter(BaseModel):\n    document_id: Optional[str] = None\n    source: Optional[Source] = None\n    source_id: Optional[str] = None\n    author: Optional[str] = None\n    start_date: Optional[str] = None  # any date string format\n    end_date: Optional[str] = None  # any date string format\n\n\nclass Query(BaseModel):\n    query: str\n    filter: Optional[DocumentMetadataFilter] = None\n    top_k: Optional[int] = 3\n\n\nclass QueryWithEmbedding(Query):\n    embedding: List[float]\n\n\nclass QueryResult(BaseModel):\n    query: str\n    results: List[DocumentChunkWithScore]\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.poetry]\nname = \"chatgpt-retrieval-plugin\"\nversion = \"0.1.0\"\ndescription = \"\"\nauthors = [\"isafulf <isabella@openai.com>\"]\nreadme = \"README.md\"\npackages = [{include = \"server\"}]\n\n[tool.poetry.dependencies]\npython = \"^3.10\"\nfastapi = \"^0.92.0\"\nuvicorn = \"^0.20.0\"\nopenai = \"^0.27.5\"\npython-dotenv = \"^0.21.1\"\npydantic = \"^1.10.5\"\ntenacity = \"^8.2.1\"\ntiktoken = \"^0.2.0\"\nnumpy = \"^1.24.2\"\ndocx2txt = \"^0.8\"\nPyPDF2 = \"^3.0.1\"\npython-pptx = \"^0.6.21\"\npython-multipart = \"^0.0.6\"\narrow = \"^1.2.3\"\nchromadb = \"^0.3.25\"\npinecone-client = \"^2.1.0\"\nweaviate-client = \"^3.12.0\"\npymilvus = \"^2.2.2\"\nqdrant-client = {version = \"^1.0.4\", python = \"<3.12\"}\nredis = \"4.5.4\"\nsupabase = \"^1.0.2\"\npsycopg2 = \"^2.9.5\"\nllama-index = \"0.5.4\"\nazure-identity = \"^1.12.0\"\nazure-search-documents = \"11.4.0b8\"\npgvector = \"^0.1.7\"\npsycopg2cffi = {version = \"^2.9.0\", optional = true}\nloguru = \"^0.7.0\"\nelasticsearch = \"8.8.2\"\npymongo = \"^4.3.3\"\nmotor = \"^3.3.2\"\n\n[tool.poetry.scripts]\nstart = \"server.main:start\"\ndev = \"local_server.main:start\"\n\n[tool.poetry.extras]\npostgresql = [\"psycopg2cffi\"]\n\n[tool.poetry.group.dev.dependencies]\nhttpx = \"^0.23.3\"\npytest = \"^7.2.1\"\npytest-cov = \"^4.0.0\"\npytest-asyncio = \"^0.20.3\"\n\n[build-system]\nrequires = [\"poetry-core\"]\nbuild-backend = \"poetry.core.masonry.api\"\n\n[tool.pytest.ini_options]\npythonpath = [\n  \".\"\n]\nasyncio_mode=\"auto\"\n"
  },
  {
    "path": "scripts/process_json/README.md",
    "content": "## Process a JSON File\n\nThis script is a utility to process a file dump of documents in a JSON format and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case.\n\n## Usage\n\nTo run this script from the terminal, navigate to this folder and use the following command:\n\n```\npython process_json.py --filepath path/to/file_dump.json --custom_metadata '{\"source\": \"file\"}' --screen_for_pii True --extract_metadata True\n```\n\nwhere:\n\n- `path/to/file_dump.json` is the name or path to the file dump to be processed. The format of this JSON file should be a list of JSON objects, where each object represents a document. The JSON object should have a subset of the following fields: `id`, `text`, `source`, `source_id`, `url`, `created_at`, and `author`. The `text` field is required, while the rest are optional and will be used to populate the metadata of the document. If the `id` field is not specified, a random UUID will be generated for the document.\n- `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{\"source\": \"file\"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`).\n- `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`.\n- `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`.\n\nThe script will load the JSON file as a list of dictionaries, iterate over the data, create document objects, and batch upsert them into the database. It will also print some progress messages and error messages if any, as well as the number and content of the skipped items due to errors or PII detection.\n\nYou can use `python process_json.py -h` to get a summary of the options and their descriptions.\n\nTest the script with the example file, [example.json](example.json).\n"
  },
  {
    "path": "scripts/process_json/example.json",
    "content": "[\n    {\n      \"id\": \"123\",\n      \"text\": \"This is a document about something\",\n      \"source\": \"file\",\n      \"source_id\": \"https://example.com/doc1\",\n      \"url\": \"https://example.com/doc1\",\n      \"created_at\": \"2021-01-01T12:00:00Z\",\n      \"author\": \"Alice\"\n    },\n    {\n      \"text\": \"This is another document about something else\",\n      \"source\": \"file\",\n      \"source_id\": \"doc2.txt\",\n      \"author\": \"Bob\"\n    },\n    {\n      \"id\": \"456\",\n      \"text\": \"This is Alice's phone number: 123-456-7890\",\n      \"source\": \"email\",\n      \"source_id\": \"567\",\n      \"created_at\": \"2021-01-02T13:00:00Z\",\n      \"author\": \"Alice\"\n    }\n]"
  },
  {
    "path": "scripts/process_json/process_json.py",
    "content": "import uuid\nimport json\nimport argparse\nimport asyncio\n\nfrom loguru import logger\nfrom models.models import Document, DocumentMetadata\nfrom datastore.datastore import DataStore\nfrom datastore.factory import get_datastore\nfrom services.extract_metadata import extract_metadata_from_document\nfrom services.pii_detection import screen_text_for_pii\n\nDOCUMENT_UPSERT_BATCH_SIZE = 50\n\n\nasync def process_json_dump(\n    filepath: str,\n    datastore: DataStore,\n    custom_metadata: dict,\n    screen_for_pii: bool,\n    extract_metadata: bool,\n):\n    # load the json file as a list of dictionaries\n    with open(filepath) as json_file:\n        data = json.load(json_file)\n\n    documents = []\n    skipped_items = []\n    # iterate over the data and create document objects\n    for item in data:\n        if len(documents) % 20 == 0:\n            logger.info(f\"Processed {len(documents)} documents\")\n\n        try:\n            # get the id, text, source, source_id, url, created_at and author from the item\n            # use default values if not specified\n            id = item.get(\"id\", None)\n            text = item.get(\"text\", None)\n            source = item.get(\"source\", None)\n            source_id = item.get(\"source_id\", None)\n            url = item.get(\"url\", None)\n            created_at = item.get(\"created_at\", None)\n            author = item.get(\"author\", None)\n\n            if not text:\n                logger.info(\"No document text, skipping...\")\n                continue\n\n            # create a metadata object with the source, source_id, url, created_at and author\n            metadata = DocumentMetadata(\n                source=source,\n                source_id=source_id,\n                url=url,\n                created_at=created_at,\n                author=author,\n            )\n            logger.info(\"metadata: \", str(metadata))\n\n            # update metadata with custom values\n            for key, value in custom_metadata.items():\n                if hasattr(metadata, key):\n                    setattr(metadata, key, value)\n\n            # screen for pii if requested\n            if screen_for_pii:\n                pii_detected = screen_text_for_pii(text)\n                # if pii detected, print a warning and skip the document\n                if pii_detected:\n                    logger.info(\"PII detected in document, skipping\")\n                    skipped_items.append(item)  # add the skipped item to the list\n                    continue\n\n            # extract metadata if requested\n            if extract_metadata:\n                # extract metadata from the document text\n                extracted_metadata = extract_metadata_from_document(\n                    f\"Text: {text}; Metadata: {str(metadata)}\"\n                )\n                # get a Metadata object from the extracted metadata\n                metadata = DocumentMetadata(**extracted_metadata)\n\n            # create a document object with the id or a random id, text and metadata\n            document = Document(\n                id=id or str(uuid.uuid4()),\n                text=text,\n                metadata=metadata,\n            )\n            documents.append(document)\n        except Exception as e:\n            # log the error and continue with the next item\n            logger.error(f\"Error processing {item}: {e}\")\n            skipped_items.append(item)  # add the skipped item to the list\n\n    # do this in batches, the upsert method already batches documents but this allows\n    # us to add more descriptive logging\n    for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE):\n        # Get the text of the chunks in the current batch\n        batch_documents = documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]\n        logger.info(f\"Upserting batch of {len(batch_documents)} documents, batch {i}\")\n        logger.info(\"documents: \", documents)\n        await datastore.upsert(batch_documents)\n\n    # print the skipped items\n    logger.info(f\"Skipped {len(skipped_items)} items due to errors or PII detection\")\n    for item in skipped_items:\n        logger.info(item)\n\n\nasync def main():\n    # parse the command-line arguments\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--filepath\", required=True, help=\"The path to the json dump\")\n    parser.add_argument(\n        \"--custom_metadata\",\n        default=\"{}\",\n        help=\"A JSON string of key-value pairs to update the metadata of the documents\",\n    )\n    parser.add_argument(\n        \"--screen_for_pii\",\n        default=False,\n        type=bool,\n        help=\"A boolean flag to indicate whether to try the PII detection function (using a language model)\",\n    )\n    parser.add_argument(\n        \"--extract_metadata\",\n        default=False,\n        type=bool,\n        help=\"A boolean flag to indicate whether to try to extract metadata from the document (using a language model)\",\n    )\n    args = parser.parse_args()\n\n    # get the arguments\n    filepath = args.filepath\n    custom_metadata = json.loads(args.custom_metadata)\n    screen_for_pii = args.screen_for_pii\n    extract_metadata = args.extract_metadata\n\n    # initialize the db instance once as a global variable\n    datastore = await get_datastore()\n    # process the json dump\n    await process_json_dump(\n        filepath, datastore, custom_metadata, screen_for_pii, extract_metadata\n    )\n\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n"
  },
  {
    "path": "scripts/process_jsonl/README.md",
    "content": "## Process a JSONL File\n\nThis script is a utility to process a file dump of documents in a JSONL format and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case.\n\n## Usage\n\nTo run this script from the terminal, navigate to this folder and use the following command:\n\n```\npython process_jsonl.py --filepath path/to/file_dump.jsonl --custom_metadata '{\"source\": \"email\"}' --screen_for_pii True --extract_metadata True\n```\n\nwhere:\n\n- `path/to/file_dump.jsonl` is the name or path to the file dump to be processed. The format of this JSONL file should be a newline-delimited JSON file, where each line is a valid JSON object representing a document. The JSON object should have a subset of the following fields: `id`, `text`, `source`, `source_id`, `url`, `created_at`, and `author`. The `text` field is required, while the rest are optional and will be used to populate the metadata of the document. If the `id` field is not specified, a random UUID will be generated for the document.\n- `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{\"source\": \"file\"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`).\n- `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`.\n- `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`.\n\nThe script will open the JSONL file as a generator of dictionaries, iterate over the data, create document objects, and batch upsert them into the database. It will also print some progress messages and error messages if any, as well as the number and content of the skipped items due to errors, PII detection, or metadata extraction issues.\n\nYou can use `python process_jsonl.py -h` to get a summary of the options and their descriptions.\n\nTest the script with the example file, [example.jsonl](example.jsonl).\n"
  },
  {
    "path": "scripts/process_jsonl/example.jsonl",
    "content": "{\"id\": \"4\", \"text\": \"This document only has an ID and text. The other fields are missing.\"}\n{\"text\": \"This document has no ID, but it has text and a source.\", \"source\": \"email\"}\n{\"id\": \"6\", \"text\": \"This document has an ID, text, and author, but no source information.\", \"author\": \"John Doe\"}\n{\"text\": \"This document has text, a source, and a URL, but no ID or author.\", \"source\": \"file\", \"url\": \"https://example.com/file/2\"}\n{\"id\": \"8\", \"text\": \"This document has an ID, text, source, and created_at timestamp, but no author or URL.\", \"source\": \"chat\", \"created_at\": \"2022-01-04T00:00:00\"}\n{\"id\": \"9\", \"text\": \"This document contains PII. John Smith's email address is john.smith@example.com and his phone number is +1 (555) 123-4567.\", \"source\": \"email\", \"source_id\": \"email_2\", \"url\": \"https://example.com/email/2\", \"created_at\": \"2022-01-05T00:00:00\", \"author\": \"John Smith\"}"
  },
  {
    "path": "scripts/process_jsonl/process_jsonl.py",
    "content": "import uuid\nimport json\nimport argparse\nimport asyncio\n\nfrom loguru import logger\nfrom models.models import Document, DocumentMetadata\nfrom datastore.datastore import DataStore\nfrom datastore.factory import get_datastore\nfrom services.extract_metadata import extract_metadata_from_document\nfrom services.pii_detection import screen_text_for_pii\n\nDOCUMENT_UPSERT_BATCH_SIZE = 50\n\n\nasync def process_jsonl_dump(\n    filepath: str,\n    datastore: DataStore,\n    custom_metadata: dict,\n    screen_for_pii: bool,\n    extract_metadata: bool,\n):\n    # open the jsonl file as a generator of dictionaries\n    with open(filepath) as jsonl_file:\n        data = [json.loads(line) for line in jsonl_file]\n\n    documents = []\n    skipped_items = []\n    # iterate over the data and create document objects\n    for item in data:\n        if len(documents) % 20 == 0:\n            logger.info(f\"Processed {len(documents)} documents\")\n\n        try:\n            # get the id, text, source, source_id, url, created_at and author from the item\n            # use default values if not specified\n            id = item.get(\"id\", None)\n            text = item.get(\"text\", None)\n            source = item.get(\"source\", None)\n            source_id = item.get(\"source_id\", None)\n            url = item.get(\"url\", None)\n            created_at = item.get(\"created_at\", None)\n            author = item.get(\"author\", None)\n\n            if not text:\n                logger.info(\"No document text, skipping...\")\n                continue\n\n            # create a metadata object with the source, source_id, url, created_at and author\n            metadata = DocumentMetadata(\n                source=source,\n                source_id=source_id,\n                url=url,\n                created_at=created_at,\n                author=author,\n            )\n\n            # update metadata with custom values\n            for key, value in custom_metadata.items():\n                if hasattr(metadata, key):\n                    setattr(metadata, key, value)\n\n            # screen for pii if requested\n            if screen_for_pii:\n                pii_detected = screen_text_for_pii(text)\n                # if pii detected, print a warning and skip the document\n                if pii_detected:\n                    logger.info(\"PII detected in document, skipping\")\n                    skipped_items.append(item)  # add the skipped item to the list\n                    continue\n\n            # extract metadata if requested\n            if extract_metadata:\n                # extract metadata from the document text\n                extracted_metadata = extract_metadata_from_document(\n                    f\"Text: {text}; Metadata: {str(metadata)}\"\n                )\n                # get a Metadata object from the extracted metadata\n                metadata = DocumentMetadata(**extracted_metadata)\n\n            # create a document object with the id, text and metadata\n            document = Document(\n                id=id,\n                text=text,\n                metadata=metadata,\n            )\n            documents.append(document)\n        except Exception as e:\n            # log the error and continue with the next item\n            logger.error(f\"Error processing {item}: {e}\")\n            skipped_items.append(item)  # add the skipped item to the list\n\n    # do this in batches, the upsert method already batches documents but this allows\n    # us to add more descriptive logging\n    for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE):\n        # Get the text of the chunks in the current batch\n        batch_documents = documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]\n        logger.info(f\"Upserting batch of {len(batch_documents)} documents, batch {i}\")\n        await datastore.upsert(batch_documents)\n\n    # print the skipped items\n    logger.info(f\"Skipped {len(skipped_items)} items due to errors or PII detection\")\n    for item in skipped_items:\n        logger.info(item)\n\n\nasync def main():\n    # parse the command-line arguments\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--filepath\", required=True, help=\"The path to the jsonl dump\")\n    parser.add_argument(\n        \"--custom_metadata\",\n        default=\"{}\",\n        help=\"A JSON string of key-value pairs to update the metadata of the documents\",\n    )\n    parser.add_argument(\n        \"--screen_for_pii\",\n        default=False,\n        type=bool,\n        help=\"A boolean flag to indicate whether to try the PII detection function (using a language model)\",\n    )\n    parser.add_argument(\n        \"--extract_metadata\",\n        default=False,\n        type=bool,\n        help=\"A boolean flag to indicate whether to try to extract metadata from the document (using a language model)\",\n    )\n    args = parser.parse_args()\n\n    # get the arguments\n    filepath = args.filepath\n    custom_metadata = json.loads(args.custom_metadata)\n    screen_for_pii = args.screen_for_pii\n    extract_metadata = args.extract_metadata\n\n    # initialize the db instance once as a global variable\n    datastore = await get_datastore()\n    # process the jsonl dump\n    await process_jsonl_dump(\n        filepath, datastore, custom_metadata, screen_for_pii, extract_metadata\n    )\n\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n"
  },
  {
    "path": "scripts/process_zip/README.md",
    "content": "## Process a ZIP File\n\nThis script is a utility to process a file dump of documents in a zip file and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case.\n\n## Usage\n\nTo run this script from the terminal, navigate to this folder and use the following command:\n\n```\npython process_zip.py --filepath path/to/file_dump.zip --custom_metadata '{\"source\": \"email\"}' --screen_for_pii True --extract_metadata True\n```\n\nwhere:\n\n- `path/to/file_dump.zip` is the name or path to the file dump to be processed. The format of this zip file should be a zip file containing of docx, pdf, txt, md and pptx files (any internal folder structure is acceptable).\n- `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{\"source\": \"file\"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`).\n- `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`.\n- `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`.\n\nThe script will extract the files from the zip file into a temporary directory named `dump`, process each file and store the document text and metadata in the database, and then delete the temporary directory and its contents. It will also print some progress messages and error messages if any.\n\nYou can use `python process_zip.py -h` to get a summary of the options and their descriptions.\n\nTest the script with the example file, [example.zip](example.zip).\n"
  },
  {
    "path": "scripts/process_zip/process_zip.py",
    "content": "import uuid\nimport zipfile\nimport os\nimport json\nimport argparse\nimport asyncio\n\nfrom loguru import logger\nfrom models.models import Document, DocumentMetadata, Source\nfrom datastore.datastore import DataStore\nfrom datastore.factory import get_datastore\nfrom services.extract_metadata import extract_metadata_from_document\nfrom services.file import extract_text_from_filepath\nfrom services.pii_detection import screen_text_for_pii\n\nDOCUMENT_UPSERT_BATCH_SIZE = 50\n\n\nasync def process_file_dump(\n    filepath: str,\n    datastore: DataStore,\n    custom_metadata: dict,\n    screen_for_pii: bool,\n    extract_metadata: bool,\n):\n    # create a ZipFile object and extract all the files into a directory named 'dump'\n    with zipfile.ZipFile(filepath) as zip_file:\n        zip_file.extractall(\"dump\")\n\n    documents = []\n    skipped_files = []\n    # use os.walk to traverse the dump directory and its subdirectories\n    for root, dirs, files in os.walk(\"dump\"):\n        for filename in files:\n            if len(documents) % 20 == 0:\n                logger.info(f\"Processed {len(documents)} documents\")\n\n            filepath = os.path.join(root, filename)\n\n            try:\n                extracted_text = extract_text_from_filepath(filepath)\n                logger.info(f\"extracted_text from {filepath}\")\n\n                # create a metadata object with the source and source_id fields\n                metadata = DocumentMetadata(\n                    source=Source.file,\n                    source_id=filename,\n                )\n\n                # update metadata with custom values\n                for key, value in custom_metadata.items():\n                    if hasattr(metadata, key):\n                        setattr(metadata, key, value)\n\n                # screen for pii if requested\n                if screen_for_pii:\n                    pii_detected = screen_text_for_pii(extracted_text)\n                    # if pii detected, print a warning and skip the document\n                    if pii_detected:\n                        logger.info(\"PII detected in document, skipping\")\n                        skipped_files.append(\n                            filepath\n                        )  # add the skipped file to the list\n                        continue\n\n                # extract metadata if requested\n                if extract_metadata:\n                    # extract metadata from the document text\n                    extracted_metadata = extract_metadata_from_document(\n                        f\"Text: {extracted_text}; Metadata: {str(metadata)}\"\n                    )\n                    # get a Metadata object from the extracted metadata\n                    metadata = DocumentMetadata(**extracted_metadata)\n\n                # create a document object with a random id, text and metadata\n                document = Document(\n                    id=str(uuid.uuid4()),\n                    text=extracted_text,\n                    metadata=metadata,\n                )\n                documents.append(document)\n            except Exception as e:\n                # log the error and continue with the next file\n                logger.error(f\"Error processing {filepath}: {e}\")\n                skipped_files.append(filepath)  # add the skipped file to the list\n\n    # do this in batches, the upsert method already batches documents but this allows\n    # us to add more descriptive logging\n    for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE):\n        # Get the text of the chunks in the current batch\n        batch_documents = [doc for doc in documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]]\n        logger.info(f\"Upserting batch of {len(batch_documents)} documents, batch {i}\")\n        logger.info(\"documents: \", documents)\n        await datastore.upsert(batch_documents)\n\n    # delete all files in the dump directory\n    for root, dirs, files in os.walk(\"dump\", topdown=False):\n        for filename in files:\n            filepath = os.path.join(root, filename)\n            os.remove(filepath)\n        for dirname in dirs:\n            dirpath = os.path.join(root, dirname)\n            os.rmdir(dirpath)\n\n    # delete the dump directory\n    os.rmdir(\"dump\")\n\n    # print the skipped files\n    logger.info(f\"Skipped {len(skipped_files)} files due to errors or PII detection\")\n    for file in skipped_files:\n        logger.info(file)\n\n\nasync def main():\n    # parse the command-line arguments\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--filepath\", required=True, help=\"The path to the file dump\")\n    parser.add_argument(\n        \"--custom_metadata\",\n        default=\"{}\",\n        help=\"A JSON string of key-value pairs to update the metadata of the documents\",\n    )\n    parser.add_argument(\n        \"--screen_for_pii\",\n        default=False,\n        type=bool,\n        help=\"A boolean flag to indicate whether to try the PII detection function (using a language model)\",\n    )\n    parser.add_argument(\n        \"--extract_metadata\",\n        default=False,\n        type=bool,\n        help=\"A boolean flag to indicate whether to try to extract metadata from the document (using a language model)\",\n    )\n    args = parser.parse_args()\n\n    # get the arguments\n    filepath = args.filepath\n    custom_metadata = json.loads(args.custom_metadata)\n    screen_for_pii = args.screen_for_pii\n    extract_metadata = args.extract_metadata\n\n    # initialize the db instance once as a global variable\n    datastore = await get_datastore()\n    # process the file dump\n    await process_file_dump(\n        filepath, datastore, custom_metadata, screen_for_pii, extract_metadata\n    )\n\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n"
  },
  {
    "path": "server/main.py",
    "content": "import os\nfrom typing import Optional\nimport uvicorn\nfrom fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile\nfrom fastapi.security import HTTPBearer, HTTPAuthorizationCredentials\nfrom fastapi.staticfiles import StaticFiles\nfrom loguru import logger\n\nfrom models.api import (\n    DeleteRequest,\n    DeleteResponse,\n    QueryRequest,\n    QueryResponse,\n    UpsertRequest,\n    UpsertResponse,\n)\nfrom datastore.factory import get_datastore\nfrom services.file import get_document_from_file\n\nfrom models.models import DocumentMetadata, Source\n\nbearer_scheme = HTTPBearer()\nBEARER_TOKEN = os.environ.get(\"BEARER_TOKEN\")\nassert BEARER_TOKEN is not None\n\n\ndef validate_token(credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme)):\n    if credentials.scheme != \"Bearer\" or credentials.credentials != BEARER_TOKEN:\n        raise HTTPException(status_code=401, detail=\"Invalid or missing token\")\n    return credentials\n\n\napp = FastAPI(dependencies=[Depends(validate_token)])\napp.mount(\"/.well-known\", StaticFiles(directory=\".well-known\"), name=\"static\")\n\n# Create a sub-application, in order to access just the query endpoint in an OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally\nsub_app = FastAPI(\n    title=\"Retrieval Plugin API\",\n    description=\"A retrieval API for querying and filtering documents based on natural language queries and metadata\",\n    version=\"1.0.0\",\n    servers=[{\"url\": \"https://your-app-url.com\"}],\n    dependencies=[Depends(validate_token)],\n)\napp.mount(\"/sub\", sub_app)\n\n\n@app.post(\n    \"/upsert-file\",\n    response_model=UpsertResponse,\n)\nasync def upsert_file(\n    file: UploadFile = File(...),\n    metadata: Optional[str] = Form(None),\n):\n    try:\n        metadata_obj = (\n            DocumentMetadata.parse_raw(metadata)\n            if metadata\n            else DocumentMetadata(source=Source.file)\n        )\n    except:\n        metadata_obj = DocumentMetadata(source=Source.file)\n\n    document = await get_document_from_file(file, metadata_obj)\n\n    try:\n        ids = await datastore.upsert([document])\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=f\"str({e})\")\n\n\n@app.post(\n    \"/upsert\",\n    response_model=UpsertResponse,\n)\nasync def upsert(\n    request: UpsertRequest = Body(...),\n):\n    try:\n        ids = await datastore.upsert(request.documents)\n        return UpsertResponse(ids=ids)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.post(\n    \"/query\",\n    response_model=QueryResponse,\n)\nasync def query_main(\n    request: QueryRequest = Body(...),\n):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@sub_app.post(\n    \"/query\",\n    response_model=QueryResponse,\n    # NOTE: We are describing the shape of the API endpoint input due to a current limitation in parsing arrays of objects from OpenAPI schemas. This will not be necessary in the future.\n    description=\"Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.\",\n)\nasync def query(\n    request: QueryRequest = Body(...),\n):\n    try:\n        results = await datastore.query(\n            request.queries,\n        )\n        return QueryResponse(results=results)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.delete(\n    \"/delete\",\n    response_model=DeleteResponse,\n)\nasync def delete(\n    request: DeleteRequest = Body(...),\n):\n    if not (request.ids or request.filter or request.delete_all):\n        raise HTTPException(\n            status_code=400,\n            detail=\"One of ids, filter, or delete_all is required\",\n        )\n    try:\n        success = await datastore.delete(\n            ids=request.ids,\n            filter=request.filter,\n            delete_all=request.delete_all,\n        )\n        return DeleteResponse(success=success)\n    except Exception as e:\n        logger.error(e)\n        raise HTTPException(status_code=500, detail=\"Internal Service Error\")\n\n\n@app.on_event(\"startup\")\nasync def startup():\n    global datastore\n    datastore = await get_datastore()\n\n\ndef start():\n    uvicorn.run(\"server.main:app\", host=\"0.0.0.0\", port=8000, reload=True)\n"
  },
  {
    "path": "services/chunks.py",
    "content": "from typing import Dict, List, Optional, Tuple\nimport uuid\nimport os\nfrom models.models import Document, DocumentChunk, DocumentChunkMetadata\n\nimport tiktoken\n\nfrom services.openai import get_embeddings\n\n# Global variables\ntokenizer = tiktoken.get_encoding(\n    \"cl100k_base\"\n)  # The encoding scheme to use for tokenization\n\n# Constants\nCHUNK_SIZE = 200  # The target size of each text chunk in tokens\nMIN_CHUNK_SIZE_CHARS = 350  # The minimum size of each text chunk in characters\nMIN_CHUNK_LENGTH_TO_EMBED = 5  # Discard chunks shorter than this\nEMBEDDINGS_BATCH_SIZE = int(\n    os.environ.get(\"OPENAI_EMBEDDING_BATCH_SIZE\", 128)\n)  # The number of embeddings to request at a time\nMAX_NUM_CHUNKS = 10000  # The maximum number of chunks to generate from a text\n\n\ndef get_text_chunks(text: str, chunk_token_size: Optional[int]) -> List[str]:\n    \"\"\"\n    Split a text into chunks of ~CHUNK_SIZE tokens, based on punctuation and newline boundaries.\n\n    Args:\n        text: The text to split into chunks.\n        chunk_token_size: The target size of each chunk in tokens, or None to use the default CHUNK_SIZE.\n\n    Returns:\n        A list of text chunks, each of which is a string of ~CHUNK_SIZE tokens.\n    \"\"\"\n    # Return an empty list if the text is empty or whitespace\n    if not text or text.isspace():\n        return []\n\n    # Tokenize the text\n    tokens = tokenizer.encode(text, disallowed_special=())\n\n    # Initialize an empty list of chunks\n    chunks = []\n\n    # Use the provided chunk token size or the default one\n    chunk_size = chunk_token_size or CHUNK_SIZE\n\n    # Initialize a counter for the number of chunks\n    num_chunks = 0\n\n    # Loop until all tokens are consumed\n    while tokens and num_chunks < MAX_NUM_CHUNKS:\n        # Take the first chunk_size tokens as a chunk\n        chunk = tokens[:chunk_size]\n\n        # Decode the chunk into text\n        chunk_text = tokenizer.decode(chunk)\n\n        # Skip the chunk if it is empty or whitespace\n        if not chunk_text or chunk_text.isspace():\n            # Remove the tokens corresponding to the chunk text from the remaining tokens\n            tokens = tokens[len(chunk) :]\n            # Continue to the next iteration of the loop\n            continue\n\n        # Find the last period or punctuation mark in the chunk\n        last_punctuation = max(\n            chunk_text.rfind(\".\"),\n            chunk_text.rfind(\"?\"),\n            chunk_text.rfind(\"!\"),\n            chunk_text.rfind(\"\\n\"),\n        )\n\n        # If there is a punctuation mark, and the last punctuation index is before MIN_CHUNK_SIZE_CHARS\n        if last_punctuation != -1 and last_punctuation > MIN_CHUNK_SIZE_CHARS:\n            # Truncate the chunk text at the punctuation mark\n            chunk_text = chunk_text[: last_punctuation + 1]\n\n        # Remove any newline characters and strip any leading or trailing whitespace\n        chunk_text_to_append = chunk_text.replace(\"\\n\", \" \").strip()\n\n        if len(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED:\n            # Append the chunk text to the list of chunks\n            chunks.append(chunk_text_to_append)\n\n        # Remove the tokens corresponding to the chunk text from the remaining tokens\n        tokens = tokens[len(tokenizer.encode(chunk_text, disallowed_special=())) :]\n\n        # Increment the number of chunks\n        num_chunks += 1\n\n    # Handle the remaining tokens\n    if tokens:\n        remaining_text = tokenizer.decode(tokens).replace(\"\\n\", \" \").strip()\n        if len(remaining_text) > MIN_CHUNK_LENGTH_TO_EMBED:\n            chunks.append(remaining_text)\n\n    return chunks\n\n\ndef create_document_chunks(\n    doc: Document, chunk_token_size: Optional[int]\n) -> Tuple[List[DocumentChunk], str]:\n    \"\"\"\n    Create a list of document chunks from a document object and return the document id.\n\n    Args:\n        doc: The document object to create chunks from. It should have a text attribute and optionally an id and a metadata attribute.\n        chunk_token_size: The target size of each chunk in tokens, or None to use the default CHUNK_SIZE.\n\n    Returns:\n        A tuple of (doc_chunks, doc_id), where doc_chunks is a list of document chunks, each of which is a DocumentChunk object with an id, a document_id, a text, and a metadata attribute,\n        and doc_id is the id of the document object, generated if not provided. The id of each chunk is generated from the document id and a sequential number, and the metadata is copied from the document object.\n    \"\"\"\n    # Check if the document text is empty or whitespace\n    if not doc.text or doc.text.isspace():\n        return [], doc.id or str(uuid.uuid4())\n\n    # Generate a document id if not provided\n    doc_id = doc.id or str(uuid.uuid4())\n\n    # Split the document text into chunks\n    text_chunks = get_text_chunks(doc.text, chunk_token_size)\n\n    metadata = (\n        DocumentChunkMetadata(**doc.metadata.__dict__)\n        if doc.metadata is not None\n        else DocumentChunkMetadata()\n    )\n\n    metadata.document_id = doc_id\n\n    # Initialize an empty list of chunks for this document\n    doc_chunks = []\n\n    # Assign each chunk a sequential number and create a DocumentChunk object\n    for i, text_chunk in enumerate(text_chunks):\n        chunk_id = f\"{doc_id}_{i}\"\n        doc_chunk = DocumentChunk(\n            id=chunk_id,\n            text=text_chunk,\n            metadata=metadata,\n        )\n        # Append the chunk object to the list of chunks for this document\n        doc_chunks.append(doc_chunk)\n\n    # Return the list of chunks and the document id\n    return doc_chunks, doc_id\n\n\ndef get_document_chunks(\n    documents: List[Document], chunk_token_size: Optional[int]\n) -> Dict[str, List[DocumentChunk]]:\n    \"\"\"\n    Convert a list of documents into a dictionary from document id to list of document chunks.\n\n    Args:\n        documents: The list of documents to convert.\n        chunk_token_size: The target size of each chunk in tokens, or None to use the default CHUNK_SIZE.\n\n    Returns:\n        A dictionary mapping each document id to a list of document chunks, each of which is a DocumentChunk object\n        with text, metadata, and embedding attributes.\n    \"\"\"\n    # Initialize an empty dictionary of lists of chunks\n    chunks: Dict[str, List[DocumentChunk]] = {}\n\n    # Initialize an empty list of all chunks\n    all_chunks: List[DocumentChunk] = []\n\n    # Loop over each document and create chunks\n    for doc in documents:\n        doc_chunks, doc_id = create_document_chunks(doc, chunk_token_size)\n\n        # Append the chunks for this document to the list of all chunks\n        all_chunks.extend(doc_chunks)\n\n        # Add the list of chunks for this document to the dictionary with the document id as the key\n        chunks[doc_id] = doc_chunks\n\n    # Check if there are no chunks\n    if not all_chunks:\n        return {}\n\n    # Get all the embeddings for the document chunks in batches, using get_embeddings\n    embeddings: List[List[float]] = []\n    for i in range(0, len(all_chunks), EMBEDDINGS_BATCH_SIZE):\n        # Get the text of the chunks in the current batch\n        batch_texts = [\n            chunk.text for chunk in all_chunks[i : i + EMBEDDINGS_BATCH_SIZE]\n        ]\n\n        # Get the embeddings for the batch texts\n        batch_embeddings = get_embeddings(batch_texts)\n\n        # Append the batch embeddings to the embeddings list\n        embeddings.extend(batch_embeddings)\n\n    # Update the document chunk objects with the embeddings\n    for i, chunk in enumerate(all_chunks):\n        # Assign the embedding from the embeddings list to the chunk object\n        chunk.embedding = embeddings[i]\n\n    return chunks\n"
  },
  {
    "path": "services/date.py",
    "content": "import arrow\nfrom loguru import logger\n\n\ndef to_unix_timestamp(date_str: str) -> int:\n    \"\"\"\n    Convert a date string to a unix timestamp (seconds since epoch).\n\n    Args:\n        date_str: The date string to convert.\n\n    Returns:\n        The unix timestamp corresponding to the date string.\n\n    If the date string cannot be parsed as a valid date format, returns the current unix timestamp and prints a warning.\n    \"\"\"\n    # Try to parse the date string using arrow, which supports many common date formats\n    try:\n        date_obj = arrow.get(date_str)\n        return int(date_obj.timestamp())\n    except arrow.parser.ParserError:\n        # If the parsing fails, return the current unix timestamp and print a warning\n        logger.info(f\"Invalid date format: {date_str}\")\n        return int(arrow.now().timestamp())\n"
  },
  {
    "path": "services/extract_metadata.py",
    "content": "from models.models import Source\nfrom services.openai import get_chat_completion\nimport json\nfrom typing import Dict\nimport os\nfrom loguru import logger\n\n\ndef extract_metadata_from_document(text: str) -> Dict[str, str]:\n    sources = Source.__members__.keys()\n    sources_string = \", \".join(sources)\n    # This prompt is just an example, change it to fit your use case\n    messages = [\n        {\n            \"role\": \"system\",\n            \"content\": f\"\"\"\n            Given a document from a user, try to extract the following metadata:\n            - source: string, one of {sources_string}\n            - url: string or don't specify\n            - created_at: string or don't specify\n            - author: string or don't specify\n\n            Respond with a JSON containing the extracted metadata in key value pairs. If you don't find a metadata field, don't specify it.\n            \"\"\",\n        },\n        {\"role\": \"user\", \"content\": text},\n    ]\n\n    # NOTE: Azure Open AI requires deployment id\n    # Read environment variable - if not set - not used\n    completion = get_chat_completion(\n        messages,\n        \"gpt-4\",\n        # os.environ.get(\"OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID\")\n    )  # TODO: change to your preferred model name\n\n    logger.info(f\"completion: {completion}\")\n\n    try:\n        metadata = json.loads(completion)\n    except Exception as e:\n        logger.error(f\"Error parsing completion: {e}\")\n        metadata = {}\n\n    return metadata\n"
  },
  {
    "path": "services/file.py",
    "content": "import os\nfrom io import BufferedReader\nfrom typing import Optional\nfrom fastapi import UploadFile\nimport mimetypes\nfrom PyPDF2 import PdfReader\nimport docx2txt\nimport csv\nimport pptx\nfrom loguru import logger\n\nfrom models.models import Document, DocumentMetadata\n\n\nasync def get_document_from_file(\n    file: UploadFile, metadata: DocumentMetadata\n) -> Document:\n    extracted_text = await extract_text_from_form_file(file)\n\n    doc = Document(text=extracted_text, metadata=metadata)\n\n    return doc\n\n\ndef extract_text_from_filepath(filepath: str, mimetype: Optional[str] = None) -> str:\n    \"\"\"Return the text content of a file given its filepath.\"\"\"\n\n    if mimetype is None:\n        # Get the mimetype of the file based on its extension\n        mimetype, _ = mimetypes.guess_type(filepath)\n\n    if not mimetype:\n        if filepath.endswith(\".md\"):\n            mimetype = \"text/markdown\"\n        else:\n            raise Exception(\"Unsupported file type\")\n\n    try:\n        with open(filepath, \"rb\") as file:\n            extracted_text = extract_text_from_file(file, mimetype)\n    except Exception as e:\n        logger.error(e)\n        raise e\n\n    return extracted_text\n\n\ndef extract_text_from_file(file: BufferedReader, mimetype: str) -> str:\n    if mimetype == \"application/pdf\":\n        # Extract text from pdf using PyPDF2\n        reader = PdfReader(file)\n        extracted_text = \" \".join([page.extract_text() for page in reader.pages])\n    elif mimetype == \"text/plain\" or mimetype == \"text/markdown\":\n        # Read text from plain text file\n        extracted_text = file.read().decode(\"utf-8\")\n    elif (\n        mimetype\n        == \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\"\n    ):\n        # Extract text from docx using docx2txt\n        extracted_text = docx2txt.process(file)\n    elif mimetype == \"text/csv\":\n        # Extract text from csv using csv module\n        extracted_text = \"\"\n        decoded_buffer = (line.decode(\"utf-8\") for line in file)\n        reader = csv.reader(decoded_buffer)\n        for row in reader:\n            extracted_text += \" \".join(row) + \"\\n\"\n    elif (\n        mimetype\n        == \"application/vnd.openxmlformats-officedocument.presentationml.presentation\"\n    ):\n        # Extract text from pptx using python-pptx\n        extracted_text = \"\"\n        presentation = pptx.Presentation(file)\n        for slide in presentation.slides:\n            for shape in slide.shapes:\n                if shape.has_text_frame:\n                    for paragraph in shape.text_frame.paragraphs:\n                        for run in paragraph.runs:\n                            extracted_text += run.text + \" \"\n                    extracted_text += \"\\n\"\n    else:\n        # Unsupported file type\n        raise ValueError(\"Unsupported file type: {}\".format(mimetype))\n\n    return extracted_text\n\n\n# Extract text from a file based on its mimetype\nasync def extract_text_from_form_file(file: UploadFile):\n    \"\"\"Return the text content of a file.\"\"\"\n    # get the file body from the upload file object\n    mimetype = file.content_type\n    logger.info(f\"mimetype: {mimetype}\")\n    logger.info(f\"file.file: {file.file}\")\n    logger.info(\"file: \", file)\n\n    file_stream = await file.read()\n\n    temp_file_path = \"/tmp/temp_file\"\n\n    # write the file to a temporary location\n    with open(temp_file_path, \"wb\") as f:\n        f.write(file_stream)\n\n    try:\n        extracted_text = extract_text_from_filepath(temp_file_path, mimetype)\n    except Exception as e:\n        logger.error(e)\n        os.remove(temp_file_path)\n        raise e\n\n    # remove file from temp location\n    os.remove(temp_file_path)\n\n    return extracted_text\n"
  },
  {
    "path": "services/openai.py",
    "content": "from typing import List\nimport openai\nimport os\nfrom loguru import logger\n\nfrom tenacity import retry, wait_random_exponential, stop_after_attempt\n\nEMBEDDING_MODEL = os.environ.get(\"EMBEDDING_MODEL\", \"text-embedding-3-large\")\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\n@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))\ndef get_embeddings(texts: List[str]) -> List[List[float]]:\n    \"\"\"\n    Embed texts using OpenAI's ada model.\n\n    Args:\n        texts: The list of texts to embed.\n\n    Returns:\n        A list of embeddings, each of which is a list of floats.\n\n    Raises:\n        Exception: If the OpenAI API call fails.\n    \"\"\"\n    # Call the OpenAI API to get the embeddings\n    # NOTE: Azure Open AI requires deployment id\n    deployment = os.environ.get(\"OPENAI_EMBEDDINGMODEL_DEPLOYMENTID\")\n\n    response = {}\n    if deployment is None:\n        response = openai.Embedding.create(input=texts, model=EMBEDDING_MODEL, dimensions=EMBEDDING_DIMENSION)\n    else:\n        response = openai.Embedding.create(input=texts, deployment_id=deployment)\n\n    # Extract the embedding data from the response\n    data = response[\"data\"]  # type: ignore\n\n    # Return the embeddings as a list of lists of floats\n    return [result[\"embedding\"] for result in data]\n\n\n@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))\ndef get_chat_completion(\n    messages,\n    model=\"gpt-3.5-turbo\",  # use \"gpt-4\" for better results\n    deployment_id=None,\n):\n    \"\"\"\n    Generate a chat completion using OpenAI's chat completion API.\n\n    Args:\n        messages: The list of messages in the chat history.\n        model: The name of the model to use for the completion. Default is gpt-3.5-turbo, which is a fast, cheap and versatile model. Use gpt-4 for higher quality but slower results.\n\n    Returns:\n        A string containing the chat completion.\n\n    Raises:\n        Exception: If the OpenAI API call fails.\n    \"\"\"\n    # call the OpenAI chat completion API with the given messages\n    # Note: Azure Open AI requires deployment id\n    response = {}\n    if deployment_id == None:\n        response = openai.ChatCompletion.create(\n            model=model,\n            messages=messages,\n        )\n    else:\n        response = openai.ChatCompletion.create(\n            deployment_id=deployment_id,\n            messages=messages,\n        )\n\n    choices = response[\"choices\"]  # type: ignore\n    completion = choices[0].message.content.strip()\n    logger.info(f\"Completion: {completion}\")\n    return completion\n"
  },
  {
    "path": "services/pii_detection.py",
    "content": "import os\nfrom services.openai import get_chat_completion\n\n\ndef screen_text_for_pii(text: str) -> bool:\n    # This prompt is just an example, change it to fit your use case\n    messages = [\n        {\n            \"role\": \"system\",\n            \"content\": f\"\"\"\n            You can only respond with the word \"True\" or \"False\", where your answer indicates whether the text in the user's message contains PII.\n            Do not explain your answer, and do not use punctuation.\n            Your task is to identify whether the text extracted from your company files\n            contains sensitive PII information that should not be shared with the broader company. Here are some things to look out for:\n            - An email address that identifies a specific person in either the local-part or the domain\n            - The postal address of a private residence (must include at least a street name)\n            - The postal address of a public place (must include either a street name or business name)\n            - Notes about hiring decisions with mentioned names of candidates. The user will send a document for you to analyze.\n            \"\"\",\n        },\n        {\"role\": \"user\", \"content\": text},\n    ]\n\n    completion = get_chat_completion(\n        messages, deployment_id=os.environ.get(\"OPENAI_COMPLETIONMODEL_DEPLOYMENTID\")\n    )\n\n    if completion.startswith(\"True\"):\n        return True\n\n    return False\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/datastore/providers/analyticdb/test_analyticdb_datastore.py",
    "content": "import pytest\nfrom models.models import (\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    DocumentChunk,\n    QueryWithEmbedding,\n    Source,\n)\nfrom datastore.providers.analyticdb_datastore import (\n    OUTPUT_DIM,\n    AnalyticDBDataStore,\n)\n\n\n@pytest.fixture\ndef analyticdb_datastore():\n    return AnalyticDBDataStore()\n\n\n@pytest.fixture\ndef document_chunk_one():\n    doc_id = \"zerp\"\n    doc_chunks = []\n\n    ids = [\"abc_123\", \"def_456\", \"ghi_789\"]\n    texts = [\n        \"lorem ipsum dolor sit amet\",\n        \"consectetur adipiscing elit\",\n        \"sed do eiusmod tempor incididunt\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    source_ids = [\"foo\", \"bar\", \"baz\"]\n    urls = [\"foo.com\", \"bar.net\", \"baz.org\"]\n    created_ats = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"2021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n    embeddings = [[x] * OUTPUT_DIM for x in range(3)]\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id,\n                source=sources[i],\n                source_id=source_ids[i],\n                url=urls[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks.append(chunk)\n\n    return {doc_id: doc_chunks}\n\n\n@pytest.fixture\ndef document_chunk_two():\n    doc_id_1 = \"zerp\"\n    doc_chunks_1 = []\n\n    ids = [\"abc_123\", \"def_456\", \"ghi_789\"]\n    texts = [\n        \"1lorem ipsum dolor sit amet\",\n        \"2consectetur adipiscing elit\",\n        \"3sed do eiusmod tempor incididunt\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    source_ids = [\"foo\", \"bar\", \"baz\"]\n    urls = [\"foo.com\", \"bar.net\", \"baz.org\"]\n    created_ats = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"3021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n    embeddings = [[x] * OUTPUT_DIM for x in range(3)]\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id_1,\n                source=sources[i],\n                source_id=source_ids[i],\n                url=urls[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks_1.append(chunk)\n\n    doc_id_2 = \"merp\"\n    doc_chunks_2 = []\n\n    ids = [\"jkl_123\", \"lmn_456\", \"opq_789\"]\n    texts = [\n        \"3sdsc efac feas sit qweas\",\n        \"4wert sdfas fdsc\",\n        \"52dsc fdsf eiusmod asdasd incididunt\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    source_ids = [\"foo\", \"bar\", \"baz\"]\n    urls = [\"foo.com\", \"bar.net\", \"baz.org\"]\n    created_ats = [\n        \"4929-10-28T09:30:00-05:00\",\n        \"5009-01-03T16:39:57-08:00\",\n        \"6021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n    embeddings = [[x] * OUTPUT_DIM for x in range(3, 6)]\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id_2,\n                source=sources[i],\n                source_id=source_ids[i],\n                url=urls[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks_2.append(chunk)\n\n    return {doc_id_1: doc_chunks_1, doc_id_2: doc_chunks_2}\n\n\n@pytest.mark.asyncio\nasync def test_upsert(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=10,\n        embedding=[0.5] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n    assert 3 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_reload(analyticdb_datastore, document_chunk_one, document_chunk_two):\n    await analyticdb_datastore.delete(delete_all=True)\n\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=10,\n        embedding=[0.5] * OUTPUT_DIM,\n    )\n\n    query_results = await analyticdb_datastore._query(queries=[query])\n    assert 3 == len(query_results[0].results)\n    new_store = AnalyticDBDataStore()\n    another_in = {i: document_chunk_two[i] for i in document_chunk_two if i != res[0]}\n    res = await new_store._upsert(another_in)\n\n    query_results = await analyticdb_datastore._query(queries=[query])\n    assert 1 == len(query_results)\n    assert 6 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_upsert_query_all(analyticdb_datastore, document_chunk_two):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_two)\n    assert res == list(document_chunk_two.keys())\n    # Num entities currently doesn't track deletes\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=10,\n        embedding=[0.5] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 6 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_query_accuracy(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=1,\n        embedding=[0] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 1 == len(query_results[0].results)\n    assert 0 == query_results[0].results[0].score\n    assert \"abc_123\" == query_results[0].results[0].id\n\n\n@pytest.mark.asyncio\nasync def test_query_filter(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=1,\n        embedding=[0] * OUTPUT_DIM,\n        filter=DocumentMetadataFilter(\n            start_date=\"2000-01-03T16:39:57-08:00\", end_date=\"2010-01-03T16:39:57-08:00\"\n        ),\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 1 == len(query_results[0].results)\n    assert 0 != query_results[0].results[0].score\n    assert \"def_456\" == query_results[0].results[0].id\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_date_filter(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    await analyticdb_datastore.delete(\n        filter=DocumentMetadataFilter(\n            end_date=\"2009-01-03T16:39:57-08:00\",\n        )\n    )\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=[0] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 1 == len(query_results[0].results)\n    assert \"ghi_789\" == query_results[0].results[0].id\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_source_filter(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    await analyticdb_datastore.delete(\n        filter=DocumentMetadataFilter(\n            source=Source.email,\n        )\n    )\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=[0] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 2 == len(query_results[0].results)\n    assert \"def_456\" == query_results[0].results[0].id\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_document_id_filter(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    await analyticdb_datastore.delete(\n        filter=DocumentMetadataFilter(\n            document_id=res[0],\n        )\n    )\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=[0] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 0 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_document_id(analyticdb_datastore, document_chunk_one):\n    await analyticdb_datastore.delete(delete_all=True)\n    res = await analyticdb_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    await analyticdb_datastore.delete([res[0]])\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=[0] * OUTPUT_DIM,\n    )\n    query_results = await analyticdb_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 0 == len(query_results[0].results)\n\n\n# if __name__ == '__main__':\n#     import sys\n#     import pytest\n#     pytest.main(sys.argv)\n"
  },
  {
    "path": "tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py",
    "content": "import pytest\nfrom typing import Dict, List\nfrom dotenv import dotenv_values\n\nfrom datastore.datastore import DataStore\nfrom datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    QueryWithEmbedding,\n)\nimport os\n\nnum_lists = 1\nsimilarity = \"COS\"\n\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\ndef create_embedding(non_zero_pos: int) -> List[float]:\n    # create a vector with a single non-zero value of dimension EMBEDDING_DIMENSION\n    vector = [0.0] * EMBEDDING_DIMENSION\n    vector[non_zero_pos - 1] = 1.0\n    return vector\n\n\n@pytest.fixture\ndef azure_cosmos_db_settings_from_dot_env() -> dict:\n    \"\"\"\n    Reads the Azure CosmosDB environment variables for the .env file.\n\n    Returns:\n        dict: The Azure CosmosDB environment variables\n    \"\"\"\n    config = dotenv_values(\".env\")\n    env_variables = {\n        \"DATASTORE\": \"azurecosmosdb\",\n        \"AZCOSMOS_API\": config.get(\n            (\"AZCOSMOS_API\")\n        ),  # Right now CosmosDB only supports vector search in Mongo vCore.\n        \"AZCOSMOS_CONNSTR\": config.get(\"AZCOSMOS_CONNSTR\"),\n        \"AZCOSMOS_DATABASE_NAME\": config.get(\"AZCOSMOS_DATABASE_NAME\"),\n        \"AZCOSMOS_CONTAINER_NAME\": config.get(\"AZCOSMOS_CONTAINER_NAME\"),\n    }\n\n    return env_variables\n\n\n@pytest.fixture\ndef initial_document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc-{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(),\n            embedding=create_embedding(i),\n        )\n        for i in range(4, 7)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n    }\n\n\n@pytest.fixture\ndef queries() -> List[QueryWithEmbedding]:\n    queries = [\n        QueryWithEmbedding(\n            query=\"Query 1\",\n            top_k=1,\n            embedding=create_embedding(4),\n        ),\n        QueryWithEmbedding(\n            query=\"Query 2\",\n            top_k=2,\n            embedding=create_embedding(5),\n        ),\n    ]\n    return queries\n\n\n@pytest.fixture\nasync def azurecosmosdb_datastore() -> DataStore:\n    return await AzureCosmosDBDataStore.create(\n        num_lists=num_lists, similarity=similarity\n    )\n\n\n@pytest.mark.asyncio\nasync def test_upsert(\n    azurecosmosdb_datastore: AzureCosmosDBDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    \"\"\"Test basic upsert.\"\"\"\n    doc_ids = await azurecosmosdb_datastore._upsert(initial_document_chunks)\n    assert doc_ids == [\n        f\"doc:{doc_id}:chunk:{chunk.id}\"\n        for doc_id, chunk_list in initial_document_chunks.items()\n        for chunk in chunk_list\n    ]\n\n\n@pytest.mark.asyncio\nasync def test_query(\n    azurecosmosdb_datastore: AzureCosmosDBDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n    queries: List[QueryWithEmbedding],\n) -> None:\n    \"\"\"Test basic query.\"\"\"\n    await azurecosmosdb_datastore.delete(delete_all=True)\n    # insert to prepare for the test\n    await azurecosmosdb_datastore._upsert(initial_document_chunks)\n\n    query_results = await azurecosmosdb_datastore._query(queries)\n    assert len(query_results) == len(queries)\n\n    query_0_results = query_results[0].results\n    query_1_results = query_results[1].results\n\n    assert len(query_0_results) == 1\n    assert len(query_1_results) == 2\n\n    # NOTE: this is the correct behavior\n    assert query_0_results[0].id == \"doc:first-doc:chunk:first-doc-4\"\n    assert query_1_results[0].id == \"doc:first-doc:chunk:first-doc-5\"\n    assert query_1_results[1].id == \"doc:first-doc:chunk:first-doc-4\"\n\n\n@pytest.mark.asyncio\nasync def test_delete(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None:\n    await azurecosmosdb_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"deleteChunk1\",\n        text=\"delete text 1\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    chunk2 = DocumentChunk(\n        id=\"deleteChunk2\",\n        text=\"delete text 2\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    # insert to prepare for test\n    await azurecosmosdb_datastore._upsert(\n        {\"deleteDoc1\": [chunk1], \"deleteDoc2\": [chunk2]}\n    )\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query for delete\",\n        embedding=query_embedding,\n    )\n    results = await azurecosmosdb_datastore._query([query])\n\n    assert len(results[0].results) == 2\n    assert results[0].results[0].id == \"doc:deleteDoc1:chunk:deleteChunk1\"\n    assert results[0].results[1].id == \"doc:deleteDoc2:chunk:deleteChunk2\"\n\n    await azurecosmosdb_datastore.delete(ids=[\"doc:deleteDoc1:chunk:deleteChunk1\"])\n    results_after_delete = await azurecosmosdb_datastore._query([query])\n\n    assert len(results_after_delete[0].results) == 1\n    assert results_after_delete[0].results[0].id == \"doc:deleteDoc2:chunk:deleteChunk2\"\n\n\n@pytest.mark.asynio\nasync def test_delete_all(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None:\n    await azurecosmosdb_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"deleteChunk\",\n        text=\"delete text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    await azurecosmosdb_datastore._upsert({\"deleteDoc\": [chunk]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"delete query\",\n        embedding=query_embedding,\n        top_k=1,\n    )\n    results = await azurecosmosdb_datastore._query([query])\n\n    assert len(results) == 1\n    assert len(results[0].results) == 1\n    assert results[0].results[0].id == \"doc:deleteDoc:chunk:deleteChunk\"\n\n    await azurecosmosdb_datastore.delete(delete_all=True)\n    results_after_delete = await azurecosmosdb_datastore._query([query])\n\n    assert len(results_after_delete[0].results) == 0\n"
  },
  {
    "path": "tests/datastore/providers/azuresearch/test_azuresearch_datastore.py",
    "content": "import pytest\nimport os\nimport time\nfrom typing import Union\nfrom azure.search.documents.indexes import SearchIndexClient\nfrom models.models import (\n    DocumentMetadataFilter,\n    Query,\n    Source,\n    Document,\n    DocumentMetadata,\n)\n\nAZURESEARCH_TEST_INDEX = \"testindex\"\nos.environ[\"AZURESEARCH_INDEX\"] = AZURESEARCH_TEST_INDEX\nif os.environ.get(\"AZURESEARCH_SERVICE\") == None:\n    os.environ[\n        \"AZURESEARCH_SERVICE\"\n    ] = \"invalid service name\"  # Will fail anyway if not set to a real service, but allows tests to be discovered\n\nimport datastore.providers.azuresearch_datastore\nfrom datastore.providers.azuresearch_datastore import AzureSearchDataStore\n\n\n@pytest.fixture(scope=\"module\")\ndef azuresearch_mgmt_client():\n    service = os.environ[\"AZURESEARCH_SERVICE\"]\n    return SearchIndexClient(\n        endpoint=f\"https://{service}.search.windows.net\",\n        credential=AzureSearchDataStore._create_credentials(False),\n    )\n\n\ndef test_translate_filter():\n    assert AzureSearchDataStore._translate_filter(DocumentMetadataFilter()) == None\n\n    for field in [\"document_id\", \"source\", \"source_id\", \"author\"]:\n        value = Source.file if field == \"source\" else f\"test_{field}\"\n        needs_escaping_value = None if field == \"source\" else f\"test'_{field}\"\n        assert (\n            AzureSearchDataStore._translate_filter(\n                DocumentMetadataFilter(**{field: value})\n            )\n            == f\"{field} eq '{value}'\"\n        )\n        if needs_escaping_value != None:\n            assert (\n                AzureSearchDataStore._translate_filter(\n                    DocumentMetadataFilter(**{field: needs_escaping_value})\n                )\n                == f\"{field} eq 'test''_{field}'\"\n            )\n\n    assert (\n        AzureSearchDataStore._translate_filter(\n            DocumentMetadataFilter(\n                document_id=\"test_document_id\",\n                source=Source.file,\n                source_id=\"test_source_id\",\n                author=\"test_author\",\n            )\n        )\n        == \"document_id eq 'test_document_id' and source eq 'file' and source_id eq 'test_source_id' and author eq 'test_author'\"\n    )\n\n    with pytest.raises(ValueError):\n        assert AzureSearchDataStore._translate_filter(\n            DocumentMetadataFilter(start_date=\"2023-01-01\")\n        )\n    with pytest.raises(ValueError):\n        assert AzureSearchDataStore._translate_filter(\n            DocumentMetadataFilter(end_date=\"2023-01-01\")\n        )\n\n    assert (\n        AzureSearchDataStore._translate_filter(\n            DocumentMetadataFilter(\n                start_date=\"2023-01-01T00:00:00Z\",\n                end_date=\"2023-01-02T00:00:00Z\",\n                document_id=\"test_document_id\",\n            )\n        )\n        == \"document_id eq 'test_document_id' and created_at ge 2023-01-01T00:00:00Z and created_at le 2023-01-02T00:00:00Z\"\n    )\n\n\n@pytest.mark.asyncio\nasync def test_lifecycle_hybrid(azuresearch_mgmt_client: SearchIndexClient):\n    datastore.providers.azuresearch_datastore.AZURESEARCH_DISABLE_HYBRID = None\n    datastore.providers.azuresearch_datastore.AZURESEARCH_SEMANTIC_CONFIG = None\n    await lifecycle(azuresearch_mgmt_client)\n\n\n@pytest.mark.asyncio\nasync def test_lifecycle_vectors_only(azuresearch_mgmt_client: SearchIndexClient):\n    datastore.providers.azuresearch_datastore.AZURESEARCH_DISABLE_HYBRID = \"1\"\n    datastore.providers.azuresearch_datastore.AZURESEARCH_SEMANTIC_CONFIG = None\n    await lifecycle(azuresearch_mgmt_client)\n\n\n@pytest.mark.asyncio\nasync def test_lifecycle_semantic(azuresearch_mgmt_client: SearchIndexClient):\n    datastore.providers.azuresearch_datastore.AZURESEARCH_DISABLE_HYBRID = None\n    datastore.providers.azuresearch_datastore.AZURESEARCH_SEMANTIC_CONFIG = (\n        \"testsemconfig\"\n    )\n    await lifecycle(azuresearch_mgmt_client)\n\n\nasync def lifecycle(azuresearch_mgmt_client: SearchIndexClient):\n    if AZURESEARCH_TEST_INDEX in azuresearch_mgmt_client.list_index_names():\n        azuresearch_mgmt_client.delete_index(AZURESEARCH_TEST_INDEX)\n    assert AZURESEARCH_TEST_INDEX not in azuresearch_mgmt_client.list_index_names()\n    try:\n        store = AzureSearchDataStore()\n        index = azuresearch_mgmt_client.get_index(AZURESEARCH_TEST_INDEX)\n        assert index is not None\n\n        result = await store.upsert(\n            [\n                Document(\n                    id=\"test_id_1\",\n                    text=\"test text\",\n                    metadata=DocumentMetadata(\n                        source=Source.file,\n                        source_id=\"test_source_id\",\n                        author=\"test_author\",\n                        created_at=\"2023-01-01T00:00:00Z\",\n                        url=\"http://some-test-url/path\",\n                    ),\n                ),\n                Document(\n                    id=\"test_id_2+\",\n                    text=\"different\",\n                    metadata=DocumentMetadata(\n                        source=Source.file,\n                        source_id=\"test_source_id\",\n                        author=\"test_author\",\n                        created_at=\"2023-01-01T00:00:00Z\",\n                        url=\"http://some-test-url/path\",\n                    ),\n                ),\n            ]\n        )\n        assert (\n            len(result) == 2 and result[0] == \"test_id_1\" and result[1] == \"test_id_2+\"\n        )\n\n        # query in a loop in case we need to retry since documents aren't searchable synchronosuly after updates\n        for _ in range(4):\n            time.sleep(0.25)\n            result = await store.query([Query(query=\"text\")])\n            if len(result) > 0 and len(result[0].results) > 0:\n                break\n        assert len(result) == 1 and len(result[0].results) == 2\n        assert (\n            result[0].results[0].metadata.document_id == \"test_id_1\"\n            and result[0].results[1].metadata.document_id == \"test_id_2+\"\n        )\n\n        result = await store.query(\n            [\n                Query(\n                    query=\"text\",\n                    filter=DocumentMetadataFilter(source_id=\"test_source_id\"),\n                )\n            ]\n        )\n        assert len(result) == 1 and len(result[0].results) == 2\n        assert (\n            result[0].results[0].metadata.document_id == \"test_id_1\"\n            and result[0].results[1].metadata.document_id == \"test_id_2+\"\n        )\n\n        result = await store.query(\n            [\n                Query(\n                    query=\"text\",\n                    filter=DocumentMetadataFilter(source_id=\"nonexisting_id\"),\n                )\n            ]\n        )\n        assert len(result) == 1 and len(result[0].results) == 0\n\n        result = await store.query(\n            [\n                Query(\n                    query=\"text\",\n                    filter=DocumentMetadataFilter(start_date=\"2023-01-02T00:00:00Z\"),\n                )\n            ]\n        )\n        assert len(result) == 1 and len(result[0].results) == 0\n\n        result = await store.query(\n            [\n                Query(\n                    query=\"text\",\n                    filter=DocumentMetadataFilter(start_date=\"2023-01-01T00:00:00Z\"),\n                )\n            ]\n        )\n        assert len(result) == 1 and len(result[0].results) == 2\n        assert (\n            result[0].results[0].metadata.document_id == \"test_id_1\"\n            and result[0].results[1].metadata.document_id == \"test_id_2+\"\n        )\n\n        result = await store.query(\n            [\n                Query(\n                    query=\"text\",\n                    filter=DocumentMetadataFilter(end_date=\"2022-12-31T00:00:00Z\"),\n                )\n            ]\n        )\n        assert len(result) == 1 and len(result[0].results) == 0\n\n        result = await store.query(\n            [\n                Query(\n                    query=\"text\",\n                    filter=DocumentMetadataFilter(end_date=\"2023-01-02T00:00:00Z\"),\n                )\n            ]\n        )\n        assert len(result) == 1 and len(result[0].results) == 2\n        assert (\n            result[0].results[0].metadata.document_id == \"test_id_1\"\n            and result[0].results[1].metadata.document_id == \"test_id_2+\"\n        )\n\n        # query in a loop in case we need to retry since documents aren't searchable synchronosuly after updates\n        assert await store.delete([\"test_id_1\", \"test_id_2+\"])\n        for _ in range(4):\n            time.sleep(0.25)\n            result = await store.query([Query(query=\"text\")])\n            if len(result) > 0 and len(result[0].results) == 0:\n                break\n        assert len(result) == 1 and len(result[0].results) == 0\n    finally:\n        azuresearch_mgmt_client.delete_index(AZURESEARCH_TEST_INDEX)\n"
  },
  {
    "path": "tests/datastore/providers/chroma/test_chroma_datastore.py",
    "content": "import shutil\nfrom typing import Dict, List\nimport pytest\nimport random\n\nfrom datastore.providers.chroma_datastore import ChromaDataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    QueryWithEmbedding,\n    Source,\n)\n\nTEST_PERSISTENCE_DIR = \"chroma_test_datastore\"\nCOLLECTION_NAME = \"documents\"\n\n\ndef ephemeral_chroma_datastore() -> ChromaDataStore:\n    # Initialize an ephemeral in-memory ChromaDB instance\n    return ChromaDataStore(\n        collection_name=COLLECTION_NAME, in_memory=True, persistence_dir=None\n    )\n\n\ndef persisted_chroma_datastore() -> ChromaDataStore:\n    # Initialize an in-memory ChromaDB instance with persistence\n    return ChromaDataStore(\n        collection_name=COLLECTION_NAME,\n        in_memory=True,\n        persistence_dir=TEST_PERSISTENCE_DIR,\n    )\n\n\ndef get_chroma_datastore() -> ChromaDataStore:\n    yield ephemeral_chroma_datastore()\n    yield persisted_chroma_datastore()\n    # Delete the persistence directory after the test\n\n\n@pytest.fixture(autouse=True)\ndef cleanup():\n    yield\n    shutil.rmtree(TEST_PERSISTENCE_DIR, ignore_errors=True)\n\n\n# Seed for deterministic testing\nrandom.seed(0)\n\n\ndef create_embedding(dim: int) -> List[float]:\n    return [random.random() for _ in range(dim)]\n\n\n# Data fixtures\nTEST_EMBEDDING_DIM = 5\nN_TEST_CHUNKS = 5\n\n\n@pytest.fixture\ndef initial_document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc-{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(),\n            embedding=create_embedding(TEST_EMBEDDING_DIM),\n        )\n        for i in range(N_TEST_CHUNKS)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n    }\n\n\n@pytest.fixture\ndef document_chunks(initial_document_chunks) -> Dict[str, List[DocumentChunk]]:\n    doc_chunks = initial_document_chunks\n\n    for k, v in doc_chunks.items():\n        for chunk in v:\n            chunk.metadata = DocumentChunkMetadata(\n                source=Source.email, created_at=\"2023-04-03\", document_id=\"first-doc\"\n            )\n            chunk.embedding = create_embedding(TEST_EMBEDDING_DIM)\n\n    doc_chunks[\"second-doc\"] = [\n        DocumentChunk(\n            id=f\"second-doc-{i}\",\n            text=f\"Dolor sit amet {i}\",\n            metadata=DocumentChunkMetadata(\n                created_at=\"2023-04-04\", document_id=\"second-doc\"\n            ),\n            embedding=create_embedding(TEST_EMBEDDING_DIM),\n        )\n        for i in range(N_TEST_CHUNKS)\n    ]\n\n    return doc_chunks\n\n\n@pytest.mark.asyncio\nasync def test_add_chunks(document_chunks: Dict[str, List[DocumentChunk]]):\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n        assert datastore._collection.count() == 0\n\n        print(document_chunks)\n\n        assert await datastore._upsert(document_chunks) == list(document_chunks.keys())\n        assert datastore._collection.count() == sum(\n            len(v) for v in document_chunks.values()\n        )\n\n\n@pytest.mark.asyncio\nasync def test_upsert(\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n    document_chunks: Dict[str, List[DocumentChunk]],\n):\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n\n        assert await datastore._upsert(initial_document_chunks) == list(\n            initial_document_chunks.keys()\n        )\n        assert datastore._collection.count() == sum(\n            len(v) for v in initial_document_chunks.values()\n        )\n\n        assert await datastore._upsert(document_chunks) == list(document_chunks.keys())\n        assert datastore._collection.count() == sum(\n            len(v) for v in document_chunks.values()\n        )\n\n\n@pytest.mark.asyncio\nasync def test_add_and_query_all(document_chunks):\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n\n        await datastore._upsert(document_chunks) == list(document_chunks.keys())\n\n        query = QueryWithEmbedding(\n            query=\"\",\n            embedding=create_embedding(TEST_EMBEDDING_DIM),\n            top_k=10,\n        )\n        query_results = await datastore._query(queries=[query])\n        assert 1 == len(query_results)\n        assert 10 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_query_accuracy(document_chunks):\n    for _, v in document_chunks.items():\n        for chunk in v:\n            print(f\"id: {chunk.id} emb: {chunk.embedding}\")\n\n    def add_noise_to_embedding(embedding: List[float], eps: float = 0) -> List[float]:\n        return [x + eps * (1.0 - 2 * random.random()) for x in embedding]\n\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n\n        print(datastore._collection.get(include=[\"embeddings\"]))\n\n        res = await datastore._upsert(document_chunks)\n\n        res = datastore._collection.get(include=[\"embeddings\"])\n        for id, emb in zip(res[\"ids\"], res[\"embeddings\"]):\n            print(f\"id: {id} emb: {emb}\")\n\n        for _, v in document_chunks.items():\n            for chunk in v:\n                print(f\"chunk: {chunk}\")\n                query = QueryWithEmbedding(\n                    query=\"\",\n                    embedding=add_noise_to_embedding(chunk.embedding),\n                    top_k=1,\n                )\n                query_results = await datastore._query(queries=[query])\n                print(query_results)\n                assert query_results[0].results[0].id == chunk.id\n\n\n@pytest.mark.asyncio\nasync def test_query_filter_by_id(document_chunks):\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n\n        await datastore._upsert(document_chunks)\n\n        for doc_id, chunks in document_chunks.items():\n            query = QueryWithEmbedding(\n                query=\"\",\n                embedding=chunks[0].embedding,\n                top_k=N_TEST_CHUNKS,\n                filter=DocumentMetadataFilter(document_id=doc_id),\n            )\n            query_results = await datastore._query(queries=[query])\n            # Assert that all document chunks are returned\n            assert len(query_results[0].results) == len(chunks)\n            assert all(\n                [\n                    result.id in [chunk.id for chunk in chunks]\n                    for result in query_results[0].results\n                ]\n            )\n\n\n@pytest.mark.asyncio\nasync def test_query_filter_by_date(document_chunks):\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n\n        await datastore._upsert(document_chunks)\n\n        # Filter by dates for only the first document\n        query = QueryWithEmbedding(\n            query=\"\",\n            embedding=document_chunks[\"first-doc\"][0].embedding,\n            top_k=N_TEST_CHUNKS,\n            filter=DocumentMetadataFilter(\n                start_date=\"2023-04-03\", end_date=\"2023-04-03\"\n            ),\n        )\n\n        query_results = await datastore._query(queries=[query])\n\n        # Assert that only the first document is returned\n        assert len(query_results[0].results) == len(document_chunks[\"first-doc\"])\n        assert all(\n            [\n                result.id in [chunk.id for chunk in document_chunks[\"first-doc\"]]\n                for result in query_results[0].results\n            ]\n        )\n\n        # Filter for the entire date span\n        query = QueryWithEmbedding(\n            query=\"\",\n            embedding=document_chunks[\"first-doc\"][0].embedding,\n            top_k=N_TEST_CHUNKS * len(document_chunks),\n            filter=DocumentMetadataFilter(\n                start_date=\"2023-04-03\", end_date=\"2023-04-04\"\n            ),\n        )\n\n        query_results = await datastore._query(queries=[query])\n\n        # Assert that both documents are returned\n        assert len(query_results[0].results) == len(document_chunks[\"first-doc\"]) + len(\n            document_chunks[\"second-doc\"]\n        )\n        assert all(\n            [\n                result.id\n                in [chunk.id for chunk in document_chunks[\"first-doc\"]]\n                + [chunk.id for chunk in document_chunks[\"second-doc\"]]\n                for result in query_results[0].results\n            ]\n        )\n\n\n@pytest.mark.asyncio\nasync def test_delete_by_id(document_chunks):\n    for datastore in get_chroma_datastore():\n        await datastore.delete(delete_all=True)\n\n        await datastore._upsert(document_chunks)\n\n        # Delete the first document\n        await datastore.delete(ids=[\"first-doc\"])\n\n        # Assert that the first document is deleted\n        query = QueryWithEmbedding(\n            query=\"\",\n            embedding=document_chunks[\"first-doc\"][0].embedding,\n            top_k=N_TEST_CHUNKS,\n        )\n        query_results = await datastore._query(queries=[query])\n\n        # Assert that only the second document is still there\n        query_results = await datastore._query(queries=[query])\n        assert len(query_results[0].results) == len(document_chunks[\"second-doc\"])\n\n        assert all(\n            [\n                result.id in [chunk.id for chunk in document_chunks[\"second-doc\"]]\n                for result in query_results[0].results\n            ]\n        )\n"
  },
  {
    "path": "tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py",
    "content": "import pytest\nfrom models.models import (\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    DocumentChunk,\n    QueryWithEmbedding,\n    Source,\n)\nfrom datastore.providers.elasticsearch_datastore import (\n    ElasticsearchDataStore,\n)\nimport time\nimport os\n\nDIM_SIZE = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\n@pytest.fixture\ndef elasticsearch_datastore():\n    return ElasticsearchDataStore()\n\n\ndef sample_embedding(one_element_poz: int):\n    embedding = [0] * DIM_SIZE\n    embedding[one_element_poz % DIM_SIZE] = 1\n    return embedding\n\n\ndef sample_embeddings(num: int, one_element_start: int = 0):\n    embeddings = []\n    for x in range(num):\n        embedding = [0] * DIM_SIZE\n        embedding[(x + one_element_start) % DIM_SIZE] = 1\n        embeddings.append(embedding)\n    return embeddings\n\n\n@pytest.fixture\ndef document_chunk_one():\n    doc_id = \"abc\"\n    doc_chunks = []\n\n    ids = [\"123\", \"456\", \"789\"]\n    texts = [\n        \"Aenean euismod bibendum laoreet\",\n        \"Vivamus non enim vitae tortor\",\n        \"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    created_ats = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"2021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Fred Smith\", \"Bob Doe\", \"Appleton Doe\"]\n\n    embeddings = sample_embeddings(len(texts))\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id,\n                source=sources[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks.append(chunk)\n\n    return {doc_id: doc_chunks}\n\n\nasync def test_upsert(elasticsearch_datastore, document_chunk_one):\n    await elasticsearch_datastore.delete(delete_all=True)\n    res = await elasticsearch_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    time.sleep(1)\n\n    results = elasticsearch_datastore.client.search(\n        index=elasticsearch_datastore.index_name, query={\"match_all\": {}}\n    )\n    assert results[\"hits\"][\"total\"][\"value\"] == 3\n    elasticsearch_datastore.client.indices.delete(\n        index=elasticsearch_datastore.index_name\n    )\n\n\nasync def test_upsert_query_all(elasticsearch_datastore, document_chunk_one):\n    await elasticsearch_datastore.delete(delete_all=True)\n    res = await elasticsearch_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    time.sleep(1)\n\n    query = QueryWithEmbedding(\n        query=\"Aenean\",\n        top_k=10,\n        embedding=sample_embedding(0),  # type: ignore\n    )\n    query_results = await elasticsearch_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 3 == len(query_results[0].results)\n\n\nasync def test_delete_with_document_id(elasticsearch_datastore, document_chunk_one):\n    await elasticsearch_datastore.delete(delete_all=True)\n    res = await elasticsearch_datastore._upsert(document_chunk_one)\n    time.sleep(1)\n    assert res == list(document_chunk_one.keys())\n    await elasticsearch_datastore.delete([res[0]])\n    time.sleep(1)\n\n    query = QueryWithEmbedding(\n        query=\"Aenean\",\n        top_k=9,\n        embedding=sample_embedding(0),  # type: ignore\n    )\n    query_results = await elasticsearch_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 0 == len(query_results[0].results)\n\n    elasticsearch_datastore.client.indices.delete(\n        index=elasticsearch_datastore.index_name\n    )\n\n\nasync def test_delete_with_source_filter(elasticsearch_datastore, document_chunk_one):\n    await elasticsearch_datastore.delete(delete_all=True)\n    res = await elasticsearch_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    time.sleep(1)\n\n    await elasticsearch_datastore.delete(\n        filter=DocumentMetadataFilter(\n            source=Source.email,\n        )\n    )\n\n    time.sleep(1)\n\n    query = QueryWithEmbedding(\n        query=\"Aenean\",\n        top_k=9,\n        embedding=sample_embedding(0),  # type: ignore\n    )\n    query_results = await elasticsearch_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 2 == len(query_results[0].results)\n    assert \"456\" == query_results[0].results[0].id\n\n    elasticsearch_datastore.client.indices.delete(\n        index=elasticsearch_datastore.index_name\n    )\n"
  },
  {
    "path": "tests/datastore/providers/llama/test_llama_datastore.py",
    "content": "from typing import Dict, List\nimport pytest\nfrom datastore.providers.llama_datastore import LlamaDataStore\nfrom models.models import DocumentChunk, DocumentChunkMetadata, QueryWithEmbedding\n\n\ndef create_embedding(non_zero_pos: int, size: int) -> List[float]:\n    vector = [0.0] * size\n    vector[non_zero_pos % size] = 1.0\n    return vector\n\n\n@pytest.fixture\ndef initial_document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc-{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(),\n            embedding=create_embedding(i, 5),\n        )\n        for i in range(4, 7)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n    }\n\n\n@pytest.fixture\ndef queries() -> List[QueryWithEmbedding]:\n    queries = [\n        QueryWithEmbedding(\n            query=\"Query 1\",\n            top_k=1,\n            embedding=create_embedding(4, 5),\n        ),\n        QueryWithEmbedding(\n            query=\"Query 2\",\n            top_k=2,\n            embedding=create_embedding(5, 5),\n        ),\n    ]\n    return queries\n\n\n@pytest.fixture\ndef llama_datastore() -> LlamaDataStore:\n    return LlamaDataStore()\n\n\n@pytest.mark.asyncio\nasync def test_upsert(\n    llama_datastore: LlamaDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    \"\"\"Test basic upsert.\"\"\"\n    doc_ids = await llama_datastore._upsert(initial_document_chunks)\n    assert doc_ids == [doc_id for doc_id in initial_document_chunks]\n\n\n@pytest.mark.asyncio\nasync def test_query(\n    llama_datastore: LlamaDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n    queries: List[QueryWithEmbedding],\n) -> None:\n    \"\"\"Test basic query.\"\"\"\n    # insert to prepare for test\n    await llama_datastore._upsert(initial_document_chunks)\n\n    query_results = await llama_datastore._query(queries)\n    assert len(query_results) == len(queries)\n\n    query_0_results = query_results[0].results\n    query_1_results = query_results[1].results\n\n    assert len(query_0_results) == 1\n    assert len(query_1_results) == 2\n\n    # NOTE: this is the correct behavior\n    assert query_0_results[0].id == \"first-doc-4\"\n    assert query_1_results[0].id == \"first-doc-5\"\n    assert query_1_results[1].id == \"first-doc-4\"\n\n\n@pytest.mark.asyncio\nasync def test_delete(\n    llama_datastore: LlamaDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    # insert to prepare for test\n    await llama_datastore._upsert(initial_document_chunks)\n\n    is_success = llama_datastore.delete([\"first-doc\"])\n    assert is_success\n"
  },
  {
    "path": "tests/datastore/providers/milvus/test_milvus_datastore.py",
    "content": "# from pathlib import Path\n# from dotenv import find_dotenv, load_dotenv\n# env_path = Path(\".\") / \"milvus.env\"\n# load_dotenv(dotenv_path=env_path, verbose=True)\n\nimport pytest\nfrom models.models import (\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    DocumentChunk,\n    QueryWithEmbedding,\n    Source,\n)\nfrom datastore.providers.milvus_datastore import (\n    OUTPUT_DIM,\n    MilvusDataStore,\n)\n\n\n@pytest.fixture\ndef milvus_datastore():\n    return MilvusDataStore(consistency_level=\"Strong\")\n\n\ndef sample_embedding(one_element_poz: int):\n    embedding = [0] * OUTPUT_DIM\n    embedding[one_element_poz % OUTPUT_DIM] = 1\n    return embedding\n\n\ndef sample_embeddings(num: int, one_element_start: int = 0):\n    # since metric type is consine, we create vector contains only one element 1, others 0\n    embeddings = []\n    for x in range(num):\n        embedding = [0] * OUTPUT_DIM\n        embedding[(x + one_element_start) % OUTPUT_DIM] = 1\n        embeddings.append(embedding)\n    return embeddings\n\n\n@pytest.fixture\ndef document_chunk_one():\n    doc_id = \"zerp\"\n    doc_chunks = []\n\n    ids = [\"abc_123\", \"def_456\", \"ghi_789\"]\n    texts = [\n        \"lorem ipsum dolor sit amet\",\n        \"consectetur adipiscing elit\",\n        \"sed do eiusmod tempor incididunt\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    source_ids = [\"foo\", \"bar\", \"baz\"]\n    urls = [\"foo.com\", \"bar.net\", \"baz.org\"]\n    created_ats = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"2021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n\n    embeddings = sample_embeddings(len(texts))\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id,\n                source=sources[i],\n                source_id=source_ids[i],\n                url=urls[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks.append(chunk)\n\n    return {doc_id: doc_chunks}\n\n\n@pytest.fixture\ndef document_chunk_two():\n    doc_id_1 = \"zerp\"\n    doc_chunks_1 = []\n\n    ids = [\"abc_123\", \"def_456\", \"ghi_789\"]\n    texts = [\n        \"1lorem ipsum dolor sit amet\",\n        \"2consectetur adipiscing elit\",\n        \"3sed do eiusmod tempor incididunt\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    source_ids = [\"foo\", \"bar\", \"baz\"]\n    urls = [\"foo.com\", \"bar.net\", \"baz.org\"]\n    created_ats = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"3021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n    embeddings = sample_embeddings(len(texts))\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id_1,\n                source=sources[i],\n                source_id=source_ids[i],\n                url=urls[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks_1.append(chunk)\n\n    doc_id_2 = \"merp\"\n    doc_chunks_2 = []\n\n    ids = [\"jkl_123\", \"lmn_456\", \"opq_789\"]\n    texts = [\n        \"3sdsc efac feas sit qweas\",\n        \"4wert sdfas fdsc\",\n        \"52dsc fdsf eiusmod asdasd incididunt\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    source_ids = [\"foo\", \"bar\", \"baz\"]\n    urls = [\"foo.com\", \"bar.net\", \"baz.org\"]\n    created_ats = [\n        \"4929-10-28T09:30:00-05:00\",\n        \"5009-01-03T16:39:57-08:00\",\n        \"6021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n    embeddings = sample_embeddings(len(texts), 3)\n\n    for i in range(3):\n        chunk = DocumentChunk(\n            id=ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=doc_id_2,\n                source=sources[i],\n                source_id=source_ids[i],\n                url=urls[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks_2.append(chunk)\n\n    return {doc_id_1: doc_chunks_1, doc_id_2: doc_chunks_2}\n\n\n@pytest.mark.asyncio\nasync def test_upsert(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    assert 3 == milvus_datastore.col.num_entities\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_reload(milvus_datastore, document_chunk_one, document_chunk_two):\n    await milvus_datastore.delete(delete_all=True)\n\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    assert 3 == milvus_datastore.col.num_entities\n\n    new_store = MilvusDataStore()\n    another_in = {i: document_chunk_two[i] for i in document_chunk_two if i != res[0]}\n    res = await new_store._upsert(another_in)\n    new_store.col.flush()\n    assert 6 == new_store.col.num_entities\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=10,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n    assert 1 == len(query_results)\n    new_store.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_upsert_query_all(milvus_datastore, document_chunk_two):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_two)\n    assert res == list(document_chunk_two.keys())\n    milvus_datastore.col.flush()\n\n    # Num entities currently doesn't track deletes\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=10,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 6 == len(query_results[0].results)\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_query_accuracy(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=1,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 1 == len(query_results[0].results)\n    assert 1.0 == query_results[0].results[0].score\n    assert \"abc_123\" == query_results[0].results[0].id\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_query_filter(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=1,\n        embedding=sample_embedding(0),\n        filter=DocumentMetadataFilter(\n            start_date=\"2000-01-03T16:39:57-08:00\", end_date=\"2010-01-03T16:39:57-08:00\"\n        ),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 1 == len(query_results[0].results)\n    assert 1.0 != query_results[0].results[0].score\n    assert \"def_456\" == query_results[0].results[0].id\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_date_filter(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    await milvus_datastore.delete(\n        filter=DocumentMetadataFilter(\n            end_date=\"2009-01-03T16:39:57-08:00\",\n        )\n    )\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 1 == len(query_results[0].results)\n    assert \"ghi_789\" == query_results[0].results[0].id\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_source_filter(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    await milvus_datastore.delete(\n        filter=DocumentMetadataFilter(\n            source=Source.email,\n        )\n    )\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 2 == len(query_results[0].results)\n    assert \"def_456\" == query_results[0].results[0].id\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_document_id_filter(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    await milvus_datastore.delete(\n        filter=DocumentMetadataFilter(\n            document_id=res[0],\n        )\n    )\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 0 == len(query_results[0].results)\n    milvus_datastore.col.drop()\n\n\n@pytest.mark.asyncio\nasync def test_delete_with_document_id(milvus_datastore, document_chunk_one):\n    await milvus_datastore.delete(delete_all=True)\n    res = await milvus_datastore._upsert(document_chunk_one)\n    assert res == list(document_chunk_one.keys())\n    milvus_datastore.col.flush()\n    await milvus_datastore.delete([res[0]])\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=9,\n        embedding=sample_embedding(0),\n    )\n    query_results = await milvus_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 0 == len(query_results[0].results)\n    milvus_datastore.col.drop()\n\n\n# if __name__ == '__main__':\n#     import sys\n#     import pytest\n#     pytest.main(sys.argv)\n"
  },
  {
    "path": "tests/datastore/providers/mongodb_atlas/test_integration.py",
    "content": "\"\"\"Integration Tests of ChatGPT Retrieval Plugin\nwith MongoDB Atlas Vector Datastore and OPENAI Embedding model.\n\nAs described in docs/providers/mongodb/setup.md, to run this, one must\nhave a running MongoDB Atlas Cluster, and\nprovide a valid OPENAI_API_KEY.\n\"\"\"\n\nimport os\nfrom time import sleep\n\nimport openai\nimport pytest\nfrom fastapi.testclient import TestClient\nfrom httpx import Response\nfrom pymongo import MongoClient\n\nfrom server.main import app\n\n\n@pytest.fixture(scope=\"session\")\ndef documents():\n    \"\"\" List of documents represents data to be embedded in the datastore.\n    Minimum requirements fpr Documents in the /upsert endpoint's UpsertRequest.\n    \"\"\"\n    return [\n        {\"text\": \"The quick brown fox jumped over the slimy green toad.\"},\n        {\"text\": \"The big brown bear jumped over the lazy dog.\"},\n        {\"text\": \"Toads are frogs.\"},\n        {\"text\": \"Green toads are basically red frogs.\"},\n    ]\n\n\n@pytest.fixture(scope=\"session\", autouse=True)\ndef client():\n    \"\"\"TestClient makes requests to FastAPI service.\"\"\"\n    endpoint_url = \"http://127.0.0.1:8000\"\n    headers = {\"Authorization\": f\"Bearer {os.environ['BEARER_TOKEN']}\"}\n    with TestClient(app=app, base_url=endpoint_url, headers=headers) as client:\n        yield client\n\n\n@pytest.fixture(scope=\"session\")\ndef delete(client) -> bool:\n    \"\"\"Drop existing documents from the collection\"\"\"\n    response = client.request(\"DELETE\", \"/delete\", json={\"delete_all\": True})\n    sleep(2)\n    return response\n\n\n@pytest.fixture(scope=\"session\")\ndef upsert(delete, documents, client) -> bool:\n    \"\"\"Upload documents to the datastore via plugin's REST API.\"\"\"\n    response = client.post(\"/upsert\", json={\"documents\": documents})\n    sleep(2)  # At this point, the Vector Search Index is being built\n    return response\n\n\ndef test_delete(delete) -> None:\n    \"\"\"Simply confirm that delete fixture ran successfully\"\"\"\n    assert delete.status_code == 200\n    assert delete.json()['success']\n\n\ndef test_upsert(upsert) -> None:\n    \"\"\"Simply confirm that upsert fixture has run successfully\"\"\"\n    assert upsert.status_code == 200\n    assert len(upsert.json()['ids']) == 4\n\n\ndef test_query(upsert, client) -> None:  # upsert,\n    \"\"\"Test queries produce reasonable results,\n    now that datastore contains embedded data which has been indexed\n    \"\"\"\n    question = \"What did the fox jump over?\"\n    n_requested = 2  # top N results per query\n    got_response = False\n    retries = 5\n    query_result = {}\n    while retries and not got_response:\n        response = client.post(\"/query\", json={'queries': [{\"query\": question, \"top_k\": n_requested}]})\n        assert isinstance(response, Response)\n        assert response.status_code == 200\n        assert len(response.json()) == 1\n        query_result = response.json()['results'][0]\n        if len(query_result['results']) == n_requested:\n            got_response = True\n        else:\n            retries -= 1\n            sleep(5)\n\n    assert got_response  # we got n_requested responses\n    assert query_result['query'] == question\n    answers = []\n    scores = []\n    for result in query_result['results']:\n        answers.append(result['text'])\n        scores.append(round(result['score'], 2))\n    assert 0.8 < scores[0] < 0.9\n    assert answers[0] == \"The quick brown fox jumped over the slimy green toad.\"\n\n\ndef test_required_vars() -> None:\n    \"\"\"Confirm that the environment has all it needs\"\"\"\n    required_vars = {'BEARER_TOKEN', 'OPENAI_API_KEY', 'DATASTORE', 'EMBEDDING_DIMENSION', 'EMBEDDING_MODEL',\n                     'MONGODB_COLLECTION', 'MONGODB_DATABASE', 'MONGODB_INDEX', 'MONGODB_URI'}\n    assert os.environ[\"DATASTORE\"] == 'mongodb'\n    missing = required_vars - set(os.environ)\n    assert len(missing) == 0\n\n\ndef test_mongodb_connection() -> None:\n    \"\"\"Confirm that the connection to the datastore works.\"\"\"\n    client = MongoClient(os.environ[\"MONGODB_URI\"])\n    assert client.admin.command('ping')['ok']\n\n\ndef test_openai_connection() -> None:\n    \"\"\"Check that we can call OpenAI Embedding models.\"\"\"\n    openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n    models = openai.Model.list()\n    model_names = [model[\"id\"] for model in models['data']]\n    for model_name in model_names:\n        try:\n            response = openai.Embedding.create(input=[\"Some input text\"], model=model_name)\n            assert len(response['data'][0]['embedding']) >= int(os.environ['EMBEDDING_DIMENSION'])\n        except:\n            pass  # Not all models are for text embedding.\n"
  },
  {
    "path": "tests/datastore/providers/mongodb_atlas/test_mongodb_datastore.py",
    "content": "\"\"\"\nIntegration tests of MongoDB Atlas Datastore.\n\nThese tests require one to have a running Cluster, Database, Collection and Atlas Search Index\nas described in docs/providers/mongodb/setup.md.\n\nOne will also have to set the same environment variables. Although one CAN\nuse we the same collection and index used in examples/providers/mongodb/semantic-search.ipynb,\nthese tests will make changes to the data, so you may wish to create another collection.\nIf you have run the example notebook, you can reuse with the following.\n\nMONGODB_DATABASE=SQUAD\nMONGODB_COLLECTION=Beyonce\nMONGODB_INDEX=vector_index\nEMBEDDING_DIMENSION=1536\nMONGODB_URI=mongodb+srv://<username>:<password>@<cluster>/?retryWrites=true&w=majority\n\"\"\"\n\n\nfrom inspect import iscoroutinefunction\nimport pytest\nimport time\nfrom typing import Callable\nimport os\n\nfrom models.models import (\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    DocumentChunk,\n    QueryWithEmbedding,\n    Source,\n)\nfrom services.date import to_unix_timestamp\nfrom datetime import datetime\nfrom datastore.providers.mongodb_atlas_datastore import (\n    MongoDBAtlasDataStore,\n)\n\n\n\nasync def assert_when_ready(callable: Callable, tries: int = 5, interval: float = 1):\n\n    for _ in range(tries):\n        if iscoroutinefunction(callable):\n            print(\"starting async call\")\n            result = await callable()\n            print(f\"finished async call with {result=}\")\n        else:\n            result = callable()\n        if result:\n            return\n        time.sleep(interval)\n\n    raise AssertionError(\"Condition not met after multiple attempts\")\n\n\ndef collection_size_callback_factory(collection, num: int):\n\n    async def predicate():\n        num_documents = await collection.count_documents({})\n        return num_documents == num\n\n    return predicate\n\n\n@pytest.fixture\ndef _mongodb_datastore():\n    return MongoDBAtlasDataStore()\n\n\n@pytest.fixture\nasync def mongodb_datastore(_mongodb_datastore):\n    await _mongodb_datastore.delete(delete_all=True)\n    collection = _mongodb_datastore.client[_mongodb_datastore.database_name][_mongodb_datastore.collection_name]\n    await assert_when_ready(collection_size_callback_factory(collection, 0))\n    yield _mongodb_datastore\n    await _mongodb_datastore.delete(delete_all=True)\n    await assert_when_ready(collection_size_callback_factory(collection, 0))\n\n\ndef sample_embedding(one_element_poz: int):\n    n_dims = int(os.environ[\"EMBEDDING_DIMENSION\"])\n    embedding = [0] * n_dims\n    embedding[one_element_poz % n_dims] = 1\n    return embedding\n\n\ndef sample_embeddings(num: int, one_element_start: int = 0):\n    return [sample_embedding(x + one_element_start) for x in range(num)]\n\n\n@pytest.fixture\ndef document_id():\n    \"\"\"ID of an unchunked document\"\"\"\n    return \"a5991f75a315f755c3365ab2\"\n\n@pytest.fixture\ndef chunk_ids(document_id):\n    \"\"\"IDs of chunks\"\"\"\n    return [f\"{document_id}_{i}\" for i in range(3)]\n\n\n@pytest.fixture\ndef one_documents_chunks(document_id, chunk_ids):\n    \"\"\"Represents output of services.chunks.get_document_chunks\n    -> Dict[str, List[DocumentChunk]]\n    called on a list containing a single Document\n    \"\"\"\n\n    n_chunks = len(chunk_ids)\n\n    texts = [\n        \"Aenean euismod bibendum laoreet\",\n        \"Vivamus non enim vitae tortor\",\n        \"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae\",\n    ]\n    sources = [Source.email, Source.file, Source.chat]\n    created_ats = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"2021-01-21T10:00:00-02:00\",\n    ]\n    authors = [\"Fred Smith\", \"Bob Doe\", \"Appleton Doe\"]\n\n    embeddings = sample_embeddings(n_chunks)\n    doc_chunks = []\n    for i in range(n_chunks):\n        chunk = DocumentChunk(\n            id=chunk_ids[i],\n            text=texts[i],\n            metadata=DocumentChunkMetadata(\n                document_id=document_id,\n                source=sources[i],\n                created_at=created_ats[i],\n                author=authors[i],\n            ),\n            embedding=embeddings[i],  # type: ignore\n        )\n\n        doc_chunks.append(chunk)\n\n    return {document_id: doc_chunks}\n\n\nasync def test_upsert(mongodb_datastore: MongoDBAtlasDataStore, one_documents_chunks, chunk_ids):\n    \"\"\"This tests that data gets uploaded, but not that the search index is built.\"\"\"\n    res = await mongodb_datastore._upsert(one_documents_chunks)\n    assert res == chunk_ids\n\n    collection = mongodb_datastore.client[mongodb_datastore.database_name][mongodb_datastore.collection_name]\n    await assert_when_ready(collection_size_callback_factory(collection, 3))\n\n\nasync def test_upsert_query_all(mongodb_datastore, one_documents_chunks, chunk_ids):\n    \"\"\"By running _query, this performs \"\"\"\n    res = await mongodb_datastore._upsert(one_documents_chunks)\n    await assert_when_ready(lambda: res == chunk_ids)\n\n    query = QueryWithEmbedding(\n        query=\"Aenean\",\n        top_k=10,\n        embedding=sample_embedding(0),  # type: ignore\n    )\n\n    async def predicate():\n        query_results = await mongodb_datastore._query(queries=[query])\n        return 1 == len(query_results) and 3 == len(query_results[0].results)\n\n    await assert_when_ready(predicate, tries=12, interval=5)\n\n\nasync def test_delete_with_document_id(mongodb_datastore, one_documents_chunks, chunk_ids):\n    res = await mongodb_datastore._upsert(one_documents_chunks)\n    assert res == chunk_ids\n    collection = mongodb_datastore.client[mongodb_datastore.database_name][mongodb_datastore.collection_name]\n    first_id = str((await collection.find_one())[\"_id\"])\n    await mongodb_datastore.delete(ids=[first_id])\n\n    await assert_when_ready(collection_size_callback_factory(collection, 2))\n\n    all_documents = [doc async for doc in collection.find()]\n    for document in all_documents:\n        assert document[\"metadata\"][\"author\"] != \"Fred Smith\"\n\n\nasync def test_delete_with_source_filter(mongodb_datastore, one_documents_chunks, chunk_ids):\n    res = await mongodb_datastore._upsert(one_documents_chunks)\n    assert res == chunk_ids\n\n    await mongodb_datastore.delete(\n        filter=DocumentMetadataFilter(\n            source=Source.email,\n        )\n    )\n\n    query = QueryWithEmbedding(\n        query=\"Aenean\",\n        top_k=9,\n        embedding=sample_embedding(0),  # type: ignore\n    )\n\n    async def predicate():\n        query_results = await mongodb_datastore._query(queries=[query])\n        return 1 == len(query_results) and query_results[0].results\n\n    await assert_when_ready(predicate, tries=12, interval=5)\n    query_results = await mongodb_datastore._query(queries=[query])\n    for result in query_results[0].results:\n        assert result.text != \"Aenean euismod bibendum laoreet\"\n\n\n@pytest.fixture\ndef build_mongo_filter():\n    return MongoDBAtlasDataStore()._build_mongo_filter\n\n\nasync def test_build_mongo_filter_with_no_filter(build_mongo_filter):\n    result = build_mongo_filter()\n    assert result == {}\n\n\nasync def test_build_mongo_filter_with_start_date(build_mongo_filter):\n    date = datetime(2022, 1, 1).isoformat()\n    filter_data = {\"start_date\": date}\n    result = build_mongo_filter(DocumentMetadataFilter(**filter_data))\n\n    assert result == {\n        \"$and\": [\n            {\"created_at\": {\"$gte\": to_unix_timestamp(date)}}\n        ]\n    }\n\n\nasync def test_build_mongo_filter_with_end_date(build_mongo_filter):\n    date = datetime(2022, 1, 1).isoformat()\n    filter_data = {\"end_date\": date}\n    result = build_mongo_filter(DocumentMetadataFilter(**filter_data))\n\n    assert result == {\n        \"$and\": [\n            {\"created_at\": {\"$lte\": to_unix_timestamp(date)}}\n        ]\n    }\n\n\nasync def test_build_mongo_filter_with_metadata_field(build_mongo_filter):\n    filter_data = {\"source\": \"email\"}\n    result = build_mongo_filter(DocumentMetadataFilter(**filter_data))\n\n    assert result == {\n        \"$and\": [\n            {\"metadata.source\": \"email\"}\n        ]\n    }\n"
  },
  {
    "path": "tests/datastore/providers/postgres/test_postgres_datastore.py",
    "content": "from typing import Dict, List\nimport pytest\nfrom datastore.providers.postgres_datastore import PostgresDataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    QueryWithEmbedding,\n)\nimport os\n\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\ndef create_embedding(non_zero_pos: int) -> List[float]:\n    # create a vector with a single non-zero value of dimension 1535\n    vector = [0.0] * EMBEDDING_DIMENSION\n    vector[non_zero_pos - 1] = 1.0\n    return vector\n\n\n@pytest.fixture\ndef initial_document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc-{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(),\n            embedding=create_embedding(i),\n        )\n        for i in range(4, 7)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n    }\n\n\n@pytest.fixture\ndef queries() -> List[QueryWithEmbedding]:\n    queries = [\n        QueryWithEmbedding(\n            query=\"Query 1\",\n            top_k=1,\n            embedding=create_embedding(4),\n        ),\n        QueryWithEmbedding(\n            query=\"Query 2\",\n            top_k=2,\n            embedding=create_embedding(5),\n        ),\n    ]\n    return queries\n\n\n@pytest.fixture\ndef postgres_datastore() -> PostgresDataStore:\n    return PostgresDataStore()\n\n\n@pytest.mark.asyncio\nasync def test_upsert(\n    postgres_datastore: PostgresDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    \"\"\"Test basic upsert.\"\"\"\n    doc_ids = await postgres_datastore._upsert(initial_document_chunks)\n    assert doc_ids == [doc_id for doc_id in initial_document_chunks]\n\n\n@pytest.mark.asyncio\nasync def test_query(\n    postgres_datastore: PostgresDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n    queries: List[QueryWithEmbedding],\n) -> None:\n    \"\"\"Test basic query.\"\"\"\n    # insert to prepare for test\n    await postgres_datastore._upsert(initial_document_chunks)\n\n    query_results = await postgres_datastore._query(queries)\n    assert len(query_results) == len(queries)\n\n    query_0_results = query_results[0].results\n    query_1_results = query_results[1].results\n\n    assert len(query_0_results) == 1\n    assert len(query_1_results) == 2\n\n    # NOTE: this is the correct behavior\n    assert query_0_results[0].id == \"first-doc-4\"\n    assert query_1_results[0].id == \"first-doc-5\"\n    assert query_1_results[1].id == \"first-doc-4\"\n\n\n@pytest.mark.asyncio\nasync def test_delete(\n    postgres_datastore: PostgresDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    # insert to prepare for test\n    await postgres_datastore._upsert(initial_document_chunks)\n\n    is_success = await postgres_datastore.delete([\"first-doc\"])\n    assert is_success\n\n\n@pytest.mark.asyncio\nasync def test_upsert_new_chunk(postgres_datastore):\n    await postgres_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    ids = await postgres_datastore._upsert({\"doc1\": [chunk]})\n    assert len(ids) == 1\n\n\n@pytest.mark.asyncio\nasync def test_upsert_existing_chunk(postgres_datastore):\n    await postgres_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    ids = await postgres_datastore._upsert({\"doc1\": [chunk]})\n\n    chunk = DocumentChunk(\n        id=\"chunk1\",\n        text=\"New text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    ids = await postgres_datastore._upsert({\"doc1\": [chunk]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        top_k=1,\n    )\n    results = await postgres_datastore._query([query])\n\n    assert len(ids) == 1\n    assert len(results[0].results) == 1\n    assert results[0].results[0].id == \"chunk1\"\n    assert results[0].results[0].text == \"New text\"\n\n\n@pytest.mark.asyncio\nasync def test_query_score(postgres_datastore):\n    await postgres_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    chunk2 = DocumentChunk(\n        id=\"chunk2\",\n        text=\"Another text\",\n        embedding=[-1 if i % 2 == 0 else 1 for i in range(EMBEDDING_DIMENSION)],\n        metadata=DocumentChunkMetadata(),\n    )\n    await postgres_datastore._upsert({\"doc1\": [chunk1], \"doc2\": [chunk2]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n    )\n    results = await postgres_datastore._query([query])\n\n    assert results[0].results[0].id == \"chunk1\"\n    assert int(results[0].results[0].score) == EMBEDDING_DIMENSION\n\n\n@pytest.mark.asyncio\nasync def test_query_filter(postgres_datastore):\n    await postgres_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(\n            source=\"email\", created_at=\"2021-01-01\", author=\"John\"\n        ),\n    )\n    chunk2 = DocumentChunk(\n        id=\"chunk2\",\n        text=\"Another text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(\n            source=\"chat\", created_at=\"2022-02-02\", author=\"Mike\"\n        ),\n    )\n    await postgres_datastore._upsert({\"doc1\": [chunk1], \"doc2\": [chunk2]})\n\n    # Test author filter -- string\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        filter=DocumentMetadataFilter(author=\"John\"),\n    )\n    results = await postgres_datastore._query([query])\n    assert results[0].results[0].id == \"chunk1\"\n\n    # Test source filter -- enum\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        filter=DocumentMetadataFilter(source=\"chat\"),\n    )\n    results = await postgres_datastore._query([query])\n    assert results[0].results[0].id == \"chunk2\"\n\n    # Test created_at filter -- date\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        filter=DocumentMetadataFilter(start_date=\"2022-01-01\"),\n    )\n    results = await postgres_datastore._query([query])\n    assert results[0].results[0].id == \"chunk2\"\n\n\n@pytest.mark.asyncio\nasync def test_delete(postgres_datastore):\n    await postgres_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    chunk2 = DocumentChunk(\n        id=\"chunk2\",\n        text=\"Another text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    await postgres_datastore._upsert({\"doc1\": [chunk1], \"doc2\": [chunk2]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Another query\",\n        embedding=query_embedding,\n    )\n    results = await postgres_datastore._query([query])\n\n    assert len(results[0].results) == 2\n    assert results[0].results[0].id == \"chunk1\"\n    assert results[0].results[1].id == \"chunk2\"\n\n    await postgres_datastore.delete(ids=[\"doc1\"])\n    results_after_delete = await postgres_datastore._query([query])\n\n    assert len(results_after_delete[0].results) == 1\n    assert results_after_delete[0].results[0].id == \"chunk2\"\n\n\n@pytest.mark.asyncio\nasync def test_delete_all(postgres_datastore):\n    await postgres_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"chunk\",\n        text=\"Another text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    await postgres_datastore._upsert({\"doc\": [chunk]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Another query\",\n        embedding=query_embedding,\n        top_k=1,\n    )\n    results = await postgres_datastore._query([query])\n\n    assert len(results) == 1\n    assert len(results[0].results) == 1\n    assert results[0].results[0].id == \"chunk\"\n\n    await postgres_datastore.delete(delete_all=True)\n    results_after_delete = await postgres_datastore._query([query])\n\n    assert len(results_after_delete[0].results) == 0\n"
  },
  {
    "path": "tests/datastore/providers/qdrant/test_qdrant_datastore.py",
    "content": "from typing import Dict, List\n\nimport pytest\nimport qdrant_client\nfrom qdrant_client.http.models import PayloadSchemaType\n\nfrom datastore.providers.qdrant_datastore import QdrantDataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    QueryWithEmbedding,\n    DocumentMetadataFilter,\n    Source,\n)\n\n\ndef create_embedding(non_zero_pos: int, size: int) -> List[float]:\n    vector = [0.0] * size\n    vector[non_zero_pos % size] = 1.0\n    return vector\n\n\n@pytest.fixture\ndef qdrant_datastore() -> QdrantDataStore:\n    return QdrantDataStore(\n        collection_name=\"documents\", vector_size=5, recreate_collection=True\n    )\n\n\n@pytest.fixture\ndef client() -> qdrant_client.QdrantClient:\n    return qdrant_client.QdrantClient()\n\n\n@pytest.fixture\ndef initial_document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc-{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(),\n            embedding=create_embedding(i, 5),\n        )\n        for i in range(4, 7)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n    }\n\n\n@pytest.fixture\ndef document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc_{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(\n                source=Source.email, created_at=\"2023-03-05\", document_id=\"first-doc\"\n            ),\n            embedding=create_embedding(i, 5),\n        )\n        for i in range(3)\n    ]\n    second_doc_chunks = [\n        DocumentChunk(\n            id=f\"second-doc_{i}\",\n            text=f\"Dolor sit amet {i}\",\n            metadata=DocumentChunkMetadata(\n                created_at=\"2023-03-04\", document_id=\"second-doc\"\n            ),\n            embedding=create_embedding(i + len(first_doc_chunks), 5),\n        )\n        for i in range(2)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n        \"second-doc\": second_doc_chunks,\n    }\n\n\n@pytest.mark.asyncio\nasync def test_datastore_creates_payload_indexes(\n    qdrant_datastore,\n    client,\n):\n    collection_info = client.get_collection(collection_name=\"documents\")\n\n    assert 2 == len(collection_info.payload_schema)\n    assert \"created_at\" in collection_info.payload_schema\n    created_at = collection_info.payload_schema[\"created_at\"]\n    assert PayloadSchemaType.INTEGER == created_at.data_type\n    assert \"metadata.document_id\" in collection_info.payload_schema\n    document_id = collection_info.payload_schema[\"metadata.document_id\"]\n    assert PayloadSchemaType.KEYWORD == document_id.data_type\n\n\n@pytest.mark.asyncio\nasync def test_upsert_creates_all_points(\n    qdrant_datastore,\n    client,\n    document_chunks,\n):\n    document_ids = await qdrant_datastore._upsert(document_chunks)\n\n    assert 2 == len(document_ids)\n    assert 5 == client.count(collection_name=\"documents\").count\n\n\n@pytest.mark.asyncio\nasync def test_upsert_does_not_remove_existing_documents_but_store_new(\n    qdrant_datastore,\n    client,\n    initial_document_chunks,\n    document_chunks,\n):\n    \"\"\"\n    This test ensures calling ._upsert no longer removes the existing document chunks,\n    as they are currently removed in the .upsert method directly.\n    \"\"\"\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(initial_document_chunks)\n\n    await qdrant_datastore._upsert(document_chunks)\n\n    assert 8 == client.count(collection_name=\"documents\").count\n\n\n@pytest.mark.asyncio\nasync def test_query_returns_all_on_single_query(qdrant_datastore, document_chunks):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    query = QueryWithEmbedding(\n        query=\"lorem\",\n        top_k=5,\n        embedding=[0.5, 0.5, 0.5, 0.5, 0.5],\n    )\n    query_results = await qdrant_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert \"lorem\" == query_results[0].query\n    assert 5 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_query_returns_closest_entry(qdrant_datastore, document_chunks):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    query = QueryWithEmbedding(\n        query=\"ipsum\",\n        top_k=1,\n        embedding=[0.0, 0.0, 0.5, 0.0, 0.0],\n    )\n    query_results = await qdrant_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert \"ipsum\" == query_results[0].query\n    assert 1 == len(query_results[0].results)\n    first_document_chunk = query_results[0].results[0]\n    assert 0.0 <= first_document_chunk.score <= 1.0\n    assert Source.email == first_document_chunk.metadata.source\n    assert \"2023-03-05\" == first_document_chunk.metadata.created_at\n    assert \"first-doc\" == first_document_chunk.metadata.document_id\n\n\n@pytest.mark.asyncio\nasync def test_query_filter_by_document_id_returns_this_document_chunks(\n    qdrant_datastore, document_chunks\n):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    first_query = QueryWithEmbedding(\n        query=\"dolor\",\n        filter=DocumentMetadataFilter(document_id=\"first-doc\"),\n        top_k=5,\n        embedding=[0.0, 0.0, 0.5, 0.0, 0.0],\n    )\n    second_query = QueryWithEmbedding(\n        query=\"dolor\",\n        filter=DocumentMetadataFilter(document_id=\"second-doc\"),\n        top_k=5,\n        embedding=[0.0, 0.0, 0.5, 0.0, 0.0],\n    )\n    query_results = await qdrant_datastore._query(queries=[first_query, second_query])\n\n    assert 2 == len(query_results)\n    assert \"dolor\" == query_results[0].query\n    assert \"dolor\" == query_results[1].query\n    assert 3 == len(query_results[0].results)\n    assert 2 == len(query_results[1].results)\n\n\n@pytest.mark.asyncio\n@pytest.mark.parametrize(\"start_date\", [\"2023-03-05T00:00:00\", \"2023-03-05\"])\nasync def test_query_start_date_converts_datestring(\n    qdrant_datastore,\n    document_chunks,\n    start_date,\n):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    query = QueryWithEmbedding(\n        query=\"sit amet\",\n        filter=DocumentMetadataFilter(start_date=start_date),\n        top_k=5,\n        embedding=[0.0, 0.0, 0.5, 0.0, 0.0],\n    )\n    query_results = await qdrant_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 3 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\n@pytest.mark.parametrize(\"end_date\", [\"2023-03-04T00:00:00\", \"2023-03-04\"])\nasync def test_query_end_date_converts_datestring(\n    qdrant_datastore,\n    document_chunks,\n    end_date,\n):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    query = QueryWithEmbedding(\n        query=\"sit amet\",\n        filter=DocumentMetadataFilter(end_date=end_date),\n        top_k=5,\n        embedding=[0.0, 0.0, 0.5, 0.0, 0.0],\n    )\n    query_results = await qdrant_datastore._query(queries=[query])\n\n    assert 1 == len(query_results)\n    assert 2 == len(query_results[0].results)\n\n\n@pytest.mark.asyncio\nasync def test_delete_removes_by_ids(\n    qdrant_datastore,\n    client,\n    document_chunks,\n):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    await qdrant_datastore.delete(ids=[\"first-doc\"])\n\n    assert 2 == client.count(collection_name=\"documents\").count\n\n\n@pytest.mark.asyncio\nasync def test_delete_removes_by_document_id_filter(\n    qdrant_datastore,\n    client,\n    document_chunks,\n):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    await qdrant_datastore.delete(\n        filter=DocumentMetadataFilter(document_id=\"first-doc\")\n    )\n\n    assert 2 == client.count(collection_name=\"documents\").count\n\n\n@pytest.mark.asyncio\nasync def test_delete_removes_all(\n    qdrant_datastore,\n    client,\n    document_chunks,\n):\n    # Fill the database with document chunks before running the actual test\n    await qdrant_datastore._upsert(document_chunks)\n\n    await qdrant_datastore.delete(delete_all=True)\n\n    assert 0 == client.count(collection_name=\"documents\").count\n"
  },
  {
    "path": "tests/datastore/providers/redis/test_redis_datastore.py",
    "content": "from datastore.providers.redis_datastore import RedisDataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    QueryWithEmbedding,\n    Source,\n    DocumentMetadataFilter,\n)\nimport pytest\nimport redis.asyncio as redis\nimport numpy as np\n\nNUM_TEST_DOCS = 10\n\n\n@pytest.fixture\nasync def redis_datastore():\n    return await RedisDataStore.init(dim=5)\n\n\ndef create_embedding(i, dim):\n    vec = np.array([0.1] * dim).astype(np.float64).tolist()\n    vec[dim - 1] = i + 1 / 10\n    return vec\n\n\ndef create_document_chunk(i, dim):\n    return DocumentChunk(\n        id=f\"first-doc_{i}\",\n        text=f\"Lorem ipsum {i}\",\n        embedding=create_embedding(i, dim),\n        metadata=DocumentChunkMetadata(\n            source=Source.file, created_at=\"1970-01-01\", document_id=\"docs\"\n        ),\n    )\n\n\ndef create_document_chunks(n, dim):\n    docs = [create_document_chunk(i, dim) for i in range(n)]\n    return {\"docs\": docs}\n\n\n@pytest.mark.asyncio\nasync def test_redis_upsert_query(redis_datastore):\n    docs = create_document_chunks(NUM_TEST_DOCS, 5)\n    await redis_datastore._upsert(docs)\n    query = QueryWithEmbedding(\n        query=\"Lorem ipsum 0\",\n        top_k=5,\n        embedding=create_embedding(0, 5),\n    )\n    query_results = await redis_datastore._query(queries=[query])\n    assert 1 == len(query_results)\n    for i in range(5):\n        assert f\"Lorem ipsum {i}\" == query_results[0].results[i].text\n        assert \"docs\" == query_results[0].results[i].id\n\n\n@pytest.mark.asyncio\nasync def test_redis_filter_query(redis_datastore):\n    query = QueryWithEmbedding(\n        query=\"Lorem ipsum 0\",\n        filter=DocumentMetadataFilter(document_id=\"docs\"),\n        top_k=5,\n        embedding=create_embedding(0, 5),\n    )\n    query_results = await redis_datastore._query(queries=[query])\n    print(query_results)\n    assert 1 == len(query_results)\n    assert \"docs\" == query_results[0].results[0].id\n\n\n@pytest.mark.asyncio\nasync def test_redis_delete_docs(redis_datastore):\n    res = await redis_datastore.delete(ids=[\"docs\"])\n    assert res\n"
  },
  {
    "path": "tests/datastore/providers/supabase/test_supabase_datastore.py",
    "content": "from typing import Dict, List\nimport pytest\nfrom datastore.providers.supabase_datastore import SupabaseDataStore\nfrom models.models import (\n    DocumentChunk,\n    DocumentChunkMetadata,\n    DocumentMetadataFilter,\n    QueryWithEmbedding,\n)\nimport os\n\nEMBEDDING_DIMENSION = int(os.environ.get(\"EMBEDDING_DIMENSION\", 256))\n\n\ndef create_embedding(non_zero_pos: int) -> List[float]:\n    # create a vector with a single non-zero value of dimension 1535\n    vector = [0.0] * EMBEDDING_DIMENSION\n    vector[non_zero_pos - 1] = 1.0\n    return vector\n\n\n@pytest.fixture\ndef initial_document_chunks() -> Dict[str, List[DocumentChunk]]:\n    first_doc_chunks = [\n        DocumentChunk(\n            id=f\"first-doc-{i}\",\n            text=f\"Lorem ipsum {i}\",\n            metadata=DocumentChunkMetadata(),\n            embedding=create_embedding(i),\n        )\n        for i in range(4, 7)\n    ]\n    return {\n        \"first-doc\": first_doc_chunks,\n    }\n\n\n@pytest.fixture\ndef queries() -> List[QueryWithEmbedding]:\n    queries = [\n        QueryWithEmbedding(\n            query=\"Query 1\",\n            top_k=1,\n            embedding=create_embedding(4),\n        ),\n        QueryWithEmbedding(\n            query=\"Query 2\",\n            top_k=2,\n            embedding=create_embedding(5),\n        ),\n    ]\n    return queries\n\n\n@pytest.fixture\ndef supabase_datastore() -> SupabaseDataStore:\n    return SupabaseDataStore()\n\n\n@pytest.mark.asyncio\nasync def test_upsert(\n    supabase_datastore: SupabaseDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    \"\"\"Test basic upsert.\"\"\"\n    doc_ids = await supabase_datastore._upsert(initial_document_chunks)\n    assert doc_ids == [doc_id for doc_id in initial_document_chunks]\n\n\n@pytest.mark.asyncio\nasync def test_query(\n    supabase_datastore: SupabaseDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n    queries: List[QueryWithEmbedding],\n) -> None:\n    \"\"\"Test basic query.\"\"\"\n    # insert to prepare for test\n    await supabase_datastore._upsert(initial_document_chunks)\n\n    query_results = await supabase_datastore._query(queries)\n    assert len(query_results) == len(queries)\n\n    query_0_results = query_results[0].results\n    query_1_results = query_results[1].results\n\n    assert len(query_0_results) == 1\n    assert len(query_1_results) == 2\n\n    # NOTE: this is the correct behavior\n    assert query_0_results[0].id == \"first-doc-4\"\n    assert query_1_results[0].id == \"first-doc-5\"\n    assert query_1_results[1].id == \"first-doc-4\"\n\n\n@pytest.mark.asyncio\nasync def test_delete(\n    supabase_datastore: SupabaseDataStore,\n    initial_document_chunks: Dict[str, List[DocumentChunk]],\n) -> None:\n    # insert to prepare for test\n    await supabase_datastore._upsert(initial_document_chunks)\n\n    is_success = await supabase_datastore.delete([\"first-doc\"])\n    assert is_success\n\n\n@pytest.mark.asyncio\nasync def test_upsert_new_chunk(supabase_datastore):\n    await supabase_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    ids = await supabase_datastore._upsert({\"doc1\": [chunk]})\n    assert len(ids) == 1\n\n\n@pytest.mark.asyncio\nasync def test_upsert_existing_chunk(supabase_datastore):\n    await supabase_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    ids = await supabase_datastore._upsert({\"doc1\": [chunk]})\n\n    chunk = DocumentChunk(\n        id=\"chunk1\",\n        text=\"New text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    ids = await supabase_datastore._upsert({\"doc1\": [chunk]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        top_k=1,\n    )\n    results = await supabase_datastore._query([query])\n\n    assert len(ids) == 1\n    assert len(results[0].results) == 1\n    assert results[0].results[0].id == \"chunk1\"\n    assert results[0].results[0].text == \"New text\"\n\n\n@pytest.mark.asyncio\nasync def test_query_score(supabase_datastore):\n    await supabase_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    chunk2 = DocumentChunk(\n        id=\"chunk2\",\n        text=\"Another text\",\n        embedding=[-1 if i % 2 == 0 else 1 for i in range(EMBEDDING_DIMENSION)],\n        metadata=DocumentChunkMetadata(),\n    )\n    await supabase_datastore._upsert({\"doc1\": [chunk1], \"doc2\": [chunk2]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n    )\n    results = await supabase_datastore._query([query])\n\n    assert results[0].results[0].id == \"chunk1\"\n    assert int(results[0].results[0].score) == EMBEDDING_DIMENSION\n\n\n@pytest.mark.asyncio\nasync def test_query_filter(supabase_datastore):\n    await supabase_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(\n            source=\"email\", created_at=\"2021-01-01\", author=\"John\"\n        ),\n    )\n    chunk2 = DocumentChunk(\n        id=\"chunk2\",\n        text=\"Another text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(\n            source=\"chat\", created_at=\"2022-02-02\", author=\"Mike\"\n        ),\n    )\n    await supabase_datastore._upsert({\"doc1\": [chunk1], \"doc2\": [chunk2]})\n\n    # Test author filter -- string\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        filter=DocumentMetadataFilter(author=\"John\"),\n    )\n    results = await supabase_datastore._query([query])\n    assert results[0].results[0].id == \"chunk1\"\n\n    # Test source filter -- enum\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        filter=DocumentMetadataFilter(source=\"chat\"),\n    )\n    results = await supabase_datastore._query([query])\n    assert results[0].results[0].id == \"chunk2\"\n\n    # Test created_at filter -- date\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Query\",\n        embedding=query_embedding,\n        filter=DocumentMetadataFilter(start_date=\"2022-01-01\"),\n    )\n    results = await supabase_datastore._query([query])\n    assert results[0].results[0].id == \"chunk2\"\n\n\n@pytest.mark.asyncio\nasync def test_delete(supabase_datastore):\n    await supabase_datastore.delete(delete_all=True)\n    chunk1 = DocumentChunk(\n        id=\"chunk1\",\n        text=\"Sample text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    chunk2 = DocumentChunk(\n        id=\"chunk2\",\n        text=\"Another text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    await supabase_datastore._upsert({\"doc1\": [chunk1], \"doc2\": [chunk2]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Another query\",\n        embedding=query_embedding,\n    )\n    results = await supabase_datastore._query([query])\n\n    assert len(results[0].results) == 2\n    assert results[0].results[0].id == \"chunk1\"\n    assert results[0].results[1].id == \"chunk2\"\n\n    await supabase_datastore.delete(ids=[\"doc1\"])\n    results_after_delete = await supabase_datastore._query([query])\n\n    assert len(results_after_delete[0].results) == 1\n    assert results_after_delete[0].results[0].id == \"chunk2\"\n\n\n@pytest.mark.asyncio\nasync def test_delete_all(supabase_datastore):\n    await supabase_datastore.delete(delete_all=True)\n    chunk = DocumentChunk(\n        id=\"chunk\",\n        text=\"Another text\",\n        embedding=[1] * EMBEDDING_DIMENSION,\n        metadata=DocumentChunkMetadata(),\n    )\n    await supabase_datastore._upsert({\"doc\": [chunk]})\n\n    query_embedding = [1] * EMBEDDING_DIMENSION\n    query = QueryWithEmbedding(\n        query=\"Another query\",\n        embedding=query_embedding,\n        top_k=1,\n    )\n    results = await supabase_datastore._query([query])\n\n    assert len(results) == 1\n    assert len(results[0].results) == 1\n    assert results[0].results[0].id == \"chunk\"\n\n    await supabase_datastore.delete(delete_all=True)\n    results_after_delete = await supabase_datastore._query([query])\n\n    assert len(results_after_delete[0].results) == 0\n"
  },
  {
    "path": "tests/datastore/providers/weaviate/docker-compose.yml",
    "content": "---\nversion: '3.4'\nservices:\n  weaviate:\n    command:\n    - --host\n    - 0.0.0.0\n    - --port\n    - '8080'\n    - --scheme\n    - http\n    image: semitechnologies/weaviate:1.18.0\n    ports:\n    - 8080:8080\n    restart: on-failure:0\n    environment:\n      QUERY_DEFAULTS_LIMIT: 25\n      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'\n      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'\n      DEFAULT_VECTORIZER_MODULE: 'none'\n      ENABLE_MODULES: ''\n      CLUSTER_HOSTNAME: 'node1'\n      LOG_LEVEL: debug\n      AUTOSCHEMA_ENABLED: 'false'\n..."
  },
  {
    "path": "tests/datastore/providers/weaviate/test_weaviate_datastore.py",
    "content": "import logging\nimport os\n\nimport pytest\nimport weaviate\nfrom _pytest.logging import LogCaptureFixture\nfrom fastapi.testclient import TestClient\nfrom loguru import logger\nfrom weaviate import Client\n\nfrom datastore.providers.weaviate_datastore import (\n    SCHEMA,\n    WeaviateDataStore,\n    extract_schema_properties,\n)\nfrom models.models import DocumentMetadataFilter, Source\nfrom server.main import app\n\nBEARER_TOKEN = os.getenv(\"BEARER_TOKEN\")\n\nclient = TestClient(app)\nclient.headers[\"Authorization\"] = f\"Bearer {BEARER_TOKEN}\"\n\n\n@pytest.fixture\ndef weaviate_client():\n    host = os.getenv(\"WEAVIATE_HOST\", \"http://localhost\")\n    port = os.getenv(\"WEAVIATE_PORT\", \"8080\")\n    client = Client(f\"{host}:{port}\")\n\n    yield client\n\n    client.schema.delete_all()\n\n\n@pytest.fixture\ndef test_db(weaviate_client, documents):\n    weaviate_client.schema.delete_all()\n    weaviate_client.schema.create_class(SCHEMA)\n\n    response = client.post(\"/upsert\", json={\"documents\": documents})\n\n    if response.status_code != 200:\n        raise Exception(\n            f\"Could not upsert to test client.\\nStatus Code: {response.status_code}\\nResponse:\\n{response.json()}\"\n        )\n\n    yield client\n\n\n@pytest.fixture\ndef documents():\n    documents = []\n\n    authors = [\"Max Mustermann\", \"John Doe\", \"Jane Doe\"]\n    texts = [\n        \"lorem ipsum dolor sit amet\",\n        \"consectetur adipiscing elit\",\n        \"sed do eiusmod tempor incididunt\",\n    ]\n    ids = [\"abc_123\", \"def_456\", \"ghi_789\"]\n    sources = [\"chat\", \"email\", \"email\"]\n    created_at = [\n        \"1929-10-28T09:30:00-05:00\",\n        \"2009-01-03T16:39:57-08:00\",\n        \"2021-01-21T10:00:00-02:00\",\n    ]\n\n    for i in range(3):\n        documents.append(\n            {\n                \"id\": ids[i],\n                \"text\": texts[i],\n                \"metadata\": {\n                    \"source\": sources[i],\n                    \"source_id\": \"5325\",\n                    \"url\": \"http://example.com\",\n                    \"created_at\": created_at[i],\n                    \"author\": authors[i],\n                },\n            }\n        )\n\n    no_metadata_doc = {\n        \"id\": \"jkl_012\",\n        \"text\": \"no metadata\",\n    }\n\n    documents.append(no_metadata_doc)\n\n    partial_metadata_doc = {\n        \"id\": \"mno_345\",\n        \"text\": \"partial metadata\",\n        \"metadata\": {\n            \"source\": \"file\",\n        },\n    }\n\n    documents.append(partial_metadata_doc)\n\n    yield documents\n\n\n@pytest.fixture\ndef caplog(caplog: LogCaptureFixture):\n    handler_id = logger.add(caplog.handler, format=\"{message}\")\n    yield caplog\n    logger.remove(handler_id)\n\n\n@pytest.mark.parametrize(\n    \"document_id\", [(\"abc_123\"), (\"9a253e0b-d2df-5c2e-be6d-8e9b1f4ae345\")]\n)\ndef test_upsert(weaviate_client, document_id):\n    weaviate_client.schema.delete_all()\n    weaviate_client.schema.create_class(SCHEMA)\n\n    text = \"\"\"\n    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce in ipsum eget dolor malesuada fermentum at ac massa. \n    Aliquam erat volutpat. Sed eu velit est. Morbi semper quam id urna fringilla lacinia. Vivamus sit amet velit id lorem \n    pretium molestie. Nulla tincidunt sapien eu nulla consequat, a lacinia justo facilisis. Maecenas euismod urna sapien, \n    sit amet tincidunt est dapibus ac. Sed in lorem in nunc tincidunt bibendum. Nullam vel urna vitae nulla iaculis rutrum. \n    Suspendisse varius, massa a dignissim vehicula, urna ligula tincidunt orci, id fringilla velit tellus eu metus. Sed \n    vestibulum, nisl in malesuada tempor, nisi turpis facilisis nibh, nec dictum velit velit vel ex. Donec euismod, \n    leo ut sollicitudin tempor, dolor augue blandit nunc, eu lacinia ipsum turpis vitae nulla. Aenean bibendum \n    tincidunt magna in pulvinar. Sed tincidunt vel nisi ac maximus.\n    \"\"\"\n    source = \"email\"\n    source_id = \"5325\"\n    url = \"http://example.com\"\n    created_at = \"2022-12-16T08:00:00+01:00\"\n    author = \"Max Mustermann\"\n\n    documents = {\n        \"documents\": [\n            {\n                \"id\": document_id,\n                \"text\": text,\n                \"metadata\": {\n                    \"source\": source,\n                    \"source_id\": source_id,\n                    \"url\": url,\n                    \"created_at\": created_at,\n                    \"author\": author,\n                },\n            }\n        ]\n    }\n\n    response = client.post(\"/upsert\", json=documents)\n\n    assert response.status_code == 200\n    assert response.json() == {\"ids\": [document_id]}\n\n    properties = [\n        \"chunk_id\",\n        \"document_id\",\n        \"source\",\n        \"source_id\",\n        \"url\",\n        \"created_at\",\n        \"author\",\n    ]\n\n    where_filter = {\n        \"path\": [\"document_id\"],\n        \"operator\": \"Equal\",\n        \"valueString\": document_id,\n    }\n\n    weaviate_doc = (\n        weaviate_client.query.get(\"OpenAIDocument\", properties)\n        .with_additional(\"vector\")\n        .with_where(where_filter)\n        .with_sort({\"path\": [\"chunk_id\"], \"order\": \"asc\"})\n        .do()\n    )\n\n    weaviate_docs = weaviate_doc[\"data\"][\"Get\"][\"OpenAIDocument\"]\n\n    assert len(weaviate_docs) == 2\n\n    for i, weaviate_doc in enumerate(weaviate_docs):\n        assert weaviate_doc[\"chunk_id\"] == f\"{document_id}_{i}\"\n\n        assert weaviate_doc[\"document_id\"] == document_id\n\n        assert weaviate_doc[\"source\"] == source\n        assert weaviate_doc[\"source_id\"] == source_id\n        assert weaviate_doc[\"url\"] == url\n        assert weaviate_doc[\"created_at\"] == created_at\n        assert weaviate_doc[\"author\"] == author\n\n        assert weaviate_doc[\"_additional\"][\"vector\"]\n\n\ndef test_upsert_no_metadata(weaviate_client):\n    weaviate_client.schema.delete_all()\n    weaviate_client.schema.create_class(SCHEMA)\n\n    no_metadata_doc = {\n        \"id\": \"jkl_012\",\n        \"text\": \"no metadata\",\n    }\n\n    metadata_properties = [\n        \"source\",\n        \"source_id\",\n        \"url\",\n        \"created_at\",\n        \"author\",\n    ]\n\n    response = client.post(\"/upsert\", json={\"documents\": [no_metadata_doc]})\n\n    assert response.status_code == 200\n\n    weaviate_doc = weaviate_client.query.get(\"OpenAIDocument\", metadata_properties).do()\n\n    weaviate_doc = weaviate_doc[\"data\"][\"Get\"][\"OpenAIDocument\"][0]\n\n    for _, metadata_value in weaviate_doc.items():\n        assert metadata_value is None\n\n\n@pytest.mark.parametrize(\n    \"test_document, expected_status_code\",\n    [\n        ({\"id\": \"abc_123\", \"text\": \"some text\"}, 200),\n        ({\"id\": \"abc_123\"}, 422),\n        ({\"text\": \"some text\"}, 200),\n    ],\n)\ndef test_upsert_invalid_documents(weaviate_client, test_document, expected_status_code):\n    weaviate_client.schema.delete_all()\n    weaviate_client.schema.create_class(SCHEMA)\n\n    response = client.post(\"/upsert\", json={\"documents\": [test_document]})\n\n    assert response.status_code == expected_status_code\n\n\n@pytest.mark.parametrize(\n    \"query, expected_num_results\",\n    [\n        ({\"query\": \"consectetur adipiscing\", \"top_k\": 3}, 3),\n        ({\"query\": \"consectetur adipiscing elit\", \"filter\": {\"source\": \"email\"}}, 2),\n        (\n            {\n                \"query\": \"sed do eiusmod tempor\",\n                \"filter\": {\n                    \"start_date\": \"2020-01-01T00:00:00Z\",\n                    \"end_date\": \"2022-12-31T00:00:00Z\",\n                },\n            },\n            1,\n        ),\n        (\n            {\n                \"query\": \"some random query\",\n                \"filter\": {\"start_date\": \"2009-01-01T00:00:00Z\"},\n                \"top_k\": 3,\n            },\n            2,\n        ),\n        (\n            {\n                \"query\": \"another random query\",\n                \"filter\": {\"end_date\": \"1929-12-31T00:00:00Z\"},\n                \"top_k\": 3,\n            },\n            1,\n        ),\n    ],\n)\ndef test_query(test_db, query, expected_num_results):\n    queries = {\"queries\": [query]}\n\n    response = client.post(\"/query\", json=queries)\n    assert response.status_code == 200\n\n    num_docs = response.json()[\"results\"][0][\"results\"]\n    assert len(num_docs) == expected_num_results\n\n\ndef test_delete(test_db, weaviate_client, caplog):\n    caplog.set_level(logging.DEBUG)\n\n    delete_request = {\"ids\": [\"def_456\"]}\n\n    response = client.request(method=\"delete\", url=\"/delete\", json=delete_request)\n    assert response.status_code == 200\n    assert response.json()[\"success\"]\n    assert weaviate_client.data_object.get()[\"totalResults\"] == 4\n\n    client.request(method=\"delete\", url=\"/delete\", json=delete_request)\n    assert \"Failed to delete\" in caplog.text\n    caplog.clear()\n\n    delete_request = {\"filter\": {\"source\": \"email\"}}\n\n    response = client.request(method=\"delete\", url=\"/delete\", json=delete_request)\n    assert response.status_code == 200\n    assert response.json()[\"success\"]\n    assert weaviate_client.data_object.get()[\"totalResults\"] == 3\n\n    client.request(method=\"delete\", url=\"/delete\", json=delete_request)\n    assert \"Failed to delete\" in caplog.text\n\n    delete_request = {\"delete_all\": True}\n\n    response = client.request(method=\"delete\", url=\"/delete\", json=delete_request)\n    assert response.status_code == 200\n    assert response.json()[\"success\"]\n    assert not weaviate_client.data_object.get()[\"objects\"]\n\n\ndef test_build_auth_credentials(monkeypatch):\n    # Test when WEAVIATE_URL ends with weaviate.network and WEAVIATE_API_KEY is set\n    with monkeypatch.context() as m:\n        m.setenv(\"WEAVIATE_URL\", \"https://example.weaviate.network\")\n        m.setenv(\"WEAVIATE_API_KEY\", \"your_api_key\")\n        auth_credentials = WeaviateDataStore._build_auth_credentials()\n        assert auth_credentials is not None\n        assert isinstance(auth_credentials, weaviate.auth.AuthApiKey)\n        assert auth_credentials.api_key == \"your_api_key\"\n\n    # Test when WEAVIATE_URL ends with weaviate.network and WEAVIATE_API_KEY is not set\n    with monkeypatch.context() as m:\n        m.setenv(\"WEAVIATE_URL\", \"https://example.weaviate.network\")\n        m.delenv(\"WEAVIATE_API_KEY\", raising=False)\n        with pytest.raises(\n            ValueError, match=\"WEAVIATE_API_KEY environment variable is not set\"\n        ):\n            WeaviateDataStore._build_auth_credentials()\n\n    # Test when WEAVIATE_URL does not end with weaviate.network\n    with monkeypatch.context() as m:\n        m.setenv(\"WEAVIATE_URL\", \"https://example.notweaviate.network\")\n        m.setenv(\"WEAVIATE_API_KEY\", \"your_api_key\")\n        auth_credentials = WeaviateDataStore._build_auth_credentials()\n        assert auth_credentials is None\n\n    # Test when WEAVIATE_URL is not set\n    with monkeypatch.context() as m:\n        m.delenv(\"WEAVIATE_URL\", raising=False)\n        m.setenv(\"WEAVIATE_API_KEY\", \"your_api_key\")\n        auth_credentials = WeaviateDataStore._build_auth_credentials()\n        assert auth_credentials is None\n\n\ndef test_extract_schema_properties():\n    class_schema = {\n        \"class\": \"Question\",\n        \"description\": \"Information from a Jeopardy! question\",\n        \"properties\": [\n            {\n                \"dataType\": [\"text\"],\n                \"description\": \"The question\",\n                \"name\": \"question\",\n            },\n            {\n                \"dataType\": [\"text\"],\n                \"description\": \"The answer\",\n                \"name\": \"answer\",\n            },\n            {\n                \"dataType\": [\"text\"],\n                \"description\": \"The category\",\n                \"name\": \"category\",\n            },\n        ],\n        \"vectorizer\": \"text2vec-openai\",\n    }\n    results = extract_schema_properties(class_schema)\n    assert results == {\"question\", \"answer\", \"category\"}\n\n\ndef test_reuse_schema(weaviate_client, caplog):\n    caplog.set_level(logging.DEBUG)\n\n    weaviate_client.schema.delete_all()\n\n    WeaviateDataStore()\n    assert \"Creating index\" in caplog.text\n\n    WeaviateDataStore()\n    assert \"Will reuse this schema\" in caplog.text\n\n\ndef test_build_date_filters():\n    filter = DocumentMetadataFilter(\n        document_id=None,\n        source=None,\n        source_id=None,\n        author=None,\n        start_date=\"2020-01-01T00:00:00Z\",\n        end_date=\"2022-12-31T00:00:00Z\",\n    )\n    actual_result = WeaviateDataStore.build_filters(filter)\n    expected_result = {\n        \"operator\": \"And\",\n        \"operands\": [\n            {\n                \"path\": [\"created_at\"],\n                \"operator\": \"GreaterThanEqual\",\n                \"valueDate\": \"2020-01-01T00:00:00Z\",\n            },\n            {\n                \"path\": [\"created_at\"],\n                \"operator\": \"LessThanEqual\",\n                \"valueDate\": \"2022-12-31T00:00:00Z\",\n            },\n        ],\n    }\n\n    assert actual_result == expected_result\n\n\n@pytest.mark.parametrize(\n    \"test_input, expected_result\",\n    [\n        (\"abc_123\", False),\n        (\"b2e4133c-c956-5684-bbf5-584e50ec3647\", True),  # version 5\n        (\"f6179953-11d8-4ee0-9af8-e51e00dbf727\", True),  # version 4\n        (\"16fe8165-3c08-348f-a015-a8bb31e26b5c\", True),  # version 3\n        (\"bda85f97-be72-11ed-9291-00000000000a\", False),  # version 1\n    ],\n)\ndef test_is_valid_weaviate_id(test_input, expected_result):\n    actual_result = WeaviateDataStore._is_valid_weaviate_id(test_input)\n    assert actual_result == expected_result\n\n\ndef test_upsert_same_docid(test_db, weaviate_client):\n    def get_doc_by_document_id(document_id):\n        properties = [\n            \"chunk_id\",\n            \"document_id\",\n            \"source\",\n            \"source_id\",\n            \"url\",\n            \"created_at\",\n            \"author\",\n        ]\n        where_filter = {\n            \"path\": [\"document_id\"],\n            \"operator\": \"Equal\",\n            \"valueString\": document_id,\n        }\n\n        results = (\n            weaviate_client.query.get(\"OpenAIDocument\", properties)\n            .with_additional(\"id\")\n            .with_where(where_filter)\n            .with_sort({\"path\": [\"chunk_id\"], \"order\": \"asc\"})\n            .do()\n        )\n\n        return results[\"data\"][\"Get\"][\"OpenAIDocument\"]\n\n    def build_upsert_payload(document):\n        return {\"documents\": [document]}\n\n    # upsert a new document\n    # this is a document that has 2 chunks and\n    # the source is email\n    doc_id = \"abc_123\"\n    text = \"\"\"\n    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce in ipsum eget dolor malesuada fermentum at ac massa. \n    Aliquam erat volutpat. Sed eu velit est. Morbi semper quam id urna fringilla lacinia. Vivamus sit amet velit id lorem \n    pretium molestie. Nulla tincidunt sapien eu nulla consequat, a lacinia justo facilisis. Maecenas euismod urna sapien, \n    sit amet tincidunt est dapibus ac. Sed in lorem in nunc tincidunt bibendum. Nullam vel urna vitae nulla iaculis rutrum. \n    Suspendisse varius, massa a dignissim vehicula, urna ligula tincidunt orci, id fringilla velit tellus eu metus. Sed \n    vestibulum, nisl in malesuada tempor, nisi turpis facilisis nibh, nec dictum velit velit vel ex. Donec euismod, \n    leo ut sollicitudin tempor, dolor augue blandit nunc, eu lacinia ipsum turpis vitae nulla. Aenean bibendum \n    tincidunt magna in pulvinar. Sed tincidunt vel nisi ac maximus.\n    \"\"\"\n\n    document = {\n        \"id\": doc_id,\n        \"text\": text,\n        \"metadata\": {\"source\": Source.email},\n    }\n\n    response = client.post(\"/upsert\", json=build_upsert_payload(document))\n    assert response.status_code == 200\n\n    weaviate_doc = get_doc_by_document_id(doc_id)\n    assert len(weaviate_doc) == 2\n    for chunk in weaviate_doc:\n        assert chunk[\"source\"] == Source.email\n\n    # now update the source to file\n    # user still has to specify the text\n    # because test is a required field\n    document[\"metadata\"][\"source\"] = Source.file\n    response = client.post(\"/upsert\", json=build_upsert_payload(document))\n    assert response.status_code == 200\n\n    weaviate_doc = get_doc_by_document_id(doc_id)\n    assert len(weaviate_doc) == 2\n    for chunk in weaviate_doc:\n        assert chunk[\"source\"] == \"file\"\n\n    # now update the text so that it is only 1 chunk\n    # user does not need to specify metadata\n    # since it is optional\n    document[\"text\"] = \"This is a short text\"\n    document.pop(\"metadata\")\n\n    response = client.post(\"/upsert\", json=build_upsert_payload(document))\n    assert response.status_code == 200\n    weaviate_doc = get_doc_by_document_id(doc_id)\n    assert len(weaviate_doc) == 1\n\n    # TODO: Implement update function\n    # but the source should still be file\n    # but it is None right now because an\n    # update function is out of scope\n    assert weaviate_doc[0][\"source\"] is None\n\n\n@pytest.mark.parametrize(\n    \"url, expected_result\",\n    [\n        (\"https://example.weaviate.network\", True),\n        (\"https://example.weaviate.network/\", True),\n        (\"https://example.weaviate.cloud\", True),\n        (\"https://example.weaviate.cloud/\", True),\n        (\"https://example.notweaviate.network\", False),\n        (\"https://weaviate.network.example.com\", False),\n        (\"https://example.weaviate.network/somepage\", False),\n        (\"\", False),\n    ],\n)\ndef test_is_wcs_domain(url, expected_result):\n    assert WeaviateDataStore._is_wcs_domain(url) == expected_result\n"
  },
  {
    "path": "tests/datastore/providers/zilliz/test_zilliz_datastore.py",
    "content": "# from pathlib import Path\n# from dotenv import find_dotenv, load_dotenv\n# env_path = Path(\".\") / \"zilliz.env\"\n# load_dotenv(dotenv_path=env_path, verbose=True)\n\nimport pytest\n\nfrom datastore.providers.zilliz_datastore import (\n    ZillizDataStore,\n)\n\nfrom datastore.providers.milvus_datastore import (\n    EMBEDDING_FIELD,\n)\n\n# Note: Only do basic test here, the ZillizDataStore is derived from MilvusDataStore.\n\n\n@pytest.fixture\ndef zilliz_datastore():\n    return ZillizDataStore()\n\n\n@pytest.mark.asyncio\nasync def test_zilliz(zilliz_datastore):\n    assert True == zilliz_datastore.col.has_index()\n    index_list = [x.to_dict() for x in zilliz_datastore.col.indexes]\n    for index in index_list:\n        if index[\"index_name\"] == EMBEDDING_FIELD:\n            assert \"AUTOINDEX\" == index[\"index_param\"][\"index_type\"]\n"
  }
]