Showing preview only (692K chars total). Download the full file or copy to clipboard to get everything.
Repository: openai/chatgpt-retrieval-plugin
Branch: main
Commit: b28ddce58474
Files: 116
Total size: 654.3 KB
Directory structure:
gitextract_txp1w2j2/
├── .dockerignore
├── .env.example
├── .github/
│ └── pull_request_template.md
├── .gitignore
├── .well-known/
│ ├── ai-plugin.json
│ └── openapi.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── datastore/
│ ├── __init__.py
│ ├── datastore.py
│ ├── factory.py
│ └── providers/
│ ├── __init__.py
│ ├── analyticdb_datastore.py
│ ├── azurecosmosdb_datastore.py
│ ├── azuresearch_datastore.py
│ ├── chroma_datastore.py
│ ├── elasticsearch_datastore.py
│ ├── llama_datastore.py
│ ├── milvus_datastore.py
│ ├── mongodb_atlas_datastore.py
│ ├── pgvector_datastore.py
│ ├── pinecone_datastore.py
│ ├── postgres_datastore.py
│ ├── qdrant_datastore.py
│ ├── redis_datastore.py
│ ├── supabase_datastore.py
│ ├── weaviate_datastore.py
│ └── zilliz_datastore.py
├── docs/
│ ├── deployment/
│ │ ├── flyio.md
│ │ ├── heroku.md
│ │ ├── other-options.md
│ │ ├── removing-unused-dependencies.md
│ │ └── render.md
│ ├── deprecated/
│ │ └── plugins.md
│ └── providers/
│ ├── analyticdb/
│ │ └── setup.md
│ ├── azurecosmosdb/
│ │ └── setup.md
│ ├── azuresearch/
│ │ └── setup.md
│ ├── chroma/
│ │ └── setup.md
│ ├── elasticsearch/
│ │ └── setup.md
│ ├── llama/
│ │ └── setup.md
│ ├── milvus/
│ │ └── setup.md
│ ├── mongodb/
│ │ └── setup.md
│ ├── pinecone/
│ │ └── setup.md
│ ├── postgres/
│ │ └── setup.md
│ ├── qdrant/
│ │ └── setup.md
│ ├── redis/
│ │ └── setup.md
│ ├── supabase/
│ │ └── setup.md
│ ├── weaviate/
│ │ └── setup.md
│ └── zilliz/
│ └── setup.md
├── examples/
│ ├── authentication-methods/
│ │ ├── no-auth/
│ │ │ ├── ai-plugin.json
│ │ │ └── main.py
│ │ ├── oauth/
│ │ │ └── ai-plugin.json
│ │ ├── service-http/
│ │ │ └── ai-plugin.json
│ │ └── user-http/
│ │ └── ai-plugin.json
│ ├── docker/
│ │ ├── elasticsearch/
│ │ │ ├── README.md
│ │ │ └── docker-compose.yaml
│ │ ├── milvus/
│ │ │ └── docker-compose.yaml
│ │ ├── qdrant/
│ │ │ ├── README.md
│ │ │ ├── docker-compose.yaml
│ │ │ ├── documents.json
│ │ │ └── queries.json
│ │ └── redis/
│ │ └── docker-compose.yml
│ ├── function-calling/
│ │ └── README.md
│ ├── memory/
│ │ ├── README.md
│ │ ├── ai-plugin.json
│ │ ├── main.py
│ │ └── openapi.yaml
│ └── providers/
│ ├── azurecosmosdb/
│ │ └── semantic-search.ipynb
│ ├── elasticsearch/
│ │ └── search.ipynb
│ ├── mongodb/
│ │ └── semantic-search.ipynb
│ ├── pinecone/
│ │ └── semantic-search.ipynb
│ ├── redis/
│ │ └── semantic-search-and-filter.ipynb
│ └── supabase/
│ ├── .gitignore
│ ├── config.toml
│ ├── migrations/
│ │ └── 20230414142107_init_pg_vector.sql
│ └── seed.sql
├── local_server/
│ ├── ai-plugin.json
│ ├── main.py
│ └── openapi.yaml
├── models/
│ ├── api.py
│ └── models.py
├── pyproject.toml
├── scripts/
│ ├── process_json/
│ │ ├── README.md
│ │ ├── example.json
│ │ └── process_json.py
│ ├── process_jsonl/
│ │ ├── README.md
│ │ ├── example.jsonl
│ │ └── process_jsonl.py
│ └── process_zip/
│ ├── README.md
│ └── process_zip.py
├── server/
│ └── main.py
├── services/
│ ├── chunks.py
│ ├── date.py
│ ├── extract_metadata.py
│ ├── file.py
│ ├── openai.py
│ └── pii_detection.py
└── tests/
├── __init__.py
└── datastore/
└── providers/
├── analyticdb/
│ └── test_analyticdb_datastore.py
├── azurecosmosdb/
│ └── test_azurecosmosdb_datastore.py
├── azuresearch/
│ └── test_azuresearch_datastore.py
├── chroma/
│ └── test_chroma_datastore.py
├── elasticsearch/
│ └── test_elasticsearch_datastore.py
├── llama/
│ └── test_llama_datastore.py
├── milvus/
│ └── test_milvus_datastore.py
├── mongodb_atlas/
│ ├── test_integration.py
│ └── test_mongodb_datastore.py
├── postgres/
│ └── test_postgres_datastore.py
├── qdrant/
│ └── test_qdrant_datastore.py
├── redis/
│ └── test_redis_datastore.py
├── supabase/
│ └── test_supabase_datastore.py
├── weaviate/
│ ├── docker-compose.yml
│ └── test_weaviate_datastore.py
└── zilliz/
└── test_zilliz_datastore.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
# Ignore files that are already ignored by git
.gitignore
scripts/
tests/
examples/
local_server/
assets/
*.md
*.pyc
.dockerignore
Dockerfile
================================================
FILE: .env.example
================================================
# Core environment variables
DATASTORE="<your_datastore>"
BEARER_TOKEN="<your_bearer_token>"
OPENAI_API_KEY="<your_openai_api_key>"
EMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use
EMBEDDING_MODEL="text-embedding-3-large" # edit this value based on the model you want to use e.g. text-embedding-3-small, text-embedding-ada-002
# Optional environment variables for Azure OpenAI
OPENAI_API_BASE="https://<AzureOpenAIName>.openai.azure.com/"
OPENAI_API_TYPE="azure"
OPENAI_EMBEDDINGMODEL_DEPLOYMENTID="<Name of embedding model deployment>"
OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID="<Name of deployment of model for metatdata>"
OPENAI_COMPLETIONMODEL_DEPLOYMENTID="<Name of general model deployment used for completion>"
OPENAI_EMBEDDING_BATCH_SIZE="<Batch size of embedding, for AzureOAI, this value need to be set as 1>"
# Pinecone configuration
PINECONE_API_KEY="<your_pinecone_api_key>"
PINECONE_ENVIRONMENT="<your_pinecone_environment>"
PINECONE_INDEX="<your_pinecone_index>"
# Weaviate configuration
WEAVIATE_URL="<your_weaviate_instance_url>"
WEAVIATE_API_KEY="<your_api_key_for_WCS>"
WEAVIATE_CLASS="<your_optional_weaviate_class>"
# Zilliz configuration
ZILLIZ_COLLECTION="<your_zilliz_collection>"
ZILLIZ_URI="<your_zilliz_uri>"
ZILLIZ_USER="<your_zilliz_username>"
ZILLIZ_PASSWORD="<your_zilliz_password>"
# Milvus configuration
MILVUS_COLLECTION="<your_milvus_collection>"
MILVUS_HOST="<your_milvus_host>"
MILVUS_PORT="<your_milvus_port>"
MILVUS_USER="<your_milvus_username>"
MILVUS_PASSWORD="<your_milvus_password>"
# Qdrant configuration
QDRANT_URL="<your_qdrant_url>"
QDRANT_PORT="<your_qdrant_port>"
QDRANT_GRPC_PORT="<your_qdrant_grpc_port>"
QDRANT_API_KEY="<your_qdrant_api_key>"
QDRANT_COLLECTION="<your_qdrant_collection>"
# AnalyticDB configuration
PG_HOST="<your_analyticdb_host>"
PG_PORT="<your_analyticdb_port>"
PG_USER="<your_analyticdb_username>"
PG_PASSWORD="<your_analyticdb_password>"
PG_DATABASE="<your_analyticdb_database>"
PG_COLLECTION="<your_analyticdb_collection>"
# Redis configuration
REDIS_HOST="<your_redis_host>"
REDIS_PORT="<your_redis_port>"
REDIS_PASSWORD="<your_redis_password>"
REDIS_INDEX_NAME="<your_redis_index_name>"
REDIS_DOC_PREFIX="<your_redis_doc_prefix>"
REDIS_DISTANCE_METRIC="<your_redis_distance_metric>"
REDIS_INDEX_TYPE="<your_redis_index_type>"
# Llama configuration
LLAMA_INDEX_TYPE="<gpt_vector_index_type>"
LLAMA_INDEX_JSON_PATH="<path_to_saved_index_json_file>"
LLAMA_QUERY_KWARGS_JSON_PATH="<path_to_saved_query_kwargs_json_file>"
LLAMA_RESPONSE_MODE="<response_mode_for_query>"
# Chroma configuration
CHROMA_COLLECTION="<your_chroma_collection>"
CHROMA_IN_MEMORY="<true_or_false>"
CHROMA_PERSISTENCE_DIR="<your_chroma_persistence_directory>"
CHROMA_HOST="<your_chroma_host>"
CHROMA_PORT="<your_chroma_port>"
# Azure Cognitive Search configuration
AZURESEARCH_SERVICE="<your_search_service_name>"
AZURESEARCH_INDEX="<your_search_index_name>"
AZURESEARCH_API_KEY="<your_api_key>" # (optional, uses key-free managed identity if not set)
# Azure CosmosDB Mongo vCore configuration
AZCOSMOS_API="<your azure cosmos db api, for now it only supports mongo>"
AZCOSMOS_CONNSTR="<your azure cosmos db mongo vcore connection string>"
AZCOSMOS_DATABASE_NAME="<your mongo database name>"
AZCOSMOS_CONTAINER_NAME="<your mongo container name>"
# Supabase configuration
SUPABASE_URL="<supabase_project_url>"
SUPABASE_ANON_KEY="<supabase_project_api_anon_key>"
# Postgres configuration
PG_HOST="<postgres_host>"
PG_PORT="<postgres_port>"
PG_USER="<postgres_user>"
PG_PASSWORD="<postgres_password>"
PG_DB="<postgres_database>"
# Elasticsearch configuration
ELASTICSEARCH_URL="<elasticsearch_host_and_port>" # (either specify host or cloud_id)
ELASTICSEARCH_CLOUD_ID="<elasticsearch_cloud_id>"
ELASTICSEARCH_USERNAME="<elasticsearch_username>"
ELASTICSEARCH_PASSWORD="<elasticsearch_password>"
ELASTICSEARCH_API_KEY="<elasticsearch_api_key>"
ELASTICSEARCH_INDEX="<elasticsearch_index_name>"
ELASTICSEARCH_REPLICAS="<elasticsearch_replicas>"
ELASTICSEARCH_SHARDS="<elasticsearch_shards>"
================================================
FILE: .github/pull_request_template.md
================================================
## Pull Request (PR) Checklist
If you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing!
1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`.
2. **Short Description**: Provide a brief, informative description of the PR that explains the changes made.
3. **Issue(s) Linked**: Mention any related issue(s) by using the keyword `Fixes` or `Closes` followed by the respective issue number(s) (e.g., Fixes #123, Closes #456).
4. **Branch**: Ensure that you have created a new branch for the changes, and it is based on the latest version of the `main` branch.
5. **Code Changes**: Make sure the code changes are minimal, focused, and relevant to the issue or feature being addressed.
6. **Commit Messages**: Write clear and concise commit messages that explain the purpose of each commit.
7. **Tests**: Include unit tests and/or integration tests for any new code or changes to existing code. Make sure all tests pass before submitting the PR.
8. **Documentation**: Update relevant documentation (e.g., README, inline comments, or external documentation) to reflect any changes made.
9. **Review Requested**: Request a review from at least one other contributor or maintainer of the repository.
10. **Video Submission** (For Complex/Large PRs): If your PR introduces significant changes, complexities, or a large number of lines of code, submit a brief video walkthrough along with the PR. The video should explain the purpose of the changes, the logic behind them, and how they address the issue or add the proposed feature. This will help reviewers to better understand your contribution and expedite the review process.
## Pull Request Naming Convention
Use the following naming convention for your PR branches:
```
<type>/<short-description>-<issue-number>
```
- `<type>`: The type of PR, such as `bugfix`, `feature`, `enhancement`, `refactor`, or `docs`. Multiple types are ok and should appear as <type>, <type2>
- `<short-description>`: A brief description of the changes made, using hyphens to separate words.
- `<issue-number>`: The issue number associated with the changes made (if applicable).
Example:
```
feature/advanced-chunking-strategy-123
```
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# .vscode files
.vscode/*
# Pycharm
.idea/
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
myvenv/
# Exception for .env.example
!.env.example
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# macOS .DS_Store files
.DS_Store
================================================
FILE: .well-known/ai-plugin.json
================================================
{
"schema_version": "v1",
"name_for_model": "retrieval",
"name_for_human": "Retrieval Plugin",
"description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
"description_for_human": "Search through your documents.",
"auth": {
"type": "user_http",
"authorization_type": "bearer"
},
"api": {
"type": "openapi",
"url": "https://your-app-url.com/.well-known/openapi.yaml",
"has_user_authentication": false
},
"logo_url": "https://your-app-url.com/.well-known/logo.png",
"contact_email": "hello@contact.com",
"legal_info_url": "http://example.com/legal-info"
}
================================================
FILE: .well-known/openapi.yaml
================================================
openapi: 3.0.2
info:
title: Retrieval Plugin API
description: A retrieval API for querying and filtering documents based on natural language queries and metadata
version: 1.0.0
servers:
- url: https://your-app-url.com
paths:
/query:
post:
summary: Query
description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.
operationId: query_query_post
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/QueryRequest"
required: true
responses:
"200":
description: Successful Response
content:
application/json:
schema:
$ref: "#/components/schemas/QueryResponse"
"422":
description: Validation Error
content:
application/json:
schema:
$ref: "#/components/schemas/HTTPValidationError"
security:
- HTTPBearer: []
components:
schemas:
DocumentChunkMetadata:
title: DocumentChunkMetadata
type: object
properties:
source:
$ref: "#/components/schemas/Source"
source_id:
title: Source Id
type: string
url:
title: Url
type: string
created_at:
title: Created At
type: string
author:
title: Author
type: string
document_id:
title: Document Id
type: string
DocumentChunkWithScore:
title: DocumentChunkWithScore
required:
- text
- metadata
- score
type: object
properties:
id:
title: Id
type: string
text:
title: Text
type: string
metadata:
$ref: "#/components/schemas/DocumentChunkMetadata"
embedding:
title: Embedding
type: array
items:
type: number
score:
title: Score
type: number
DocumentMetadataFilter:
title: DocumentMetadataFilter
type: object
properties:
document_id:
title: Document Id
type: string
source:
$ref: "#/components/schemas/Source"
source_id:
title: Source Id
type: string
author:
title: Author
type: string
start_date:
title: Start Date
type: string
end_date:
title: End Date
type: string
HTTPValidationError:
title: HTTPValidationError
type: object
properties:
detail:
title: Detail
type: array
items:
$ref: "#/components/schemas/ValidationError"
Query:
title: Query
required:
- query
type: object
properties:
query:
title: Query
type: string
filter:
$ref: "#/components/schemas/DocumentMetadataFilter"
top_k:
title: Top K
type: integer
default: 3
QueryRequest:
title: QueryRequest
required:
- queries
type: object
properties:
queries:
title: Queries
type: array
items:
$ref: "#/components/schemas/Query"
QueryResponse:
title: QueryResponse
required:
- results
type: object
properties:
results:
title: Results
type: array
items:
$ref: "#/components/schemas/QueryResult"
QueryResult:
title: QueryResult
required:
- query
- results
type: object
properties:
query:
title: Query
type: string
results:
title: Results
type: array
items:
$ref: "#/components/schemas/DocumentChunkWithScore"
Source:
title: Source
enum:
- email
- file
- chat
type: string
description: An enumeration.
ValidationError:
title: ValidationError
required:
- loc
- msg
- type
type: object
properties:
loc:
title: Location
type: array
items:
anyOf:
- type: string
- type: integer
msg:
title: Message
type: string
type:
title: Error Type
type: string
securitySchemes:
HTTPBearer:
type: http
scheme: bearer
================================================
FILE: Dockerfile
================================================
FROM python:3.10 as requirements-stage
WORKDIR /tmp
RUN pip install poetry
COPY ./pyproject.toml ./poetry.lock* /tmp/
RUN poetry export -f requirements.txt --output requirements.txt --without-hashes
FROM python:3.10
WORKDIR /code
COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
COPY . /code/
# Heroku uses PORT, Azure App Services uses WEBSITES_PORT, Fly.io uses 8080 by default
CMD ["sh", "-c", "uvicorn server.main:app --host 0.0.0.0 --port ${PORT:-${WEBSITES_PORT:-8080}}"]
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2023 OpenAI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
# Heroku
# make heroku-login
# make heroku-push
HEROKU_APP = <your app name>
heroku-push:
docker buildx build --platform linux/amd64 -t ${HEROKU_APP} .
docker tag ${HEROKU_APP} registry.heroku.com/${HEROKU_APP}/web
docker push registry.heroku.com/${HEROKU_APP}/web
heroku container:release web -a ${HEROKU_APP}
heroku-login:
heroku container:login
================================================
FILE: README.md
================================================
# ChatGPT Retrieval Plugin
Build Custom GPTs with a Retrieval Plugin backend to give ChatGPT access to personal documents.

## Introduction
The ChatGPT Retrieval Plugin repository provides a flexible solution for semantic search and retrieval of personal or organizational documents using natural language queries. It is a standalone retrieval backend, and can be used with [ChatGPT custom GPTs](https://chat.openai.com/gpts/discovery), [function calling](https://platform.openai.com/docs/guides/function-calling) with the [chat completions](https://platform.openai.com/docs/guides/text-generation) or [assistants APIs](https://platform.openai.com/docs/assistants/overview), or with the [ChatGPT plugins model (deprecated)](https://chat.openai.com/?model=gpt-4-plugins). ChatGPT and the Assistants API both natively support retrieval from uploaded files, so you should use the Retrieval Plugin as a backend only if you want more granular control of your retrieval system (e.g. document text chunk length, embedding model / size, etc.).
The repository is organized into several directories:
| Directory | Description |
| ------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |
| [`datastore`](/datastore) | Contains the core logic for storing and querying document embeddings using various vector database providers. |
| [`docs`](/docs) | Includes documentation for setting up and using each vector database provider, webhooks, and removing unused dependencies. |
| [`examples`](/examples) | Provides example configurations, authentication methods, and provider-specific examples. |
| [`local_server`](/local_server) | Contains an implementation of the Retrieval Plugin configured for localhost testing. |
| [`models`](/models) | Contains the data models used by the plugin, such as document and metadata models. |
| [`scripts`](/scripts) | Offers scripts for processing and uploading documents from different data sources. |
| [`server`](/server) | Houses the main FastAPI server implementation. |
| [`services`](/services) | Contains utility services for tasks like chunking, metadata extraction, and PII detection. |
| [`tests`](/tests) | Includes integration tests for various vector database providers. |
| [`.well-known`](/.well-known) | Stores the plugin manifest file and OpenAPI schema, which define the plugin configuration and API specification. |
This README provides detailed information on how to set up, develop, and deploy the ChatGPT Retrieval Plugin (stand-alone retrieval backend).
## Table of Contents
- [Quickstart](#quickstart)
- [About](#about)
- [Retrieval Plugin](#retrieval-plugin)
- [Retrieval Plugin with custom GPTs](#retrieval-plugin-with-custom-gpts)
- [Retrieval Plugin with function calling](#retrieval-plugin-with-function-calling)
- [Retrieval Plugin with the plugins model (deprecated)](#chatgpt-plugins-model)
- [API Endpoints](#api-endpoints)
- [Memory Feature](#memory-feature)
- [Security](#security)
- [Choosing an Embeddings Model](#choosing-an-embeddings-model)
- [Development](#development)
- [Setup](#setup)
- [General Environment Variables](#general-environment-variables)
- [Choosing a Vector Database](#choosing-a-vector-database)
- [Pinecone](#pinecone)
- [Elasticsearch](#elasticsearch)
- [MongoDB Atlas](#mongodb-atlas)
- [Weaviate](#weaviate)
- [Zilliz](#zilliz)
- [Milvus](#milvus)
- [Qdrant](#qdrant)
- [Redis](#redis)
- [Llama Index](#llamaindex)
- [Chroma](#chroma)
- [Azure Cognitive Search](#azure-cognitive-search)
- [Azure CosmosDB Mongo vCore](#azure-cosmosdb-mongo-vcore)
- [Supabase](#supabase)
- [Postgres](#postgres)
- [AnalyticDB](#analyticdb)
- [Running the API Locally](#running-the-api-locally)
- [Personalization](#personalization)
- [Authentication Methods](#authentication-methods)
- [Deployment](#deployment)
- [Webhooks](#webhooks)
- [Scripts](#scripts)
- [Limitations](#limitations)
- [Contributors](#contributors)
- [Future Directions](#future-directions)
## Quickstart
Follow these steps to quickly set up and run the ChatGPT Retrieval Plugin:
1. Install Python 3.10, if not already installed.
2. Clone the repository: `git clone https://github.com/openai/chatgpt-retrieval-plugin.git`
3. Navigate to the cloned repository directory: `cd /path/to/chatgpt-retrieval-plugin`
4. Install poetry: `pip install poetry`
5. Create a new virtual environment with Python 3.10: `poetry env use python3.10`
6. Activate the virtual environment: `poetry shell`
7. Install app dependencies: `poetry install`
8. Create a [bearer token](#general-environment-variables)
9. Set the required environment variables:
```
export DATASTORE=<your_datastore>
export BEARER_TOKEN=<your_bearer_token>
export OPENAI_API_KEY=<your_openai_api_key>
export EMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use
export EMBEDDING_MODEL=text-embedding-3-large # edit this based on your model preference, e.g. text-embedding-3-small, text-embedding-ada-002
# Optional environment variables used when running Azure OpenAI
export OPENAI_API_BASE=https://<AzureOpenAIName>.openai.azure.com/
export OPENAI_API_TYPE=azure
export OPENAI_EMBEDDINGMODEL_DEPLOYMENTID=<Name of embedding model deployment>
export OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID=<Name of deployment of model for metatdata>
export OPENAI_COMPLETIONMODEL_DEPLOYMENTID=<Name of general model deployment used for completion>
export OPENAI_EMBEDDING_BATCH_SIZE=<Batch size of embedding, for AzureOAI, this value need to be set as 1>
# Add the environment variables for your chosen vector DB.
# Some of these are optional; read the provider's setup docs in /docs/providers for more information.
# Pinecone
export PINECONE_API_KEY=<your_pinecone_api_key>
export PINECONE_ENVIRONMENT=<your_pinecone_environment>
export PINECONE_INDEX=<your_pinecone_index>
# Weaviate
export WEAVIATE_URL=<your_weaviate_instance_url>
export WEAVIATE_API_KEY=<your_api_key_for_WCS>
export WEAVIATE_CLASS=<your_optional_weaviate_class>
# Zilliz
export ZILLIZ_COLLECTION=<your_zilliz_collection>
export ZILLIZ_URI=<your_zilliz_uri>
export ZILLIZ_USER=<your_zilliz_username>
export ZILLIZ_PASSWORD=<your_zilliz_password>
# Milvus
export MILVUS_COLLECTION=<your_milvus_collection>
export MILVUS_HOST=<your_milvus_host>
export MILVUS_PORT=<your_milvus_port>
export MILVUS_USER=<your_milvus_username>
export MILVUS_PASSWORD=<your_milvus_password>
# Qdrant
export QDRANT_URL=<your_qdrant_url>
export QDRANT_PORT=<your_qdrant_port>
export QDRANT_GRPC_PORT=<your_qdrant_grpc_port>
export QDRANT_API_KEY=<your_qdrant_api_key>
export QDRANT_COLLECTION=<your_qdrant_collection>
# AnalyticDB
export PG_HOST=<your_analyticdb_host>
export PG_PORT=<your_analyticdb_port>
export PG_USER=<your_analyticdb_username>
export PG_PASSWORD=<your_analyticdb_password>
export PG_DATABASE=<your_analyticdb_database>
export PG_COLLECTION=<your_analyticdb_collection>
# Redis
export REDIS_HOST=<your_redis_host>
export REDIS_PORT=<your_redis_port>
export REDIS_PASSWORD=<your_redis_password>
export REDIS_INDEX_NAME=<your_redis_index_name>
export REDIS_DOC_PREFIX=<your_redis_doc_prefix>
export REDIS_DISTANCE_METRIC=<your_redis_distance_metric>
export REDIS_INDEX_TYPE=<your_redis_index_type>
# Llama
export LLAMA_INDEX_TYPE=<gpt_vector_index_type>
export LLAMA_INDEX_JSON_PATH=<path_to_saved_index_json_file>
export LLAMA_QUERY_KWARGS_JSON_PATH=<path_to_saved_query_kwargs_json_file>
export LLAMA_RESPONSE_MODE=<response_mode_for_query>
# Chroma
export CHROMA_COLLECTION=<your_chroma_collection>
export CHROMA_IN_MEMORY=<true_or_false>
export CHROMA_PERSISTENCE_DIR=<your_chroma_persistence_directory>
export CHROMA_HOST=<your_chroma_host>
export CHROMA_PORT=<your_chroma_port>
# Azure Cognitive Search
export AZURESEARCH_SERVICE=<your_search_service_name>
export AZURESEARCH_INDEX=<your_search_index_name>
export AZURESEARCH_API_KEY=<your_api_key> (optional, uses key-free managed identity if not set)
# Azure CosmosDB Mongo vCore
export AZCOSMOS_API = <your azure cosmos db api, for now it only supports mongo>
export AZCOSMOS_CONNSTR = <your azure cosmos db mongo vcore connection string>
export AZCOSMOS_DATABASE_NAME = <your mongo database name>
export AZCOSMOS_CONTAINER_NAME = <your mongo container name>
# Supabase
export SUPABASE_URL=<supabase_project_url>
export SUPABASE_ANON_KEY=<supabase_project_api_anon_key>
# Postgres
export PG_HOST=<postgres_host>
export PG_PORT=<postgres_port>
export PG_USER=<postgres_user>
export PG_PASSWORD=<postgres_password>
export PG_DB=<postgres_database>
# Elasticsearch
export ELASTICSEARCH_URL=<elasticsearch_host_and_port> (either specify host or cloud_id)
export ELASTICSEARCH_CLOUD_ID=<elasticsearch_cloud_id>
export ELASTICSEARCH_USERNAME=<elasticsearch_username>
export ELASTICSEARCH_PASSWORD=<elasticsearch_password>
export ELASTICSEARCH_API_KEY=<elasticsearch_api_key>
export ELASTICSEARCH_INDEX=<elasticsearch_index_name>
export ELASTICSEARCH_REPLICAS=<elasticsearch_replicas>
export ELASTICSEARCH_SHARDS=<elasticsearch_shards>
# MongoDB Atlas
export MONGODB_URI=<mongodb_uri>
export MONGODB_DATABASE=<mongodb_database>
export MONGODB_COLLECTION=<mongodb_collection>
export MONGODB_INDEX=<mongodb_index>
```
10. Run the API locally: `poetry run start`
11. Access the API documentation at `http://0.0.0.0:8000/docs` and test the API endpoints (make sure to add your bearer token).
## About
### Retrieval Plugin
This is a standalone retrieval backend that can be used with [ChatGPT custom GPTs](https://chat.openai.com/gpts/discovery), [function calling](https://platform.openai.com/docs/guides/function-calling) with the [chat completions](https://platform.openai.com/docs/guides/text-generation) or [assistants APIs](https://platform.openai.com/docs/assistants/overview), or with the [ChatGPT plugins model (deprecated)](https://chat.openai.com/?model=gpt-4-plugins).
It enables a model to carry out semantic search and retrieval of personal or organizational documents, and write answers informed by relevent retrieved context (sometimes referred to as "Retrieval-Augmented Generation" or "RAG"). It allows users to obtain the most relevant document snippets from their data sources, such as files, notes, or emails, by asking questions or expressing needs in natural language. Enterprises can make their internal documents available to their employees through ChatGPT using this plugin.
The plugin uses OpenAI's embeddings model (`text-embedding-3-large` 256 dimension embeddings by default) to generate embeddings of document chunks, and then stores and queries them using a vector database on the backend. As an open-source and self-hosted solution, developers can deploy their own Retrieval Plugin and register it with ChatGPT. The Retrieval Plugin supports several vector database providers, allowing developers to choose their preferred one from a list.
A FastAPI server exposes the plugin's endpoints for upserting, querying, and deleting documents. Users can refine their search results by using metadata filters by source, date, author, or other criteria. The plugin can be hosted on any cloud platform that supports Docker containers, such as Fly.io, Heroku, Render, or Azure Container Apps. To keep the vector database updated with the latest documents, the plugin can process and store documents from various data sources continuously, using incoming webhooks to the upsert and delete endpoints. Tools like [Zapier](https://zapier.com) or [Make](https://www.make.com) can help configure the webhooks based on events or schedules.
### Retrieval Plugin with Custom GPTs
To create a custom GPT that can use your Retrieval Plugin for semantic search and retrieval of your documents, and even store new information back to the database, you first need to have deployed a Retrieval Plugin. For detailed instructions on how to do this, please refer to the [Deployment section](#deployment). Once you have your app URL (e.g., `https://your-app-url.com`), take the following steps:
1. Navigate to the create GPT page at `https://chat.openai.com/gpts/editor`.
2. Follow the standard creation flow to set up your GPT.
3. Navigate to the "Configure" tab. Here, you can manually fill in fields such as name, description, and instructions, or use the smart creator for assistance.
4. Under the "Actions" section, click on "Create new action".
5. Choose an authentication method. The Retrieval Plugin supports None, API key (Basic or Bearer) and OAuth. For more information on these methods, refer to the [Authentication Methods Section](#authentication-methods).
6. Import the OpenAPI schema. You can either:
- Import directly from the OpenAPI schema hosted in your app at `https://your-app-url.com/.well-known/openapi.yaml`.
- Copy and paste the contents of [this file](/.well-known/openapi.yaml) into the Schema input area if you only want to expose the query endpoint to the GPT. Remember to change the URL under the `-servers` section of the OpenAPI schema you paste in.
7. Optionally, you might want to add a fetch endpoint. This would involve editing the [`/server/main.py`](/server/main.py) file to add an endpoint and implement this for your chosen vector database. If you make this change, please consider contributing it back to the project by opening a pull request! Adding the fetch endpoint to the OpenAPI schema would allow the model to fetch more content from a document by ID if some text is cut off in the retrieved result. It might also be useful to pass in a string with the text from the retrieved result and an option to return a fixed length of context before and after the retrieved result.
8. If you want the GPT to be able to save information back to the vector database, you can give it access to the Retrieval Plugin's `/upsert` endpoint. To do this, copy the contents of [this file](/examples/memory/openapi.yaml) into the schema area. This allows the GPT to store new information it generates or learns during the conversation. More details on this feature can be found at [Memory Feature](#memory-feature) and [in the docs here](/examples/memory).
Remember: ChatGPT and custom GPTs natively support retrieval from uploaded files, so you should use the Retrieval Plugin as a backend only if you want more granular control of your retrieval system (e.g. self-hosting, embedding chunk length, embedding model / size, etc.).
### Retrieval Plugin with Function Calling
The Retrieval Plugin can be integrated with function calling in both the [Chat Completions API](https://platform.openai.com/docs/guides/function-calling) and the [Assistants API](https://platform.openai.com/docs/assistants/overview). This allows the model to decide when to use your functions (query, fetch, upsert) based on the conversation context.
#### Function Calling with Chat Completions
In a call to the chat completions API, you can describe functions and have the model generate a JSON object containing arguments to call one or many functions. The latest models (gpt-3.5-turbo-0125 and gpt-4-turbo-preview) have been trained to detect when a function should be called and to respond with JSON that adheres to the function signature.
You can define the functions for the Retrieval Plugin endpoints and pass them in as tools when you use the Chat Completions API with one of the latest models. The model will then intelligently call the functions. You can use function calling to write queries to your APIs, call the endpoint on the backend, and return the response as a tool message to the model to continue the conversation. The function definitions/schemas and an example can be found [here](/examples/function-calling/).
#### Function Calling with Assistants API
You can use the same function definitions with the OpenAI [Assistants API](https://platform.openai.com/docs/assistants/overview), specifically the [function calling in tool use](https://platform.openai.com/docs/assistants/tools/function-calling). The Assistants API allows you to build AI assistants within your own applications, leveraging models, tools, and knowledge to respond to user queries. The function definitions/schemas and an example can be found [here](/examples/function-calling/). The Assistants API natively supports retrieval from uploaded files, so you should use the Retrieval Plugin with function calling only if you want more granular control of your retrieval system (e.g. embedding chunk length, embedding model / size, etc.).
Parallel function calling is supported for both the Chat Completions API and the Assistants API. This means you can perform multiple tasks, such as querying something and saving something back to the vector database, in the same message.
Read more about function calling with the Retrieval Plugin [here](/examples/function-calling/).
### ChatGPT Plugins Model
(deprecated) We recommend using custom actions with GPTs to make use of the Retrieval Plugin through ChatGPT. Instrucitons for using retrieval with the deprecated plugins model can be found [here](/docs/deprecated/plugins.md).
### API Endpoints
The Retrieval Plugin is built using FastAPI, a web framework for building APIs with Python. FastAPI allows for easy development, validation, and documentation of API endpoints. Find the FastAPI documentation [here](https://fastapi.tiangolo.com/).
One of the benefits of using FastAPI is the automatic generation of interactive API documentation with Swagger UI. When the API is running locally, Swagger UI at `<local_host_url i.e. http://0.0.0.0:8000>/docs` can be used to interact with the API endpoints, test their functionality, and view the expected request and response models.
The plugin exposes the following endpoints for upserting, querying, and deleting documents from the vector database. All requests and responses are in JSON format, and require a valid bearer token as an authorization header.
- `/upsert`: This endpoint allows uploading one or more documents and storing their text and metadata in the vector database. The documents are split into chunks of around 200 tokens, each with a unique ID. The endpoint expects a list of documents in the request body, each with a `text` field, and optional `id` and `metadata` fields. The `metadata` field can contain the following optional subfields: `source`, `source_id`, `url`, `created_at`, and `author`. The endpoint returns a list of the IDs of the inserted documents (an ID is generated if not initially provided).
- `/upsert-file`: This endpoint allows uploading a single file (PDF, TXT, DOCX, PPTX, or MD) and storing its text and metadata in the vector database. The file is converted to plain text and split into chunks of around 200 tokens, each with a unique ID. The endpoint returns a list containing the generated id of the inserted file.
- `/query`: This endpoint allows querying the vector database using one or more natural language queries and optional metadata filters. The endpoint expects a list of queries in the request body, each with a `query` and optional `filter` and `top_k` fields. The `filter` field should contain a subset of the following subfields: `source`, `source_id`, `document_id`, `url`, `created_at`, and `author`. The `top_k` field specifies how many results to return for a given query, and the default value is 3. The endpoint returns a list of objects that each contain a list of the most relevant document chunks for the given query, along with their text, metadata and similarity scores.
- `/delete`: This endpoint allows deleting one or more documents from the vector database using their IDs, a metadata filter, or a delete_all flag. The endpoint expects at least one of the following parameters in the request body: `ids`, `filter`, or `delete_all`. The `ids` parameter should be a list of document IDs to delete; all document chunks for the document with these IDS will be deleted. The `filter` parameter should contain a subset of the following subfields: `source`, `source_id`, `document_id`, `url`, `created_at`, and `author`. The `delete_all` parameter should be a boolean indicating whether to delete all documents from the vector database. The endpoint returns a boolean indicating whether the deletion was successful.
The detailed specifications and examples of the request and response models can be found by running the app locally and navigating to http://0.0.0.0:8000/openapi.json, or in the OpenAPI schema [here](/.well-known/openapi.yaml). Note that the OpenAPI schema only contains the `/query` endpoint, because that is the only function that ChatGPT needs to access. This way, ChatGPT can use the plugin only to retrieve relevant documents based on natural language queries or needs. However, if developers want to also give ChatGPT the ability to remember things for later, they can use the `/upsert` endpoint to save snippets from the conversation to the vector database. An example of a manifest and OpenAPI schema that gives ChatGPT access to the `/upsert` endpoint can be found [here](/examples/memory).
To include custom metadata fields, edit the `DocumentMetadata` and `DocumentMetadataFilter` data models [here](/models/models.py), and update the OpenAPI schema [here](/.well-known/openapi.yaml). You can update this easily by running the app locally, copying the JSON found at http://0.0.0.0:8000/sub/openapi.json, and converting it to YAML format with [Swagger Editor](https://editor.swagger.io/). Alternatively, you can replace the `openapi.yaml` file with an `openapi.json` file.
### Memory Feature
A notable feature of the Retrieval Plugin is its capacity to provide ChatGPT with memory. By using the plugin's upsert endpoint, ChatGPT can save snippets from the conversation to the vector database for later reference (only when prompted to do so by the user). This functionality contributes to a more context-aware chat experience by allowing ChatGPT to remember and retrieve information from previous conversations. Learn how to configure the Retrieval Plugin with memory [here](/examples/memory).
### Security
The Retrieval Plugin allows ChatGPT to search a vector database of content, and then add the best results into the ChatGPT session. This means it doesn’t have any external effects, and the main risk consideration is data authorization and privacy. Developers should only add content into their Retrieval Plugin that they have authorization for and that they are fine with appearing in users’ ChatGPT sessions. You can choose from a number of different authentication methods to secure the plugin (more information [here](#authentication-methods)).
### Choosing an Embeddings Model
The ChatGPT Retrieval Plugin uses OpenAI's embeddings models to generate embeddings of document chunks. The default model for the Retrieval Plugin is `text-embedding-3-large` with 256 dimensions. OpenAI offers two latest embeddings models, `text-embedding-3-small` and `text-embedding-3-large`, as well as an older model, `text-embedding-ada-002`.
The new models support shortening embeddings without significant loss of retrieval accuracy, allowing you to balance retrieval accuracy, cost, and speed.
Here's a comparison of the models:
| Model | Embedding Size | Average MTEB Score | Cost per 1k Tokens |
| ---------------------- | -------------- | ------------------ | ------------------ |
| text-embedding-3-large | 3072 | 64.6% | $0.00013 |
| text-embedding-3-large | 1024 | 64.1% | $0.00013 |
| text-embedding-3-large | 256 | 62.0% | $0.00013 |
| text-embedding-3-small | 1536 | 62.3% | $0.00002 |
| text-embedding-3-small | 512 | 61.6% | $0.00002 |
| text-embedding-ada-002 | 1536 | 61.0% | $0.0001 |
When choosing a model, consider:
1. **Retrieval Accuracy vs Cost**: `text-embedding-3-large` offers the highest accuracy but at a higher cost. `text-embedding-3-small` is more cost-effective with competitive accuracy. The older `text-embedding-ada-002` model has the lowest accuracy.
2. **Embedding Size**: Larger embeddings provide better accuracy but consume more storage and could be slower to query. You can adjust the size of the embeddings to balance these factors.
For example, if your vector database supports up to 1024 dimensions, you can use `text-embedding-3-large` and set the dimensions API parameter to 1024. This shortens the embedding from 3072 dimensions, trading off some accuracy for lower storage and query costs.
To change your chosen embeddings model and size, edit the following environment variables:
```
EMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use
EMBEDDING_MODEL="text-embedding-3-large" # edit this value based on the model you want to use e.g. text-embedding-3-small, text-embedding-ada-002
```
## Development
### Setup
This app uses Python 3.10, and [poetry](https://python-poetry.org/) for dependency management.
Install Python 3.10 on your machine if it isn't already installed. It can be downloaded from the official [Python website](https://www.python.org/downloads/) or with a package manager like `brew` or `apt`, depending on your system.
Clone the repository from GitHub:
```
git clone https://github.com/openai/chatgpt-retrieval-plugin.git
```
Navigate to the cloned repository directory:
```
cd /path/to/chatgpt-retrieval-plugin
```
Install poetry:
```
pip install poetry
```
Create a new virtual environment that uses Python 3.10:
```
poetry env use python3.10
poetry shell
```
Install app dependencies using poetry:
```
poetry install
```
**Note:** If adding dependencies in the `pyproject.toml`, make sure to run `poetry lock` and `poetry install`.
#### General Environment Variables
The API requires the following environment variables to work:
| Name | Required | Description |
| ---------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `DATASTORE` | Yes | This specifies the vector database provider you want to use to store and query embeddings. You can choose from `elasticsearch`, `chroma`, `pinecone`, `weaviate`, `zilliz`, `milvus`, `qdrant`, `redis`, `azuresearch`, `supabase`, `postgres`, `analyticdb`, `mongodb-atlas`. |
| `BEARER_TOKEN` | Yes | This is a secret token that you need to authenticate your requests to the API. You can generate one using any tool or method you prefer, such as [jwt.io](https://jwt.io/). |
| `OPENAI_API_KEY` | Yes | This is your OpenAI API key that you need to generate embeddings using the one of the OpenAI embeddings model. You can get an API key by creating an account on [OpenAI](https://openai.com/). |
### Using the plugin with Azure OpenAI
The Azure Open AI uses URLs that are specific to your resource and references models not by model name but by the deployment id. As a result, you need to set additional environment variables for this case.
In addition to the `OPENAI_API_BASE` (your specific URL) and `OPENAI_API_TYPE` (azure), you should also set `OPENAI_EMBEDDINGMODEL_DEPLOYMENTID` which specifies the model to use for getting embeddings on upsert and query. For this, we recommend deploying `text-embedding-ada-002` model and using the deployment name here.
If you wish to use the data preparation scripts, you will also need to set `OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID`, used for metadata extraction and
`OPENAI_COMPLETIONMODEL_DEPLOYMENTID`, used for PII handling.
### Choosing a Vector Database
The plugin supports several vector database providers, each with different features, performance, and pricing. Depending on which one you choose, you will need to use a different Dockerfile and set different environment variables. The following sections provide brief introductions to each vector database provider.
For more detailed instructions on setting up and using each vector database provider, please refer to the respective documentation in the `/docs/providers/<datastore_name>/setup.md` file ([folders here](/docs/providers)).
#### Pinecone
[Pinecone](https://www.pinecone.io) is a managed vector database designed for speed, scale, and rapid deployment to production. It supports hybrid search and is currently the only datastore to natively support SPLADE sparse vectors. For detailed setup instructions, refer to [`/docs/providers/pinecone/setup.md`](/docs/providers/pinecone/setup.md).
#### Weaviate
[Weaviate](https://weaviate.io/) is an open-source vector search engine built to scale seamlessly into billions of data objects. It supports hybrid search out-of-the-box, making it suitable for users who require efficient keyword searches. Weaviate can be self-hosted or managed, offering flexibility in deployment. For detailed setup instructions, refer to [`/docs/providers/weaviate/setup.md`](/docs/providers/weaviate/setup.md).
#### Zilliz
[Zilliz](https://zilliz.com) is a managed cloud-native vector database designed for billion-scale data. It offers a wide range of features, including multiple indexing algorithms, distance metrics, scalar filtering, time travel searches, rollback with snapshots, full RBAC, 99.9% uptime, separated storage and compute, and multi-language SDKs. For detailed setup instructions, refer to [`/docs/providers/zilliz/setup.md`](/docs/providers/zilliz/setup.md).
#### Milvus
[Milvus](https://milvus.io/) is an open-source, cloud-native vector database that scales to billions of vectors. It is the open-source version of Zilliz and shares many of its features, such as various indexing algorithms, distance metrics, scalar filtering, time travel searches, rollback with snapshots, multi-language SDKs, storage and compute separation, and cloud scalability. For detailed setup instructions, refer to [`/docs/providers/milvus/setup.md`](/docs/providers/milvus/setup.md).
#### Qdrant
[Qdrant](https://qdrant.tech/) is a vector database capable of storing documents and vector embeddings. It offers both self-hosted and managed [Qdrant Cloud](https://cloud.qdrant.io/) deployment options, providing flexibility for users with different requirements. For detailed setup instructions, refer to [`/docs/providers/qdrant/setup.md`](/docs/providers/qdrant/setup.md).
#### Redis
[Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform suitable for a variety of use cases, including everyday applications and AI/ML workloads. It can be used as a low-latency vector engine by creating a Redis database with the [Redis Stack docker container](/examples/docker/redis/docker-compose.yml). For a hosted/managed solution, [Redis Cloud](https://app.redislabs.com/#/) is available. For detailed setup instructions, refer to [`/docs/providers/redis/setup.md`](/docs/providers/redis/setup.md).
#### LlamaIndex
[LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data.
It provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT.
Unlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases.
It is light-weight, easy-to-use, and requires no additional deployment.
All you need to do is specifying a few environment variables (optionally point to an existing saved Index json file).
Note that metadata filters in queries are not yet supported.
For detailed setup instructions, refer to [`/docs/providers/llama/setup.md`](/docs/providers/llama/setup.md).
#### Chroma
[Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make getting started as easy as possible. Chroma runs in-memory, or in a client-server setup. It supports metadata and keyword filtering out of the box. For detailed instructions, refer to [`/docs/providers/chroma/setup.md`](/docs/providers/chroma/setup.md).
#### Azure Cognitive Search
[Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval cloud service that supports vector search, text search, and hybrid (vectors + text combined to yield the best of the two approaches). It also offers an [optional L2 re-ranking step](https://learn.microsoft.com/azure/search/semantic-search-overview) to further improve results quality. For detailed setup instructions, refer to [`/docs/providers/azuresearch/setup.md`](/docs/providers/azuresearch/setup.md)
#### Azure CosmosDB Mongo vCore
[Azure CosmosDB Mongo vCore](https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/) supports vector search on embeddings, and it could be used to seamlessly integrate your AI-based applications with your data stored in the Azure CosmosDB. For detailed instructions, refer to [`/docs/providers/azurecosmosdb/setup.md`](/docs/providers/azurecosmosdb/setup.md)
#### Supabase
[Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension for Postgres Database. [You can use Supabase CLI](https://github.com/supabase/cli) to set up a whole Supabase stack locally or in the cloud or you can also use docker-compose, k8s and other options available. For a hosted/managed solution, try [Supabase.com](https://supabase.com/) and unlock the full power of Postgres with built-in authentication, storage, auto APIs, and Realtime features. For detailed setup instructions, refer to [`/docs/providers/supabase/setup.md`](/docs/providers/supabase/setup.md).
#### Postgres
[Postgres](https://www.postgresql.org) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension. To use pgvector, you will need to set up a PostgreSQL database with the pgvector extension enabled. For example, you can [use docker](https://www.docker.com/blog/how-to-use-the-postgres-docker-official-image/) to run locally. For a hosted/managed solution, you can use any of the cloud vendors which support [pgvector](https://github.com/pgvector/pgvector#hosted-postgres). For detailed setup instructions, refer to [`/docs/providers/postgres/setup.md`](/docs/providers/postgres/setup.md).
#### AnalyticDB
[AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is fully compatible with PostgreSQL syntax and managed by Alibaba Cloud. AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing features such as indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. For detailed setup instructions, refer to [`/docs/providers/analyticdb/setup.md`](/docs/providers/analyticdb/setup.md).
#### Elasticsearch
[Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html) currently supports storing vectors through the `dense_vector` field type and uses them to calculate document scores. Elasticsearch 8.0 builds on this functionality to support fast, approximate nearest neighbor search (ANN). This represents a much more scalable approach, allowing vector search to run efficiently on large datasets. For detailed setup instructions, refer to [`/docs/providers/elasticsearch/setup.md`](/docs/providers/elasticsearch/setup.md).
#### Mongodb-Atlas
[MongoDB Atlas](https://www.mongodb.com/docs/atlas/getting-started/) Currently, the procedure involves generating an Atlas Vector Search index for all collections featuring vector embeddings of 2048 dimensions or fewer in width. This applies to diverse data types coexisting with additional data on your Atlas cluster, and the process is executed through the Atlas UI and Atlas Administration AP, refer to [`/docs/providers/mongodb_atlas/setup.md`](/docs/providers/mongodb_atlas/setup.md).
### Running the API locally
To run the API locally, you first need to set the requisite environment variables with the `export` command:
```
export DATASTORE=<your_datastore>
export BEARER_TOKEN=<your_bearer_token>
export OPENAI_API_KEY=<your_openai_api_key>
<Add the environment variables for your chosen vector DB here>
```
Start the API with:
```
poetry run start
```
Append `docs` to the URL shown in the terminal and open it in a browser to access the API documentation and try out the endpoints (i.e. http://0.0.0.0:8000/docs). Make sure to enter your bearer token and test the API endpoints.
**Note:** If you add new dependencies to the pyproject.toml file, you need to run `poetry lock` and `poetry install` to update the lock file and install the new dependencies.
### Personalization
You can personalize the Retrieval Plugin for your own use case by doing the following:
- **Replace the logo**: Replace the image in [logo.png](/.well-known/logo.png) with your own logo.
- **Edit the data models**: Edit the `DocumentMetadata` and `DocumentMetadataFilter` data models in [models.py](/models/models.py) to add custom metadata fields. Update the OpenAPI schema in [openapi.yaml](/.well-known/openapi.yaml) accordingly. To update the OpenAPI schema more easily, you can run the app locally, then navigate to `http://0.0.0.0:8000/sub/openapi.json` and copy the contents of the webpage. Then go to [Swagger Editor](https://editor.swagger.io/) and paste in the JSON to convert it to a YAML format. You could also replace the [openapi.yaml](/.well-known/openapi.yaml) file with an openapi.json file in the [.well-known](/.well-known) folder.
- **Change the plugin name, description, and usage instructions**: Update the plugin name, user-facing description, and usage instructions for the model. You can either edit the descriptions in the [main.py](/server/main.py) file or update the [openapi.yaml](/.well-known/openapi.yaml) file. Follow the same instructions as in the previous step to update the OpenAPI schema.
- **Enable ChatGPT to save information from conversations**: See the instructions in the [memory example folder](/examples/memory).
### Authentication Methods
You can choose from four options for authenticating requests to your plugin:
1. **No Authentication**: Anyone can add your plugin and use its API without any credentials. This option is suitable if you are only exposing documents that are not sensitive or already public. It provides no security for your data. If using this method, copy the contents of this [main.py](/examples/authentication-methods/no-auth/main.py) into the [actual main.py file](/server/main.py). Example manifest [here](/examples/authentication-methods/no-auth/ai-plugin.json).
2. **HTTP Bearer**: You can use a secret token as a header to authorize requests to your plugin. There are two variants of this option:
- **User Level** (default for this implementation): Each user who adds your plugin to ChatGPT must provide the bearer token when adding the plugin. You can generate and distribute these tokens using any tool or method you prefer, such as [jwt.io](https://jwt.io/). This method provides better security as each user has to enter the shared access token. If you require a unique access token for each user, you will need to implement this yourself in the [main.py](/server/main.py) file. Example manifest [here](/examples/authentication-methods/user-http/ai-plugin.json).
- **Service Level**: Anyone can add your plugin and use its API without credentials, but you must add a bearer token when registering the plugin. When you install your plugin, you need to add your bearer token, and will then receive a token from ChatGPT that you must include in your hosted manifest file. Your token will be used by ChatGPT to authorize requests to your plugin on behalf of all users who add it. This method is more convenient for users, but it may be less secure as all users share the same token and do not need to add a token to install the plugin. Example manifest [here](/examples/authentication-methods/service-http/ai-plugin.json).
3. **OAuth**: Users must go through an OAuth flow to add your plugin. You can use an OAuth provider to authenticate users who add your plugin and grant them access to your API. This method offers the highest level of security and control, as users authenticate through a trusted third-party provider. However, you will need to implement the OAuth flow yourself in the [main.py](/server/main.py) file and provide the necessary parameters in your manifest file. Example manifest [here](/examples/authentication-methods/oauth/ai-plugin.json).
Consider the benefits and drawbacks of each authentication method before choosing the one that best suits your use case and security requirements. If you choose to use a method different to the default (User Level HTTP), make sure to update the manifest file [here](/.well-known/ai-plugin.json).
## Deployment
You can deploy your app to different cloud providers, depending on your preferences and requirements. However, regardless of the provider you choose, you will need to update two files in your app: [openapi.yaml](/.well-known/openapi.yaml) and [ai-plugin.json](/.well-known/ai-plugin.json). As outlined above, these files define the API specification and the AI plugin configuration for your app, respectively. You need to change the url field in both files to match the address of your deployed app.
Render has a 1-click deploy option that automatically updates the url field in both files:
[<img src="https://render.com/images/deploy-to-render-button.svg" alt="Deploy to Render" />](https://render.com/deploy?repo=https://github.com/render-examples/chatgpt-retrieval-plugin/tree/main)
Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. Refer to the respective documentation in the [`/docs/deployment/removing-unused-dependencies.md`](/docs/deployment/removing-unused-dependencies.md) file for information on removing unused dependencies for each provider.
Instructions:
- [Deploying to Fly.io](/docs/deployment/flyio.md)
- [Deploying to Heroku](/docs/deployment/heroku.md)
- [Deploying to Render](/docs/deployment/render.md)
- [Other Deployment Options](/docs/deployment/other-options.md) (Azure Container Apps, Google Cloud Run, AWS Elastic Container Service, etc.)
Once you have deployed your app, consider uploading an initial batch of documents using one of [these scripts](/scripts) or by calling the `/upsert` endpoint.
## Webhooks
To keep the documents stored in the vector database up-to-date, consider using tools like [Zapier](https://zapier.com) or [Make](https://www.make.com) to configure incoming webhooks to your plugin's API based on events or schedules. For example, this could allow you to sync new information as you update your notes or receive emails. You can also use a [Zapier Transfer](https://zapier.com/blog/zapier-transfer-guide/) to batch process a collection of existing documents and upload them to the vector database.
If you need to pass custom fields from these tools to your plugin, you might want to create an additional Retrieval Plugin API endpoint that calls the datastore's upsert function, such as `upsert-email`. This custom endpoint can be designed to accept specific fields from the webhook and process them accordingly.
To set up an incoming webhook, follow these general steps:
- Choose a webhook tool like Zapier or Make and create an account.
- Set up a new webhook or transfer in the tool, and configure it to trigger based on events or schedules.
- Specify the target URL for the webhook, which should be the API endpoint of your Retrieval Plugin (e.g. `https://your-plugin-url.com/upsert`).
- Configure the webhook payload to include the necessary data fields and format them according to your Retrieval Plugin's API requirements.
- Test the webhook to ensure it's working correctly and sending data to your Retrieval Plugin as expected.
After setting up the webhook, you may want to run a backfill to ensure that any previously missed data is included in the vector database.
Remember that if you want to use incoming webhooks to continuously sync data, you should consider running a backfill after setting these up to avoid missing any data.
In addition to using tools like Zapier and Make, you can also build your own custom integrations to sync data with your Retrieval Plugin. This allows you to have more control over the data flow and tailor the integration to your specific needs and requirements.
## Scripts
The `scripts` folder contains scripts to batch upsert or process text documents from different data sources, such as a zip file, JSON file, or JSONL file. These scripts use the plugin's upsert utility functions to upload the documents and their metadata to the vector database, after converting them to plain text and splitting them into chunks. Each script folder has a README file that explains how to use it and what parameters it requires. You can also optionally screen the documents for personally identifiable information (PII) using a language model and skip them if detected, with the [`services.pii_detection`](/services/pii_detection.py) module. This can be helpful if you want to avoid uploading sensitive or private documents to the vector database unintentionally. Additionally, you can optionally extract metadata from the document text using a language model, with the [`services.extract_metadata`](/services/extract_metadata.py) module. This can be useful if you want to enrich the document metadata. **Note:** if using incoming webhooks to continuously sync data, consider running a backfill after setting these up to avoid missing any data.
The scripts are:
- [`process_json`](scripts/process_json/): This script processes a file dump of documents in a JSON format and stores them in the vector database with some metadata. The format of the JSON file should be a list of JSON objects, where each object represents a document. The JSON object should have a `text` field and optionally other fields to populate the metadata. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata.
- [`process_jsonl`](scripts/process_jsonl/): This script processes a file dump of documents in a JSONL format and stores them in the vector database with some metadata. The format of the JSONL file should be a newline-delimited JSON file, where each line is a valid JSON object representing a document. The JSON object should have a `text` field and optionally other fields to populate the metadata. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata.
- [`process_zip`](scripts/process_zip/): This script processes a file dump of documents in a zip file and stores them in the vector database with some metadata. The format of the zip file should be a flat zip file folder of docx, pdf, txt, md, pptx or csv files. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata.
## Pull Request (PR) Checklist
If you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing!
1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`.
2. **Short Description**: Provide a brief, informative description of the PR that explains the changes made.
3. **Issue(s) Linked**: Mention any related issue(s) by using the keyword `Fixes` or `Closes` followed by the respective issue number(s) (e.g., Fixes #123, Closes #456).
4. **Branch**: Ensure that you have created a new branch for the changes, and it is based on the latest version of the `main` branch.
5. **Code Changes**: Make sure the code changes are minimal, focused, and relevant to the issue or feature being addressed.
6. **Commit Messages**: Write clear and concise commit messages that explain the purpose of each commit.
7. **Tests**: Include unit tests and/or integration tests for any new code or changes to existing code. Make sure all tests pass before submitting the PR.
8. **Documentation**: Update relevant documentation (e.g., README, inline comments, or external documentation) to reflect any changes made.
9. **Review Requested**: Request a review from at least one other contributor or maintainer of the repository.
10. **Video Submission** (For Complex/Large PRs): If your PR introduces significant changes, complexities, or a large number of lines of code, submit a brief video walkthrough along with the PR. The video should explain the purpose of the changes, the logic behind them, and how they address the issue or add the proposed feature. This will help reviewers to better understand your contribution and expedite the review process.
## Pull Request Naming Convention
Use the following naming convention for your PR branches:
```
<type>/<short-description>-<issue-number>
```
- `<type>`: The type of PR, such as `bugfix`, `feature`, `enhancement`, `refactor`, or `docs`. Multiple types are ok and should appear as <type>, <type2>
- `<short-description>`: A brief description of the changes made, using hyphens to separate words.
- `<issue-number>`: The issue number associated with the changes made (if applicable).
Example:
```
feature/advanced-chunking-strategy-123
```
## Limitations
While the ChatGPT Retrieval Plugin is designed to provide a flexible solution for semantic search and retrieval, it does have some limitations:
- **Keyword search limitations**: The embeddings generated by the chosen OpenAI embeddings model may not always be effective at capturing exact keyword matches. As a result, the plugin might not return the most relevant results for queries that rely heavily on specific keywords. Some vector databases, like Elasticsearch, Pinecone, Weaviate and Azure Cognitive Search, use hybrid search and might perform better for keyword searches.
- **Sensitive data handling**: The plugin does not automatically detect or filter sensitive data. It is the responsibility of the developers to ensure that they have the necessary authorization to include content in the Retrieval Plugin and that the content complies with data privacy requirements.
- **Scalability**: The performance of the plugin may vary depending on the chosen vector database provider and the size of the dataset. Some providers may offer better scalability and performance than others.
- **Metadata extraction**: The optional metadata extraction feature relies on a language model to extract information from the document text. This process may not always be accurate, and the quality of the extracted metadata may vary depending on the document content and structure.
- **PII detection**: The optional PII detection feature is not foolproof and may not catch all instances of personally identifiable information. Use this feature with caution and verify its effectiveness for your specific use case.
## Future Directions
The ChatGPT Retrieval Plugin provides a flexible solution for semantic search and retrieval, but there is always potential for further development. We encourage users to contribute to the project by submitting pull requests for new features or enhancements. Notable contributions may be acknowledged with OpenAI credits.
Some ideas for future directions include:
- **More vector database providers**: If you are interested in integrating another vector database provider with the ChatGPT Retrieval Plugin, feel free to submit an implementation.
- **Additional scripts**: Expanding the range of scripts available for processing and uploading documents from various data sources would make the plugin even more versatile.
- **User Interface**: Developing a user interface for managing documents and interacting with the plugin could improve the user experience.
- **Hybrid search / TF-IDF option**: Enhancing the [datastore's upsert function](/datastore/datastore.py#L18) with an option to use hybrid search or TF-IDF indexing could improve the plugin's performance for keyword-based queries.
- **Advanced chunking strategies and embeddings calculations**: Implementing more sophisticated chunking strategies and embeddings calculations, such as embedding document titles and summaries, performing weighted averaging of document chunks and summaries, or calculating the average embedding for a document, could lead to better search results.
- **Custom metadata**: Allowing users to add custom metadata to document chunks, such as titles or other relevant information, might improve the retrieved results in some use cases.
- **Additional optional services**: Integrating more optional services, such as summarizing documents or pre-processing documents before embedding them, could enhance the plugin's functionality and quality of retrieved results. These services could be implemented using language models and integrated directly into the plugin, rather than just being available in the scripts.
We welcome contributions from the community to help improve the ChatGPT Retrieval Plugin and expand its capabilities. If you have an idea or feature you'd like to contribute, please submit a pull request to the repository.
## Contributors
We would like to extend our gratitude to the following contributors for their code / documentation contributions, and support in integrating various vector database providers with the ChatGPT Retrieval Plugin:
- [Pinecone](https://www.pinecone.io/)
- [acatav](https://github.com/acatav)
- [gkogan](https://github.com/gkogan)
- [jamescalam](https://github.com/jamescalam)
- [Weaviate](https://www.semi.technology/)
- [byronvoorbach](https://github.com/byronvoorbach)
- [hsm207](https://github.com/hsm207)
- [sebawita](https://github.com/sebawita)
- [Zilliz](https://zilliz.com/)
- [filip-halt](https://github.com/filip-halt)
- [Milvus](https://milvus.io/)
- [filip-halt](https://github.com/filip-halt)
- [Qdrant](https://qdrant.tech/)
- [kacperlukawski](https://github.com/kacperlukawski)
- [Redis](https://redis.io/)
- [spartee](https://github.com/spartee)
- [tylerhutcherson](https://github.com/tylerhutcherson)
- [LlamaIndex](https://github.com/jerryjliu/llama_index)
- [jerryjliu](https://github.com/jerryjliu)
- [Disiok](https://github.com/Disiok)
- [Supabase](https://supabase.com/)
- [egor-romanov](https://github.com/egor-romanov)
- [Postgres](https://www.postgresql.org/)
- [egor-romanov](https://github.com/egor-romanov)
- [mmmaia](https://github.com/mmmaia)
- [Elasticsearch](https://www.elastic.co/)
- [joemcelroy](https://github.com/joemcelroy)
================================================
FILE: datastore/__init__.py
================================================
================================================
FILE: datastore/datastore.py
================================================
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
import asyncio
from models.models import (
Document,
DocumentChunk,
DocumentMetadataFilter,
Query,
QueryResult,
QueryWithEmbedding,
)
from services.chunks import get_document_chunks
from services.openai import get_embeddings
class DataStore(ABC):
async def upsert(
self, documents: List[Document], chunk_token_size: Optional[int] = None
) -> List[str]:
"""
Takes in a list of documents and inserts them into the database.
First deletes all the existing vectors with the document id (if necessary, depends on the vector db), then inserts the new ones.
Return a list of document ids.
"""
# Delete any existing vectors for documents with the input document ids
await asyncio.gather(
*[
self.delete(
filter=DocumentMetadataFilter(
document_id=document.id,
),
delete_all=False,
)
for document in documents
if document.id
]
)
chunks = get_document_chunks(documents, chunk_token_size)
return await self._upsert(chunks)
@abstractmethod
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of document chunks and inserts them into the database.
Return a list of document ids.
"""
raise NotImplementedError
async def query(self, queries: List[Query]) -> List[QueryResult]:
"""
Takes in a list of queries and filters and returns a list of query results with matching document chunks and scores.
"""
# get a list of just the queries from the Query list
query_texts = [query.query for query in queries]
query_embeddings = get_embeddings(query_texts)
# hydrate the queries with embeddings
queries_with_embeddings = [
QueryWithEmbedding(**query.dict(), embedding=embedding)
for query, embedding in zip(queries, query_embeddings)
]
return await self._query(queries_with_embeddings)
@abstractmethod
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
raise NotImplementedError
@abstractmethod
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Multiple parameters can be used at once.
Returns whether the operation was successful.
"""
raise NotImplementedError
================================================
FILE: datastore/factory.py
================================================
from datastore.datastore import DataStore
import os
async def get_datastore() -> DataStore:
datastore = os.environ.get("DATASTORE")
assert datastore is not None
match datastore:
case "chroma":
from datastore.providers.chroma_datastore import ChromaDataStore
return ChromaDataStore()
case "llama":
from datastore.providers.llama_datastore import LlamaDataStore
return LlamaDataStore()
case "pinecone":
from datastore.providers.pinecone_datastore import PineconeDataStore
return PineconeDataStore()
case "weaviate":
from datastore.providers.weaviate_datastore import WeaviateDataStore
return WeaviateDataStore()
case "milvus":
from datastore.providers.milvus_datastore import MilvusDataStore
return MilvusDataStore()
case "zilliz":
from datastore.providers.zilliz_datastore import ZillizDataStore
return ZillizDataStore()
case "redis":
from datastore.providers.redis_datastore import RedisDataStore
return await RedisDataStore.init()
case "azurecosmosdb":
from datastore.providers.azurecosmosdb_datastore import (
AzureCosmosDBDataStore,
)
return await AzureCosmosDBDataStore.create()
case "qdrant":
from datastore.providers.qdrant_datastore import QdrantDataStore
return QdrantDataStore()
case "azuresearch":
from datastore.providers.azuresearch_datastore import AzureSearchDataStore
return AzureSearchDataStore()
case "supabase":
from datastore.providers.supabase_datastore import SupabaseDataStore
return SupabaseDataStore()
case "postgres":
from datastore.providers.postgres_datastore import PostgresDataStore
return PostgresDataStore()
case "analyticdb":
from datastore.providers.analyticdb_datastore import AnalyticDBDataStore
return AnalyticDBDataStore()
case "elasticsearch":
from datastore.providers.elasticsearch_datastore import (
ElasticsearchDataStore,
)
return ElasticsearchDataStore()
case "mongodb":
from datastore.providers.mongodb_atlas_datastore import (
MongoDBAtlasDataStore,
)
return MongoDBAtlasDataStore()
case _:
raise ValueError(
f"Unsupported vector database: {datastore}. "
f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, azuresearch, or qdrant"
)
================================================
FILE: datastore/providers/__init__.py
================================================
================================================
FILE: datastore/providers/analyticdb_datastore.py
================================================
import os
import asyncio
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
from loguru import logger
from psycopg2cffi import compat
compat.register()
import psycopg2
from psycopg2.extras import DictCursor
from psycopg2.pool import SimpleConnectionPool
from services.date import to_unix_timestamp
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkMetadata,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
DocumentChunkWithScore,
)
PG_CONFIG = {
"collection": os.environ.get("PG_COLLECTION", "document_chunks"),
"database": os.environ.get("PG_DATABASE", "postgres"),
"user": os.environ.get("PG_USER", "user"),
"password": os.environ.get("PG_PASSWORD", "password"),
"host": os.environ.get("PG_HOST", "localhost"),
"port": int(os.environ.get("PG_PORT", "5432")),
}
OUTPUT_DIM = int(os.environ.get("EMBEDDING_DIMENSION", 256))
class AnalyticDBDataStore(DataStore):
def __init__(self, config: Dict[str, str] = PG_CONFIG):
self.collection_name = config["collection"]
self.user = config["user"]
self.password = config["password"]
self.database = config["database"]
self.host = config["host"]
self.port = config["port"]
self.connection_pool = SimpleConnectionPool(
minconn=1,
maxconn=100,
dbname=self.database,
user=self.user,
password=self.password,
host=self.host,
port=self.port,
)
self._initialize_db()
def _initialize_db(self):
conn = self.connection_pool.getconn()
try:
with conn.cursor() as cur:
self._create_table(cur)
self._create_embedding_index(cur)
conn.commit()
finally:
self.connection_pool.putconn(conn)
def _create_table(self, cur: psycopg2.extensions.cursor):
cur.execute(
f"""
CREATE TABLE IF NOT EXISTS {self.collection_name} (
id TEXT PRIMARY KEY DEFAULT uuid_generate_v4()::TEXT,
source TEXT,
source_id TEXT,
content TEXT,
document_id TEXT,
author TEXT,
url TEXT,
created_at TIMESTAMPTZ DEFAULT NOW(),
embedding real[]
);
"""
)
def _create_embedding_index(self, cur: psycopg2.extensions.cursor):
cur.execute(
f"""
SELECT * FROM pg_indexes WHERE tablename='{self.collection_name}';
"""
)
index_exists = any(
index[2] == f"{self.collection_name}_embedding_idx"
for index in cur.fetchall()
)
if not index_exists:
cur.execute(
f"""
CREATE INDEX {self.collection_name}_embedding_idx
ON {self.collection_name}
USING ann(embedding)
WITH (
distancemeasure=L2,
dim=OUTPUT_DIM,
pq_segments=64,
hnsw_m=100,
pq_centers=2048
);
"""
)
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a dict of document_ids to list of document chunks and inserts them into the database.
Return a list of document ids.
"""
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(None, self._upsert_chunk, chunk)
for document_chunks in chunks.values()
for chunk in document_chunks
]
await asyncio.gather(*tasks)
return list(chunks.keys())
def _upsert_chunk(self, chunk: DocumentChunk):
created_at = (
datetime.fromtimestamp(to_unix_timestamp(chunk.metadata.created_at))
if chunk.metadata.created_at
else None
)
data = (
chunk.id,
chunk.text,
chunk.embedding,
chunk.metadata.document_id,
chunk.metadata.source,
chunk.metadata.source_id,
chunk.metadata.url,
chunk.metadata.author,
created_at,
)
conn = self.connection_pool.getconn()
try:
with conn.cursor() as cur:
# Construct the SQL query and data
query = f"""
INSERT INTO {self.collection_name} (id, content, embedding, document_id, source, source_id, url, author, created_at)
VALUES (%s::text, %s::text, %s::real[], %s::text, %s::text, %s::text, %s::text, %s::text, %s::timestamp with time zone)
ON CONFLICT (id) DO UPDATE SET
content = EXCLUDED.content,
embedding = EXCLUDED.embedding,
document_id = EXCLUDED.document_id,
source = EXCLUDED.source,
source_id = EXCLUDED.source_id,
url = EXCLUDED.url,
author = EXCLUDED.author,
created_at = EXCLUDED.created_at;
"""
# Execute the query
cur.execute(query, data)
# Commit the transaction
conn.commit()
finally:
self.connection_pool.putconn(conn)
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
query_results: List[QueryResult] = []
def generate_query(query: QueryWithEmbedding) -> Tuple[str, List[Any]]:
embedding = "[" + ", ".join(str(x) for x in query.embedding) + "]"
q = f"""
SELECT
id,
content,
source,
source_id,
document_id,
url,
created_at,
author,
embedding,
l2_distance(embedding,array{embedding}::real[]) AS similarity
FROM
{self.collection_name}
"""
where_clause, params = generate_where_clause(query.filter)
q += where_clause
q += f"ORDER BY embedding <-> array{embedding}::real[] LIMIT {query.top_k};"
return q, params
def generate_where_clause(
query_filter: Optional[DocumentMetadataFilter],
) -> Tuple[str, List[Any]]:
if query_filter is None:
return "", []
conditions = [
("document_id=%s", query_filter.document_id),
("source_id=%s", query_filter.source_id),
("source LIKE %s", query_filter.source),
("author LIKE %s", query_filter.author),
("created_at >= %s", query_filter.start_date),
("created_at <= %s", query_filter.end_date),
]
where_clause = "WHERE " + " AND ".join(
[cond[0] for cond in conditions if cond[1] is not None]
)
values = [cond[1] for cond in conditions if cond[1] is not None]
return where_clause, values
def fetch_data(cur, q: str, params: List[Any]):
cur.execute(q, params)
return cur.fetchall()
def create_results(data):
results = []
for row in data:
document_chunk = DocumentChunkWithScore(
id=row["id"],
text=row["content"],
score=float(row["similarity"]),
metadata=DocumentChunkMetadata(
source=row["source"],
source_id=row["source_id"],
document_id=row["document_id"],
url=row["url"],
created_at=str(row["created_at"]),
author=row["author"],
),
)
results.append(document_chunk)
return results
conn = self.connection_pool.getconn()
try:
for query in queries:
try:
cur = conn.cursor(cursor_factory=DictCursor)
for query in queries:
q, params = generate_query(query)
data = fetch_data(cur, q, params)
results = create_results(data)
query_results.append(
QueryResult(query=query.query, results=results)
)
except Exception as e:
logger.error(e)
query_results.append(QueryResult(query=query.query, results=[]))
return query_results
finally:
self.connection_pool.putconn(conn)
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
async def execute_delete(query: str, params: Optional[List] = None) -> bool:
conn = self.connection_pool.getconn()
try:
with conn.cursor() as cur:
if params:
cur.execute(query, params)
else:
cur.execute(query)
self.conn.commit()
return True
except Exception as e:
logger.error(e)
return False
finally:
self.connection_pool.putconn(conn)
if delete_all:
query = f"DELETE FROM {self.collection_name} WHERE document_id LIKE %s;"
return await execute_delete(query, ["%"])
elif ids:
query = f"DELETE FROM {self.collection_name} WHERE document_id IN ({','.join(['%s'] * len(ids))});"
return await execute_delete(query, ids)
elif filter is not None:
query, params = self._generate_delete_query(filter)
return await execute_delete(query, params)
else:
return True
def _generate_delete_query(
self, filter: DocumentMetadataFilter
) -> Tuple[str, List]:
conditions = [
(filter.document_id, "document_id = %s"),
(filter.source, "source = %s"),
(filter.source_id, "source_id = %s"),
(filter.author, "author = %s"),
(filter.start_date, "created_at >= %s"),
(filter.end_date, "created_at <= %s"),
]
where_conditions = [f for value, f in conditions if value]
where_values = [value for value, _ in conditions if value]
query = f"DELETE FROM {self.collection_name} WHERE {' AND '.join(where_conditions)};"
return query, where_values
================================================
FILE: datastore/providers/azurecosmosdb_datastore.py
================================================
import logging
import os
import certifi
import numpy as np
import pymongo
from pymongo.mongo_client import MongoClient
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from datetime import datetime
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentMetadataFilter,
DocumentChunkWithScore,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
)
from services.date import to_unix_timestamp
# Read environment variables for CosmosDB Mongo vCore
AZCOSMOS_API = os.environ.get("AZCOSMOS_API", "mongo-vcore")
AZCOSMOS_CONNSTR = os.environ.get("AZCOSMOS_CONNSTR")
AZCOSMOS_DATABASE_NAME = os.environ.get("AZCOSMOS_DATABASE_NAME")
AZCOSMOS_CONTAINER_NAME = os.environ.get("AZCOSMOS_CONTAINER_NAME")
assert AZCOSMOS_API is not None
assert AZCOSMOS_CONNSTR is not None
assert AZCOSMOS_DATABASE_NAME is not None
assert AZCOSMOS_CONTAINER_NAME is not None
# OpenAI Ada Embeddings Dimension
VECTOR_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256))
# Abstract class similar to the original data store that allows API level abstraction
class AzureCosmosDBStoreApi(ABC):
@abstractmethod
async def ensure(self, num_lists, similarity):
raise NotImplementedError
@abstractmethod
async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]:
raise NotImplementedError
@abstractmethod
async def query_core(
self, query: QueryWithEmbedding
) -> List[DocumentChunkWithScore]:
raise NotImplementedError
@abstractmethod
async def drop_container(self):
raise NotImplementedError
@abstractmethod
async def delete_filter(self, filter: DocumentMetadataFilter):
raise NotImplementedError
@abstractmethod
async def delete_ids(self, ids: List[str]):
raise NotImplementedError
@abstractmethod
async def delete_document_ids(self, documentIds: List[str]):
raise NotImplementedError
class MongoStoreApi(AzureCosmosDBStoreApi):
def __init__(self, mongoClient: MongoClient):
self.mongoClient = mongoClient
@staticmethod
def _get_metadata_filter(filter: DocumentMetadataFilter) -> dict:
returnedFilter: dict = {}
if filter.document_id is not None:
returnedFilter["document_id"] = filter.document_id
if filter.author is not None:
returnedFilter["metadata.author"] = filter.author
if filter.start_date is not None:
returnedFilter["metadata.created_at"] = {
"$gt": datetime.fromisoformat(filter.start_date)
}
if filter.end_date is not None:
returnedFilter["metadata.created_at"] = {
"$lt": datetime.fromisoformat(filter.end_date)
}
if filter.source is not None:
returnedFilter["metadata.source"] = filter.source
if filter.source_id is not None:
returnedFilter["metadata.source_id"] = filter.source_id
return returnedFilter
async def ensure(self, num_lists, similarity):
assert self.mongoClient.is_mongos
self.collection = self.mongoClient[AZCOSMOS_DATABASE_NAME][
AZCOSMOS_CONTAINER_NAME
]
indexes = self.collection.index_information()
if indexes.get("embedding_cosmosSearch") is None:
# Ensure the vector index exists.
indexDefs: List[any] = [
{
"name": "embedding_cosmosSearch",
"key": {"embedding": "cosmosSearch"},
"cosmosSearchOptions": {
"kind": "vector-ivf",
"numLists": num_lists,
"similarity": similarity,
"dimensions": VECTOR_DIMENSION,
},
}
]
self.mongoClient[AZCOSMOS_DATABASE_NAME].command(
"createIndexes", AZCOSMOS_CONTAINER_NAME, indexes=indexDefs
)
async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -> List[str]:
# Until nested doc embedding support is done, treat each chunk as a separate doc.
doc_ids: List[str] = []
for chunk in chunks:
finalDocChunk: dict = {
"_id": f"doc:{docId}:chunk:{chunk.id}",
"document_id": docId,
"embedding": chunk.embedding,
"text": chunk.text,
"metadata": chunk.metadata.__dict__,
}
if chunk.metadata.created_at is not None:
finalDocChunk["metadata"]["created_at"] = datetime.fromisoformat(
chunk.metadata.created_at
)
self.collection.insert_one(finalDocChunk)
doc_ids.append(finalDocChunk["_id"])
return doc_ids
async def query_core(
self, query: QueryWithEmbedding
) -> List[DocumentChunkWithScore]:
pipeline = [
{
"$search": {
"cosmosSearch": {
"vector": query.embedding,
"path": "embedding",
"k": query.top_k,
},
"returnStoredSource": True,
}
},
{
"$project": {
"similarityScore": {"$meta": "searchScore"},
"document": "$$ROOT",
}
},
]
# TODO: Add in match filter (once it can be satisfied).
# Perform vector search
query_results: List[DocumentChunkWithScore] = []
for aggResult in self.collection.aggregate(pipeline):
finalMetadata = aggResult["document"]["metadata"]
if finalMetadata["created_at"] is not None:
finalMetadata["created_at"] = datetime.isoformat(
finalMetadata["created_at"]
)
result = DocumentChunkWithScore(
id=aggResult["_id"],
score=aggResult["similarityScore"],
text=aggResult["document"]["text"],
metadata=finalMetadata,
)
query_results.append(result)
return query_results
async def drop_container(self):
self.collection.drop()
async def delete_filter(self, filter: DocumentMetadataFilter):
delete_filter = self._get_metadata_filter(filter)
self.collection.delete_many(delete_filter)
async def delete_ids(self, ids: List[str]):
self.collection.delete_many({"_id": {"$in": ids}})
async def delete_document_ids(self, documentIds: List[str]):
self.collection.delete_many({"document_id": {"$in": documentIds}})
# Datastore implementation.
"""
A class representing a memory store for Azure CosmosDB DataStore, currently only supports Mongo vCore
"""
class AzureCosmosDBDataStore(DataStore):
def __init__(self, cosmosStore: AzureCosmosDBStoreApi):
self.cosmosStore = cosmosStore
"""
Creates a new datastore based on the Cosmos Api provided in the environment variables,
only supports Mongo vCore for now
Args:
numLists (int) : This integer is the number of clusters that the inverted file (IVF) index
uses to group the vector data. We recommend that numLists is set to
documentCount/1000 for up to 1 million documents and to sqrt(documentCount)
for more than 1 million documents. Using a numLists value of 1 is akin to
performing brute-force search, which has limited performance.
similarity (str) : Similarity metric to use with the IVF index. Possible options are COS (cosine distance),
L2 (Euclidean distance), and IP (inner product).
"""
@staticmethod
async def create(num_lists, similarity) -> DataStore:
# Create underlying data store based on the API definition.
# Right now this only supports Mongo, but set up to support more.
apiStore: AzureCosmosDBStoreApi = None
if AZCOSMOS_API == "mongo-vcore":
mongoClient = MongoClient(AZCOSMOS_CONNSTR)
apiStore = MongoStoreApi(mongoClient)
else:
raise NotImplementedError
await apiStore.ensure(num_lists, similarity)
store = AzureCosmosDBDataStore(apiStore)
return store
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of list of document chunks and inserts them into the database.
Return a list of document ids.
"""
# Initialize a list of ids to return
doc_ids: List[str] = []
for doc_id, chunk_list in chunks.items():
returnedIds = await self.cosmosStore.upsert_core(doc_id, chunk_list)
for returnedId in returnedIds:
doc_ids.append(returnedId)
return doc_ids
async def _query(
self,
queries: List[QueryWithEmbedding],
) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and
returns a list of query results with matching document chunks and scores.
"""
# Prepare query responses and results object
results: List[QueryResult] = []
# Gather query results in a pipeline
logging.info(f"Gathering {len(queries)} query results", flush=True)
for query in queries:
logging.info(f"Query: {query.query}")
query_results = await self.cosmosStore.query_core(query)
# Add to overall results
results.append(QueryResult(query=query.query, results=query_results))
return results
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Returns whether the operation was successful.
"""
if delete_all:
# fast path - truncate/delete all items.
await self.cosmosStore.drop_container()
return True
if filter:
if filter.document_id is not None:
await self.cosmosStore.delete_document_ids([filter.document_id])
else:
await self.cosmosStore.delete_filter(filter)
if ids:
await self.cosmosStore.delete_ids(ids)
return True
================================================
FILE: datastore/providers/azuresearch_datastore.py
================================================
import asyncio
import base64
import os
import re
import time
from typing import Dict, List, Optional, Union
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential as DefaultAzureCredentialSync
from azure.identity.aio import DefaultAzureCredential
from azure.search.documents.aio import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.search.documents.models import QueryType, Vector
from loguru import logger
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkMetadata,
DocumentChunkWithScore,
DocumentMetadataFilter,
Query,
QueryResult,
QueryWithEmbedding,
)
AZURESEARCH_SERVICE = os.environ.get("AZURESEARCH_SERVICE")
AZURESEARCH_INDEX = os.environ.get("AZURESEARCH_INDEX")
AZURESEARCH_API_KEY = os.environ.get("AZURESEARCH_API_KEY")
AZURESEARCH_SEMANTIC_CONFIG = os.environ.get("AZURESEARCH_SEMANTIC_CONFIG")
AZURESEARCH_LANGUAGE = os.environ.get("AZURESEARCH_LANGUAGE", "en-us")
AZURESEARCH_DISABLE_HYBRID = os.environ.get("AZURESEARCH_DISABLE_HYBRID")
AZURESEARCH_DIMENSIONS = os.environ.get(
"AZURESEARCH_DIMENSIONS", 256
) # Default to 256 dimensions, change if using a different embeddings model
assert AZURESEARCH_SERVICE is not None
assert AZURESEARCH_INDEX is not None
# Allow overriding field names for Azure Search
FIELDS_ID = os.environ.get("AZURESEARCH_FIELDS_ID", "id")
FIELDS_TEXT = os.environ.get("AZURESEARCH_FIELDS_TEXT", "text")
FIELDS_EMBEDDING = os.environ.get("AZURESEARCH_FIELDS_EMBEDDING", "embedding")
FIELDS_DOCUMENT_ID = os.environ.get("AZURESEARCH_FIELDS_DOCUMENT_ID", "document_id")
FIELDS_SOURCE = os.environ.get("AZURESEARCH_FIELDS_SOURCE", "source")
FIELDS_SOURCE_ID = os.environ.get("AZURESEARCH_FIELDS_SOURCE_ID", "source_id")
FIELDS_URL = os.environ.get("AZURESEARCH_FIELDS_URL", "url")
FIELDS_CREATED_AT = os.environ.get("AZURESEARCH_FIELDS_CREATED_AT", "created_at")
FIELDS_AUTHOR = os.environ.get("AZURESEARCH_FIELDS_AUTHOR", "author")
MAX_UPLOAD_BATCH_SIZE = 1000
MAX_DELETE_BATCH_SIZE = 1000
class AzureSearchDataStore(DataStore):
def __init__(self):
self.client = SearchClient(
endpoint=f"https://{AZURESEARCH_SERVICE}.search.windows.net",
index_name=AZURESEARCH_INDEX,
credential=AzureSearchDataStore._create_credentials(True),
user_agent="retrievalplugin",
)
mgmt_client = SearchIndexClient(
endpoint=f"https://{AZURESEARCH_SERVICE}.search.windows.net",
credential=AzureSearchDataStore._create_credentials(False),
user_agent="retrievalplugin",
)
if AZURESEARCH_INDEX not in [name for name in mgmt_client.list_index_names()]:
self._create_index(mgmt_client)
else:
logger.info(
f"Using existing index {AZURESEARCH_INDEX} in service {AZURESEARCH_SERVICE}"
)
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
azdocuments: List[Dict] = []
async def upload():
r = await self.client.upload_documents(documents=azdocuments)
count = sum(1 for rr in r if rr.succeeded)
logger.info(f"Upserted {count} chunks out of {len(azdocuments)}")
if count < len(azdocuments):
raise Exception(f"Failed to upload {len(azdocuments) - count} chunks")
ids = []
for document_id, document_chunks in chunks.items():
ids.append(document_id)
for chunk in document_chunks:
azdocuments.append(
{
# base64-encode the id string to stay within Azure Search's valid characters for keys
FIELDS_ID: base64.urlsafe_b64encode(
bytes(chunk.id, "utf-8")
).decode("ascii"),
FIELDS_TEXT: chunk.text,
FIELDS_EMBEDDING: chunk.embedding,
FIELDS_DOCUMENT_ID: document_id,
FIELDS_SOURCE: chunk.metadata.source,
FIELDS_SOURCE_ID: chunk.metadata.source_id,
FIELDS_URL: chunk.metadata.url,
FIELDS_CREATED_AT: chunk.metadata.created_at,
FIELDS_AUTHOR: chunk.metadata.author,
}
)
if len(azdocuments) >= MAX_UPLOAD_BATCH_SIZE:
await upload()
azdocuments = []
if len(azdocuments) > 0:
await upload()
return ids
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
filter = None if delete_all else self._translate_filter(filter)
if delete_all or filter is not None:
deleted = set()
while True:
search_result = await self.client.search(
None,
filter=filter,
top=MAX_DELETE_BATCH_SIZE,
include_total_count=True,
select=FIELDS_ID,
)
if await search_result.get_count() == 0:
break
documents = [
{FIELDS_ID: d[FIELDS_ID]}
async for d in search_result
if d[FIELDS_ID] not in deleted
]
if len(documents) > 0:
logger.info(
f"Deleting {len(documents)} chunks "
+ (
"using a filter"
if filter is not None
else "using delete_all"
)
)
del_result = await self.client.delete_documents(documents=documents)
if not all([rr.succeeded for rr in del_result]):
raise Exception("Failed to delete documents")
deleted.update([d[FIELDS_ID] for d in documents])
else:
# All repeats, delay a bit to let the index refresh and try again
time.sleep(0.25)
if ids is not None and len(ids) > 0:
for id in ids:
logger.info(f"Deleting chunks for document id {id}")
await self.delete(filter=DocumentMetadataFilter(document_id=id))
return True
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
return await asyncio.gather(*(self._single_query(query) for query in queries))
async def _single_query(self, query: QueryWithEmbedding) -> QueryResult:
"""
Takes in a single query and filters and returns a query result with matching document chunks and scores.
"""
filter = (
self._translate_filter(query.filter) if query.filter is not None else None
)
try:
vector_top_k = query.top_k if filter is None else query.top_k * 2
if not AZURESEARCH_DISABLE_HYBRID:
vector_top_k *= 2
q = query.query if not AZURESEARCH_DISABLE_HYBRID else None
vector_q = Vector(
value=query.embedding, k=vector_top_k, fields=FIELDS_EMBEDDING
)
if AZURESEARCH_SEMANTIC_CONFIG != None and not AZURESEARCH_DISABLE_HYBRID:
# Ensure we're feeding a good number of candidates to the L2 reranker
vector_top_k = max(50, vector_top_k)
r = await self.client.search(
q,
filter=filter,
top=query.top_k,
vectors=[vector_q],
query_type=QueryType.SEMANTIC,
query_language=AZURESEARCH_LANGUAGE,
semantic_configuration_name=AZURESEARCH_SEMANTIC_CONFIG,
)
else:
r = await self.client.search(
q, filter=filter, top=query.top_k, vectors=[vector_q]
)
results: List[DocumentChunkWithScore] = []
async for hit in r:
f = lambda field: hit.get(field) if field != "-" else None
results.append(
DocumentChunkWithScore(
id=hit[FIELDS_ID],
text=hit[FIELDS_TEXT],
metadata=DocumentChunkMetadata(
document_id=f(FIELDS_DOCUMENT_ID),
source=f(FIELDS_SOURCE) or "file",
source_id=f(FIELDS_SOURCE_ID),
url=f(FIELDS_URL),
created_at=f(FIELDS_CREATED_AT),
author=f(FIELDS_AUTHOR),
),
score=hit["@search.score"],
)
)
return QueryResult(query=query.query, results=results)
except Exception as e:
raise Exception(f"Error querying the index: {e}")
@staticmethod
def _translate_filter(filter: DocumentMetadataFilter) -> str:
"""
Translates a DocumentMetadataFilter into an Azure Search filter string
"""
if filter is None:
return None
escape = lambda s: s.replace("'", "''")
# regex to validate dates are in OData format
date_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z")
filter_list = []
if filter.document_id is not None:
filter_list.append(
f"{FIELDS_DOCUMENT_ID} eq '{escape(filter.document_id)}'"
)
if filter.source is not None:
filter_list.append(f"{FIELDS_SOURCE} eq '{escape(filter.source)}'")
if filter.source_id is not None:
filter_list.append(f"{FIELDS_SOURCE_ID} eq '{escape(filter.source_id)}'")
if filter.author is not None:
filter_list.append(f"{FIELDS_AUTHOR} eq '{escape(filter.author)}'")
if filter.start_date is not None:
if not date_re.match(filter.start_date):
raise ValueError(
f"start_date must be in OData format, got {filter.start_date}"
)
filter_list.append(f"{FIELDS_CREATED_AT} ge {filter.start_date}")
if filter.end_date is not None:
if not date_re.match(filter.end_date):
raise ValueError(
f"end_date must be in OData format, got {filter.end_date}"
)
filter_list.append(f"{FIELDS_CREATED_AT} le {filter.end_date}")
return " and ".join(filter_list) if len(filter_list) > 0 else None
def _create_index(self, mgmt_client: SearchIndexClient):
"""
Creates an Azure Cognitive Search index, including a semantic search configuration if a name is specified for it
"""
logger.info(
f"Creating index {AZURESEARCH_INDEX} in service {AZURESEARCH_SERVICE}"
+ (
f" with semantic search configuration {AZURESEARCH_SEMANTIC_CONFIG}"
if AZURESEARCH_SEMANTIC_CONFIG is not None
else ""
)
)
mgmt_client.create_index(
SearchIndex(
name=AZURESEARCH_INDEX,
fields=[
SimpleField(
name=FIELDS_ID, type=SearchFieldDataType.String, key=True
),
SearchableField(
name=FIELDS_TEXT,
type=SearchFieldDataType.String,
analyzer_name="standard.lucene",
),
SearchField(
name=FIELDS_EMBEDDING,
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
hidden=False,
searchable=True,
filterable=False,
sortable=False,
facetable=False,
vector_search_dimensions=AZURESEARCH_DIMENSIONS,
vector_search_configuration="default",
),
SimpleField(
name=FIELDS_DOCUMENT_ID,
type=SearchFieldDataType.String,
filterable=True,
sortable=True,
),
SimpleField(
name=FIELDS_SOURCE,
type=SearchFieldDataType.String,
filterable=True,
sortable=True,
),
SimpleField(
name=FIELDS_SOURCE_ID,
type=SearchFieldDataType.String,
filterable=True,
sortable=True,
),
SimpleField(name=FIELDS_URL, type=SearchFieldDataType.String),
SimpleField(
name=FIELDS_CREATED_AT,
type=SearchFieldDataType.DateTimeOffset,
filterable=True,
sortable=True,
),
SimpleField(
name=FIELDS_AUTHOR,
type=SearchFieldDataType.String,
filterable=True,
sortable=True,
),
],
semantic_settings=None
if AZURESEARCH_SEMANTIC_CONFIG is None
else SemanticSettings(
configurations=[
SemanticConfiguration(
name=AZURESEARCH_SEMANTIC_CONFIG,
prioritized_fields=PrioritizedFields(
title_field=None,
prioritized_content_fields=[
SemanticField(field_name=FIELDS_TEXT)
],
),
)
]
),
vector_search=VectorSearch(
algorithm_configurations=[
HnswVectorSearchAlgorithmConfiguration(
name="default",
kind="hnsw",
# Could change to dotproduct for OpenAI's embeddings since they normalize vectors to unit length
hnsw_parameters=HnswParameters(metric="cosine"),
)
]
),
)
)
@staticmethod
def _create_credentials(
use_async: bool,
) -> Union[AzureKeyCredential, DefaultAzureCredential, DefaultAzureCredentialSync]:
if AZURESEARCH_API_KEY is None:
logger.info(
"Using DefaultAzureCredential for Azure Search, make sure local identity or managed identity are set up appropriately"
)
credential = (
DefaultAzureCredential() if use_async else DefaultAzureCredentialSync()
)
else:
logger.info("Using an API key to authenticate with Azure Search")
credential = AzureKeyCredential(AZURESEARCH_API_KEY)
return credential
================================================
FILE: datastore/providers/chroma_datastore.py
================================================
"""
Chroma datastore support for the ChatGPT retrieval plugin.
Consult the Chroma docs and GitHub repo for more information:
- https://docs.trychroma.com/usage-guide?lang=py
- https://github.com/chroma-core/chroma
- https://www.trychroma.com/
"""
import os
from datetime import datetime
from typing import Dict, List, Optional
import chromadb
from datastore.datastore import DataStore
from models.models import (
Document,
DocumentChunk,
DocumentChunkMetadata,
DocumentChunkWithScore,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
Source,
)
from services.chunks import get_document_chunks
CHROMA_IN_MEMORY = os.environ.get("CHROMA_IN_MEMORY", "True")
CHROMA_PERSISTENCE_DIR = os.environ.get("CHROMA_PERSISTENCE_DIR", "openai")
CHROMA_HOST = os.environ.get("CHROMA_HOST", "http://127.0.0.1")
CHROMA_PORT = os.environ.get("CHROMA_PORT", "8000")
CHROMA_COLLECTION = os.environ.get("CHROMA_COLLECTION", "openaiembeddings")
class ChromaDataStore(DataStore):
def __init__(
self,
in_memory: bool = CHROMA_IN_MEMORY, # type: ignore
persistence_dir: Optional[str] = CHROMA_PERSISTENCE_DIR,
collection_name: str = CHROMA_COLLECTION,
host: str = CHROMA_HOST,
port: str = CHROMA_PORT,
client: Optional[chromadb.Client] = None,
):
if client:
self._client = client
else:
if in_memory:
settings = (
chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=persistence_dir,
)
if persistence_dir
else chromadb.config.Settings()
)
self._client = chromadb.Client(settings=settings)
else:
self._client = chromadb.Client(
settings=chromadb.config.Settings(
chroma_api_impl="rest",
chroma_server_host=host,
chroma_server_http_port=port,
)
)
self._collection = self._client.get_or_create_collection(
name=collection_name,
embedding_function=None,
)
async def upsert(
self, documents: List[Document], chunk_token_size: Optional[int] = None
) -> List[str]:
"""
Takes in a list of documents and inserts them into the database. If an id already exists, the document is updated.
Return a list of document ids.
"""
chunks = get_document_chunks(documents, chunk_token_size)
# Chroma has a true upsert, so we don't need to delete first
return await self._upsert(chunks)
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of list of document chunks and inserts them into the database.
Return a list of document ids.
"""
self._collection.upsert(
ids=[chunk.id for chunk_list in chunks.values() for chunk in chunk_list],
embeddings=[
chunk.embedding
for chunk_list in chunks.values()
for chunk in chunk_list
],
documents=[
chunk.text for chunk_list in chunks.values() for chunk in chunk_list
],
metadatas=[
self._process_metadata_for_storage(chunk.metadata)
for chunk_list in chunks.values()
for chunk in chunk_list
],
)
return list(chunks.keys())
def _where_from_query_filter(self, query_filter: DocumentMetadataFilter) -> Dict:
output = {
k: v
for (k, v) in query_filter.dict().items()
if v is not None and k != "start_date" and k != "end_date" and k != "source"
}
if query_filter.source:
output["source"] = query_filter.source.value
if query_filter.start_date and query_filter.end_date:
output["$and"] = [
{
"created_at": {
"$gte": int(
datetime.fromisoformat(query_filter.start_date).timestamp()
)
}
},
{
"created_at": {
"$lte": int(
datetime.fromisoformat(query_filter.end_date).timestamp()
)
}
},
]
elif query_filter.start_date:
output["created_at"] = {
"$gte": int(datetime.fromisoformat(query_filter.start_date).timestamp())
}
elif query_filter.end_date:
output["created_at"] = {
"$lte": int(datetime.fromisoformat(query_filter.end_date).timestamp())
}
return output
def _process_metadata_for_storage(self, metadata: DocumentChunkMetadata) -> Dict:
stored_metadata = {}
if metadata.source:
stored_metadata["source"] = metadata.source.value
if metadata.source_id:
stored_metadata["source_id"] = metadata.source_id
if metadata.url:
stored_metadata["url"] = metadata.url
if metadata.created_at:
stored_metadata["created_at"] = int(
datetime.fromisoformat(metadata.created_at).timestamp()
)
if metadata.author:
stored_metadata["author"] = metadata.author
if metadata.document_id:
stored_metadata["document_id"] = metadata.document_id
return stored_metadata
def _process_metadata_from_storage(self, metadata: Dict) -> DocumentChunkMetadata:
return DocumentChunkMetadata(
source=Source(metadata["source"]) if "source" in metadata else None,
source_id=metadata.get("source_id", None),
url=metadata.get("url", None),
created_at=datetime.fromtimestamp(metadata["created_at"]).isoformat()
if "created_at" in metadata
else None,
author=metadata.get("author", None),
document_id=metadata.get("document_id", None),
)
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
results = [
self._collection.query(
query_embeddings=[query.embedding],
include=["documents", "distances", "metadatas"], # embeddings
n_results=min(query.top_k, self._collection.count()), # type: ignore
where=(
self._where_from_query_filter(query.filter) if query.filter else {}
),
)
for query in queries
]
output = []
for query, result in zip(queries, results):
inner_results = []
(ids,) = result["ids"]
# (embeddings,) = result["embeddings"]
(documents,) = result["documents"]
(metadatas,) = result["metadatas"]
(distances,) = result["distances"]
for id_, text, metadata, distance in zip(
ids,
documents,
metadatas,
distances, # embeddings (https://github.com/openai/chatgpt-retrieval-plugin/pull/59#discussion_r1154985153)
):
inner_results.append(
DocumentChunkWithScore(
id=id_,
text=text,
metadata=self._process_metadata_from_storage(metadata),
# embedding=embedding,
score=distance,
)
)
output.append(QueryResult(query=query.query, results=inner_results))
return output
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Multiple parameters can be used at once.
Returns whether the operation was successful.
"""
if delete_all:
self._collection.delete()
return True
if ids and len(ids) > 0:
if len(ids) > 1:
where_clause = {"$or": [{"document_id": id_} for id_ in ids]}
else:
(id_,) = ids
where_clause = {"document_id": id_}
if filter:
where_clause = {
"$and": [self._where_from_query_filter(filter), where_clause]
}
elif filter:
where_clause = self._where_from_query_filter(filter)
self._collection.delete(where=where_clause)
return True
================================================
FILE: datastore/providers/elasticsearch_datastore.py
================================================
import os
from typing import Dict, List, Any, Optional
import elasticsearch
from elasticsearch import Elasticsearch, helpers
from loguru import logger
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkWithScore,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
)
from services.date import to_unix_timestamp
ELASTICSEARCH_URL = os.environ.get("ELASTICSEARCH_URL", "http://localhost:9200")
ELASTICSEARCH_CLOUD_ID = os.environ.get("ELASTICSEARCH_CLOUD_ID")
ELASTICSEARCH_USERNAME = os.environ.get("ELASTICSEARCH_USERNAME")
ELASTICSEARCH_PASSWORD = os.environ.get("ELASTICSEARCH_PASSWORD")
ELASTICSEARCH_API_KEY = os.environ.get("ELASTICSEARCH_API_KEY")
ELASTICSEARCH_INDEX = os.environ.get("ELASTICSEARCH_INDEX")
ELASTICSEARCH_REPLICAS = int(os.environ.get("ELASTICSEARCH_REPLICAS", "1"))
ELASTICSEARCH_SHARDS = int(os.environ.get("ELASTICSEARCH_SHARDS", "1"))
VECTOR_SIZE = int(os.environ.get("EMBEDDING_DIMENSION", 256))
UPSERT_BATCH_SIZE = 100
class ElasticsearchDataStore(DataStore):
def __init__(
self,
index_name: Optional[str] = None,
vector_size: int = VECTOR_SIZE,
similarity: str = "cosine",
replicas: int = ELASTICSEARCH_REPLICAS,
shards: int = ELASTICSEARCH_SHARDS,
recreate_index: bool = True,
):
"""
Args:
index_name: Name of the index to be used
vector_size: Size of the embedding stored in a collection
similarity:
Any of "cosine" / "l2_norm" / "dot_product".
"""
assert similarity in [
"cosine",
"l2_norm",
"dot_product",
], "Similarity must be one of 'cosine' / 'l2_norm' / 'dot_product'."
assert replicas > 0, "Replicas must be greater than or equal to 0."
assert shards > 0, "Shards must be greater than or equal to 0."
self.client = connect_to_elasticsearch(
ELASTICSEARCH_URL,
ELASTICSEARCH_CLOUD_ID,
ELASTICSEARCH_API_KEY,
ELASTICSEARCH_USERNAME,
ELASTICSEARCH_PASSWORD,
)
assert (
index_name != "" or ELASTICSEARCH_INDEX != ""
), "Please provide an index name."
self.index_name = index_name or ELASTICSEARCH_INDEX or ""
replicas = replicas or ELASTICSEARCH_REPLICAS
shards = shards or ELASTICSEARCH_SHARDS
# Set up the collection so the documents might be inserted or queried
self._set_up_index(vector_size, similarity, replicas, shards, recreate_index)
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of document chunks and inserts them into the database.
Return a list of document ids.
"""
actions = []
for _, chunkList in chunks.items():
for chunk in chunkList:
actions = (
actions
+ self._convert_document_chunk_to_es_document_operation(chunk)
)
self.client.bulk(operations=actions, index=self.index_name)
return list(chunks.keys())
async def _query(
self,
queries: List[QueryWithEmbedding],
) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
searches = self._convert_queries_to_msearch_query(queries)
results = self.client.msearch(searches=searches)
return [
QueryResult(
query=query.query,
results=[
self._convert_hit_to_document_chunk_with_score(hit)
for hit in result["hits"]["hits"]
],
)
for query, result in zip(queries, results["responses"])
]
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Returns whether the operation was successful.
"""
# Delete all vectors from the index if delete_all is True
if delete_all:
try:
logger.info(f"Deleting all vectors from index")
self.client.delete_by_query(
index=self.index_name, query={"match_all": {}}
)
logger.info(f"Deleted all vectors successfully")
return True
except Exception as e:
logger.error(f"Error deleting all vectors: {e}")
raise e
# Convert the metadata filter object to a dict with elasticsearch filter expressions
es_filters = self._get_es_filters(filter)
# Delete vectors that match the filter from the index if the filter is not empty
if es_filters != {}:
try:
logger.info(f"Deleting vectors with filter {es_filters}")
self.client.delete_by_query(index=self.index_name, query=es_filters)
logger.info(f"Deleted vectors with filter successfully")
except Exception as e:
logger.error(f"Error deleting vectors with filter: {e}")
raise e
if ids:
try:
documents_to_delete = [doc_id for doc_id in ids]
logger.info(f"Deleting {len(documents_to_delete)} documents")
res = self.client.delete_by_query(
index=self.index_name,
query={"terms": {"metadata.document_id": documents_to_delete}},
)
logger.info(f"Deleted documents successfully")
except Exception as e:
logger.error(f"Error deleting documents: {e}")
raise e
return True
def _get_es_filters(
self, filter: Optional[DocumentMetadataFilter] = None
) -> Dict[str, Any]:
if filter is None:
return {}
es_filters = {
"bool": {
"must": [],
}
}
# For each field in the MetadataFilter, check if it has a value and add the corresponding pinecone filter expression
# For start_date and end_date, uses the range query - gte and lte operators respectively
# For other fields, uses the term query
for field, value in filter.dict().items():
if value is not None:
if field == "start_date":
es_filters["bool"]["must"].append(
{"range": {"created_at": {"gte": to_unix_timestamp(value)}}}
)
elif field == "end_date":
es_filters["bool"]["must"].append(
{"range": {"created_at": {"lte": to_unix_timestamp(value)}}}
)
else:
es_filters["bool"]["must"].append(
{"term": {f"metadata.{field}": value}}
)
return es_filters
def _convert_document_chunk_to_es_document_operation(
self, document_chunk: DocumentChunk
) -> List[Dict]:
created_at = (
to_unix_timestamp(document_chunk.metadata.created_at)
if document_chunk.metadata.created_at is not None
else None
)
action_and_metadata = {
"index": {
"_index": self.index_name,
"_id": document_chunk.id,
}
}
source = {
"id": document_chunk.id,
"text": document_chunk.text,
"metadata": document_chunk.metadata.dict(),
"created_at": created_at,
"embedding": document_chunk.embedding,
}
return [action_and_metadata, source]
def _convert_queries_to_msearch_query(self, queries: List[QueryWithEmbedding]):
searches = []
for query in queries:
searches.append({"index": self.index_name})
searches.append(
{
"_source": True,
"knn": {
"field": "embedding",
"query_vector": query.embedding,
"k": query.top_k,
"num_candidates": query.top_k,
},
"size": query.top_k,
}
)
return searches
def _convert_hit_to_document_chunk_with_score(self, hit) -> DocumentChunkWithScore:
return DocumentChunkWithScore(
id=hit["_id"],
text=hit["_source"]["text"], # type: ignore
metadata=hit["_source"]["metadata"], # type: ignore
embedding=hit["_source"]["embedding"], # type: ignore
score=hit["_score"],
)
def _set_up_index(
self,
vector_size: int,
similarity: str,
replicas: int,
shards: int,
recreate_index: bool,
) -> None:
if recreate_index:
self._recreate_index(similarity, vector_size, replicas, shards)
try:
index_mapping = self.client.indices.get_mapping(index=self.index_name)
current_similarity = index_mapping[self.index_name]["mappings"]["properties"]["embedding"]["similarity"] # type: ignore
current_vector_size = index_mapping[self.index_name]["mappings"]["properties"]["embedding"]["dims"] # type: ignore
if current_similarity != similarity:
raise ValueError(
f"Collection '{self.index_name}' already exists in Elasticsearch, "
f"but it is configured with a similarity '{current_similarity}'. "
f"If you want to use that collection, but with a different "
f"similarity, please set `recreate_index=True` argument."
)
if current_vector_size != vector_size:
raise ValueError(
f"Collection '{self.index_name}' already exists in Elasticsearch, "
f"but it is configured with a vector size '{current_vector_size}'. "
f"If you want to use that collection, but with a different "
f"vector size, please set `recreate_index=True` argument."
)
except elasticsearch.exceptions.NotFoundError:
self._recreate_index(similarity, vector_size, replicas, shards)
def _recreate_index(
self, similarity: str, vector_size: int, replicas: int, shards: int
) -> None:
settings = {
"index": {
"number_of_shards": shards,
"number_of_replicas": replicas,
"refresh_interval": "1s",
}
}
mappings = {
"properties": {
"embedding": {
"type": "dense_vector",
"dims": vector_size,
"index": True,
"similarity": similarity,
}
}
}
self.client.indices.delete(
index=self.index_name, ignore_unavailable=True, allow_no_indices=True
)
self.client.indices.create(
index=self.index_name, mappings=mappings, settings=settings
)
def connect_to_elasticsearch(
elasticsearch_url=None, cloud_id=None, api_key=None, username=None, password=None
):
# Check if both elasticsearch_url and cloud_id are defined
if elasticsearch_url and cloud_id:
raise ValueError(
"Both elasticsearch_url and cloud_id are defined. Please provide only one."
)
# Initialize connection parameters dictionary
connection_params = {}
# Define the connection based on the provided parameters
if elasticsearch_url:
connection_params["hosts"] = [elasticsearch_url]
elif cloud_id:
connection_params["cloud_id"] = cloud_id
else:
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
# Add authentication details based on the provided parameters
if api_key:
connection_params["api_key"] = api_key
elif username and password:
connection_params["basic_auth"] = (username, password)
else:
logger.warning(
"No authentication details provided. Please consider using an api_key or username and password to secure your connection."
)
# Establish the Elasticsearch client connection
es_client = Elasticsearch(**connection_params)
try:
es_client.info()
except Exception as e:
logger.error(f"Error connecting to Elasticsearch: {e}")
raise e
return es_client
================================================
FILE: datastore/providers/llama_datastore.py
================================================
import json
import os
from typing import Dict, List, Optional, Type
from loguru import logger
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkMetadata,
DocumentChunkWithScore,
DocumentMetadataFilter,
Query,
QueryResult,
QueryWithEmbedding,
)
from llama_index.indices.base import BaseGPTIndex
from llama_index.indices.vector_store.base import GPTVectorStoreIndex
from llama_index.indices.query.schema import QueryBundle
from llama_index.response.schema import Response
from llama_index.data_structs.node_v2 import Node, DocumentRelationship, NodeWithScore
from llama_index.indices.registry import INDEX_STRUCT_TYPE_TO_INDEX_CLASS
from llama_index.data_structs.struct_type import IndexStructType
from llama_index.indices.response.builder import ResponseMode
INDEX_STRUCT_TYPE_STR = os.environ.get(
"LLAMA_INDEX_TYPE", IndexStructType.SIMPLE_DICT.value
)
INDEX_JSON_PATH = os.environ.get("LLAMA_INDEX_JSON_PATH", None)
QUERY_KWARGS_JSON_PATH = os.environ.get("LLAMA_QUERY_KWARGS_JSON_PATH", None)
RESPONSE_MODE = os.environ.get("LLAMA_RESPONSE_MODE", ResponseMode.NO_TEXT.value)
EXTERNAL_VECTOR_STORE_INDEX_STRUCT_TYPES = [
IndexStructType.DICT,
IndexStructType.WEAVIATE,
IndexStructType.PINECONE,
IndexStructType.QDRANT,
IndexStructType.CHROMA,
IndexStructType.VECTOR_STORE,
]
def _create_or_load_index(
index_type_str: Optional[str] = None,
index_json_path: Optional[str] = None,
index_type_to_index_cls: Optional[dict[str, Type[BaseGPTIndex]]] = None,
) -> BaseGPTIndex:
"""Create or load index from json path."""
index_json_path = index_json_path or INDEX_JSON_PATH
index_type_to_index_cls = (
index_type_to_index_cls or INDEX_STRUCT_TYPE_TO_INDEX_CLASS
)
index_type_str = index_type_str or INDEX_STRUCT_TYPE_STR
index_type = IndexStructType(index_type_str)
if index_type not in index_type_to_index_cls:
raise ValueError(f"Unknown index type: {index_type}")
if index_type in EXTERNAL_VECTOR_STORE_INDEX_STRUCT_TYPES:
raise ValueError("Please use vector store directly.")
index_cls = index_type_to_index_cls[index_type]
if index_json_path is None:
return index_cls(nodes=[]) # Create empty index
else:
return index_cls.load_from_disk(index_json_path) # Load index from disk
def _create_or_load_query_kwargs(
query_kwargs_json_path: Optional[str] = None,
) -> Optional[dict]:
"""Create or load query kwargs from json path."""
query_kwargs_json_path = query_kwargs_json_path or QUERY_KWARGS_JSON_PATH
query_kargs: Optional[dict] = None
if query_kwargs_json_path is not None:
with open(INDEX_JSON_PATH, "r") as f:
query_kargs = json.load(f)
return query_kargs
def _doc_chunk_to_node(doc_chunk: DocumentChunk, source_doc_id: str) -> Node:
"""Convert document chunk to Node"""
return Node(
doc_id=doc_chunk.id,
text=doc_chunk.text,
embedding=doc_chunk.embedding,
extra_info=doc_chunk.metadata.dict(),
relationships={DocumentRelationship.SOURCE: source_doc_id},
)
def _query_with_embedding_to_query_bundle(query: QueryWithEmbedding) -> QueryBundle:
return QueryBundle(
query_str=query.query,
embedding=query.embedding,
)
def _source_node_to_doc_chunk_with_score(
node_with_score: NodeWithScore,
) -> DocumentChunkWithScore:
node = node_with_score.node
if node.extra_info is not None:
metadata = DocumentChunkMetadata(**node.extra_info)
else:
metadata = DocumentChunkMetadata()
return DocumentChunkWithScore(
id=node.doc_id,
text=node.text,
score=node_with_score.score if node_with_score.score is not None else 1.0,
metadata=metadata,
)
def _response_to_query_result(
response: Response, query: QueryWithEmbedding
) -> QueryResult:
results = [
_source_node_to_doc_chunk_with_score(node) for node in response.source_nodes
]
return QueryResult(
query=query.query,
results=results,
)
class LlamaDataStore(DataStore):
def __init__(
self, index: Optional[BaseGPTIndex] = None, query_kwargs: Optional[dict] = None
):
self._index = index or _create_or_load_index()
self._query_kwargs = query_kwargs or _create_or_load_query_kwargs()
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of list of document chunks and inserts them into the database.
Return a list of document ids.
"""
doc_ids = []
for doc_id, doc_chunks in chunks.items():
logger.debug(f"Upserting {doc_id} with {len(doc_chunks)} chunks")
nodes = [
_doc_chunk_to_node(doc_chunk=doc_chunk, source_doc_id=doc_id)
for doc_chunk in doc_chunks
]
self._index.insert_nodes(nodes)
doc_ids.append(doc_id)
return doc_ids
async def _query(
self,
queries: List[QueryWithEmbedding],
) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and
returns a list of query results with matching document chunks and scores.
"""
query_result_all = []
for query in queries:
if query.filter is not None:
logger.warning("Filters are not supported yet, ignoring for now.")
query_bundle = _query_with_embedding_to_query_bundle(query)
# Setup query kwargs
if self._query_kwargs is not None:
query_kwargs = self._query_kwargs
else:
query_kwargs = {}
# TODO: support top_k for other indices
if isinstance(self._index, GPTVectorStoreIndex):
query_kwargs["similarity_top_k"] = query.top_k
response = await self._index.aquery(
query_bundle, response_mode=RESPONSE_MODE, **query_kwargs
)
query_result = _response_to_query_result(response, query)
query_result_all.append(query_result)
return query_result_all
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Returns whether the operation was successful.
"""
if delete_all:
logger.warning("Delete all not supported yet.")
return False
if filter is not None:
logger.warning("Filters are not supported yet.")
return False
if ids is not None:
for id_ in ids:
try:
self._index.delete(id_)
except NotImplementedError:
# NOTE: some indices does not support delete yet.
logger.warning(f"{type(self._index)} does not support delete yet.")
return False
return True
================================================
FILE: datastore/providers/milvus_datastore.py
================================================
import json
import os
import asyncio
from loguru import logger
from typing import Dict, List, Optional
from pymilvus import (
Collection,
connections,
utility,
FieldSchema,
DataType,
CollectionSchema,
MilvusException,
)
from uuid import uuid4
from services.date import to_unix_timestamp
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkMetadata,
Source,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
DocumentChunkWithScore,
)
MILVUS_COLLECTION = os.environ.get("MILVUS_COLLECTION") or "c" + uuid4().hex
MILVUS_HOST = os.environ.get("MILVUS_HOST") or "localhost"
MILVUS_PORT = os.environ.get("MILVUS_PORT") or 19530
MILVUS_USER = os.environ.get("MILVUS_USER")
MILVUS_PASSWORD = os.environ.get("MILVUS_PASSWORD")
MILVUS_USE_SECURITY = False if MILVUS_PASSWORD is None else True
MILVUS_INDEX_PARAMS = os.environ.get("MILVUS_INDEX_PARAMS")
MILVUS_SEARCH_PARAMS = os.environ.get("MILVUS_SEARCH_PARAMS")
MILVUS_CONSISTENCY_LEVEL = os.environ.get("MILVUS_CONSISTENCY_LEVEL")
UPSERT_BATCH_SIZE = 100
OUTPUT_DIM = int(os.environ.get("EMBEDDING_DIMENSION", 256))
EMBEDDING_FIELD = "embedding"
class Required:
pass
# The fields names that we are going to be storing within Milvus, the field declaration for schema creation, and the default value
SCHEMA_V1 = [
(
"pk",
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
Required,
),
(
EMBEDDING_FIELD,
FieldSchema(name=EMBEDDING_FIELD, dtype=DataType.FLOAT_VECTOR, dim=OUTPUT_DIM),
Required,
),
(
"text",
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
Required,
),
(
"document_id",
FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=65535),
"",
),
(
"source_id",
FieldSchema(name="source_id", dtype=DataType.VARCHAR, max_length=65535),
"",
),
(
"id",
FieldSchema(
name="id",
dtype=DataType.VARCHAR,
max_length=65535,
),
"",
),
(
"source",
FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=65535),
"",
),
("url", FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=65535), ""),
("created_at", FieldSchema(name="created_at", dtype=DataType.INT64), -1),
(
"author",
FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=65535),
"",
),
]
# V2 schema, remomve the "pk" field
SCHEMA_V2 = SCHEMA_V1[1:]
SCHEMA_V2[4][1].is_primary = True
class MilvusDataStore(DataStore):
def __init__(
self,
create_new: Optional[bool] = False,
consistency_level: str = "Bounded",
):
"""Create a Milvus DataStore.
The Milvus Datastore allows for storing your indexes and metadata within a Milvus instance.
Args:
create_new (Optional[bool], optional): Whether to overwrite if collection already exists. Defaults to True.
consistency_level(str, optional): Specify the collection consistency level.
Defaults to "Bounded" for search performance.
Set to "Strong" in test cases for result validation.
"""
# Overwrite the default consistency level by MILVUS_CONSISTENCY_LEVEL
self._consistency_level = MILVUS_CONSISTENCY_LEVEL or consistency_level
self._create_connection()
self._create_collection(MILVUS_COLLECTION, create_new) # type: ignore
self._create_index()
def _get_schema(self):
return SCHEMA_V1 if self._schema_ver == "V1" else SCHEMA_V2
def _create_connection(self):
try:
self.alias = ""
# Check if the connection already exists
for x in connections.list_connections():
addr = connections.get_connection_addr(x[0])
if (
x[1]
and ("address" in addr)
and (addr["address"] == "{}:{}".format(MILVUS_HOST, MILVUS_PORT))
):
self.alias = x[0]
logger.info(
"Reuse connection to Milvus server '{}:{}' with alias '{:s}'".format(
MILVUS_HOST, MILVUS_PORT, self.alias
)
)
break
# Connect to the Milvus instance using the passed in Environment variables
if len(self.alias) == 0:
self.alias = uuid4().hex
connections.connect(
alias=self.alias,
host=MILVUS_HOST,
port=MILVUS_PORT,
user=MILVUS_USER, # type: ignore
password=MILVUS_PASSWORD, # type: ignore
secure=MILVUS_USE_SECURITY,
)
logger.info(
"Create connection to Milvus server '{}:{}' with alias '{:s}'".format(
MILVUS_HOST, MILVUS_PORT, self.alias
)
)
except Exception as e:
logger.error(
"Failed to create connection to Milvus server '{}:{}', error: {}".format(
MILVUS_HOST, MILVUS_PORT, e
)
)
def _create_collection(self, collection_name, create_new: bool) -> None:
"""Create a collection based on environment and passed in variables.
Args:
create_new (bool): Whether to overwrite if collection already exists.
"""
try:
self._schema_ver = "V1"
# If the collection exists and create_new is True, drop the existing collection
if utility.has_collection(collection_name, using=self.alias) and create_new:
utility.drop_collection(collection_name, using=self.alias)
# Check if the collection doesnt exist
if utility.has_collection(collection_name, using=self.alias) is False:
# If it doesnt exist use the field params from init to create a new schem
schema = [field[1] for field in SCHEMA_V2]
schema = CollectionSchema(schema)
# Use the schema to create a new collection
self.col = Collection(
collection_name,
schema=schema,
using=self.alias,
consistency_level=self._consistency_level,
)
self._schema_ver = "V2"
logger.info(
"Create Milvus collection '{}' with schema {} and consistency level {}".format(
collection_name, self._schema_ver, self._consistency_level
)
)
else:
# If the collection exists, point to it
self.col = Collection(collection_name, using=self.alias) # type: ignore
# Which sechma is used
for field in self.col.schema.fields:
if field.name == "id" and field.is_primary:
self._schema_ver = "V2"
break
logger.info(
"Milvus collection '{}' already exists with schema {}".format(
collection_name, self._schema_ver
)
)
except Exception as e:
logger.error(
"Failed to create collection '{}', error: {}".format(collection_name, e)
)
def _create_index(self):
# TODO: verify index/search params passed by os.environ
self.index_params = MILVUS_INDEX_PARAMS or None
self.search_params = MILVUS_SEARCH_PARAMS or None
try:
# If no index on the collection, create one
if len(self.col.indexes) == 0:
if self.index_params is not None:
# Convert the string format to JSON format parameters passed by MILVUS_INDEX_PARAMS
self.index_params = json.loads(self.index_params)
logger.info("Create Milvus index: {}".format(self.index_params))
# Create an index on the 'embedding' field with the index params found in init
self.col.create_index(
EMBEDDING_FIELD, index_params=self.index_params
)
else:
# If no index param supplied, to first create an HNSW index for Milvus
try:
i_p = {
"metric_type": "IP",
"index_type": "HNSW",
"params": {"M": 8, "efConstruction": 64},
}
logger.info(
"Attempting creation of Milvus '{}' index".format(
i_p["index_type"]
)
)
self.col.create_index(EMBEDDING_FIELD, index_params=i_p)
self.index_params = i_p
logger.info(
"Creation of Milvus '{}' index successful".format(
i_p["index_type"]
)
)
# If create fails, most likely due to being Zilliz Cloud instance, try to create an AutoIndex
except MilvusException:
logger.info("Attempting creation of Milvus default index")
i_p = {
"metric_type": "IP",
"index_type": "AUTOINDEX",
"params": {},
}
self.col.create_index(EMBEDDING_FIELD, index_params=i_p)
self.index_params = i_p
logger.info("Creation of Milvus default index successful")
# If an index already exists, grab its params
else:
# How about if the first index is not vector index?
for index in self.col.indexes:
idx = index.to_dict()
if idx["field"] == EMBEDDING_FIELD:
logger.info("Index already exists: {}".format(idx))
self.index_params = idx["index_param"]
break
self.col.load()
if self.search_params is not None:
# Convert the string format to JSON format parameters passed by MILVUS_SEARCH_PARAMS
self.search_params = json.loads(self.search_params)
else:
# The default search params
metric_type = "IP"
if "metric_type" in self.index_params:
metric_type = self.index_params["metric_type"]
default_search_params = {
"IVF_FLAT": {"metric_type": metric_type, "params": {"nprobe": 10}},
"IVF_SQ8": {"metric_type": metric_type, "params": {"nprobe": 10}},
"IVF_PQ": {"metric_type": metric_type, "params": {"nprobe": 10}},
"HNSW": {"metric_type": metric_type, "params": {"ef": 10}},
"RHNSW_FLAT": {"metric_type": metric_type, "params": {"ef": 10}},
"RHNSW_SQ": {"metric_type": metric_type, "params": {"ef": 10}},
"RHNSW_PQ": {"metric_type": metric_type, "params": {"ef": 10}},
"IVF_HNSW": {
"metric_type": metric_type,
"params": {"nprobe": 10, "ef": 10},
},
"ANNOY": {"metric_type": metric_type, "params": {"search_k": 10}},
"AUTOINDEX": {"metric_type": metric_type, "params": {}},
}
# Set the search params
self.search_params = default_search_params[
self.index_params["index_type"]
]
logger.info("Milvus search parameters: {}".format(self.search_params))
except Exception as e:
logger.error("Failed to create index, error: {}".format(e))
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""Upsert chunks into the datastore.
Args:
chunks (Dict[str, List[DocumentChunk]]): A list of DocumentChunks to insert
Raises:
e: Error in upserting data.
Returns:
List[str]: The document_id's that were inserted.
"""
try:
# The doc id's to return for the upsert
doc_ids: List[str] = []
# List to collect all the insert data, skip the "pk" for schema V1
offset = 1 if self._schema_ver == "V1" else 0
insert_data = [[] for _ in range(len(self._get_schema()) - offset)]
# Go through each document chunklist and grab the data
for doc_id, chunk_list in chunks.items():
# Append the doc_id to the list we are returning
doc_ids.append(doc_id)
# Examine each chunk in the chunklist
for chunk in chunk_list:
# Extract data from the chunk
list_of_data = self._get_values(chunk)
# Check if the data is valid
if list_of_data is not None:
# Append each field to the insert_data
for x in range(len(insert_data)):
insert_data[x].append(list_of_data[x])
# Slice up our insert data into batches
batches = [
insert_data[i : i + UPSERT_BATCH_SIZE]
for i in range(0, len(insert_data), UPSERT_BATCH_SIZE)
]
# Attempt to insert each batch into our collection
# batch data can work with both V1 and V2 schema
for batch in batches:
if len(batch[0]) != 0:
try:
logger.info(f"Upserting batch of size {len(batch[0])}")
self.col.insert(batch)
logger.info(f"Upserted batch successfully")
except Exception as e:
logger.error(f"Failed to insert batch records, error: {e}")
raise e
# This setting perfoms flushes after insert. Small insert == bad to use
# self.col.flush()
return doc_ids
except Exception as e:
logger.error("Failed to insert records, error: {}".format(e))
return []
def _get_values(self, chunk: DocumentChunk) -> List[any] | None: # type: ignore
"""Convert the chunk into a list of values to insert whose indexes align with fields.
Args:
chunk (DocumentChunk): The chunk to convert.
Returns:
List (any): The values to insert.
"""
# Convert DocumentChunk and its sub models to dict
values = chunk.dict()
# Unpack the metadata into the same dict
meta = values.pop("metadata")
values.update(meta)
# Convert date to int timestamp form
if values["created_at"]:
values["created_at"] = to_unix_timestamp(values["created_at"])
# If source exists, change from Source object to the string value it holds
if values["source"]:
values["source"] = values["source"].value
# List to collect data we will return
ret = []
# Grab data responding to each field, excluding the hidden auto pk field for schema V1
offset = 1 if self._schema_ver == "V1" else 0
for key, _, default in self._get_schema()[offset:]:
# Grab the data at the key and default to our defaults set in init
x = values.get(key) or default
# If one of our required fields is missing, ignore the entire entry
if x is Required:
logger.info("Chunk " + values["id"] + " missing " + key + " skipping")
return None
# Add the corresponding value if it passes the tests
ret.append(x)
return ret
async def _query(
self,
queries: List[QueryWithEmbedding],
) -> List[QueryResult]:
"""Query the QueryWithEmbedding against the MilvusDocumentSearch
Search the embedding and its filter in the collection.
Args:
queries (List[QueryWithEmbedding]): The list of searches to perform.
Returns:
List[QueryResult]: Results for each search.
"""
# Async to perform the query, adapted from pinecone implementation
async def _single_query(query: QueryWithEmbedding) -> QueryResult:
try:
filter = None
# Set the filter to expression that is valid for Milvus
if query.filter is not None:
# Either a valid filter or None will be returned
filter = self._get_filter(query.filter)
# Perform our search
return_from = 2 if self._schema_ver == "V1" else 1
res = self.col.search(
data=[query.embedding],
anns_field=EMBEDDING_FIELD,
param=self.search_params,
limit=query.top_k,
expr=filter,
output_fields=[
field[0] for field in self._get_schema()[return_from:]
], # Ignoring pk, embedding
)
# Results that will hold our DocumentChunkWithScores
results = []
# Parse every result for our search
for hit in res[0]: # type: ignore
# The distance score for the search result, falls under DocumentChunkWithScore
score = hit.score
# Our metadata info, falls under DocumentChunkMetadata
metadata = {}
# Grab the values that correspond to our fields, ignore pk and embedding.
for x in [field[0] for field in self._get_schema()[return_from:]]:
metadata[x] = hit.entity.get(x)
# If the source isn't valid, convert to None
if metadata["source"] not in Source.__members__:
metadata["source"] = None
# Text falls under the DocumentChunk
text = metadata.pop("text")
# Id falls under the DocumentChunk
ids = metadata.pop("id")
chunk = DocumentChunkWithScore(
id=ids,
score=score,
text=text,
metadata=DocumentChunkMetadata(**metadata),
)
results.append(chunk)
# TODO: decide on doing queries to grab the embedding itself, slows down performance as double query occurs
return QueryResult(query=query.query, results=results)
except Exception as e:
logger.error("Failed to query, error: {}".format(e))
return QueryResult(query=query.query, results=[])
results: List[QueryResult] = await asyncio.gather(
*[_single_query(query) for query in queries]
)
return results
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""Delete the entities based either on the chunk_id of the vector,
Args:
ids (Optional[List[str]], optional): The document_ids to delete. Defaults to None.
filter (Optional[DocumentMetadataFilter], optional): The filter to delete by. Defaults to None.
delete_all (Optional[bool], optional): Whether to drop the collection and recreate it. Defaults to None.
"""
# If deleting all, drop and create the new collection
if delete_all:
coll_name = self.col.name
logger.info(
"Delete the entire collection {} and create new one".format(coll_name)
)
# Release the collection from memory
self.col.release()
# Drop the collection
self.col.drop()
# Recreate the new collection
self._create_collection(coll_name, True)
self._create_index()
return True
# Keep track of how many we have deleted for later printing
delete_count = 0
batch_size = 100
pk_name = "pk" if self._schema_ver == "V1" else "id"
try:
# According to the api design, the ids is a list of document_id,
# document_id is not primary key, use query+delete to workaround,
# in future version we can delete by expression
if (ids is not None) and len(ids) > 0:
# Add quotation marks around the string format id
ids = ['"' + str(id) + '"' for id in ids]
# Query for the pk's of entries that match id's
ids = self.col.query(f"document_id in [{','.join(ids)}]")
# Convert to list of pks
pks = [str(entry[pk_name]) for entry in ids] # type: ignore
# for schema V2, the "id" is varchar, rewrite the expression
if self._schema_ver != "V1":
pks = ['"' + pk + '"' for pk in pks]
# Delete by ids batch by batch(avoid too long expression)
logger.info(
"Apply {:d} deletions to schema {:s}".format(
len(pks), self._schema_ver
)
)
while len(pks) > 0:
batch_pks = pks[:batch_size]
pks = pks[batch_size:]
# Delete the entries batch by batch
res = self.col.delete(f"{pk_name} in [{','.join(batch_pks)}]")
# Increment our deleted count
delete_count += int(res.delete_count) # type: ignore
except Exception as e:
logger.error("Failed to delete by ids, error: {}".format(e))
try:
# Check if empty filter
if filter is not None:
# Convert filter to milvus expression
filter = self._get_filter(filter) # type: ignore
# Check if there is anything to filter
if len(filter) != 0: # type: ignore
# Query for the pk's of entries that match filter
res = self.col.query(filter) # type: ignore
# Convert to list of pks
pks = [str(entry[pk_name]) for entry in res] # type: ignore
# for schema V2, the "id" is varchar, rewrite the expression
if self._schema_ver != "V1":
pks = ['"' + pk + '"' for pk in pks]
# Check to see if there are valid pk's to delete, delete batch by batch(avoid too long expression)
while len(pks) > 0: # type: ignore
batch_pks = pks[:batch_size]
pks = pks[batch_size:]
# Delete the entries batch by batch
res = self.col.delete(f"{pk_name} in [{','.join(batch_pks)}]") # type: ignore
# Increment our delete count
delete_count += int(res.delete_count) # type: ignore
except Exception as e:
logger.error("Failed to delete by filter, error: {}".format(e))
logger.info("{:d} records deleted".format(delete_count))
# This setting performs flushes after delete. Small delete == bad to use
# self.col.flush()
return True
def _get_filter(self, filter: DocumentMetadataFilter) -> Optional[str]:
"""Converts a DocumentMetdataFilter to the expression that Milvus takes.
Args:
filter (DocumentMetadataFilter): The Filter to convert to Milvus expression.
Returns:
Optional[str]: The filter if valid, otherwise None.
"""
filters = []
# Go through all the fields and their values
for field, value in filter.dict().items():
# Check if the Value is empty
if value is not None:
# Convert start_date to int and add greater than or equal logic
if field == "start_date":
filters.append(
"(created_at >= " + str(to_unix_timestamp(value)) + ")"
)
# Convert end_date to int and add less than or equal logic
elif field == "end_date":
filters.append(
"(created_at <= " + str(to_unix_timestamp(value)) + ")"
)
# Convert Source to its string value and check equivalency
elif field == "source":
filters.append("(" + field + ' == "' + str(value.value) + '")')
# Check equivalency of rest of string fields
else:
filters.append("(" + field + ' == "' + str(value) + '")')
# Join all our expressions with `and``
return " and ".join(filters)
================================================
FILE: datastore/providers/mongodb_atlas_datastore.py
================================================
import os
from typing import Dict, List, Any, Optional
from loguru import logger
from importlib.metadata import version
from motor.motor_asyncio import AsyncIOMotorClient
from pymongo.driver_info import DriverInfo
from pymongo import UpdateOne
from datastore.datastore import DataStore
from functools import cached_property
from models.models import (
Document,
DocumentChunk,
DocumentChunkWithScore,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
)
from services.chunks import get_document_chunks
from services.date import to_unix_timestamp
MONGODB_CONNECTION_URI = os.environ.get("MONGODB_URI")
MONGODB_DATABASE = os.environ.get("MONGODB_DATABASE", "default")
MONGODB_COLLECTION = os.environ.get("MONGODB_COLLECTION", "default")
MONGODB_INDEX = os.environ.get("MONGODB_INDEX", "default")
OVERSAMPLING_FACTOR = 10
MAX_CANDIDATES = 10_000
class MongoDBAtlasDataStore(DataStore):
def __init__(
self,
atlas_connection_uri: str = MONGODB_CONNECTION_URI,
index_name: str = MONGODB_INDEX,
database_name: str = MONGODB_DATABASE,
collection_name: str = MONGODB_COLLECTION,
oversampling_factor: float = OVERSAMPLING_FACTOR,
):
"""
Initialize a MongoDBAtlasDataStore instance.
Parameters:
- index_name (str, optional): Vector search index. If not provided, default index name is used.
- database_name (str, optional): Database. If not provided, default database name is used.
- collection_name (str, optional): Collection. If not provided, default collection name is used.
- oversampling_factor (float, optional): Oversampling factor for data augmentation.
Default is OVERSAMPLING_FACTOR.
Raises:
- ValueError: If index_name is not a valid string.
Attributes:
- index_name (str): Name of the index.
- database_name (str): Name of the database.
- collection_name (str): Name of the collection.
- oversampling_factor (float): Oversampling factor for data augmentation.
"""
self.atlas_connection_uri = atlas_connection_uri
self.oversampling_factor = oversampling_factor
self.database_name = database_name
self.collection_name = collection_name
if not (index_name and isinstance(index_name, str)):
raise ValueError("Provide a valid index name")
self.index_name = index_name
# TODO: Create index via driver https://jira.mongodb.org/browse/PYTHON-4175
# self._create_search_index(num_dimensions=1536, path="embedding", similarity="dotProduct", type="vector")
@cached_property
def client(self):
return self._connect_to_mongodb_atlas(
atlas_connection_uri=MONGODB_CONNECTION_URI
)
async def upsert(
self, documents: List[Document], chunk_token_size: Optional[int] = None
) -> List[str]:
"""
Takes in a list of Documents, chunks them, and upserts the chunks into the database.
Return a list the ids of the document chunks.
"""
chunks = get_document_chunks(documents, chunk_token_size)
return await self._upsert(chunks)
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a list of document chunks and inserts them into the database.
Return a list of document ids.
"""
documents_to_upsert = []
inserted_ids = []
for chunk_list in chunks.values():
for chunk in chunk_list:
inserted_ids.append(chunk.id)
documents_to_upsert.append(
UpdateOne({'_id': chunk.id}, {"$set": chunk.dict()}, upsert=True)
)
logger.info(f"Upsert documents into MongoDB collection: {self.database_name}: {self.collection_name}")
await self.client[self.database_name][self.collection_name].bulk_write(documents_to_upsert)
logger.info("Upsert successful")
return inserted_ids
async def _query(
self,
queries: List[QueryWithEmbedding],
) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns
a list of query results with matching document chunks and scores.
"""
results = []
for query in queries:
query_result = await self._execute_embedding_query(query)
results.append(query_result)
return results
async def _execute_embedding_query(self, query: QueryWithEmbedding) -> QueryResult:
"""
Execute a MongoDB query using vector search on the specified collection and
return the result of the query, including matched documents and their scores.
"""
pipeline = [
{
'$vectorSearch': {
'index': self.index_name,
'path': 'embedding',
'queryVector': query.embedding,
'numCandidates': min(query.top_k * self.oversampling_factor, MAX_CANDIDATES),
'limit': query.top_k
}
}, {
'$project': {
'text': 1,
'metadata': 1,
'score': {
'$meta': 'vectorSearchScore'
}
}
}
]
async with self.client[self.database_name][self.collection_name].aggregate(pipeline) as cursor:
results = [
self._convert_mongodb_document_to_document_chunk_with_score(doc)
async for doc in cursor
]
return QueryResult(
query=query.query,
results=results,
)
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes documents by ids, filter, or everything in the datastore.
Returns whether the operation was successful.
Note that ids refer to those in the datastore,
which are those of the **DocumentChunks**
"""
# Delete all documents from the collection if delete_all is True
if delete_all:
logger.info("Deleting all documents from collection")
mg_filter = {}
# Delete by ids
elif ids:
logger.info(f"Deleting documents with ids: {ids}")
mg_filter = {"_id": {"$in": ids}}
# Delete by filters
elif filter:
mg_filter = self._build_mongo_filter(filter)
logger.info(f"Deleting documents with filter: {mg_filter}")
# Do nothing
else:
logger.warning("No criteria set; nothing to delete args: ids: %s, filter: %s delete_all: %s", ids, filter, delete_all)
return True
try:
await self.client[self.database_name][self.collection_name].delete_many(mg_filter)
logger.info("Deleted documents successfully")
except Exception as e:
logger.error("Error deleting documents with filter: %s -- error: %s", mg_filter, e)
return False
return True
def _convert_mongodb_document_to_document_chunk_with_score(
self, document: Dict
) -> DocumentChunkWithScore:
# Convert MongoDB document to DocumentChunkWithScore
return DocumentChunkWithScore(
id=document.get("_id"),
text=document["text"],
metadata=document.get("metadata"),
score=document.get("score"),
)
def _build_mongo_filter(
self, filter: Optional[DocumentMetadataFilter] = None
) -> Dict[str, Any]:
"""
Generate MongoDB query filters based on the provided DocumentMetadataFilter.
"""
if filter is None:
return {}
mongo_filters = {
"$and": [],
}
# For each field in the MetadataFilter,
# check if it has a value and add the corresponding MongoDB filter expression
for field, value in filter.dict().items():
if value is not None:
if field == "start_date":
mongo_filters["$and"].append(
{"created_at": {"$gte": to_unix_timestamp(value)}}
)
elif field == "end_date":
mongo_filters["$and"].append(
{"created_at": {"$lte": to_unix_timestamp(value)}}
)
else:
mongo_filters["$and"].append(
{f"metadata.{field}": value}
)
return mongo_filters
@staticmethod
def _connect_to_mongodb_atlas(atlas_connection_uri: str):
"""
Establish a connection to MongoDB Atlas.
"""
client = AsyncIOMotorClient(
atlas_connection_uri,
driver=DriverInfo(name="Chatgpt Retrieval Plugin", version=version("chatgpt_retrieval_plugin")))
return client
================================================
FILE: datastore/providers/pgvector_datastore.py
================================================
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional
from datetime import datetime
from loguru import logger
from services.date import to_unix_timestamp
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkMetadata,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
DocumentChunkWithScore,
)
# interface for Postgres client to implement pg based Datastore providers
class PGClient(ABC):
@abstractmethod
async def upsert(self, table: str, json: dict[str, Any]) -> None:
"""
Takes in a list of documents and inserts them into the table.
"""
raise NotImplementedError
@abstractmethod
async def rpc(self, function_name: str, params: dict[str, Any]) -> Any:
"""
Calls a stored procedure in the database with the given parameters.
"""
raise NotImplementedError
@abstractmethod
async def delete_like(self, table: str, column: str, pattern: str) -> None:
"""
Deletes rows in the table that match the pattern.
"""
raise NotImplementedError
@abstractmethod
async def delete_in(self, table: str, column: str, ids: List[str]) -> None:
"""
Deletes rows in the table that match the ids.
"""
raise NotImplementedError
@abstractmethod
async def delete_by_filters(
self, table: str, filter: DocumentMetadataFilter
) -> None:
"""
Deletes rows in the table that match the filter.
"""
raise NotImplementedError
# abstract class for Postgres based Datastore providers that implements DataStore interface
class PgVectorDataStore(DataStore):
def __init__(self):
self.client = self.create_db_client()
@abstractmethod
def create_db_client(self) -> PGClient:
"""
Create db client, can be accessing postgres database via different APIs.
Can be supabase client or psycopg2 based client.
Return a client for postgres DB.
"""
raise NotImplementedError
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a dict of document_ids to list of document chunks and inserts them into the database.
Return a list of document ids.
"""
for document_id, document_chunks in chunks.items():
for chunk in document_chunks:
json = {
"id": chunk.id,
"content": chunk.text,
"embedding": chunk.embedding,
"document_id": document_id,
"source": chunk.metadata.source,
"source_id": chunk.metadata.source_id,
"url": chunk.metadata.url,
"author": chunk.metadata.author,
}
if chunk.metadata.created_at:
json["created_at"] = (
datetime.fromtimestamp(
to_unix_timestamp(chunk.metadata.created_at)
),
)
await self.client.upsert("documents", json)
return list(chunks.keys())
async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
query_results: List[QueryResult] = []
for query in queries:
# get the top 3 documents with the highest cosine similarity using rpc function in the database called "match_page_sections"
params = {
"in_embedding": query.embedding,
}
if query.top_k:
params["in_match_count"] = query.top_k
if query.filter:
if query.filter.document_id:
params["in_document_id"] = query.filter.document_id
if query.filter.source:
params["in_source"] = query.filter.source.value
if query.filter.source_id:
params["in_source_id"] = query.filter.source_id
if query.filter.author:
params["in_author"] = query.filter.author
if query.filter.start_date:
params["in_start_date"] = datetime.fromtimestamp(
to_unix_timestamp(query.filter.start_date)
)
if query.filter.end_date:
params["in_end_date"] = datetime.fromtimestamp(
to_unix_timestamp(query.filter.end_date)
)
try:
data = await self.client.rpc("match_page_sections", params=params)
results: List[DocumentChunkWithScore] = []
for row in data:
document_chunk = DocumentChunkWithScore(
id=row["id"],
text=row["content"],
# TODO: add embedding to the response ?
# embedding=row["embedding"],
score=float(row["similarity"]),
metadata=DocumentChunkMetadata(
source=row["source"],
source_id=row["source_id"],
document_id=row["document_id"],
url=row["url"],
created_at=row["created_at"],
author=row["author"],
),
)
results.append(document_chunk)
query_results.append(QueryResult(query=query.query, results=results))
except Exception as e:
logger.error(e)
query_results.append(QueryResult(query=query.query, results=[]))
return query_results
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything in the datastore.
Multiple parameters can be used at once.
Returns whether the operation was successful.
"""
if delete_all:
try:
await self.client.delete_like("documents", "document_id", "%")
except:
return False
elif ids:
try:
await self.client.delete_in("documents", "document_id", ids)
except:
return False
elif filter:
try:
await self.client.delete_by_filters("documents", filter)
except:
return False
return True
================================================
FILE: datastore/providers/pinecone_datastore.py
================================================
import os
from typing import Any, Dict, List, Optional
import pinecone
from tenacity import retry, wait_random_exponential, stop_after_attempt
import asyncio
from loguru import logger
from datastore.datastore import DataStore
from models.models import (
DocumentChunk,
DocumentChunkMetadata,
DocumentChunkWithScore,
DocumentMetadataFilter,
QueryResult,
QueryWithEmbedding,
Source,
)
from services.date import to_unix_timestamp
# Read environment variables for Pinecone configuration
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")
PINECONE_INDEX = os.environ.get("PINECONE_INDEX")
assert PINECONE_API_KEY is not None
assert PINECONE_ENVIRONMENT is not None
assert PINECONE_INDEX is not None
# Initialize Pinecone with the API key and environment
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
# Set the batch size for upserting vectors to Pinecone
UPSERT_BATCH_SIZE = 100
EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256))
class PineconeDataStore(DataStore):
def __init__(self):
# Check if the index name is specified and exists in Pinecone
if PINECONE_INDEX and PINECONE_INDEX not in pinecone.list_indexes():
# Get all fields in the metadata object in a list
fields_to_index = list(DocumentChunkMetadata.__fields__.keys())
# Create a new index with the specified name, dimension, and metadata configuration
try:
logger.info(
f"Creating index {PINECONE_INDEX} with metadata config {fields_to_index}"
)
pinecone.create_index(
PINECONE_INDEX,
dimension=EMBEDDING_DIMENSION,
metadata_config={"indexed": fields_to_index},
)
self.index = pinecone.Index(PINECONE_INDEX)
logger.info(f"Index {PINECONE_INDEX} created successfully")
except Exception as e:
logger.error(f"Error creating index {PINECONE_INDEX}: {e}")
raise e
elif PINECONE_INDEX and PINECONE_INDEX in pinecone.list_indexes():
# Connect to an existing index with the specified name
try:
logger.info(f"Connecting to existing index {PINECONE_INDEX}")
self.index = pinecone.Index(PINECONE_INDEX)
logger.info(f"Connected to index {PINECONE_INDEX} successfully")
except Exception as e:
logger.error(f"Error connecting to index {PINECONE_INDEX}: {e}")
raise e
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
"""
Takes in a dict from document id to list of document chunks and inserts them into the index.
Return a list of document ids.
"""
# Initialize a list of ids to return
doc_ids: List[str] = []
# Initialize a list of vectors to upsert
vectors = []
# Loop through the dict items
for doc_id, chunk_list in chunks.items():
# Append the id to the ids list
doc_ids.append(doc_id)
logger.info(f"Upserting document_id: {doc_id}")
for chunk in chunk_list:
# Create a vector tuple of (id, embedding, metadata)
# Convert the metadata object to a dict with unix timestamps for dates
pinecone_metadata = self._get_pinecone_metadata(chunk.metadata)
# Add the text and document id to the metadata dict
pinecone_metadata["text"] = chunk.text
pinecone_metadata["document_id"] = doc_id
vector = (chunk.id, chunk.embedding, pinecone_metadata)
vectors.append(vector)
# Split the vectors list into batches of the specified size
batches = [
vectors[i : i + UPSERT_BATCH_SIZE]
for i in range(0, len(vectors), UPSERT_BATCH_SIZE)
]
# Upsert each batch to Pinecone
for batch in batches:
try:
logger.info(f"Upserting batch of size {len(batch)}")
self.index.upsert(vectors=batch)
logger.info(f"Upserted batch successfully")
except Exception as e:
logger.error(f"Error upserting batch: {e}")
raise e
return doc_ids
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
async def _query(
self,
queries: List[QueryWithEmbedding],
) -> List[QueryResult]:
"""
Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
"""
# Define a helper coroutine that performs a single query and returns a QueryResult
async def _single_query(query: QueryWithEmbedding) -> QueryResult:
logger.debug(f"Query: {query.query}")
# Convert the metadata filter object to a dict with pinecone filter expressions
pinecone_filter = self._get_pinecone_filter(query.filter)
try:
# Query the index with the query embedding, filter, and top_k
query_response = self.index.query(
# namespace=namespace,
top_k=query.top_k,
vector=query.embedding,
filter=pinecone_filter,
include_metadata=True,
)
except Exception as e:
logger.error(f"Error querying index: {e}")
raise e
query_results: List[DocumentChunkWithScore] = []
for result in query_response.matches:
score = result.score
metadata = result.metadata
# Remove document id and text from metadata and store it in a new variable
metadata_without_text = (
{key: value for key, value in metadata.items() if key != "text"}
if metadata
else None
)
# If the source is not a valid Source in the Source enum, set it to None
if (
metadata_without_text
and "source" in metadata_without_text
and metadata_without_text["source"] not in Source.__members__
):
metadata_without_text["source"] = None
# Create a document chunk with score object with the result data
result = DocumentChunkWithScore(
id=result.id,
score=score,
text=str(metadata["text"])
if metadata and "text" in metadata
else "",
metadata=metadata_without_text,
)
query_results.append(result)
return QueryResult(query=query.query, results=query_results)
# Use asyncio.gather to run multiple _single_query coroutines concurrently and collect their results
results: List[QueryResult] = await asyncio.gather(
*[_single_query(query) for query in queries]
)
return results
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
async def delete(
self,
ids: Optional[List[str]] = None,
filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
) -> bool:
"""
Removes vectors by ids, filter, or everything from the index.
"""
# Delete all vectors from the index if delete_all is True
if delete_all:
try:
logger.info(f"Deleting all vectors from index")
self.index.delete(delete_all=True)
logger.info(f"Deleted all vectors successfully")
return True
except Exception as e:
logger.error(f"Error deleting all vectors: {e}")
raise e
# Convert the metadata filter object to a dict with pinecone filter expressions
pinecone_filter = self._get_pinecone_filter(filter)
# Delete vectors that match the filter from the index if the filter is not empty
if pinecone_filter != {}:
try:
logger.info(f"Deleting vectors with filter {pinecone_filter}")
self.index.delete(filter=pinecone_filter)
logger.info(f"Deleted vectors with filter successfully")
except Exception as e:
logger.error(f"Error deleting vectors with filter: {e}")
raise e
# Delete vectors that match the document ids from the index if the ids list is not empty
gitextract_txp1w2j2/
├── .dockerignore
├── .env.example
├── .github/
│ └── pull_request_template.md
├── .gitignore
├── .well-known/
│ ├── ai-plugin.json
│ └── openapi.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── datastore/
│ ├── __init__.py
│ ├── datastore.py
│ ├── factory.py
│ └── providers/
│ ├── __init__.py
│ ├── analyticdb_datastore.py
│ ├── azurecosmosdb_datastore.py
│ ├── azuresearch_datastore.py
│ ├── chroma_datastore.py
│ ├── elasticsearch_datastore.py
│ ├── llama_datastore.py
│ ├── milvus_datastore.py
│ ├── mongodb_atlas_datastore.py
│ ├── pgvector_datastore.py
│ ├── pinecone_datastore.py
│ ├── postgres_datastore.py
│ ├── qdrant_datastore.py
│ ├── redis_datastore.py
│ ├── supabase_datastore.py
│ ├── weaviate_datastore.py
│ └── zilliz_datastore.py
├── docs/
│ ├── deployment/
│ │ ├── flyio.md
│ │ ├── heroku.md
│ │ ├── other-options.md
│ │ ├── removing-unused-dependencies.md
│ │ └── render.md
│ ├── deprecated/
│ │ └── plugins.md
│ └── providers/
│ ├── analyticdb/
│ │ └── setup.md
│ ├── azurecosmosdb/
│ │ └── setup.md
│ ├── azuresearch/
│ │ └── setup.md
│ ├── chroma/
│ │ └── setup.md
│ ├── elasticsearch/
│ │ └── setup.md
│ ├── llama/
│ │ └── setup.md
│ ├── milvus/
│ │ └── setup.md
│ ├── mongodb/
│ │ └── setup.md
│ ├── pinecone/
│ │ └── setup.md
│ ├── postgres/
│ │ └── setup.md
│ ├── qdrant/
│ │ └── setup.md
│ ├── redis/
│ │ └── setup.md
│ ├── supabase/
│ │ └── setup.md
│ ├── weaviate/
│ │ └── setup.md
│ └── zilliz/
│ └── setup.md
├── examples/
│ ├── authentication-methods/
│ │ ├── no-auth/
│ │ │ ├── ai-plugin.json
│ │ │ └── main.py
│ │ ├── oauth/
│ │ │ └── ai-plugin.json
│ │ ├── service-http/
│ │ │ └── ai-plugin.json
│ │ └── user-http/
│ │ └── ai-plugin.json
│ ├── docker/
│ │ ├── elasticsearch/
│ │ │ ├── README.md
│ │ │ └── docker-compose.yaml
│ │ ├── milvus/
│ │ │ └── docker-compose.yaml
│ │ ├── qdrant/
│ │ │ ├── README.md
│ │ │ ├── docker-compose.yaml
│ │ │ ├── documents.json
│ │ │ └── queries.json
│ │ └── redis/
│ │ └── docker-compose.yml
│ ├── function-calling/
│ │ └── README.md
│ ├── memory/
│ │ ├── README.md
│ │ ├── ai-plugin.json
│ │ ├── main.py
│ │ └── openapi.yaml
│ └── providers/
│ ├── azurecosmosdb/
│ │ └── semantic-search.ipynb
│ ├── elasticsearch/
│ │ └── search.ipynb
│ ├── mongodb/
│ │ └── semantic-search.ipynb
│ ├── pinecone/
│ │ └── semantic-search.ipynb
│ ├── redis/
│ │ └── semantic-search-and-filter.ipynb
│ └── supabase/
│ ├── .gitignore
│ ├── config.toml
│ ├── migrations/
│ │ └── 20230414142107_init_pg_vector.sql
│ └── seed.sql
├── local_server/
│ ├── ai-plugin.json
│ ├── main.py
│ └── openapi.yaml
├── models/
│ ├── api.py
│ └── models.py
├── pyproject.toml
├── scripts/
│ ├── process_json/
│ │ ├── README.md
│ │ ├── example.json
│ │ └── process_json.py
│ ├── process_jsonl/
│ │ ├── README.md
│ │ ├── example.jsonl
│ │ └── process_jsonl.py
│ └── process_zip/
│ ├── README.md
│ └── process_zip.py
├── server/
│ └── main.py
├── services/
│ ├── chunks.py
│ ├── date.py
│ ├── extract_metadata.py
│ ├── file.py
│ ├── openai.py
│ └── pii_detection.py
└── tests/
├── __init__.py
└── datastore/
└── providers/
├── analyticdb/
│ └── test_analyticdb_datastore.py
├── azurecosmosdb/
│ └── test_azurecosmosdb_datastore.py
├── azuresearch/
│ └── test_azuresearch_datastore.py
├── chroma/
│ └── test_chroma_datastore.py
├── elasticsearch/
│ └── test_elasticsearch_datastore.py
├── llama/
│ └── test_llama_datastore.py
├── milvus/
│ └── test_milvus_datastore.py
├── mongodb_atlas/
│ ├── test_integration.py
│ └── test_mongodb_datastore.py
├── postgres/
│ └── test_postgres_datastore.py
├── qdrant/
│ └── test_qdrant_datastore.py
├── redis/
│ └── test_redis_datastore.py
├── supabase/
│ └── test_supabase_datastore.py
├── weaviate/
│ ├── docker-compose.yml
│ └── test_weaviate_datastore.py
└── zilliz/
└── test_zilliz_datastore.py
SYMBOL INDEX (424 symbols across 49 files)
FILE: datastore/datastore.py
class DataStore (line 17) | class DataStore(ABC):
method upsert (line 18) | async def upsert(
method _upsert (line 45) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method query (line 53) | async def query(self, queries: List[Query]) -> List[QueryResult]:
method _query (line 68) | async def _query(self, queries: List[QueryWithEmbedding]) -> List[Quer...
method delete (line 75) | async def delete(
FILE: datastore/factory.py
function get_datastore (line 5) | async def get_datastore() -> DataStore:
FILE: datastore/providers/analyticdb_datastore.py
class AnalyticDBDataStore (line 36) | class AnalyticDBDataStore(DataStore):
method __init__ (line 37) | def __init__(self, config: Dict[str, str] = PG_CONFIG):
method _initialize_db (line 57) | def _initialize_db(self):
method _create_table (line 67) | def _create_table(self, cur: psycopg2.extensions.cursor):
method _create_embedding_index (line 84) | def _create_embedding_index(self, cur: psycopg2.extensions.cursor):
method _upsert (line 110) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _upsert_chunk (line 125) | def _upsert_chunk(self, chunk: DocumentChunk):
method _query (line 169) | async def _query(self, queries: List[QueryWithEmbedding]) -> List[Quer...
method delete (line 262) | async def delete(
method _generate_delete_query (line 296) | def _generate_delete_query(
FILE: datastore/providers/azurecosmosdb_datastore.py
class AzureCosmosDBStoreApi (line 40) | class AzureCosmosDBStoreApi(ABC):
method ensure (line 42) | async def ensure(self, num_lists, similarity):
method upsert_core (line 46) | async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -...
method query_core (line 50) | async def query_core(
method drop_container (line 56) | async def drop_container(self):
method delete_filter (line 60) | async def delete_filter(self, filter: DocumentMetadataFilter):
method delete_ids (line 64) | async def delete_ids(self, ids: List[str]):
method delete_document_ids (line 68) | async def delete_document_ids(self, documentIds: List[str]):
class MongoStoreApi (line 72) | class MongoStoreApi(AzureCosmosDBStoreApi):
method __init__ (line 73) | def __init__(self, mongoClient: MongoClient):
method _get_metadata_filter (line 77) | def _get_metadata_filter(filter: DocumentMetadataFilter) -> dict:
method ensure (line 97) | async def ensure(self, num_lists, similarity):
method upsert_core (line 122) | async def upsert_core(self, docId: str, chunks: List[DocumentChunk]) -...
method query_core (line 142) | async def query_core(
method drop_container (line 182) | async def drop_container(self):
method delete_filter (line 185) | async def delete_filter(self, filter: DocumentMetadataFilter):
method delete_ids (line 189) | async def delete_ids(self, ids: List[str]):
method delete_document_ids (line 192) | async def delete_document_ids(self, documentIds: List[str]):
class AzureCosmosDBDataStore (line 202) | class AzureCosmosDBDataStore(DataStore):
method __init__ (line 203) | def __init__(self, cosmosStore: AzureCosmosDBStoreApi):
method create (line 222) | async def create(num_lists, similarity) -> DataStore:
method _upsert (line 236) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 249) | async def _query(
method delete (line 270) | async def delete(
FILE: datastore/providers/azuresearch_datastore.py
class AzureSearchDataStore (line 55) | class AzureSearchDataStore(DataStore):
method __init__ (line 56) | def __init__(self):
method _upsert (line 76) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method delete (line 116) | async def delete(
method _query (line 164) | async def _query(self, queries: List[QueryWithEmbedding]) -> List[Quer...
method _single_query (line 170) | async def _single_query(self, query: QueryWithEmbedding) -> QueryResult:
method _translate_filter (line 225) | def _translate_filter(filter: DocumentMetadataFilter) -> str:
method _create_index (line 262) | def _create_index(self, mgmt_client: SearchIndexClient):
method _create_credentials (line 358) | def _create_credentials(
FILE: datastore/providers/chroma_datastore.py
class ChromaDataStore (line 36) | class ChromaDataStore(DataStore):
method __init__ (line 37) | def __init__(
method upsert (line 73) | async def upsert(
method _upsert (line 86) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _where_from_query_filter (line 110) | def _where_from_query_filter(self, query_filter: DocumentMetadataFilte...
method _process_metadata_for_storage (line 146) | def _process_metadata_for_storage(self, metadata: DocumentChunkMetadat...
method _process_metadata_from_storage (line 165) | def _process_metadata_from_storage(self, metadata: Dict) -> DocumentCh...
method _query (line 177) | async def _query(self, queries: List[QueryWithEmbedding]) -> List[Quer...
method delete (line 220) | async def delete(
FILE: datastore/providers/elasticsearch_datastore.py
class ElasticsearchDataStore (line 33) | class ElasticsearchDataStore(DataStore):
method __init__ (line 34) | def __init__(
method _upsert (line 77) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 93) | async def _query(
method delete (line 113) | async def delete(
method _get_es_filters (line 164) | def _get_es_filters(
method _convert_document_chunk_to_es_document_operation (line 196) | def _convert_document_chunk_to_es_document_operation(
method _convert_queries_to_msearch_query (line 222) | def _convert_queries_to_msearch_query(self, queries: List[QueryWithEmb...
method _convert_hit_to_document_chunk_with_score (line 242) | def _convert_hit_to_document_chunk_with_score(self, hit) -> DocumentCh...
method _set_up_index (line 251) | def _set_up_index(
method _recreate_index (line 285) | def _recreate_index(
function connect_to_elasticsearch (line 314) | def connect_to_elasticsearch(
FILE: datastore/providers/llama_datastore.py
function _create_or_load_index (line 42) | def _create_or_load_index(
function _create_or_load_query_kwargs (line 68) | def _create_or_load_query_kwargs(
function _doc_chunk_to_node (line 80) | def _doc_chunk_to_node(doc_chunk: DocumentChunk, source_doc_id: str) -> ...
function _query_with_embedding_to_query_bundle (line 91) | def _query_with_embedding_to_query_bundle(query: QueryWithEmbedding) -> ...
function _source_node_to_doc_chunk_with_score (line 98) | def _source_node_to_doc_chunk_with_score(
function _response_to_query_result (line 115) | def _response_to_query_result(
class LlamaDataStore (line 127) | class LlamaDataStore(DataStore):
method __init__ (line 128) | def __init__(
method _upsert (line 134) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 152) | async def _query(
method delete (line 185) | async def delete(
FILE: datastore/providers/milvus_datastore.py
class Required (line 47) | class Required:
class MilvusDataStore (line 106) | class MilvusDataStore(DataStore):
method __init__ (line 107) | def __init__(
method _get_schema (line 129) | def _get_schema(self):
method _create_connection (line 132) | def _create_connection(self):
method _create_collection (line 174) | def _create_collection(self, collection_name, create_new: bool) -> None:
method _create_index (line 222) | def _create_index(self):
method _upsert (line 311) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _get_values (line 368) | def _get_values(self, chunk: DocumentChunk) -> List[any] | None: # ty...
method _query (line 405) | async def _query(
method delete (line 479) | async def delete(
method _get_filter (line 574) | def _get_filter(self, filter: DocumentMetadataFilter) -> Optional[str]:
FILE: datastore/providers/mongodb_atlas_datastore.py
class MongoDBAtlasDataStore (line 31) | class MongoDBAtlasDataStore(DataStore):
method __init__ (line 33) | def __init__(
method client (line 74) | def client(self):
method upsert (line 79) | async def upsert(
method _upsert (line 89) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 108) | async def _query(
method _execute_embedding_query (line 123) | async def _execute_embedding_query(self, query: QueryWithEmbedding) ->...
method delete (line 159) | async def delete(
method _convert_mongodb_document_to_document_chunk_with_score (line 200) | def _convert_mongodb_document_to_document_chunk_with_score(
method _build_mongo_filter (line 211) | def _build_mongo_filter(
method _connect_to_mongodb_atlas (line 244) | def _connect_to_mongodb_atlas(atlas_connection_uri: str):
FILE: datastore/providers/pgvector_datastore.py
class PGClient (line 19) | class PGClient(ABC):
method upsert (line 21) | async def upsert(self, table: str, json: dict[str, Any]) -> None:
method rpc (line 28) | async def rpc(self, function_name: str, params: dict[str, Any]) -> Any:
method delete_like (line 35) | async def delete_like(self, table: str, column: str, pattern: str) -> ...
method delete_in (line 42) | async def delete_in(self, table: str, column: str, ids: List[str]) -> ...
method delete_by_filters (line 49) | async def delete_by_filters(
class PgVectorDataStore (line 59) | class PgVectorDataStore(DataStore):
method __init__ (line 60) | def __init__(self):
method create_db_client (line 64) | def create_db_client(self) -> PGClient:
method _upsert (line 73) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 100) | async def _query(self, queries: List[QueryWithEmbedding]) -> List[Quer...
method delete (line 155) | async def delete(
FILE: datastore/providers/pinecone_datastore.py
class PineconeDataStore (line 37) | class PineconeDataStore(DataStore):
method __init__ (line 38) | def __init__(self):
method _upsert (line 70) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 112) | async def _query(
method delete (line 179) | async def delete(
method _get_pinecone_filter (line 224) | def _get_pinecone_filter(
method _get_pinecone_metadata (line 252) | def _get_pinecone_metadata(
FILE: datastore/providers/postgres_datastore.py
class PostgresDataStore (line 24) | class PostgresDataStore(PgVectorDataStore):
method create_db_client (line 25) | def create_db_client(self):
class PostgresClient (line 29) | class PostgresClient(PGClient):
method __init__ (line 30) | def __init__(self) -> None:
method __del__ (line 37) | def __del__(self):
method upsert (line 41) | async def upsert(self, table: str, json: dict[str, Any]):
method rpc (line 73) | async def rpc(self, function_name: str, params: dict[str, Any]):
method delete_like (line 88) | async def delete_like(self, table: str, column: str, pattern: str):
method delete_in (line 99) | async def delete_in(self, table: str, column: str, ids: List[str]):
method delete_by_filters (line 110) | async def delete_by_filters(self, table: str, filter: DocumentMetadata...
FILE: datastore/providers/qdrant_datastore.py
class QdrantDataStore (line 32) | class QdrantDataStore(DataStore):
method __init__ (line 35) | def __init__(
method _upsert (line 63) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 80) | async def _query(
method delete (line 105) | async def delete(
method _convert_document_chunk_to_point (line 133) | def _convert_document_chunk_to_point(
method _create_document_chunk_id (line 152) | def _create_document_chunk_id(self, external_id: Optional[str]) -> str:
method _convert_query_to_search_request (line 157) | def _convert_query_to_search_request(
method _convert_metadata_filter_to_qdrant_filter (line 168) | def _convert_metadata_filter_to_qdrant_filter(
method _convert_scored_point_to_document_chunk_with_score (line 233) | def _convert_scored_point_to_document_chunk_with_score(
method _set_up_collection (line 245) | def _set_up_collection(
method _recreate_collection (line 276) | def _recreate_collection(self, distance: rest.Distance, vector_size: i...
FILE: datastore/providers/redis_datastore.py
function unpack_schema (line 53) | def unpack_schema(d: dict):
function _check_redis_module_exist (line 61) | async def _check_redis_module_exist(client: redis.Redis, modules: List[d...
class RedisDataStore (line 76) | class RedisDataStore(DataStore):
method __init__ (line 77) | def __init__(self, client: redis.Redis, redisearch_schema: dict):
method init (line 89) | async def init(cls, **kwargs):
method _redis_key (line 147) | def _redis_key(document_id: str, chunk_id: str) -> str:
method _escape (line 161) | def _escape(value: str) -> str:
method _get_redis_chunk (line 178) | def _get_redis_chunk(self, chunk: DocumentChunk) -> dict:
method _get_redis_query (line 206) | def _get_redis_query(self, query: QueryWithEmbedding) -> RediSearchQuery:
method _redis_delete (line 266) | async def _redis_delete(self, keys: List[str]):
method _upsert (line 278) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 301) | async def _query(
method _find_keys (line 345) | async def _find_keys(self, pattern: str) -> List[str]:
method delete (line 348) | async def delete(
FILE: datastore/providers/supabase_datastore.py
class SupabaseDataStore (line 23) | class SupabaseDataStore(PgVectorDataStore):
method create_db_client (line 24) | def create_db_client(self):
class SupabaseClient (line 28) | class SupabaseClient(PGClient):
method __init__ (line 29) | def __init__(self) -> None:
method upsert (line 36) | async def upsert(self, table: str, json: dict[str, Any]):
method rpc (line 45) | async def rpc(self, function_name: str, params: dict[str, Any]):
method delete_like (line 57) | async def delete_like(self, table: str, column: str, pattern: str):
method delete_in (line 63) | async def delete_in(self, table: str, column: str, ids: List[str]):
method delete_by_filters (line 69) | async def delete_by_filters(self, table: str, filter: DocumentMetadata...
FILE: datastore/providers/weaviate_datastore.py
function extract_schema_properties (line 79) | def extract_schema_properties(schema):
class WeaviateDataStore (line 85) | class WeaviateDataStore(DataStore):
method handle_errors (line 86) | def handle_errors(self, results: Optional[List[dict]]) -> List[str]:
method __init__ (line 104) | def __init__(self):
method _build_auth_credentials (line 137) | def _build_auth_credentials():
method _upsert (line 149) | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> Lis...
method _query (line 188) | async def _query(
method delete (line 264) | async def delete(
method build_filters (line 316) | def build_filters(filter):
method _is_valid_weaviate_id (line 354) | def _is_valid_weaviate_id(candidate_id: str) -> bool:
method _is_wcs_domain (line 374) | def _is_wcs_domain(url: str) -> bool:
FILE: datastore/providers/zilliz_datastore.py
class ZillizDataStore (line 24) | class ZillizDataStore(MilvusDataStore):
method __init__ (line 25) | def __init__(self, create_new: Optional[bool] = False):
method _create_connection (line 40) | def _create_connection(self):
method _create_index (line 54) | def _create_index(self):
FILE: examples/authentication-methods/no-auth/main.py
function upsert_file (line 40) | async def upsert_file(
function upsert (line 67) | async def upsert(
function query_main (line 82) | async def query_main(
function query (line 100) | async def query(
function delete (line 117) | async def delete(
function startup (line 138) | async def startup():
function start (line 143) | def start():
FILE: examples/memory/main.py
function validate_token (line 32) | def validate_token(credentials: HTTPAuthorizationCredentials = Depends(b...
function upsert_file (line 56) | async def upsert_file(
function upsert_main (line 83) | async def upsert_main(
function upsert (line 101) | async def upsert(
function query_main (line 117) | async def query_main(
function query (line 137) | async def query(
function delete (line 155) | async def delete(
function startup (line 177) | async def startup():
function start (line 182) | def start():
FILE: examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql
type documents (line 3) | create table if not exists documents (
type ix_documents_document_id (line 15) | create index ix_documents_document_id on documents using btree ( documen...
type ix_documents_source (line 16) | create index ix_documents_source on documents using btree ( source )
type ix_documents_source_id (line 17) | create index ix_documents_source_id on documents using btree ( source_id )
type ix_documents_author (line 18) | create index ix_documents_author on documents using btree ( author )
type ix_documents_created_at (line 19) | create index ix_documents_created_at on documents using brin ( created_at )
function match_page_sections (line 23) | create or replace function match_page_sections(in_embedding vector(256) ...
FILE: local_server/main.py
function get_manifest (line 44) | async def get_manifest(request):
function get_logo (line 52) | async def get_logo(request):
function get_openapi (line 58) | async def get_openapi(request):
function upsert_file (line 67) | async def upsert_file(
function upsert (line 94) | async def upsert(
function query_main (line 106) | async def query_main(request: QueryRequest = Body(...)):
function delete (line 121) | async def delete(
function startup (line 142) | async def startup():
function start (line 147) | def start():
FILE: models/api.py
class UpsertRequest (line 11) | class UpsertRequest(BaseModel):
class UpsertResponse (line 15) | class UpsertResponse(BaseModel):
class QueryRequest (line 19) | class QueryRequest(BaseModel):
class QueryResponse (line 23) | class QueryResponse(BaseModel):
class DeleteRequest (line 27) | class DeleteRequest(BaseModel):
class DeleteResponse (line 33) | class DeleteResponse(BaseModel):
FILE: models/models.py
class Source (line 6) | class Source(str, Enum):
class DocumentMetadata (line 12) | class DocumentMetadata(BaseModel):
class DocumentChunkMetadata (line 20) | class DocumentChunkMetadata(DocumentMetadata):
class DocumentChunk (line 24) | class DocumentChunk(BaseModel):
class DocumentChunkWithScore (line 31) | class DocumentChunkWithScore(DocumentChunk):
class Document (line 35) | class Document(BaseModel):
class DocumentWithChunks (line 41) | class DocumentWithChunks(Document):
class DocumentMetadataFilter (line 45) | class DocumentMetadataFilter(BaseModel):
class Query (line 54) | class Query(BaseModel):
class QueryWithEmbedding (line 60) | class QueryWithEmbedding(Query):
class QueryResult (line 64) | class QueryResult(BaseModel):
FILE: scripts/process_json/process_json.py
function process_json_dump (line 16) | async def process_json_dump(
function main (line 109) | async def main():
FILE: scripts/process_jsonl/process_jsonl.py
function process_jsonl_dump (line 16) | async def process_jsonl_dump(
function main (line 107) | async def main():
FILE: scripts/process_zip/process_zip.py
function process_file_dump (line 19) | async def process_file_dump(
function main (line 114) | async def main():
FILE: server/main.py
function validate_token (line 27) | def validate_token(credentials: HTTPAuthorizationCredentials = Depends(b...
function upsert_file (line 51) | async def upsert_file(
function upsert (line 78) | async def upsert(
function query_main (line 93) | async def query_main(
function query (line 112) | async def query(
function delete (line 129) | async def delete(
function startup (line 150) | async def startup():
function start (line 155) | def start():
FILE: services/chunks.py
function get_text_chunks (line 25) | def get_text_chunks(text: str, chunk_token_size: Optional[int]) -> List[...
function create_document_chunks (line 102) | def create_document_chunks(
function get_document_chunks (line 152) | def get_document_chunks(
FILE: services/date.py
function to_unix_timestamp (line 5) | def to_unix_timestamp(date_str: str) -> int:
FILE: services/extract_metadata.py
function extract_metadata_from_document (line 9) | def extract_metadata_from_document(text: str) -> Dict[str, str]:
FILE: services/file.py
function get_document_from_file (line 15) | async def get_document_from_file(
function extract_text_from_filepath (line 25) | def extract_text_from_filepath(filepath: str, mimetype: Optional[str] = ...
function extract_text_from_file (line 48) | def extract_text_from_file(file: BufferedReader, mimetype: str) -> str:
function extract_text_from_form_file (line 91) | async def extract_text_from_form_file(file: UploadFile):
FILE: services/openai.py
function get_embeddings (line 13) | def get_embeddings(texts: List[str]) -> List[List[float]]:
function get_chat_completion (line 44) | def get_chat_completion(
FILE: services/pii_detection.py
function screen_text_for_pii (line 5) | def screen_text_for_pii(text: str) -> bool:
FILE: tests/datastore/providers/analyticdb/test_analyticdb_datastore.py
function analyticdb_datastore (line 16) | def analyticdb_datastore():
function document_chunk_one (line 21) | def document_chunk_one():
function document_chunk_two (line 63) | def document_chunk_two():
function test_upsert (line 142) | async def test_upsert(analyticdb_datastore, document_chunk_one):
function test_reload (line 156) | async def test_reload(analyticdb_datastore, document_chunk_one, document...
function test_upsert_query_all (line 180) | async def test_upsert_query_all(analyticdb_datastore, document_chunk_two):
function test_query_accuracy (line 197) | async def test_query_accuracy(analyticdb_datastore, document_chunk_one):
function test_query_filter (line 215) | async def test_query_filter(analyticdb_datastore, document_chunk_one):
function test_delete_with_date_filter (line 236) | async def test_delete_with_date_filter(analyticdb_datastore, document_ch...
function test_delete_with_source_filter (line 259) | async def test_delete_with_source_filter(analyticdb_datastore, document_...
function test_delete_with_document_id_filter (line 282) | async def test_delete_with_document_id_filter(analyticdb_datastore, docu...
function test_delete_with_document_id (line 303) | async def test_delete_with_document_id(analyticdb_datastore, document_ch...
FILE: tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py
function create_embedding (line 20) | def create_embedding(non_zero_pos: int) -> List[float]:
function azure_cosmos_db_settings_from_dot_env (line 28) | def azure_cosmos_db_settings_from_dot_env() -> dict:
function initial_document_chunks (line 50) | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
function queries (line 66) | def queries() -> List[QueryWithEmbedding]:
function azurecosmosdb_datastore (line 83) | async def azurecosmosdb_datastore() -> DataStore:
function test_upsert (line 90) | async def test_upsert(
function test_query (line 104) | async def test_query(
function test_delete (line 130) | async def test_delete(azurecosmosdb_datastore: AzureCosmosDBDataStore) -...
function test_delete_all (line 168) | async def test_delete_all(azurecosmosdb_datastore: AzureCosmosDBDataStor...
FILE: tests/datastore/providers/azuresearch/test_azuresearch_datastore.py
function azuresearch_mgmt_client (line 26) | def azuresearch_mgmt_client():
function test_translate_filter (line 34) | def test_translate_filter():
function test_lifecycle_hybrid (line 88) | async def test_lifecycle_hybrid(azuresearch_mgmt_client: SearchIndexClie...
function test_lifecycle_vectors_only (line 95) | async def test_lifecycle_vectors_only(azuresearch_mgmt_client: SearchInd...
function test_lifecycle_semantic (line 102) | async def test_lifecycle_semantic(azuresearch_mgmt_client: SearchIndexCl...
function lifecycle (line 110) | async def lifecycle(azuresearch_mgmt_client: SearchIndexClient):
FILE: tests/datastore/providers/chroma/test_chroma_datastore.py
function ephemeral_chroma_datastore (line 19) | def ephemeral_chroma_datastore() -> ChromaDataStore:
function persisted_chroma_datastore (line 26) | def persisted_chroma_datastore() -> ChromaDataStore:
function get_chroma_datastore (line 35) | def get_chroma_datastore() -> ChromaDataStore:
function cleanup (line 42) | def cleanup():
function create_embedding (line 51) | def create_embedding(dim: int) -> List[float]:
function initial_document_chunks (line 61) | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
function document_chunks (line 77) | def document_chunks(initial_document_chunks) -> Dict[str, List[DocumentC...
function test_add_chunks (line 103) | async def test_add_chunks(document_chunks: Dict[str, List[DocumentChunk]]):
function test_upsert (line 117) | async def test_upsert(
function test_add_and_query_all (line 138) | async def test_add_and_query_all(document_chunks):
function test_query_accuracy (line 155) | async def test_query_accuracy(document_chunks):
function test_query_filter_by_id (line 188) | async def test_query_filter_by_id(document_chunks):
function test_query_filter_by_date (line 213) | async def test_query_filter_by_date(document_chunks):
function test_delete_by_id (line 267) | async def test_delete_by_id(document_chunks):
FILE: tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py
function elasticsearch_datastore (line 19) | def elasticsearch_datastore():
function sample_embedding (line 23) | def sample_embedding(one_element_poz: int):
function sample_embeddings (line 29) | def sample_embeddings(num: int, one_element_start: int = 0):
function document_chunk_one (line 39) | def document_chunk_one():
function test_upsert (line 77) | async def test_upsert(elasticsearch_datastore, document_chunk_one):
function test_upsert_query_all (line 92) | async def test_upsert_query_all(elasticsearch_datastore, document_chunk_...
function test_delete_with_document_id (line 109) | async def test_delete_with_document_id(elasticsearch_datastore, document...
function test_delete_with_source_filter (line 132) | async def test_delete_with_source_filter(elasticsearch_datastore, docume...
FILE: tests/datastore/providers/llama/test_llama_datastore.py
function create_embedding (line 7) | def create_embedding(non_zero_pos: int, size: int) -> List[float]:
function initial_document_chunks (line 14) | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
function queries (line 30) | def queries() -> List[QueryWithEmbedding]:
function llama_datastore (line 47) | def llama_datastore() -> LlamaDataStore:
function test_upsert (line 52) | async def test_upsert(
function test_query (line 62) | async def test_query(
function test_delete (line 87) | async def test_delete(
FILE: tests/datastore/providers/milvus/test_milvus_datastore.py
function milvus_datastore (line 21) | def milvus_datastore():
function sample_embedding (line 25) | def sample_embedding(one_element_poz: int):
function sample_embeddings (line 31) | def sample_embeddings(num: int, one_element_start: int = 0):
function document_chunk_one (line 42) | def document_chunk_one():
function document_chunk_two (line 85) | def document_chunk_two():
function test_upsert (line 164) | async def test_upsert(milvus_datastore, document_chunk_one):
function test_reload (line 174) | async def test_reload(milvus_datastore, document_chunk_one, document_chu...
function test_upsert_query_all (line 198) | async def test_upsert_query_all(milvus_datastore, document_chunk_two):
function test_query_accuracy (line 218) | async def test_query_accuracy(milvus_datastore, document_chunk_one):
function test_query_filter (line 238) | async def test_query_filter(milvus_datastore, document_chunk_one):
function test_delete_with_date_filter (line 261) | async def test_delete_with_date_filter(milvus_datastore, document_chunk_...
function test_delete_with_source_filter (line 286) | async def test_delete_with_source_filter(milvus_datastore, document_chun...
function test_delete_with_document_id_filter (line 311) | async def test_delete_with_document_id_filter(milvus_datastore, document...
function test_delete_with_document_id (line 334) | async def test_delete_with_document_id(milvus_datastore, document_chunk_...
FILE: tests/datastore/providers/mongodb_atlas/test_integration.py
function documents (line 22) | def documents():
function client (line 35) | def client():
function delete (line 44) | def delete(client) -> bool:
function upsert (line 52) | def upsert(delete, documents, client) -> bool:
function test_delete (line 59) | def test_delete(delete) -> None:
function test_upsert (line 65) | def test_upsert(upsert) -> None:
function test_query (line 71) | def test_query(upsert, client) -> None: # upsert,
function test_required_vars (line 103) | def test_required_vars() -> None:
function test_mongodb_connection (line 112) | def test_mongodb_connection() -> None:
function test_openai_connection (line 118) | def test_openai_connection() -> None:
FILE: tests/datastore/providers/mongodb_atlas/test_mongodb_datastore.py
function assert_when_ready (line 41) | async def assert_when_ready(callable: Callable, tries: int = 5, interval...
function collection_size_callback_factory (line 57) | def collection_size_callback_factory(collection, num: int):
function _mongodb_datastore (line 67) | def _mongodb_datastore():
function mongodb_datastore (line 72) | async def mongodb_datastore(_mongodb_datastore):
function sample_embedding (line 81) | def sample_embedding(one_element_poz: int):
function sample_embeddings (line 88) | def sample_embeddings(num: int, one_element_start: int = 0):
function document_id (line 93) | def document_id():
function chunk_ids (line 98) | def chunk_ids(document_id):
function one_documents_chunks (line 104) | def one_documents_chunks(document_id, chunk_ids):
function test_upsert (line 145) | async def test_upsert(mongodb_datastore: MongoDBAtlasDataStore, one_docu...
function test_upsert_query_all (line 154) | async def test_upsert_query_all(mongodb_datastore, one_documents_chunks,...
function test_delete_with_document_id (line 172) | async def test_delete_with_document_id(mongodb_datastore, one_documents_...
function test_delete_with_source_filter (line 186) | async def test_delete_with_source_filter(mongodb_datastore, one_document...
function build_mongo_filter (line 213) | def build_mongo_filter():
function test_build_mongo_filter_with_no_filter (line 217) | async def test_build_mongo_filter_with_no_filter(build_mongo_filter):
function test_build_mongo_filter_with_start_date (line 222) | async def test_build_mongo_filter_with_start_date(build_mongo_filter):
function test_build_mongo_filter_with_end_date (line 234) | async def test_build_mongo_filter_with_end_date(build_mongo_filter):
function test_build_mongo_filter_with_metadata_field (line 246) | async def test_build_mongo_filter_with_metadata_field(build_mongo_filter):
FILE: tests/datastore/providers/postgres/test_postgres_datastore.py
function create_embedding (line 15) | def create_embedding(non_zero_pos: int) -> List[float]:
function initial_document_chunks (line 23) | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
function queries (line 39) | def queries() -> List[QueryWithEmbedding]:
function postgres_datastore (line 56) | def postgres_datastore() -> PostgresDataStore:
function test_upsert (line 61) | async def test_upsert(
function test_query (line 71) | async def test_query(
function test_delete (line 96) | async def test_delete(
function test_upsert_new_chunk (line 108) | async def test_upsert_new_chunk(postgres_datastore):
function test_upsert_existing_chunk (line 121) | async def test_upsert_existing_chunk(postgres_datastore):
function test_query_score (line 154) | async def test_query_score(postgres_datastore):
function test_query_filter (line 182) | async def test_query_filter(postgres_datastore):
function test_delete (line 234) | async def test_delete(postgres_datastore):
function test_delete_all (line 269) | async def test_delete_all(postgres_datastore):
FILE: tests/datastore/providers/qdrant/test_qdrant_datastore.py
function create_embedding (line 17) | def create_embedding(non_zero_pos: int, size: int) -> List[float]:
function qdrant_datastore (line 24) | def qdrant_datastore() -> QdrantDataStore:
function client (line 31) | def client() -> qdrant_client.QdrantClient:
function initial_document_chunks (line 36) | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
function document_chunks (line 52) | def document_chunks() -> Dict[str, List[DocumentChunk]]:
function test_datastore_creates_payload_indexes (line 82) | async def test_datastore_creates_payload_indexes(
function test_upsert_creates_all_points (line 98) | async def test_upsert_creates_all_points(
function test_upsert_does_not_remove_existing_documents_but_store_new (line 110) | async def test_upsert_does_not_remove_existing_documents_but_store_new(
function test_query_returns_all_on_single_query (line 129) | async def test_query_returns_all_on_single_query(qdrant_datastore, docum...
function test_query_returns_closest_entry (line 146) | async def test_query_returns_closest_entry(qdrant_datastore, document_ch...
function test_query_filter_by_document_id_returns_this_document_chunks (line 168) | async def test_query_filter_by_document_id_returns_this_document_chunks(
function test_query_start_date_converts_datestring (line 197) | async def test_query_start_date_converts_datestring(
function test_query_end_date_converts_datestring (line 219) | async def test_query_end_date_converts_datestring(
function test_delete_removes_by_ids (line 240) | async def test_delete_removes_by_ids(
function test_delete_removes_by_document_id_filter (line 254) | async def test_delete_removes_by_document_id_filter(
function test_delete_removes_all (line 270) | async def test_delete_removes_all(
FILE: tests/datastore/providers/redis/test_redis_datastore.py
function redis_datastore (line 17) | async def redis_datastore():
function create_embedding (line 21) | def create_embedding(i, dim):
function create_document_chunk (line 27) | def create_document_chunk(i, dim):
function create_document_chunks (line 38) | def create_document_chunks(n, dim):
function test_redis_upsert_query (line 44) | async def test_redis_upsert_query(redis_datastore):
function test_redis_filter_query (line 60) | async def test_redis_filter_query(redis_datastore):
function test_redis_delete_docs (line 74) | async def test_redis_delete_docs(redis_datastore):
FILE: tests/datastore/providers/supabase/test_supabase_datastore.py
function create_embedding (line 15) | def create_embedding(non_zero_pos: int) -> List[float]:
function initial_document_chunks (line 23) | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
function queries (line 39) | def queries() -> List[QueryWithEmbedding]:
function supabase_datastore (line 56) | def supabase_datastore() -> SupabaseDataStore:
function test_upsert (line 61) | async def test_upsert(
function test_query (line 71) | async def test_query(
function test_delete (line 96) | async def test_delete(
function test_upsert_new_chunk (line 108) | async def test_upsert_new_chunk(supabase_datastore):
function test_upsert_existing_chunk (line 121) | async def test_upsert_existing_chunk(supabase_datastore):
function test_query_score (line 154) | async def test_query_score(supabase_datastore):
function test_query_filter (line 182) | async def test_query_filter(supabase_datastore):
function test_delete (line 234) | async def test_delete(supabase_datastore):
function test_delete_all (line 269) | async def test_delete_all(supabase_datastore):
FILE: tests/datastore/providers/weaviate/test_weaviate_datastore.py
function weaviate_client (line 26) | def weaviate_client():
function test_db (line 37) | def test_db(weaviate_client, documents):
function documents (line 52) | def documents():
function caplog (line 105) | def caplog(caplog: LogCaptureFixture):
function test_upsert (line 114) | def test_upsert(weaviate_client, document_id):
function test_upsert_no_metadata (line 197) | def test_upsert_no_metadata(weaviate_client):
function test_upsert_invalid_documents (line 234) | def test_upsert_invalid_documents(weaviate_client, test_document, expect...
function test_query (line 276) | def test_query(test_db, query, expected_num_results):
function test_delete (line 286) | def test_delete(test_db, weaviate_client, caplog):
function test_build_auth_credentials (line 318) | def test_build_auth_credentials(monkeypatch):
function test_extract_schema_properties (line 352) | def test_extract_schema_properties():
function test_reuse_schema (line 379) | def test_reuse_schema(weaviate_client, caplog):
function test_build_date_filters (line 391) | def test_build_date_filters():
function test_is_valid_weaviate_id (line 430) | def test_is_valid_weaviate_id(test_input, expected_result):
function test_upsert_same_docid (line 435) | def test_upsert_same_docid(test_db, weaviate_client):
function test_is_wcs_domain (line 537) | def test_is_wcs_domain(url, expected_result):
FILE: tests/datastore/providers/zilliz/test_zilliz_datastore.py
function zilliz_datastore (line 20) | def zilliz_datastore():
function test_zilliz (line 25) | async def test_zilliz(zilliz_datastore):
Condensed preview — 116 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (709K chars).
[
{
"path": ".dockerignore",
"chars": 143,
"preview": "# Ignore files that are already ignored by git\n.gitignore\n\nscripts/\ntests/\nexamples/\nlocal_server/\nassets/\n*.md\n*.pyc\n.d"
},
{
"path": ".env.example",
"chars": 4099,
"preview": "# Core environment variables\nDATASTORE=\"<your_datastore>\"\nBEARER_TOKEN=\"<your_bearer_token>\"\nOPENAI_API_KEY=\"<your_opena"
},
{
"path": ".github/pull_request_template.md",
"chars": 2426,
"preview": "## Pull Request (PR) Checklist\nIf you'd like to contribute, please follow the checklist below when submitting a PR. This"
},
{
"path": ".gitignore",
"chars": 1919,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\n"
},
{
"path": ".well-known/ai-plugin.json",
"chars": 804,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"description_for_"
},
{
"path": ".well-known/openapi.yaml",
"chars": 4746,
"preview": "openapi: 3.0.2\ninfo:\n title: Retrieval Plugin API\n description: A retrieval API for querying and filtering documents b"
},
{
"path": "Dockerfile",
"chars": 584,
"preview": "\nFROM python:3.10 as requirements-stage\n\nWORKDIR /tmp\n\nRUN pip install poetry\n\nCOPY ./pyproject.toml ./poetry.lock* /tmp"
},
{
"path": "LICENSE",
"chars": 1063,
"preview": "MIT License\n\nCopyright (c) 2023 OpenAI\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "Makefile",
"chars": 357,
"preview": "# Heroku\n# make heroku-login\n# make heroku-push\n\nHEROKU_APP = <your app name> \n\nheroku-push:\n\tdocker buildx build --plat"
},
{
"path": "README.md",
"chars": 56798,
"preview": "# ChatGPT Retrieval Plugin\n\nBuild Custom GPTs with a Retrieval Plugin backend to give ChatGPT access to personal documen"
},
{
"path": "datastore/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "datastore/datastore.py",
"chars": 2993,
"preview": "from abc import ABC, abstractmethod\nfrom typing import Dict, List, Optional\nimport asyncio\n\nfrom models.models import (\n"
},
{
"path": "datastore/factory.py",
"chars": 2778,
"preview": "from datastore.datastore import DataStore\nimport os\n\n\nasync def get_datastore() -> DataStore:\n datastore = os.environ"
},
{
"path": "datastore/providers/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "datastore/providers/analyticdb_datastore.py",
"chars": 11260,
"preview": "import os\nimport asyncio\nfrom typing import Dict, List, Optional, Tuple, Any\nfrom datetime import datetime\nfrom loguru i"
},
{
"path": "datastore/providers/azurecosmosdb_datastore.py",
"chars": 10726,
"preview": "import logging\nimport os\n\nimport certifi\nimport numpy as np\nimport pymongo\n\nfrom pymongo.mongo_client import MongoClient"
},
{
"path": "datastore/providers/azuresearch_datastore.py",
"chars": 15843,
"preview": "import asyncio\nimport base64\nimport os\nimport re\nimport time\nfrom typing import Dict, List, Optional, Union\n\nfrom azure."
},
{
"path": "datastore/providers/chroma_datastore.py",
"chars": 9107,
"preview": "\"\"\"\nChroma datastore support for the ChatGPT retrieval plugin.\n\nConsult the Chroma docs and GitHub repo for more informa"
},
{
"path": "datastore/providers/elasticsearch_datastore.py",
"chars": 12922,
"preview": "import os\nfrom typing import Dict, List, Any, Optional\n\nimport elasticsearch\nfrom elasticsearch import Elasticsearch, he"
},
{
"path": "datastore/providers/llama_datastore.py",
"chars": 7192,
"preview": "import json\nimport os\nfrom typing import Dict, List, Optional, Type\nfrom loguru import logger\nfrom datastore.datastore i"
},
{
"path": "datastore/providers/milvus_datastore.py",
"chars": 25922,
"preview": "import json\nimport os\nimport asyncio\n\nfrom loguru import logger\nfrom typing import Dict, List, Optional\nfrom pymilvus im"
},
{
"path": "datastore/providers/mongodb_atlas_datastore.py",
"chars": 9215,
"preview": "import os\nfrom typing import Dict, List, Any, Optional\nfrom loguru import logger\nfrom importlib.metadata import version\n"
},
{
"path": "datastore/providers/pgvector_datastore.py",
"chars": 6902,
"preview": "from abc import ABC, abstractmethod\nfrom typing import Any, Dict, List, Optional\nfrom datetime import datetime\nfrom logu"
},
{
"path": "datastore/providers/pinecone_datastore.py",
"chars": 11318,
"preview": "import os\nfrom typing import Any, Dict, List, Optional\nimport pinecone\nfrom tenacity import retry, wait_random_exponenti"
},
{
"path": "datastore/providers/postgres_datastore.py",
"chars": 4830,
"preview": "import os\nfrom typing import Any, List\nfrom datetime import datetime\nimport numpy as np\n\nfrom psycopg2 import connect\nfr"
},
{
"path": "datastore/providers/qdrant_datastore.py",
"chars": 10981,
"preview": "import os\nimport uuid\nfrom typing import Dict, List, Optional\n\nfrom grpc._channel import _InactiveRpcError\nfrom qdrant_c"
},
{
"path": "datastore/providers/redis_datastore.py",
"chars": 14188,
"preview": "import asyncio\nimport os\nimport re\nimport json\nimport redis.asyncio as redis\nimport numpy as np\n\nfrom redis.commands.sea"
},
{
"path": "datastore/providers/supabase_datastore.py",
"chars": 3415,
"preview": "import os\nfrom typing import Any, List\nfrom datetime import datetime\n\nfrom supabase import Client\n\nfrom datastore.provid"
},
{
"path": "datastore/providers/weaviate_datastore.py",
"chars": 13818,
"preview": "import asyncio\nimport os\nimport re\nimport uuid\nfrom typing import Dict, List, Optional\n\nimport weaviate\nfrom loguru impo"
},
{
"path": "datastore/providers/zilliz_datastore.py",
"chars": 2565,
"preview": "import os\n\nfrom loguru import logger\nfrom typing import Optional\nfrom pymilvus import (\n connections,\n)\nfrom uuid imp"
},
{
"path": "docs/deployment/flyio.md",
"chars": 3286,
"preview": "# Deploying to Fly.io\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused depen"
},
{
"path": "docs/deployment/heroku.md",
"chars": 3435,
"preview": "# Deploying to Heroku\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused depen"
},
{
"path": "docs/deployment/other-options.md",
"chars": 2602,
"preview": "# Other Deployment Options\n\nSome possible other options for deploying the app are:\n\n- **Azure Container Apps**: This is "
},
{
"path": "docs/deployment/removing-unused-dependencies.md",
"chars": 3199,
"preview": "# Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused dependencies from your [pypro"
},
{
"path": "docs/deployment/render.md",
"chars": 1844,
"preview": "# Deploying to Render\n\n## Removing Unused Dependencies\n\nBefore deploying your app, you might want to remove unused depen"
},
{
"path": "docs/deprecated/plugins.md",
"chars": 3444,
"preview": "## Plugins (deprecated)\n\nPlugins are chat extensions designed specifically for language models like ChatGPT, enabling th"
},
{
"path": "docs/providers/analyticdb/setup.md",
"chars": 3603,
"preview": "# AnalyticDB\n\n[AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-ov"
},
{
"path": "docs/providers/azurecosmosdb/setup.md",
"chars": 2120,
"preview": "# Azure Cosmos DB\n\n[Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully m"
},
{
"path": "docs/providers/azuresearch/setup.md",
"chars": 5943,
"preview": "# Azure Cognitive Search\n\n[Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval"
},
{
"path": "docs/providers/chroma/setup.md",
"chars": 3083,
"preview": "[Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make it easy to work with emb"
},
{
"path": "docs/providers/elasticsearch/setup.md",
"chars": 3610,
"preview": "# Elasticsearch\n\nElasticsearch is a search engine based on the Lucene library. It provides a distributed, full-text and "
},
{
"path": "docs/providers/llama/setup.md",
"chars": 2850,
"preview": "\n# LlamaIndex\n\n[LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with "
},
{
"path": "docs/providers/milvus/setup.md",
"chars": 3444,
"preview": "# Milvus\n\n[Milvus](https://milvus.io/) is the open-source, cloud-native vector database that scales to billions of vecto"
},
{
"path": "docs/providers/mongodb/setup.md",
"chars": 6933,
"preview": "# Setting up MongoDB Atlas as the Datastore Provider\n\nMongoDB Atlas is a multi-cloud database service made by the same p"
},
{
"path": "docs/providers/pinecone/setup.md",
"chars": 3307,
"preview": "# Pinecone\n\n[Pinecone](https://www.pinecone.io) is a managed vector database built for speed, scale, and shipping to pro"
},
{
"path": "docs/providers/postgres/setup.md",
"chars": 4222,
"preview": "# Postgres\n\nPostgres Database offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvect"
},
{
"path": "docs/providers/qdrant/setup.md",
"chars": 2657,
"preview": "# Qdrant\n\n[Qdrant](https://qdrant.tech/) is a vector database that can store documents and vector embeddings. It can run"
},
{
"path": "docs/providers/redis/setup.md",
"chars": 3580,
"preview": "# Redis\n\n[Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform that supports a va"
},
{
"path": "docs/providers/supabase/setup.md",
"chars": 5710,
"preview": "# Supabase\n\n[Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to "
},
{
"path": "docs/providers/weaviate/setup.md",
"chars": 4629,
"preview": "# Weaviate\n\n## Set up a Weaviate Instance\n\n[Weaviate](https://weaviate.io/) is an open-source vector search engine desig"
},
{
"path": "docs/providers/zilliz/setup.md",
"chars": 2531,
"preview": "# Zilliz\n\n[Zilliz](https://zilliz.com) is a managed cloud-native vector database designed for the billion scale. Zilliz "
},
{
"path": "examples/authentication-methods/no-auth/ai-plugin.json",
"chars": 712,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"description_for_"
},
{
"path": "examples/authentication-methods/no-auth/main.py",
"chars": 4199,
"preview": "# This is a version of the main.py file found in ../../../server/main.py without authentication.\n# Copy and paste this i"
},
{
"path": "examples/authentication-methods/oauth/ai-plugin.json",
"chars": 1075,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"description_for_"
},
{
"path": "examples/authentication-methods/service-http/ai-plugin.json",
"chars": 870,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"description_for_"
},
{
"path": "examples/authentication-methods/user-http/ai-plugin.json",
"chars": 789,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"description_for_"
},
{
"path": "examples/docker/elasticsearch/README.md",
"chars": 107,
"preview": "## Running Elasticsearch\n\n```bash\ndocker-compose up -d\n```\n\nshould now be running at http://localhost:9200\n"
},
{
"path": "examples/docker/elasticsearch/docker-compose.yaml",
"chars": 498,
"preview": "version: \"3.7\"\n\nservices:\n elasticsearch:\n image: docker.elastic.co/elasticsearch/elasticsearch:8.8.2\n container_"
},
{
"path": "examples/docker/milvus/docker-compose.yaml",
"chars": 1356,
"preview": "version: '3.5'\n\nservices:\n etcd:\n container_name: milvus-etcd\n image: quay.io/coreos/etcd:v3.5.0\n environment:"
},
{
"path": "examples/docker/qdrant/README.md",
"chars": 1193,
"preview": "# Running the Retrieval Plugin with Qdrant in Docker Containers\n\nTo set up the ChatGPT retrieval plugin with a single in"
},
{
"path": "examples/docker/qdrant/docker-compose.yaml",
"chars": 391,
"preview": "services:\n retrieval-app:\n build:\n context: ../../../\n dockerfile: Dockerfile\n image: openai/chatgpt-re"
},
{
"path": "examples/docker/qdrant/documents.json",
"chars": 1068,
"preview": "{\n \"documents\": [\n {\n \"id\": \"openai\",\n \"text\": \"OpenAI is an AI research and deployment company. Our missi"
},
{
"path": "examples/docker/qdrant/queries.json",
"chars": 86,
"preview": "{\n \"queries\": [\n {\n \"query\": \"What vector database should I use?\"\n }\n ]\n}"
},
{
"path": "examples/docker/redis/docker-compose.yml",
"chars": 343,
"preview": "version: \"3.9\"\n\nservices:\n redis:\n image: redis/redis-stack-server:latest\n ports:\n - \"6379:6379\"\n volumes"
},
{
"path": "examples/function-calling/README.md",
"chars": 15944,
"preview": "# Retrieval Plugin Function Calling Guide\n\nThis guide provides an overview of how to use the Retrieval Plugin with funct"
},
{
"path": "examples/memory/README.md",
"chars": 1469,
"preview": "# ChatGPT Retrieval Plugin with Memory\n\nThis example demonstrates how to give ChatGPT the ability to remember informatio"
},
{
"path": "examples/memory/ai-plugin.json",
"chars": 868,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"descript"
},
{
"path": "examples/memory/main.py",
"chars": 6287,
"preview": "# This is a version of the main.py file found in ../../server/main.py that also gives ChatGPT access to the upsert endpo"
},
{
"path": "examples/memory/openapi.yaml",
"chars": 6864,
"preview": "openapi: 3.0.2\ninfo:\n title: Retrieval Plugin API\n description: A retrieval API for querying and filtering documents b"
},
{
"path": "examples/providers/azurecosmosdb/semantic-search.ipynb",
"chars": 6932,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"de02cdc9\",\n \"metadata\": {},\n \"outputs\":"
},
{
"path": "examples/providers/elasticsearch/search.ipynb",
"chars": 29238,
"preview": "{\n \"cells\": [\n {\n \"attachments\": {},\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Using Elast"
},
{
"path": "examples/providers/mongodb/semantic-search.ipynb",
"chars": 26445,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"735ae737-86be-4497-a8e9-38525e422380\",\n \"metadata\": {},\n \"so"
},
{
"path": "examples/providers/pinecone/semantic-search.ipynb",
"chars": 40434,
"preview": "{\n \"cells\": [\n {\n \"attachments\": {},\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Using the P"
},
{
"path": "examples/providers/redis/semantic-search-and-filter.ipynb",
"chars": 19584,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 16,\n \"metadata\": {\n \"tags\": []\n },\n \"outputs\": "
},
{
"path": "examples/providers/supabase/.gitignore",
"chars": 27,
"preview": "# Supabase\n.branches\n.temp\n"
},
{
"path": "examples/providers/supabase/config.toml",
"chars": 2827,
"preview": "# A string used to distinguish different Supabase projects on the same host. Defaults to the working\n# directory name wh"
},
{
"path": "examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql",
"chars": 2786,
"preview": "create extension vector;\n\ncreate table if not exists documents (\n id text primary key default gen_random_uuid()::text"
},
{
"path": "examples/providers/supabase/seed.sql",
"chars": 0,
"preview": ""
},
{
"path": "local_server/ai-plugin.json",
"chars": 706,
"preview": "{\n \"schema_version\": \"v1\",\n \"name_for_model\": \"retrieval\",\n \"name_for_human\": \"Retrieval Plugin\",\n \"description_for_"
},
{
"path": "local_server/main.py",
"chars": 3889,
"preview": "# This is a version of the main.py file found in ../../../server/main.py for testing the plugin locally.\n# Use the comma"
},
{
"path": "local_server/openapi.yaml",
"chars": 4629,
"preview": "openapi: 3.0.2\ninfo:\n title: Retrieval Plugin API\n description: A retrieval API for querying and filtering documents b"
},
{
"path": "models/api.py",
"chars": 620,
"preview": "from models.models import (\n Document,\n DocumentMetadataFilter,\n Query,\n QueryResult,\n)\nfrom pydantic import"
},
{
"path": "models/models.py",
"chars": 1460,
"preview": "from pydantic import BaseModel\nfrom typing import List, Optional\nfrom enum import Enum\n\n\nclass Source(str, Enum):\n em"
},
{
"path": "pyproject.toml",
"chars": 1352,
"preview": "[tool.poetry]\nname = \"chatgpt-retrieval-plugin\"\nversion = \"0.1.0\"\ndescription = \"\"\nauthors = [\"isafulf <isabella@openai."
},
{
"path": "scripts/process_json/README.md",
"chars": 3016,
"preview": "## Process a JSON File\n\nThis script is a utility to process a file dump of documents in a JSON format and store them in "
},
{
"path": "scripts/process_json/example.json",
"chars": 630,
"preview": "[\n {\n \"id\": \"123\",\n \"text\": \"This is a document about something\",\n \"source\": \"file\",\n \"source_id\""
},
{
"path": "scripts/process_json/process_json.py",
"chars": 5367,
"preview": "import uuid\nimport json\nimport argparse\nimport asyncio\n\nfrom loguru import logger\nfrom models.models import Document, Do"
},
{
"path": "scripts/process_jsonl/README.md",
"chars": 3091,
"preview": "## Process a JSONL File\n\nThis script is a utility to process a file dump of documents in a JSONL format and store them i"
},
{
"path": "scripts/process_jsonl/example.jsonl",
"chars": 879,
"preview": "{\"id\": \"4\", \"text\": \"This document only has an ID and text. The other fields are missing.\"}\n{\"text\": \"This document has "
},
{
"path": "scripts/process_jsonl/process_jsonl.py",
"chars": 5264,
"preview": "import uuid\nimport json\nimport argparse\nimport asyncio\n\nfrom loguru import logger\nfrom models.models import Document, Do"
},
{
"path": "scripts/process_zip/README.md",
"chars": 2699,
"preview": "## Process a ZIP File\n\nThis script is a utility to process a file dump of documents in a zip file and store them in the "
},
{
"path": "scripts/process_zip/process_zip.py",
"chars": 5659,
"preview": "import uuid\nimport zipfile\nimport os\nimport json\nimport argparse\nimport asyncio\n\nfrom loguru import logger\nfrom models.m"
},
{
"path": "server/main.py",
"chars": 4718,
"preview": "import os\nfrom typing import Optional\nimport uvicorn\nfrom fastapi import FastAPI, File, Form, HTTPException, Depends, Bo"
},
{
"path": "services/chunks.py",
"chars": 7649,
"preview": "from typing import Dict, List, Optional, Tuple\nimport uuid\nimport os\nfrom models.models import Document, DocumentChunk, "
},
{
"path": "services/date.py",
"chars": 825,
"preview": "import arrow\nfrom loguru import logger\n\n\ndef to_unix_timestamp(date_str: str) -> int:\n \"\"\"\n Convert a date string "
},
{
"path": "services/extract_metadata.py",
"chars": 1472,
"preview": "from models.models import Source\nfrom services.openai import get_chat_completion\nimport json\nfrom typing import Dict\nimp"
},
{
"path": "services/file.py",
"chars": 3681,
"preview": "import os\nfrom io import BufferedReader\nfrom typing import Optional\nfrom fastapi import UploadFile\nimport mimetypes\nfrom"
},
{
"path": "services/openai.py",
"chars": 2572,
"preview": "from typing import List\nimport openai\nimport os\nfrom loguru import logger\n\nfrom tenacity import retry, wait_random_expon"
},
{
"path": "services/pii_detection.py",
"chars": 1414,
"preview": "import os\nfrom services.openai import get_chat_completion\n\n\ndef screen_text_for_pii(text: str) -> bool:\n # This promp"
},
{
"path": "tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/datastore/providers/analyticdb/test_analyticdb_datastore.py",
"chars": 9971,
"preview": "import pytest\nfrom models.models import (\n DocumentChunkMetadata,\n DocumentMetadataFilter,\n DocumentChunk,\n "
},
{
"path": "tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py",
"chars": 5988,
"preview": "import pytest\nfrom typing import Dict, List\nfrom dotenv import dotenv_values\n\nfrom datastore.datastore import DataStore\n"
},
{
"path": "tests/datastore/providers/azuresearch/test_azuresearch_datastore.py",
"chars": 8748,
"preview": "import pytest\nimport os\nimport time\nfrom typing import Union\nfrom azure.search.documents.indexes import SearchIndexClien"
},
{
"path": "tests/datastore/providers/chroma/test_chroma_datastore.py",
"chars": 9287,
"preview": "import shutil\nfrom typing import Dict, List\nimport pytest\nimport random\n\nfrom datastore.providers.chroma_datastore impor"
},
{
"path": "tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py",
"chars": 4620,
"preview": "import pytest\nfrom models.models import (\n DocumentChunkMetadata,\n DocumentMetadataFilter,\n DocumentChunk,\n "
},
{
"path": "tests/datastore/providers/llama/test_llama_datastore.py",
"chars": 2623,
"preview": "from typing import Dict, List\nimport pytest\nfrom datastore.providers.llama_datastore import LlamaDataStore\nfrom models.m"
},
{
"path": "tests/datastore/providers/milvus/test_milvus_datastore.py",
"chars": 10839,
"preview": "# from pathlib import Path\n# from dotenv import find_dotenv, load_dotenv\n# env_path = Path(\".\") / \"milvus.env\"\n# load_do"
},
{
"path": "tests/datastore/providers/mongodb_atlas/test_integration.py",
"chars": 4512,
"preview": "\"\"\"Integration Tests of ChatGPT Retrieval Plugin\nwith MongoDB Atlas Vector Datastore and OPENAI Embedding model.\n\nAs des"
},
{
"path": "tests/datastore/providers/mongodb_atlas/test_mongodb_datastore.py",
"chars": 7958,
"preview": "\"\"\"\nIntegration tests of MongoDB Atlas Datastore.\n\nThese tests require one to have a running Cluster, Database, Collecti"
},
{
"path": "tests/datastore/providers/postgres/test_postgres_datastore.py",
"chars": 8810,
"preview": "from typing import Dict, List\nimport pytest\nfrom datastore.providers.postgres_datastore import PostgresDataStore\nfrom mo"
},
{
"path": "tests/datastore/providers/qdrant/test_qdrant_datastore.py",
"chars": 8429,
"preview": "from typing import Dict, List\n\nimport pytest\nimport qdrant_client\nfrom qdrant_client.http.models import PayloadSchemaTyp"
},
{
"path": "tests/datastore/providers/redis/test_redis_datastore.py",
"chars": 2068,
"preview": "from datastore.providers.redis_datastore import RedisDataStore\nfrom models.models import (\n DocumentChunk,\n Docume"
},
{
"path": "tests/datastore/providers/supabase/test_supabase_datastore.py",
"chars": 8810,
"preview": "from typing import Dict, List\nimport pytest\nfrom datastore.providers.supabase_datastore import SupabaseDataStore\nfrom mo"
},
{
"path": "tests/datastore/providers/weaviate/docker-compose.yml",
"chars": 538,
"preview": "---\nversion: '3.4'\nservices:\n weaviate:\n command:\n - --host\n - 0.0.0.0\n - --port\n - '8080'\n - --schem"
},
{
"path": "tests/datastore/providers/weaviate/test_weaviate_datastore.py",
"chars": 17088,
"preview": "import logging\nimport os\n\nimport pytest\nimport weaviate\nfrom _pytest.logging import LogCaptureFixture\nfrom fastapi.testc"
},
{
"path": "tests/datastore/providers/zilliz/test_zilliz_datastore.py",
"chars": 818,
"preview": "# from pathlib import Path\n# from dotenv import find_dotenv, load_dotenv\n# env_path = Path(\".\") / \"zilliz.env\"\n# load_do"
}
]
About this extraction
This page contains the full source code of the openai/chatgpt-retrieval-plugin GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 116 files (654.3 KB), approximately 156.5k tokens, and a symbol index with 424 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.