Repository: jerpint/buster
Branch: main
Commit: 07b6bb893f47
Files: 44
Total size: 152.0 KB
Directory structure:
gitextract_5l_frr4b/
├── .github/
│ └── workflows/
│ ├── publish_pypi.yaml
│ └── tests.yaml
├── .gitignore
├── LICENSE.md
├── README.md
├── buster/
│ ├── __init__.py
│ ├── busterbot.py
│ ├── completers/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── chatgpt.py
│ │ └── user_inputs.py
│ ├── documents_manager/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── deeplake.py
│ │ └── service.py
│ ├── examples/
│ │ ├── cfg.py
│ │ ├── generate_embeddings.py
│ │ ├── gradio_app.py
│ │ └── stackoverflow.csv
│ ├── formatters/
│ │ ├── documents.py
│ │ └── prompts.py
│ ├── llm_utils/
│ │ ├── __init__.py
│ │ ├── embeddings.py
│ │ └── question_reformulator.py
│ ├── parsers/
│ │ ├── __init__.py
│ │ └── parser.py
│ ├── retriever/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── deeplake.py
│ │ └── service.py
│ ├── tokenizers/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── gpt.py
│ ├── utils.py
│ └── validators/
│ ├── __init__.py
│ ├── base.py
│ └── validators.py
├── pyproject.toml
├── requirements.txt
└── tests/
├── test_chatbot.py
├── test_documents.py
├── test_formatters.py
├── test_read_write.py
└── test_validator.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/publish_pypi.yaml
================================================
name: publish-pypi
on:
workflow_dispatch:
release:
types: [created]
jobs:
deploy:
runs-on: ubuntu-latest
environment: secrets
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install poetry
- name: Build and publish
env:
POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }}
run: |
poetry version $(git describe --tags --abbrev=0)
poetry add $(cat requirements.txt)
poetry build
poetry publish
================================================
FILE: .github/workflows/tests.yaml
================================================
name: Tests
on: [pull_request]
jobs:
tests:
runs-on: ubuntu-latest
environment: secrets
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: black linter
uses: psf/black@stable
with:
options: "--check --diff --line-length 120"
- name: isort
run: |
pip install isort
isort --profile black --check-only .
- name: unit tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
python3 -m pip install --upgrade pip
pip install -e .
pytest
================================================
FILE: .gitignore
================================================
# database files
*.db
buster/apps/data/
deeplake_store/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Macos
*.DS_Store*
albenchmark/data/
# Ignore notebooks by default
*.ipynb
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# VSCode
.vscode/
================================================
FILE: LICENSE.md
================================================
MIT License
Copyright (c) 2023 Buster dev team
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Buster, the QA documentation chatbot!
[](https://github.com/jerpint/buster)
[](https://pypi.org/project/buster-doctalk)
[](https://github.com/psf/black)
[](https://huggingface.co/spaces/jerpint/buster)
Buster is a question-answering chatbot that can be tuned to any source of documentations.
# Demo
In order to view the full abilities of Buster, you can play with our [live demo here](https://huggingface.co/spaces/jerpint/buster).
We scraped the documentation of [huggingface 🤗 Transformers](https://huggingface.co/docs/transformers/index) and instructed Buster to answer questions related to its usage.
# Quickstart
This section is meant to help you install and run local version of Buster.
First step, install buster:
**Note**: Buster requires python>=3.10
```bash
pip install buster-doctalk
```
Then, go to the examples folder and launch the app.
We've included small sample data off stackoverflow-ai questions that you can test your setup with to try app:
```bash
cd buster/buster/examples
gradio gradio_app.py
```
This will launch the gradio app locally.
**NOTE**: The demo uses chatGPT to generate text and compute embeddings, make sure to set a valid openai API key:
```bash
export OPENAI_API_KEY=sk-...
```
# Generating your own embeddings
Once your local version of Buster is up and running, the next step is for you to be able to import your own data.
We will be using the `stackoverflow.csv` file in the `buster/examples/` folder for this. This is the same data that was used to generate the demo app's embeddings.
You will first ingest the documents to be ready for buster. In this example, we use Deeplake's vector store, but you can always write your own custom `DocumentManager`:
```python
import pandas as pd
from buster.documents_manager import DeepLakeDocumentsManager
# Read the csv
df = pd.read_csv("stackoverflow.csv")
# Generate the embeddings for our documents and store them in a deeplake format
dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True)
dm.add(df)
```
You can also just simply run the script:
python generate_embeddings.py --csv stackoverflow.csv
This will generate the embeddings and save them locally in the `deeplake_store`.
**NOTE**: You will need to set a valid openai key for computing embeddings:
```bash
export OPENAI_API_KEY=sk-...
```
You only need to run this operation one time.
In the .csv, we expect columns ["title", "url", "content", "source"] for each row of the csv:
* title: this will be the title of the url to display
* url: the link that clicking the title will redirect to
* source: where the content was originally sourced from (e.g. wikipedia, stackoverflow, etc.)
* content: plaintext of the documents to be embedded. It is your responsibility to chunk your documents appropriately. For better results, we recommend chunks of 400-600 words.
# Additional Configurations
Properly prompting models as well as playing around with various model parameters can lead to different results. We use a `BusterConfig` object to keep track of the various Buster configurations. In the `buster/examples/` folder, the config is stored inside `cfg.py`. Modify this config to update parameters, prompts, etc.
# How does Buster work?
First, we parsed the documentation into snippets. For each snippet, we obtain an embedding by using the [OpenAI API](https://beta.openai.com/docs/guides/embeddings/what-are-embeddings).
Then, when a user asks a question, we compute its embedding, and find the snippets from the doc with the highest cosine similarity to the question.
Finally, we craft the prompt:
- The most relevant snippets from the doc.
- The engineering prompt.
- The user's question.
We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!
### Currently available models
- For embeddings: "text-embedding-ada-002"
- For completion: We support both "gpt-3.5-turbo" and "gpt-4"
### Livestream
For more information, you can watch the livestream where explain how buster works in detail!
- [Livestream recording](https://youtu.be/LB5g-AhfPG8)
================================================
FILE: buster/__init__.py
================================================
================================================
FILE: buster/busterbot.py
================================================
import logging
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
from buster.completers import Completion, DocumentAnswerer, UserInputs
from buster.llm_utils import QuestionReformulator, get_openai_embedding
from buster.retriever import Retriever
from buster.validators import Validator
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@dataclass
class BusterConfig:
"""Configuration object for a chatbot."""
validator_cfg: dict = field(
default_factory=lambda: {
"use_reranking": True,
"validate_documents": False,
}
)
tokenizer_cfg: dict = field(
default_factory=lambda: {
"model_name": "gpt-3.5-turbo",
}
)
retriever_cfg: dict = field(
default_factory=lambda: {
"max_tokens": 3000,
"top_k": 3,
"thresh": 0.7,
"embedding_fn": get_openai_embedding,
}
)
prompt_formatter_cfg: dict = field(
default_factory=lambda: {
"max_tokens": 3500,
"text_before_docs": "You are a chatbot answering questions.\n",
"text_after_docs": "Answer the following question:\n",
"formatter": "{text_before_docs}\n{documents}\n{text_after_docs}",
}
)
documents_formatter_cfg: dict = (
field(
default_factory=lambda: {
"max_tokens": 3500,
"formatter": "{content}",
}
),
)
documents_answerer_cfg: dict = field(
default_factory=lambda: {
"no_documents_message": "No documents are available for this question.",
}
)
question_reformulator_cfg: dict = field(
default_factory=lambda: {
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
"system_prompt": """
Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system.
Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval.""",
}
)
completion_cfg: dict = field(
default_factory=lambda: {
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"temperature": 0,
"stream": True,
},
}
)
class Buster:
def __init__(
self,
retriever: Retriever,
document_answerer: DocumentAnswerer,
validator: Validator,
question_reformulator: Optional[QuestionReformulator] = None,
):
self.document_answerer = document_answerer
self.retriever = retriever
self.validator = validator
self.question_reformulator = question_reformulator
def process_input(
self,
user_input: str,
sources: Optional[list[str]] = None,
top_k: Optional[int] = None,
reformulate_question: Optional[bool] = False,
) -> Completion:
"""
Main function to process the input question and generate a formatted output.
"""
logger.info(f"User Input:\n{user_input}")
# We make sure there is always a newline at the end of the question to avoid completing the question.
if not user_input.endswith("\n"):
user_input += "\n"
user_inputs = UserInputs(original_input=user_input)
# The returned message is either a generic invalid question message or an error handling message
question_relevant, irrelevant_question_message = self.validator.check_question_relevance(user_input)
if question_relevant:
# question is relevant, get completor to generate completion
# reformulate the question if a reformulator is defined
if self.question_reformulator is not None and reformulate_question:
reformulated_input, reformulation_error = self.question_reformulator.reformulate(
user_inputs.original_input
)
user_inputs.reformulated_input = reformulated_input
if reformulation_error:
completion = Completion(
error=True,
user_inputs=user_inputs,
matched_documents=pd.DataFrame(),
answer_text="Something went wrong reformulating the question. Try again soon.",
answer_relevant=False,
question_relevant=False,
validator=self.validator,
)
return completion
# Retrieve and answer
matched_documents = self.retriever.retrieve(user_inputs, sources=sources, top_k=top_k)
completion: Completion = self.document_answerer.get_completion(
user_inputs=user_inputs,
matched_documents=matched_documents,
validator=self.validator,
question_relevant=question_relevant,
)
return completion
else:
# question was determined irrelevant, so we instead return a generic response set by the user.
completion = Completion(
error=False,
user_inputs=user_inputs,
matched_documents=pd.DataFrame(),
answer_text=irrelevant_question_message,
answer_relevant=False,
question_relevant=False,
validator=self.validator,
)
return completion
================================================
FILE: buster/completers/__init__.py
================================================
from .base import Completer, Completion, DocumentAnswerer
from .chatgpt import ChatGPTCompleter
from .user_inputs import UserInputs
__all__ = [
ChatGPTCompleter,
Completer,
Completion,
DocumentAnswerer,
UserInputs,
]
================================================
FILE: buster/completers/base.py
================================================
import io
import logging
import warnings
from abc import ABC, abstractmethod
from typing import Any, Iterator, Optional
import pandas as pd
from fastapi.encoders import jsonable_encoder
from buster.completers.user_inputs import UserInputs
from buster.formatters.documents import DocumentsFormatter
from buster.formatters.prompts import PromptFormatter
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class Completion:
"""
A class to represent the completion object of a model's output for a user's question.
Attributes:
error (bool): A boolean indicating if an error occurred when generating the completion.
user_inputs (UserInputs): The inputs from the user.
matched_documents (pd.DataFrame): The documents that were matched to the user's question.
answer_generator (Iterator): An optional iterator used to generate the model's answer.
answer_text (str): An optional answer text.
answer_relevant (bool): An optional boolean indicating if the answer is relevant.
question_relevant (bool): An optional boolean indicating if the question is relevant.
completion_kwargs (dict): Optional arguments for the completion.
validator (Validator): An optional Validator object.
Methods:
__repr__: Outputs a string representation of the object.
_validate_arguments: Validates answer_generator and answer_text arguments.
answer_relevant: Determines if the answer is relevant or not.
question_relevant: Retrieves the relevance of the question.
answer_text: Retrieves the answer text.
answer_generator: Retrieves the answer generator.
postprocess: Postprocesses the results after generating the model's answer.
to_json: Outputs selected attributes of the object in JSON format.
from_dict: Creates a Completion object from a dictionary.
"""
def __init__(
self,
error: bool,
user_inputs: UserInputs,
matched_documents: pd.DataFrame,
answer_generator: Optional[Iterator] = None,
answer_text: Optional[str] = None,
answer_relevant: Optional[bool] = None,
question_relevant: Optional[bool] = None,
completion_kwargs: Optional[dict] = None,
validator=None,
):
self.error = error
self.user_inputs = user_inputs
self.matched_documents = matched_documents
self.validator = validator
self.completion_kwargs = completion_kwargs
self._answer_relevant = answer_relevant
self._question_relevant = question_relevant
self._validate_arguments(answer_generator, answer_text)
def __repr__(self):
class_name = type(self).__name__
return (
f"{class_name}("
f"user_inputs={self.user_inputs!r}, "
f"error={self.error!r}, "
f"matched_documents={self.matched_documents!r}, "
f"answer_text={self._answer_text!r}, "
f"answer_generator={self.answer_generator!r}, "
f"answer_relevant={self._answer_relevant!r}, "
f"question_relevant={self.question_relevant!r}, "
f"completion_kwargs={self.completion_kwargs!r}, "
"),"
)
def _validate_arguments(self, answer_generator: Optional[Iterator], answer_text: Optional[str]):
"""Sets answer_generator and answer_text properties depending on the provided inputs.
Checks that one of either answer_generator or answer_text is not None.
If answer_text is set, a generator can simply be inferred from answer_text.
If answer_generator is set, answer_text will be set only once the generator gets called. Set to None for now.
"""
if (answer_generator is None and answer_text is None) or (
answer_generator is not None and answer_text is not None
):
raise ValueError("Only one of 'answer_generator' and 'answer_text' must be set.")
# If text is provided, the genrator can be inferred
if answer_text is not None:
assert isinstance(answer_text, str)
answer_generator = (msg for msg in answer_text)
self._answer_text = answer_text
self._answer_generator = answer_generator
@property
def answer_relevant(self) -> bool:
"""Property determining the relevance of an answer (bool).
If an error occured, the relevance is False.
If no documents were retrieved, the relevance is also False.
Otherwise, the relevance is computed as defined by the validator (e.g. comparing to embeddings)
"""
if self.error:
self._answer_relevant = False
elif len(self.matched_documents) == 0:
self._answer_relevant = False
elif self._answer_relevant is not None:
return self._answer_relevant
else:
# Check the answer relevance by looking at the embeddings
self._answer_relevant = self.validator.check_answer_relevance(self.answer_text)
return self._answer_relevant
@property
def question_relevant(self):
"""Property determining the relevance of the question asked (bool)."""
return self._question_relevant
@property
def answer_text(self):
if self._answer_text is None:
# generates the text if it wasn't already generated
self._answer_text = "".join([i for i in self.answer_generator])
return self._answer_text
@answer_text.setter
def answer_text(self, value: str) -> None:
self._answer_text = value
@property
def answer_generator(self):
# keeps track of the yielded text
self._answer_text = ""
for token in self._answer_generator:
self._answer_text += token
yield token
self.postprocess()
@answer_generator.setter
def answer_generator(self, generator: Iterator) -> None:
self._answer_generator = generator
def postprocess(self):
"""Function executed after the answer text is generated by the answer_generator"""
if self.validator is None:
# TODO: This should only happen if declaring a Completion using .from_dict() method.
# This behaviour is not ideal and we may want to remove support for .from_dict() in the future.
logger.info("No validator was set, skipping postprocessing.")
return
if self.validator.use_reranking:
# rerank docs in order of cosine similarity to the question
self.matched_documents = self.validator.rerank_docs(
answer=self.answer_text, matched_documents=self.matched_documents
)
if self.validator.validate_documents:
self.matched_documents = self.validator.check_documents_relevance(
answer=self.answer_text, matched_documents=self.matched_documents
)
# access the property so it gets set if not computed alerady
self.answer_relevant
def to_json(self, columns_to_ignore: Optional[list[str]] = None) -> Any:
"""Converts selected attributes of the object to a JSON format.
Args:
columns_to_ignore (list[str]): A list of column names to ignore in the csulting matched_documents dataframe.
Returns:
Any: The object's attributes encoded as JSON.
Notes:
- The 'matched_documents' attribute of type pd.DataFrame is encoded separately
using a custom encoder.
- The resulting JSON may exclude specified columns based on the 'columns_to_ignore' parameter.
"""
def encode_df(df: pd.DataFrame) -> dict:
if columns_to_ignore is not None:
df = df.drop(columns=columns_to_ignore, errors="ignore")
return df.to_json(orient="index")
custom_encoder = {
# Converts the matched_documents in the user_responses to json
pd.DataFrame: encode_df,
}
to_encode = {
"user_inputs": self.user_inputs,
"answer_text": self.answer_text,
"matched_documents": self.matched_documents,
"answer_relevant": self.answer_relevant,
"question_relevant": self.question_relevant,
"completion_kwargs": self.completion_kwargs,
"error": self.error,
}
return jsonable_encoder(to_encode, custom_encoder=custom_encoder)
@classmethod
def from_dict(cls, completion_dict: dict):
# Map a dict of user inputs to the UserInputs class
if isinstance(completion_dict["user_inputs"], dict):
completion_dict["user_inputs"] = UserInputs(**completion_dict["user_inputs"])
# Map the matched documents back to a dataframe
if isinstance(completion_dict["matched_documents"], str):
# avoids deprecation warning
json_data = io.StringIO(completion_dict["matched_documents"])
completion_dict["matched_documents"] = pd.read_json(json_data, orient="index")
elif isinstance(completion_dict["matched_documents"], dict):
completion_dict["matched_documents"] = pd.DataFrame(completion_dict["matched_documents"]).T
else:
raise ValueError(f"Unknown type for matched_documents: {type(completion_dict['matched_documents'])}")
return cls(**completion_dict)
class Completer(ABC):
"""
Abstract base class for completers, which generate an answer to a prompt.
Methods:
complete: The method that should be implemented by any child class to provide an answer to a prompt.
"""
@abstractmethod
def complete(self, prompt: str, user_input) -> (str | Iterator, bool):
"""Returns the completed message (can be a generator), and a boolean to indicate if an error occured or not."""
...
class DocumentAnswerer:
"""
A class that answers questions based on documents.
It takes care of formatting the prompts and the documents, and generating the answer when relevant.
Attributes:
completer (Completer): Object that actually generates an answer to the prompt.
documents_formatter (DocumentsFormatter): Object that formats the documents for the prompt.
prompt_formatter (PromptFormatter): Object that prepares the prompt for the completer.
no_documents_message (str): Message to display when no documents are found to match the query.
completion_class (Completion): Class to use for the resulting completion.
Methods:
prepare_prompt: Prepares the prompt that will be passed to the completer.
get_completion: Generates a completion to the user's question based on matched documents.
"""
def __init__(
self,
documents_formatter: DocumentsFormatter,
prompt_formatter: PromptFormatter,
completer: Completer,
completion_class: Completion = Completion,
no_documents_message: str = "No documents were found that match your question.",
):
self.completer = completer
self.documents_formatter = documents_formatter
self.prompt_formatter = prompt_formatter
self.no_documents_message = no_documents_message
self.completion_class = completion_class
def prepare_prompt(self, matched_documents) -> str:
"""Prepare the prompt with prompt engineering.
A user's question is not included here. We use the documents formatter and prompt formatter to
compose the prompt itself.
"""
# format the matched documents, (will truncate them if too long)
formatted_documents, _ = self.documents_formatter.format(matched_documents)
prompt = self.prompt_formatter.format(formatted_documents)
return prompt
def get_completion(
self,
user_inputs: UserInputs,
matched_documents: pd.DataFrame,
validator,
question_relevant: bool = True,
) -> Completion:
"""Generate a completion to a user's question based on matched documents.
It is safe to assume the question_relevance to be True if we made it here."""
logger.info(f"{user_inputs=}")
if len(matched_documents) == 0:
warning_msg = "No documents found during retrieval."
warnings.warn(warning_msg)
logger.warning(warning_msg)
# empty dataframe
matched_documents = pd.DataFrame(columns=matched_documents.columns)
# because we are requesting a completion, we assume the question is relevant.
# However, no documents were found, so we pass the no documents found message instead of generating the answer.
# The completion does not get triggered, so we do not pass completion kwargs here either.
completion = self.completion_class(
user_inputs=user_inputs,
answer_text=self.no_documents_message,
error=False,
matched_documents=matched_documents,
question_relevant=question_relevant,
validator=validator,
)
return completion
# prepare the prompt with matched documents
prompt = self.prepare_prompt(matched_documents)
logger.info(f"{prompt=}")
logger.info(f"querying model with parameters: {self.completer.completion_kwargs}...")
try:
answer_generator, error = self.completer.complete(prompt=prompt, user_input=user_inputs.current_input)
except Exception as e:
error = True
answer_generator = "Something went wrong with the request, try again soon!"
logger.exception("Unknown error when attempting to generate response. See traceback:")
completion = self.completion_class(
answer_generator=answer_generator,
error=error,
matched_documents=matched_documents,
user_inputs=user_inputs,
question_relevant=question_relevant,
validator=validator,
completion_kwargs=self.completer.completion_kwargs,
)
return completion
================================================
FILE: buster/completers/chatgpt.py
================================================
import logging
import os
from typing import Iterator, Optional
import openai
from openai import OpenAI
from buster.completers import Completer
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Check if an API key exists for promptlayer, if it does, use it
promptlayer_api_key = os.environ.get("PROMPTLAYER_API_KEY")
if promptlayer_api_key:
# TODO: Check if this still works with latest openAI API...
try:
import promptlayer
logger.info("Enabling prompt layer...")
promptlayer.api_key = promptlayer_api_key
# replace openai with the promptlayer wrapper
openai = promptlayer.openai
except Exception as e:
logger.exception("Something went wrong enabling promptlayer.")
class ChatGPTCompleter(Completer):
def __init__(self, completion_kwargs: dict, client_kwargs: Optional[dict] = None):
"""Initialize the ChatGPTCompleter with completion and client keyword arguments.
Args:
completion_kwargs: A dictionary of keyword arguments to be used for completions.
client_kwargs: An optional dictionary of keyword arguments to be used for the OpenAI client.
"""
# use default client if none passed
self.completion_kwargs = completion_kwargs
if client_kwargs is None:
client_kwargs = {}
self.client = OpenAI(**client_kwargs)
def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str | Iterator, bool):
"""Given a prompt and user input, returns the generated message and error flag.
Args:
prompt: The prompt containing the formatted documents and instructions.
user_input: The user input to be responded to.
completion_kwargs: An optional dictionary of keyword arguments to override the default completion kwargs.
Returns:
A tuple containing the completed message and a boolean indicating if an error occurred.
Raises:
openai.BadRequestError: If the completion request is invalid.
openai.RateLimitError: If the OpenAI servers are overloaded.
"""
# Uses default configuration if not overridden
if completion_kwargs is None:
completion_kwargs = self.completion_kwargs
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": user_input},
]
try:
error = False
response = self.client.chat.completions.create(messages=messages, **completion_kwargs)
except openai.BadRequestError:
error = True
logger.exception("Invalid request to OpenAI API. See traceback:")
error_message = "Something went wrong while connecting with OpenAI, try again soon!"
return error_message, error
except openai.RateLimitError:
error = True
logger.exception("RateLimit error from OpenAI. See traceback:")
error_message = "OpenAI servers seem to be overloaded, try again later!"
return error_message, error
except Exception as e:
error = True
logger.exception("Some kind of error happened trying to generate the response. See traceback:")
error_message = "Something went wrong with connecting with OpenAI, try again soon!"
return error_message, error
if completion_kwargs.get("stream") is True:
# We are entering streaming mode, so here we're just wrapping the streamed
# openai response to be easier to handle later
def answer_generator():
for chunk in response:
token = chunk.choices[0].delta.content
# Always stream a string, openAI returns None on last token
token = "" if token is None else token
yield token
return answer_generator(), error
else:
full_response: str = response.choices[0].message.content
return full_response, error
================================================
FILE: buster/completers/user_inputs.py
================================================
from dataclasses import dataclass
from typing import Optional
@dataclass
class UserInputs:
"""A class that represents user inputs.
Attributes:
original_input: The original user input.
reformulated_input: The reformulated user input (optional).
"""
original_input: str
reformulated_input: Optional[str] = None
@property
def current_input(self):
"""Returns the current user input.
If the reformulated input is not None, it returns the reformulated input.
Otherwise, it returns the original input.
Returns:
The current user input.
"""
return self.reformulated_input if self.reformulated_input is not None else self.original_input
================================================
FILE: buster/documents_manager/__init__.py
================================================
from .base import DocumentsManager
from .deeplake import DeepLakeDocumentsManager
from .service import DocumentsService
__all__ = [DocumentsManager, DocumentsService, DeepLakeDocumentsManager]
================================================
FILE: buster/documents_manager/base.py
================================================
import logging
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Callable, Optional
import numpy as np
import pandas as pd
from tqdm import tqdm
from buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding
tqdm.pandas()
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@dataclass
class DocumentsManager(ABC):
def __init__(self, required_columns: Optional[list[str]] = None):
"""
Constructor for DocumentsManager class.
Args:
required_columns (Optional[list[str]]): A list of column names that are required for the dataframe to contain.
If None, no columns are enforced.
"""
self.required_columns = required_columns
def _check_required_columns(self, df: pd.DataFrame):
"""Each entry in the df is expected to have the columns in self.required_columns"""
if not all(col in df.columns for col in self.required_columns):
raise ValueError(f"DataFrame is missing one or more of {self.required_columns=}")
def _checkpoint_csv(self, df, csv_filename: str, csv_overwrite: bool = True):
"""
Saves DataFrame with embeddings to a CSV checkpoint.
Args:
df (pd.DataFrame): The DataFrame with embeddings.
csv_filename (str): Path to save a copy of the dataframe with computed embeddings for later use.
csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True.
"""
import os
if csv_overwrite:
df.to_csv(csv_filename)
logger.info(f"Saved DataFrame with embeddings to {csv_filename}")
else:
if os.path.exists(csv_filename):
# append to existing file
append_df = pd.read_csv(csv_filename)
append_df = pd.concat([append_df, df])
else:
# will create the new file
append_df = df.copy()
append_df.to_csv(csv_filename)
logger.info(f"Appending DataFrame embeddings to {csv_filename}")
def add(
self,
df: pd.DataFrame,
num_workers: int = 16,
embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding,
sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,
csv_filename: Optional[str] = None,
csv_overwrite: bool = True,
**add_kwargs,
):
"""Write documents from a DataFrame into the DocumentManager store.
This method adds documents from the provided DataFrame to the database. It performs the following steps:
1. Checks if the required columns are present in the DataFrame.
2. Computes embeddings for the 'content' column if they are not already present.
3. Optionally saves the DataFrame with computed embeddings to a CSV checkpoint.
4. Calls the '_add_documents' method to add documents with embeddings to the DocumentsManager.
Args:
df (pd.DataFrame): The DataFrame containing the documents to be added.
num_workers (int, optional): The number of parallel workers to use for computing embeddings. Default is 32.
embedding_fn (callable, optional): A function that computes embeddings for a given input string.
Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.
sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.
Default is None. Only use if you want sparse embeddings.
csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use.
csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True.
**add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method.
"""
if self.required_columns is not None:
self._check_required_columns(df)
# Check if embeddings are present, computes them if not
if "embedding" not in df.columns:
df["embedding"] = compute_embeddings_parallelized(df, embedding_fn=embedding_fn, num_workers=num_workers)
if "sparse_embedding" not in df.columns and sparse_embedding_fn is not None:
df["sparse_embedding"] = sparse_embedding_fn(df.content.to_list())
if csv_filename is not None:
self._checkpoint_csv(df, csv_filename=csv_filename, csv_overwrite=csv_overwrite)
self._add_documents(df, **add_kwargs)
def batch_add(
self,
df: pd.DataFrame,
batch_size: int = 3000,
min_time_interval: int = 60,
num_workers: int = 16,
embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding,
sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,
csv_filename: Optional[str] = None,
csv_overwrite: bool = False,
**add_kwargs,
):
"""
Adds DataFrame data to a DataManager instance in batches.
This function takes a DataFrame and adds its data to a DataManager instance in batches.
It ensures that a minimum time interval is maintained between successive batches
to prevent timeouts or excessive load. This is useful for APIs like openAI with rate limits.
Args:
df (pd.DataFrame): The input DataFrame containing data to be added.
batch_size (int, optional): The size of each batch. Defaults to 3000.
min_time_interval (int, optional): The minimum time interval (in seconds) between batches.
Defaults to 60.
num_workers (int, optional): The number of parallel workers to use when adding data.
Defaults to 32.
embedding_fn (callable, optional): A function that computes embeddings for a given input string.
Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.
sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.
Default is None. Only use if you want sparse embeddings.
csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use.
csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to False.
When using batches, set to False to keep all embeddings in the same file. You may want to manually remove the file if experimenting.
**add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method.
"""
total_batches = (len(df) // batch_size) + 1
logger.info(f"Adding {len(df)} documents with {batch_size=} for {total_batches=}")
for batch_idx in range(total_batches):
logger.info(f"Processing batch {batch_idx + 1}/{total_batches}")
start_time = time.time()
# Calculate batch indices and extract batch DataFrame
start_idx = batch_idx * batch_size
end_idx = min((batch_idx + 1) * batch_size, len(df))
batch_df = df.iloc[start_idx:end_idx]
# Add the batch data to using specified parameters
self.add(
batch_df,
num_workers=num_workers,
csv_filename=csv_filename,
csv_overwrite=csv_overwrite,
embedding_fn=embedding_fn,
sparse_embedding_fn=sparse_embedding_fn,
**add_kwargs,
)
elapsed_time = time.time() - start_time
# Sleep to ensure the minimum time interval is maintained
# Only sleep if it's not the last iteration
if batch_idx < total_batches - 1:
sleep_time = max(0, min_time_interval - elapsed_time)
if sleep_time > 0:
logger.info(f"Sleeping for {round(sleep_time)} seconds...")
time.sleep(sleep_time)
logger.info("All batches processed.")
@abstractmethod
def _add_documents(self, df: pd.DataFrame, **add_kwargs):
"""Abstract method to be implemented by each inherited member.
This method should handle the actual process of adding documents to the database.
"""
...
================================================
FILE: buster/documents_manager/deeplake.py
================================================
import logging
from typing import Optional
import pandas as pd
from buster.utils import zip_contents
from .base import DocumentsManager
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class DeepLakeDocumentsManager(DocumentsManager):
def __init__(
self,
vector_store_path: str = "deeplake_store",
required_columns: Optional[list[str]] = None,
**vector_store_kwargs,
):
"""Initialize a DeepLakeDocumentsManager object.
Args:
vector_store_path: The path to the vector store.
required_columns: A list of columns that are required in the dataframe.
**vector_store_kwargs: Additional keyword arguments to pass to the VectorStore initializer.
"""
from deeplake.core.vectorstore import VectorStore
self.vector_store_path = vector_store_path
self.required_columns = required_columns
self.vector_store = VectorStore(
path=self.vector_store_path,
**vector_store_kwargs,
)
def __len__(self):
"""Get the number of documents in the vector store.
Returns:
The number of documents in the vector store.
"""
return len(self.vector_store)
@classmethod
def _extract_metadata(cls, df: pd.DataFrame) -> dict:
"""Extract metadata from the dataframe in DeepLake dict format.
Args:
df: The dataframe from which to extract metadata.
Returns:
The extracted metadata in DeepLake dict format.
"""
# Ignore the content and embedding column for metadata
df = df.drop(columns=["content", "embedding"], errors="ignore")
columns = list(df.columns)
metadata = df.apply(
lambda x: {col: x[col] for col in columns},
axis=1,
).to_list()
return metadata
def _add_documents(self, df: pd.DataFrame, **add_kwargs):
"""Write all documents from the dataframe into the vector store as a new version.
Each entry in the dataframe is expected to have at least the following columns:
["content", "embedding"]
Embeddings will have been precomputed in the self.add() method, which calls this one.
Args:
df: The dataframe containing the documents to add.
**add_kwargs: Additional keyword arguments to pass to the add method of the vector store.
"""
# Embedding should already be computed in the .add method
assert "embedding" in df.columns, "expected column=embedding in the dataframe"
# extract the chunked text + metadata
metadata = self._extract_metadata(df)
chunked_text = df.content.to_list()
embeddings = df.embedding.to_list()
self.vector_store.add(
text=chunked_text,
embedding=embeddings,
metadata=metadata,
**add_kwargs,
)
def to_zip(self, output_path: str = "."):
"""Zip the contents of the vector store path folder to a .zip file in the output path.
Args:
output_path: The path where the zip file should be created.
Returns:
The path to the created zip file.
"""
vector_store_path = self.vector_store_path
logger.info(f"Compressing {vector_store_path}...")
zip_file_path = zip_contents(input_path=vector_store_path, output_path=output_path)
logger.info(f"Compressed {vector_store_path} to {zip_file_path}.")
return zip_file_path
================================================
FILE: buster/documents_manager/service.py
================================================
import logging
import pandas as pd
import pinecone
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from buster.documents_manager.base import DocumentsManager
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class DocumentsService(DocumentsManager):
"""Manager to use in production. Mixed Pinecone and MongoDB backend."""
def __init__(
self,
pinecone_api_key: str,
pinecone_index: str,
pinecone_namespace: str,
mongo_uri: str,
mongo_db_name: str,
**kwargs,
):
"""Initialize the DocumentsService.
Args:
pinecone_api_key: The Pinecone API key.
pinecone_env: The Pinecone environment.
pinecone_index: The Pinecone index.
pinecone_namespace: The Pinecone namespace.
mongo_uri: The MongoDB URI.
mongo_db_name: The MongoDB database name.
**kwargs: Additional keyword arguments to pass to the parent class.
"""
super().__init__(**kwargs)
pc = pinecone.Pinecone(api_key=pinecone_api_key)
self.index = pc.Index(pinecone_index)
self.namespace = pinecone_namespace
self.mongo_db_name = mongo_db_name
self.client = MongoClient(mongo_uri, server_api=ServerApi("1"))
self.db = self.client[mongo_db_name]
def __repr__(self):
"""Return a string representation of the DocumentsService."""
return "DocumentsService"
def get_source_id(self, source: str) -> str:
"""Get the id of a source.
Args:
source: The name of the source.
Returns:
The id of the source.
"""
return str(self.db.sources.find_one({"name": source})["_id"])
def _add_documents(self, df: pd.DataFrame):
"""Write all documents from the dataframe into the db as a new version.
Args:
df: The dataframe containing the documents.
"""
use_sparse_vector = "sparse_embedding" in df.columns
if use_sparse_vector:
logger.info("Uploading sparse embeddings too.")
for source in df.source.unique():
source_exists = self.db.sources.find_one({"name": source})
if source_exists is None:
self.db.sources.insert_one({"name": source})
source_id = self.get_source_id(source)
df_source = df[df.source == source]
to_upsert = []
for row in df_source.to_dict(orient="records"):
embedding = row["embedding"].tolist()
if use_sparse_vector:
sparse_embedding = row["sparse_embedding"]
document = row.copy()
document.pop("embedding")
if use_sparse_vector:
document.pop("sparse_embedding")
document["source_id"] = source_id
document_id = str(self.db.documents.insert_one(document).inserted_id)
vector = {"id": document_id, "values": embedding, "metadata": {"source": source}}
if use_sparse_vector:
vector["sparse_values"] = sparse_embedding
to_upsert.append(vector)
# Current (February 2024) Pinecone upload rules:
# - Max 100 vectors per batch
MAX_PINECONE_BATCH_SIZE = 100
for i in range(0, len(to_upsert), MAX_PINECONE_BATCH_SIZE):
self.index.upsert(vectors=to_upsert[i : i + MAX_PINECONE_BATCH_SIZE], namespace=self.namespace)
def update_source(self, source: str, display_name: str = None, note: str = None):
"""Update the display name and/or note of a source. Also create the source if it does not exist.
Args:
source: The name of the source.
display_name: The new display name of the source.
note: The new note of the source.
"""
self.db.sources.update_one(
{"name": source}, {"$set": {"display_name": display_name, "note": note}}, upsert=True
)
def delete_source(self, source: str) -> tuple[int, int]:
"""Delete a source and all its documents. Return if the source was deleted and the number of deleted documents.
Args:
source: The name of the source.
Returns:
A tuple containing the number of deleted sources and the number of deleted documents.
"""
source_id = self.get_source_id(source)
# MongoDB
source_deleted = self.db.sources.delete_one({"name": source}).deleted_count
documents_deleted = self.db.documents.delete_many({"source_id": source_id}).deleted_count
# Pinecone
self.index.delete(filter={"source": source}, namespace=self.namespace)
return source_deleted, documents_deleted
def drop_db(self):
"""Drop the currently accessible database.
For Pinecone, this means deleting everything in the namespace.
For Mongo DB, this means dropping the database. However this needs to be done manually through the GUI.
"""
confirmation = input("Dropping the database is irreversible. Are you sure you want to proceed? (y/N): ")
if confirmation.strip().lower() == "y":
self.index.delete(namespace=self.namespace, delete_all=True)
logging.info(f"Deleted all documents from Pinecone namespace: {self.namespace=}")
logging.info(f"The MongoDB database needs to be dropped manually: {self.mongo_db_name=}")
else:
logging.info("Operation cancelled.")
================================================
FILE: buster/examples/cfg.py
================================================
from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, DocumentAnswerer
from buster.formatters.documents import DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.llm_utils import get_openai_embedding_constructor
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers import GPTTokenizer
from buster.validators import Validator
# kwargs to pass to OpenAI client
client_kwargs = {
"timeout": 20,
"max_retries": 3,
}
embedding_fn = get_openai_embedding_constructor(client_kwargs=client_kwargs)
buster_cfg = BusterConfig(
validator_cfg={
"question_validator_cfg": {
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
"client_kwargs": client_kwargs,
"check_question_prompt": """You are a chatbot answering questions on artificial intelligence.
Your job is to determine wether or not a question is valid, and should be answered.
More general questions are not considered valid, even if you might know the response.
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
For example:
Q: What is backpropagation?
true
Q: What is the meaning of life?
false
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
},
"answer_validator_cfg": {
"unknown_response_templates": [
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
],
"unknown_threshold": 0.85,
"embedding_fn": embedding_fn,
},
"documents_validator_cfg": {
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
"client_kwargs": client_kwargs,
},
"use_reranking": True,
"validate_documents": False,
},
retriever_cfg={
"path": "deeplake_store",
"top_k": 3,
"thresh": 0.7,
"embedding_fn": embedding_fn,
},
documents_answerer_cfg={
"no_documents_message": "No documents are available for this question.",
},
completion_cfg={
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": True,
"temperature": 0,
},
"client_kwargs": client_kwargs,
},
tokenizer_cfg={
"model_name": "gpt-3.5-turbo",
},
documents_formatter_cfg={
"max_tokens": 3500,
"columns": ["content", "title", "source"],
},
prompt_formatter_cfg={
"max_tokens": 3500,
"text_before_docs": (
"You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
"You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
"If the answer is in the documentation, summarize it in a helpful way to the user. "
"If it isn't, simply reply that you cannot answer the question. "
"Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
"Here is the documentation: "
),
"text_after_docs": (
"REMEMBER:\n"
"You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
"Here are the rules you must follow:\n"
"1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
"2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
"3) Do not reference any links, urls or hyperlinks in your answers.\n"
"4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
"5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
"For example:\n"
"What is the meaning of life for an qa bot?\n"
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
"Now answer the following question:\n"
),
},
)
def setup_buster(buster_cfg: BusterConfig):
"""initialize buster with a buster_cfg class"""
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
document_answerer: DocumentAnswerer = DocumentAnswerer(
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
documents_formatter=DocumentsFormatterJSON(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
**buster_cfg.documents_answerer_cfg,
)
validator: Validator = Validator(**buster_cfg.validator_cfg)
buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
return buster
================================================
FILE: buster/examples/generate_embeddings.py
================================================
import click
import pandas as pd
from buster.documents_manager import DeepLakeDocumentsManager
REQUIRED_COLUMNS = ["url", "title", "content", "source"]
@click.command(
help="This script processes a CSV file and generates embeddings. The CSV argument specifies the path to the input CSV file."
)
@click.argument("csv", metavar="")
def main(csv):
# Read the csv
df = pd.read_csv(csv)
# initialize our vector store from scratch
dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True, required_columns=REQUIRED_COLUMNS)
# Generate the embeddings for our documents and store them to the deeplake store
dm.add(df, csv_filename="embeddings.csv")
# Save it to a zip file
dm.to_zip()
if __name__ == "__main__":
main()
================================================
FILE: buster/examples/gradio_app.py
================================================
import os
from typing import Optional, Tuple
import cfg
import gradio as gr
import pandas as pd
from cfg import setup_buster
from buster.completers import Completion
from buster.utils import extract_zip
# Check if an openai key is set as an env. variable
if os.getenv("OPENAI_API_KEY") is None:
print("Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'.")
# Typehint for chatbot history
ChatHistory = list[list[Optional[str], Optional[str]]]
extract_zip("deeplake_store.zip", "deeplake_store")
buster = setup_buster(cfg.buster_cfg)
def add_user_question(user_question: str, chat_history: Optional[ChatHistory] = None) -> ChatHistory:
"""Adds a user's question to the chat history.
If no history is provided, the first element of the history will be the user conversation.
"""
if chat_history is None:
chat_history = []
chat_history.append([user_question, None])
return chat_history
def format_sources(matched_documents: pd.DataFrame) -> str:
if len(matched_documents) == 0:
return ""
matched_documents.similarity_to_answer = matched_documents.similarity_to_answer * 100
# drop duplicate pages (by title), keep highest ranking ones
matched_documents = matched_documents.sort_values("similarity_to_answer", ascending=False).drop_duplicates(
"title", keep="first"
)
documents_answer_template: str = (
"📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
)
document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
documents = "\n".join([document_template.format(document=document) for _, document in matched_documents.iterrows()])
footnote: str = "I'm a bot 🤖 and not always perfect."
return documents_answer_template.format(documents=documents, footnote=footnote)
def add_sources(history, completion):
if completion.answer_relevant:
formatted_sources = format_sources(completion.matched_documents)
history.append([None, formatted_sources])
return history
def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
"""Answer a user's question using retrieval augmented generation."""
# We assume that the question is the user's last interaction
user_input = chat_history[-1][0]
# Do retrieval + augmented generation with buster
completion = buster.process_input(user_input)
# Stream tokens one at a time to the user
chat_history[-1][1] = ""
for token in completion.answer_generator:
chat_history[-1][1] += token
yield chat_history, completion
demo = gr.Blocks()
with demo:
with gr.Row():
gr.Markdown("Buster 🤖: A Question-Answering Bot for your documentation
")
chatbot = gr.Chatbot()
with gr.Row():
question_textbox = gr.Textbox(
label="What's your question?",
placeholder="Type your question here...",
lines=1,
)
send_button = gr.Button(value="Send", variant="secondary")
examples = gr.Examples(
examples=[
"How can I perform backpropagation?",
"How do I deal with noisy data?",
"How do I deal with noisy data in 2 words?",
],
inputs=question_textbox,
)
gr.Markdown("This application uses GPT to search the docs for relevant info and answer questions.")
gr.HTML("️ Created with ❤️ by @jerpint and @hadrienbertrand")
response = gr.State()
# fmt: off
gr.on(
triggers=[send_button.click, question_textbox.submit],
fn=add_user_question,
inputs=[question_textbox],
outputs=[chatbot]
).then(
chat,
inputs=[chatbot],
outputs=[chatbot, response]
).then(
add_sources,
inputs=[chatbot, response],
outputs=[chatbot]
)
# fmt: on
demo.queue()
demo.launch(debug=True, share=False)
================================================
FILE: buster/examples/stackoverflow.csv
================================================
,source,title,content,url
0,stackoverflow,stackoverflow question #1,"""Backprop"" is the same as ""backpropagation"": it's just a shorter way to say it. It is sometimes abbreviated as ""BP"".
",https://ai.stackexchange.com/questions/1
1,stackoverflow,stackoverflow question #2,"Noise in the data, to a reasonable amount, may help the network to generalize better. Sometimes, it has the opposite effect. It partly depends on the kind of noise (""true"" vs. artificial).
The AI FAQ on ANN gives a good overview. Excerpt:
Noise in the actual data is never a good thing, since it limits the accuracy of generalization that can be achieved no matter how extensive the training set is. On the other hand, injecting artificial noise (jitter) into the inputs during training is one of several ways to improve generalization for smooth functions when you have a small training set.
In some field, such as computer vision, it's common to increase the size of the training set by copying some samples and adding some noises or other transformation.
",https://ai.stackexchange.com/questions/2
2,stackoverflow,stackoverflow question #4,"There is no direct way to find the optimal number of them: people empirically try and see (e.g., using cross-validation). The most common search techniques are random, manual, and grid searches.
There exist more advanced techniques such as Gaussian processes, e.g. Optimizing Neural Network Hyperparameters with Gaussian Processes for Dialog Act Classification, IEEE SLT 2016.
",https://ai.stackexchange.com/questions/4
3,stackoverflow,stackoverflow question #6,"It rather depends on how one defines several of the terms used. For example:
Whether the term ""expected"" is interpreted in a formal (i.e.
statistical) sense.
Whether it's assumed that humans have any kind of utilitarian
""performance measure"".
The motivation for this description of ""agent"" arose from a desire to have a quantitative model - it's not clear that such a model is a good fit for human cognition.
However, there are alternative definitions of agents, for example the BDI model, which are rather more open-ended and hence more obviously applicable to humans.
",https://ai.stackexchange.com/questions/6
4,stackoverflow,stackoverflow question #7,"
To put it simply in layman terms, what are the possible threats from AI?
Currently, there are no threat.
The threat comes if humans create a so-called ultraintelligent machine, a machine that can surpass all intellectual activities by any human. This would be the last invention man would need to do, since this machine is better in inventing machines than humans are (since that is an intellectual activity). However, this could cause the machine to invent machines that can destruct humans, and we can't stop them because they are so much smarter than we are.
This is all hypothetical, no one has even a clue of what an ultraintelligent machine looks like.
If we know that AI is so dangerous why are we still promoting it? Why is it not banned?
As I said before, the existence of a ultraintelligent machine is hypothetical. Artificial Intelligence has lots of useful applications (more than this answer can contain), and if we develop it, we get even more useful applications. We just have to be careful that the machines won't overtake us.
",https://ai.stackexchange.com/questions/7
5,stackoverflow,stackoverflow question #10,"It's analogous to analogue versus digital, or the many shades of gray in between black and white: when evaluating the truthiness of a result, in binary boolean it's either true or false (0 or 1), but when utilizing fuzzy logic, it's an estimated probability between 0 and 1 (such as 0.75 being mostly probably true). It's useful for making calculated decisions when all information needed isn't necessarily available.
Wikipedia has a fantastic page for this.
",https://ai.stackexchange.com/questions/10
6,stackoverflow,stackoverflow question #15,"The problem of the Turing Test is that it tests the machines ability to resemble humans. Not necessarily every form of AI has to resemble humans. This makes the Turing Test less reliable. However, it is still useful since it is an actual test. It is also noteworthy that there is a prize for passing or coming closest to passing the Turing Test, the Loebner Prize.
The intelligent agent definition of intelligence states that an agent is intelligent if it acts so to maximize the expected value of a performance measure based on past experience and knowledge. (paraphrased from Wikipedia). This definition is used more often and does not depend on the ability to resemble humans. However, it is harder to test this.
",https://ai.stackexchange.com/questions/15
7,stackoverflow,stackoverflow question #17,"The concept of ""the singularity"" is when machines outsmart the humans. Although Stephen Hawking opinion is that this situation is inevitable, but I think it'll be very difficult to reach that point, because every A.I. algorithm needs to be programmed by humans, therefore it would be always more limited than its creator.
We would probably know when that point when humanity will lose control over Artificial Intelligence where super-smart AI would be in competition with humans and maybe creating more sophisticated intelligent beings occurred, but currently, it's more like science fiction (aka Terminator's Skynet).
The risk could involve killing people (like self-flying war drones making their own decision), destroying countries or even the whole planet (like A.I. connected to the nuclear weapons (aka WarGames movie), but it doesn't prove the point that the machines would be smarter than humans.
",https://ai.stackexchange.com/questions/17
8,stackoverflow,stackoverflow question #26,"I think your question fits nowadays more in the field of Human-Robot Interaction, which relies largely on vision for recognition of gestures and follow movements, as well as soft, natural movements as a response. Note that the movements of the face and hands belong to the most complex tasks, involving many muscles at a time.
I strongly recommend the film Plug & Pray to have an idea of what people are researching in this area.
You may also find Eliza (which you can try here) interesting. It is classical in the history of AI and pretends to mimic an analyst (psychology). (I am thinking of Eliza not because of its emotional intelligence, but because it was apparently taken seriously by a couple of humans. Could this be taken as a sort of (approved) Turing test? What does it say about the humans it met?)
On the purely human end of the scale, I sometimes wonder about our (my) emotional intelligence myself. Would I want to implement such an intelligence in an artificial agent at all?
",https://ai.stackexchange.com/questions/26
9,stackoverflow,stackoverflow question #28,"This is probably more a question of philosophy than anything. In terms of how things are commonly defined, I'll say ""yes, genetic algorithms are part of AI"". If you pick up a comprehensive book on artificial intelligence, there will probably be a chapter on genetic algorithms (or more broadly, evolutionary algorithms).
One area that has been extensively studied in the past is the idea of using genetic algorithms to train neural networks. I don't know if people are still actively researching this topic or not, but it at least illustrates that GA's are part of the overall rubric of AI in one regard.
",https://ai.stackexchange.com/questions/28
================================================
FILE: buster/formatters/documents.py
================================================
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
import pandas as pd
from buster.tokenizers import Tokenizer
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class DocumentsFormatter(ABC):
"""
Abstract base class for document formatters.
Subclasses are required to implement the `format` method which transforms the input documents
into the desired format.
"""
@abstractmethod
def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:
"""
Abstract method to format matched documents.
Parameters:
- matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.
Returns:
- tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a string and
the possibly truncated matched documents DataFrame.
"""
pass
@dataclass
class DocumentsFormatterHTML(DocumentsFormatter):
"""
Formatter class to convert matched documents into an HTML format.
Attributes:
- tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents.
- max_tokens (int): Maximum allowed tokens for the formatted documents.
- formatter (str): String formatter for the document's content.
- inner_tag (str): HTML tag that will be used at the document level.
- outer_tag (str): HTML tag that will be used at the documents collection level.
"""
tokenizer: Tokenizer
max_tokens: int
formatter: str = "{content}"
inner_tag: str = "DOCUMENT"
outer_tag: str = "DOCUMENTS"
def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:
"""
Format the matched documents into an HTML format.
If the total tokens exceed max_tokens, documents are truncated or omitted to fit within the limit.
Parameters:
- matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.
Returns:
- tuple[str, pd.DataFrame]: A tuple containing the formatted documents as an HTML string and
the possibly truncated matched documents DataFrame.
"""
documents_str = ""
total_tokens = 0
max_tokens = self.max_tokens
num_total_docs = len(matched_documents)
num_preserved_docs = 0
# TODO: uniformize this logic with the DocumentsFormatterJSON
for _, row in matched_documents.iterrows():
doc = self.formatter.format_map(row.to_dict())
num_preserved_docs += 1
token_count, encoded = self.tokenizer.num_tokens(doc, return_encoded=True)
if total_tokens + token_count <= max_tokens:
documents_str += f"<{self.inner_tag}>{doc}<\\{self.inner_tag}>"
total_tokens += token_count
else:
logger.warning("truncating document to fit...")
remaining_tokens = max_tokens - total_tokens
truncated_doc = self.tokenizer.decode(encoded[:remaining_tokens])
documents_str += f"<{self.inner_tag}>{truncated_doc}<\\{self.inner_tag}>"
logger.warning(f"Documents after truncation: {documents_str}")
break
if num_preserved_docs < (num_total_docs):
logger.warning(
f"{num_preserved_docs}/{num_total_docs} documents were preserved from the matched documents due to truncation."
)
matched_documents = matched_documents.iloc[:num_preserved_docs]
documents_str = f"<{self.outer_tag}>{documents_str}<\\{self.outer_tag}>"
return documents_str, matched_documents
@dataclass
class DocumentsFormatterJSON(DocumentsFormatter):
"""
Formatter class to convert matched documents into a JSON format.
Attributes:
- tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents.
- max_tokens (int): Maximum allowed tokens for the formatted documents.
- columns (list[str]): List of columns to include in the JSON format.
"""
tokenizer: Tokenizer
max_tokens: int
columns: list[str]
def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:
"""
Format the matched documents into a JSON format.
If the total tokens exceed max_tokens, documents are omitted one at a time until it fits the limit.
Parameters:
- matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.
Returns:
- tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a JSON string and
the possibly truncated matched documents DataFrame.
"""
max_tokens = self.max_tokens
documents_str = matched_documents[self.columns].to_json(orient="records")
token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True)
while token_count > max_tokens:
# Truncated too much, no documents left, raise an error
if len(matched_documents) == 0:
raise ValueError(
f"Could not truncate documents to fit {max_tokens=}. Consider increasing max_tokens or decreasing chunk lengths."
)
# Too many tokens, drop a document and try again.
matched_documents = matched_documents.iloc[:-1]
documents_str = matched_documents[self.columns].to_json(orient="records")
token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True)
# Log a warning with more details
logger.warning(
f"Truncating documents to fit. Remaining documents after truncation: {len(matched_documents)}"
)
return documents_str, matched_documents
================================================
FILE: buster/formatters/prompts.py
================================================
import logging
from dataclasses import dataclass
import pandas as pd
from buster.tokenizers import Tokenizer
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@dataclass
class PromptFormatter:
tokenizer: Tokenizer
max_tokens: int
text_before_docs: str
text_after_docs: str
formatter: str = "{text_before_docs}\n{documents}\n{text_after_docs}"
def format(self, documents: str) -> str:
"""Formats the system prompt with prompt engineering.
Joins the text before and after documents with the documents provided.
Args:
documents (str): The already formatted documents to include in the system prompt.
Returns:
str: The formatted system prompt.
Raises:
ValueError: If the number of prompt tokens exceeds the maximum allowed tokens.
"""
system_prompt = self.formatter.format(
text_before_docs=self.text_before_docs, documents=documents, text_after_docs=self.text_after_docs
)
if self.tokenizer.num_tokens(system_prompt) > self.max_tokens:
raise ValueError(f"System prompt tokens > {self.max_tokens=}")
return system_prompt
def prompt_formatter_factory(tokenizer: Tokenizer, prompt_cfg) -> PromptFormatter:
"""Creates a PromptFormatter instance.
Args:
tokenizer (Tokenizer): The tokenizer to use for the PromptFormatter.
prompt_cfg: The configuration for the PromptFormatter.
Returns:
PromptFormatter: The created PromptFormatter instance.
"""
return PromptFormatter(
tokenizer=tokenizer,
max_tokens=prompt_cfg["max_tokens"],
text_before_docs=prompt_cfg["text_before_documents"],
text_after_docs=prompt_cfg["text_before_prompt"],
)
================================================
FILE: buster/llm_utils/__init__.py
================================================
from buster.llm_utils.embeddings import (
BM25,
compute_embeddings_parallelized,
cosine_similarity,
get_openai_embedding,
get_openai_embedding_constructor,
)
from buster.llm_utils.question_reformulator import QuestionReformulator
__all__ = [
QuestionReformulator,
cosine_similarity,
get_openai_embedding,
compute_embeddings_parallelized,
get_openai_embedding_constructor,
BM25,
]
================================================
FILE: buster/llm_utils/embeddings.py
================================================
import logging
from functools import lru_cache
from typing import Optional
import numpy as np
import pandas as pd
from openai import OpenAI
from pinecone_text.sparse import BM25Encoder
from tqdm.contrib.concurrent import thread_map
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def get_openai_embedding_constructor(client_kwargs: Optional[dict] = None, model: str = "text-embedding-ada-002"):
if client_kwargs is None:
client_kwargs = {}
client = OpenAI(**client_kwargs)
@lru_cache
def embedding_fn(text: str, model: str = model) -> np.array:
try:
text = text.replace("\n", " ")
response = client.embeddings.create(
input=text,
model=model,
)
embedding = response.data[0].embedding
return np.array(embedding, dtype="float32")
except Exception as e:
# This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.
logger.exception(e)
logger.warning(f"Embedding failed to compute for {text=}")
return None
return embedding_fn
# default embedding function
get_openai_embedding = get_openai_embedding_constructor()
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:
"""Compute the embeddings on the 'content' column of a DataFrame in parallel.
This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified
embedding function. The 'content' column is expected to contain strings or textual data. The method processes the
embeddings in parallel using the number of workers specified.
Args:
df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.
embedding_fn (callable): A function that computes embeddings for a given input string.
num_workers (int): The number of parallel workers to use for computing embeddings.
Returns:
pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.
"""
logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}")
embeddings = thread_map(embedding_fn, df.content.to_list(), max_workers=num_workers)
logger.info(f"Finished computing embeddings")
return embeddings
class BM25:
def __init__(self, path_to_params: str = None) -> None:
self.encoder = BM25Encoder()
if path_to_params:
self.encoder.load(path_to_params)
def fit(self, df: pd.DataFrame):
self.encoder.fit(df.content.to_list())
def dump_params(self, path: str):
self.encoder.dump(path)
def get_sparse_embedding_fn(self):
def sparse_embedding_fn(query: str):
return self.encoder.encode_queries(query)
return sparse_embedding_fn
================================================
FILE: buster/llm_utils/question_reformulator.py
================================================
import logging
from typing import Optional
from buster.completers import ChatGPTCompleter
class QuestionReformulator:
def __init__(
self,
system_prompt: Optional[str] = None,
completion_kwargs: Optional[dict] = None,
client_kwargs: Optional[dict] = None,
):
self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)
if completion_kwargs is None:
# Default kwargs
completion_kwargs = {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
}
self.completion_kwargs = completion_kwargs
if system_prompt is None:
# Default prompt
system_prompt = """
Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system.
Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval."""
self.system_prompt = system_prompt
def reformulate(self, user_input: str) -> str:
"""Reformulate a user's question"""
reformulated_question, error = self.completer.complete(
self.system_prompt, user_input=user_input, completion_kwargs=self.completion_kwargs
)
logging.info(f"Reformulated question from {user_input=} to {reformulated_question=}")
return reformulated_question, error
================================================
FILE: buster/parsers/__init__.py
================================================
from buster.parsers.parser import HuggingfaceParser, SphinxParser, get_all_documents
__all__ = [get_all_documents, SphinxParser, HuggingfaceParser]
================================================
FILE: buster/parsers/parser.py
================================================
import glob
import os
import re
from abc import ABC, abstractmethod
from dataclasses import InitVar, dataclass, field
from itertools import takewhile, zip_longest
from pathlib import Path
from typing import Iterator, Type
import bs4
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
@dataclass
class Section:
url: str
name: str
nodes: InitVar[list[bs4.element.NavigableString]]
text: str = field(init=False)
def __post_init__(self, nodes: list[bs4.element.NavigableString]):
section = []
for node in nodes:
if node.name == "table":
node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
elif node.name == "script":
continue
else:
node_text = node.text
section.append(node_text)
self.text = "\n".join(section).strip()
# Remove tabs
self.text = self.text.replace("\t", "")
# Replace group of newlines with a single newline
self.text = re.sub("\n{2,}", "\n", self.text)
# Replace non-breaking spaces with regular spaces
self.text = self.text.replace("\xa0", " ")
def __len__(self) -> int:
return len(self.text)
@classmethod
def from_text(cls, text: str, url: str, name: str) -> "Section":
"""Alternate constructor, without parsing."""
section = cls.__new__(cls) # Allocate memory, does not call __init__
# Does the init here.
section.text = text
section.url = url
section.name = name
return section
def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
"""Split a section into chunks."""
if len(self) > max_length:
# Get the number of chunk, by dividing and rounding up.
# Then, split the section into equal lenght chunks.
# This could results in chunks below the minimum length,
# and will truncate the end of the section.
n_chunks = (len(self) + max_length - 1) // max_length
length = len(self) // n_chunks
for chunk in range(n_chunks):
start = chunk * length
yield Section.from_text(self.text[start : start + length], self.url, self.name)
elif len(self) > min_length:
yield self
return
@dataclass
class Parser(ABC):
soup: BeautifulSoup
base_url: str
root_dir: str
filepath: str
min_section_length: int = 100
max_section_length: int = 2000
@property
def relative_path(self) -> str:
"""Gets the relative path of the file to the root dir.
This is particularly useful for websites with pages, subdomains, etc.
The split is to remove the .html extension
"""
parent = Path(self.root_dir)
son = Path(self.filepath)
self._relative_path = str(son.relative_to(parent)).split(".")[0]
return self._relative_path
@abstractmethod
def find_sections(self) -> Iterator[Section]: ...
def parse(self) -> list[Section]:
"""Parse the documents into sections, respecting the lenght constraints."""
sections = []
for section in self.find_sections():
sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))
return sections
class SphinxParser(Parser):
def find_sections(self) -> Iterator[Section]:
for section in self.soup.find_all("a", href=True, class_="headerlink"):
container = section.parent.parent
section_href = container.find_all("a", href=True, class_="headerlink")
url = self.build_url(section["href"].strip().replace("\n", ""))
name = section.parent.text.strip()[:-1].replace("\n", "")
# If sections has subsections, keep only the part before the first subsection
if len(section_href) > 1 and container.section is not None:
siblings = list(container.section.previous_siblings)[::-1]
section = Section(url, name, siblings)
else:
section = Section(url, name, container.children)
yield section
return
def build_url(self, suffix: str) -> str:
return self.base_url + self.relative_path + ".html" + suffix
class HuggingfaceParser(Parser):
def find_sections(self) -> Iterator[Section]:
sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
for section, next_section in zip_longest(sections, sections[1:]):
href = section.find("a", href=True, class_="header-link")
nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))
suffix = href["href"].strip().replace("\n", "")
url = self.build_url(suffix)
name = section.text.strip().replace("\n", "")
yield Section(url, name, nodes)
return
def build_url(self, suffix: str) -> str:
return self.base_url + self.relative_path + suffix
def get_document(
root_dir: str,
file: str,
base_url: str,
parser_cls: Type[Parser],
min_section_length: int = 100,
max_section_length: int = 2000,
) -> pd.DataFrame:
"""Extract all sections from one file.
Sections are broken into subsections if they are longer than `max_section_length`.
Sections correspond to `section` HTML tags that have a headerlink attached.
"""
filepath = os.path.join(root_dir, file)
with open(filepath, "r") as f:
source = f.read()
soup = BeautifulSoup(source, "html.parser")
parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length)
sections = []
urls = []
names = []
for section in parser.parse():
sections.append(section.text)
urls.append(section.url)
names.append(section.name)
documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
return documents_df
def get_all_documents(
root_dir: str,
base_url: str,
parser_cls: Type[Parser],
min_section_length: int = 100,
max_section_length: int = 2000,
) -> pd.DataFrame:
"""Parse all HTML files in `root_dir`, and extract all sections.
Sections are broken into subsections if they are longer than `max_section_length`.
Sections correspond to `section` HTML tags that have a headerlink attached.
"""
files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
dfs = []
for file in tqdm(files):
try:
df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length)
dfs.append(df)
except Exception as e:
print(f"Skipping {file} due to the following error: {e}")
continue
documents_df = pd.concat(dfs, ignore_index=True)
return documents_df
================================================
FILE: buster/retriever/__init__.py
================================================
from .base import Retriever
from .deeplake import DeepLakeRetriever
from .service import ServiceRetriever
__all__ = [Retriever, ServiceRetriever, DeepLakeRetriever]
================================================
FILE: buster/retriever/base.py
================================================
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Callable, Optional
import numpy as np
import pandas as pd
from buster.completers import UserInputs
from buster.llm_utils import get_openai_embedding
ALL_SOURCES = "All"
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@dataclass
class Retriever(ABC):
def __init__(
self,
top_k: int,
thresh: float,
embedding_fn: Callable[[str], np.ndarray] = None,
sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,
*args,
**kwargs,
):
"""Initializes a Retriever instance.
Args:
top_k: The maximum number of documents to retrieve.
thresh: The similarity threshold for document retrieval.
embedding_fn: The function to compute document embeddings.
embedding_fn: (Optional) The function to compute sparse document embeddings.
*args, **kwargs: Additional arguments and keyword arguments.
"""
if embedding_fn is None:
embedding_fn = get_openai_embedding
self.top_k = top_k
self.thresh = thresh
self.embedding_fn = embedding_fn
self.sparse_embedding_fn = sparse_embedding_fn
# Add your access to documents in your own init
@abstractmethod
def get_documents(self, source: Optional[str] = None) -> pd.DataFrame:
"""Get all current documents from a given source.
Args:
source: The source from which to retrieve documents. If None, retrieves documents from all sources.
Returns:
A pandas DataFrame containing the documents.
"""
...
@abstractmethod
def get_source_display_name(self, source: str) -> str:
"""Get the display name of a source.
Args:
source: The source for which to retrieve the display name.
Returns:
The display name of the source.
If source is None, returns all documents. If source does not exist, returns empty dataframe.
"""
...
@abstractmethod
def get_topk_documents(self, query: str, source: Optional[str] = None, top_k: Optional[int] = None) -> pd.DataFrame:
"""Get the topk documents matching a user's query.
Args:
query: The user's query.
source: The source from which to retrieve documents. If None, retrieves documents from all sources.
top_k: The maximum number of documents to retrieve.
Returns:
A pandas DataFrame containing the topk matched documents.
If no matches are found, returns an empty dataframe.
"""
...
def threshold_documents(self, matched_documents: pd.DataFrame, thresh: float) -> pd.DataFrame:
"""Filters out matched documents using a similarity threshold.
Args:
matched_documents: The DataFrame containing the matched documents.
thresh: The similarity threshold.
Returns:
A pandas DataFrame containing the filtered matched documents.
"""
# filter out matched_documents using a threshold
return matched_documents[matched_documents.similarity > thresh]
def retrieve(
self,
user_inputs: UserInputs,
sources: Optional[list[str]] = None,
top_k: Optional[int] = None,
thresh: Optional[float] = None,
) -> pd.DataFrame:
"""Retrieves documents based on user inputs.
Args:
user_inputs: The user's inputs.
sources: The sources from which to retrieve documents. If None, retrieves documents from all sources.
top_k: The maximum number of documents to retrieve.
thresh: The similarity threshold for document retrieval.
Returns:
A pandas DataFrame containing the retrieved documents.
"""
if top_k is None:
top_k = self.top_k
if thresh is None:
thresh = self.thresh
query = user_inputs.current_input
matched_documents = self.get_topk_documents(query=query, sources=sources, top_k=top_k)
# log matched_documents to the console
logger.info(f"matched documents before thresh: {matched_documents}")
# No matches were found, simply return at this point
if len(matched_documents) == 0:
return matched_documents
# otherwise, make sure we have the minimum required fields
assert "similarity" in matched_documents.columns
assert "embedding" in matched_documents.columns
assert "content" in matched_documents.columns
assert "title" in matched_documents.columns
# filter out matched_documents using a threshold
matched_documents = self.threshold_documents(matched_documents, thresh)
logger.info(f"matched documents after thresh: {matched_documents}")
return matched_documents
================================================
FILE: buster/retriever/deeplake.py
================================================
import logging
import os
from typing import Optional
import numpy as np
import pandas as pd
from buster.retriever.base import ALL_SOURCES, Retriever
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def extract_metadata(x: pd.DataFrame, columns) -> pd.DataFrame:
"""Extracts metadata from deeplake.
Args:
x: The dataframe containing the metadata.
columns: The columns to extract.
Returns:
The dataframe with the extracted metadata.
"""
for col in columns:
x[col] = x.metadata[col]
return x
def data_dict_to_df(data: dict) -> pd.DataFrame:
"""Converts a dictionary of data to a Pandas DataFrame.
Args:
data: The dictionary containing the data.
Returns:
The DataFrame containing the data.
"""
# rename 'score' to 'similarity'
data["similarity"] = data.pop("score")
data["content"] = data.pop("text")
matched_documents = pd.DataFrame(data)
if len(matched_documents) == 0:
logger.info("No matches found...")
return pd.DataFrame()
matched_documents = matched_documents.apply(extract_metadata, columns=["source", "title", "url"], axis=1)
matched_documents = matched_documents.drop(columns="metadata")
return matched_documents
def build_tql_query(embedding, sources=None, top_k: int = 3) -> str:
"""Builds a TQL query.
Args:
embedding: The embedding vector.
sources: The sources to filter by.
top_k: The number of top documents to retrieve.
Returns:
The TQL query.
"""
# Initialize the where_clause to an empty string.
where_clause = ""
embedding_string = ",".join([str(item) for item in embedding])
# If sources is provided and it's not empty, build the where clause.
if sources:
conditions = [f"contains(metadata['source'], '{source}')" for source in sources]
where_clause = "where " + " or ".join(conditions)
# Construct the entire query
query = f"""
select * from (
select embedding, text, metadata, cosine_similarity(embedding, ARRAY[{embedding_string}]) as score
{where_clause}
)
order by score desc limit {top_k}
"""
return query
class DeepLakeRetriever(Retriever):
def __init__(
self,
path,
exec_option: str = "python",
use_tql: bool = False,
deep_memory: bool = False,
activeloop_token: str = None,
**kwargs,
):
from deeplake.core.vectorstore import VectorStore
super().__init__(**kwargs)
self.use_tql = use_tql
self.exec_option = exec_option
self.deep_memory = deep_memory
self.vector_store = VectorStore(
path=path,
read_only=True,
token=activeloop_token,
exec_option=exec_option,
)
if activeloop_token is None and use_tql:
logger.warning(
"""
No activeloop token detected, enterprise features will not be available.
You can set it using: export ACTIVELOOP_TOKEN=...
"""
)
def get_documents(self, sources: Optional[list[str]] = None) -> pd.DataFrame:
"""Get all current documents from a given source.
Args:
sources: The sources to retrieve documents from.
Returns:
The DataFrame containing the retrieved documents.
"""
k = len(self.vector_store)
# currently this is the only way to retrieve all embeddings in deeplake
# generate a dummy embedding and specify top-k equals the length of the vector store.
embedding_dim = self.vector_store.tensors()["embedding"].shape[1]
dummy_embedding = np.random.random(embedding_dim)
return self.get_topk_documents(query=None, embedding=dummy_embedding, top_k=k, sources=sources)
def get_source_display_name(self, source: str) -> str:
"""Get the display name of a source.
Args:
source: The name of the source.
Returns:
The display name of the source.
Raises:
NotImplementedError: If the method is not implemented.
"""
raise NotImplementedError()
def get_topk_documents(
self,
query: str = None,
embedding: np.array = None,
sources: Optional[list[str]] = None,
top_k: int = None,
return_tensors: str = "*",
) -> pd.DataFrame:
"""Get the topk documents matching a user's query.
If no matches are found, returns an empty dataframe.
Args:
query: The user's query.
embedding: The embedding vector.
sources: The sources to filter by.
top_k: The number of top documents to retrieve.
return_tensors: The tensors to include in the result.
Returns:
The DataFrame containing the matched documents.
"""
if query is not None:
query_embedding = self.embedding_fn(query)
elif embedding is not None:
query_embedding = embedding
else:
raise ValueError("must provide either a query or an embedding")
if self.use_tql:
assert self.exec_option == "compute_engine", "cant use tql without compute_engine"
tql_query = build_tql_query(query_embedding, sources=sources, top_k=top_k)
data = self.vector_store.search(query=tql_query, deep_memory=self.deep_memory)
else:
# build the filter clause
if sources:
def filter(x):
return x["metadata"].data()["value"]["source"] in sources
else:
filter = None
data = self.vector_store.search(
k=top_k,
embedding=query_embedding,
exec_option=self.exec_option,
return_tensors=return_tensors,
filter=filter,
)
matched_documents = data_dict_to_df(data)
return matched_documents
================================================
FILE: buster/retriever/service.py
================================================
import logging
from typing import List, Optional
import numpy as np
import pandas as pd
import pinecone
from bson.objectid import ObjectId
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from buster.retriever.base import ALL_SOURCES, Retriever
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class ServiceRetriever(Retriever):
def __init__(
self,
pinecone_api_key: str,
pinecone_index: str,
pinecone_namespace: str,
mongo_uri: str,
mongo_db_name: str,
**kwargs,
):
"""
Initializes a ServiceRetriever instance.
The ServiceRetriever is a hybrid retrieval combining pinecone and mongodb services.
Pinecone is exclusively used as a vector store.
The id of the pinecone vectors are used as a key in the mongodb database to store its associated metadata.
Args:
pinecone_api_key: The API key for Pinecone.
pinecone_env: The environment for Pinecone.
pinecone_index: The name of the Pinecone index.
pinecone_namespace: The namespace for Pinecone.
mongo_uri: The URI for MongoDB.
mongo_db_name: The name of the MongoDB database.
"""
super().__init__(**kwargs)
pc = pinecone.Pinecone(api_key=pinecone_api_key)
self.index = pc.Index(pinecone_index)
self.namespace = pinecone_namespace
self.client = MongoClient(mongo_uri, server_api=ServerApi("1"))
self.db = self.client[mongo_db_name]
def get_source_id(self, source: str) -> str:
"""Get the id of a source. Returns an empty string if the source does not exist.
Args:
source: The name of the source.
Returns:
The id of the source.
"""
source_pointer = self.db.sources.find_one({"name": source})
return "" if source_pointer is None else str(source_pointer["_id"])
def get_documents(self, source: Optional[str] = None) -> pd.DataFrame:
"""Get all current documents from a given source.
Args:
source: The name of the source. Defaults to None.
Returns:
A DataFrame containing all the documents. If the source does not exist, returns an empty DataFrame.
"""
if source is None:
# No source specified, return all documents
documents = self.db.documents.find()
else:
assert isinstance(source, str), "source must be a valid string."
source_id = self.get_source_id(source)
if source_id == "":
logger.warning(f"{source=} not found.")
documents = self.db.documents.find({"source_id": source_id})
return pd.DataFrame(list(documents))
def get_source_display_name(self, source: str) -> str:
"""Get the display name of a source.
Args:
source: The name of the source.
Returns:
The display name of the source.
"""
if source is None:
return ALL_SOURCES
else:
display_name = self.db.sources.find_one({"name": source})["display_name"]
return display_name
def get_topk_documents(self, query: str, sources: Optional[List[str]], top_k: int) -> pd.DataFrame:
"""Get the top k documents matching a query from the specified sources.
Args:
query: The query string.
sources: The list of source names to search. Defaults to None.
top_k: The number of top matches to return.
Returns:
A DataFrame containing the top k matching documents.
"""
if sources is None:
filter = None
else:
filter = {"source": {"$in": sources}}
source_exists = self.db.sources.find_one({"name": {"$in": sources}})
if source_exists is None:
logger.warning(f"Sources {sources} do not exist. Returning empty dataframe.")
return pd.DataFrame()
query_embedding = self.embedding_fn(query)
sparse_query_embedding = self.sparse_embedding_fn(query) if self.sparse_embedding_fn is not None else None
if isinstance(query_embedding, np.ndarray):
# pinecone expects a list of floats, so convert from ndarray if necessary
query_embedding = query_embedding.tolist()
# Pinecone retrieval
matches = self.index.query(
vector=query_embedding,
sparse_vector=sparse_query_embedding,
top_k=top_k,
filter=filter,
include_values=True,
namespace=self.namespace,
)["matches"]
matching_ids = [ObjectId(match.id) for match in matches]
matching_scores = {match.id: match.score for match in matches}
matching_embeddings = {match.id: match.values for match in matches}
if len(matching_ids) == 0:
return pd.DataFrame()
# MongoDB retrieval
matched_documents = self.db.documents.find({"_id": {"$in": matching_ids}})
matched_documents = pd.DataFrame(list(matched_documents))
# add additional information from matching
matched_documents["similarity"] = matched_documents["_id"].apply(lambda x: matching_scores[str(x)])
matched_documents["embedding"] = matched_documents["_id"].apply(lambda x: matching_embeddings[str(x)])
# sort by similarity
matched_documents = matched_documents.sort_values(by="similarity", ascending=False, ignore_index=True)
return matched_documents
================================================
FILE: buster/tokenizers/__init__.py
================================================
from .base import Tokenizer
from .gpt import GPTTokenizer
def tokenizer_factory(tokenizer_cfg: dict) -> Tokenizer:
model_name = tokenizer_cfg["model_name"]
if model_name in ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]:
return GPTTokenizer(model_name)
raise ValueError(f"Tokenizer not implemented for {model_name=}")
__all__ = [Tokenizer, GPTTokenizer, tokenizer_factory]
================================================
FILE: buster/tokenizers/base.py
================================================
from abc import ABC, abstractmethod
from typing import Union
class Tokenizer(ABC):
"""Abstract base class for a tokenizer.
Args:
model_name: The name of the tokenizer model.
Attributes:
model_name: The name of the tokenizer model.
"""
def __init__(self, model_name: str):
self.model_name = model_name
@abstractmethod
def encode(self, string: str) -> list[int]:
"""Encodes a string into a list of integers.
Args:
string: The input string to be encoded.
Returns:
A list of integers representing the encoded string.
"""
...
@abstractmethod
def decode(self, encoded: list[int]) -> str:
"""Decodes a list of integers into a string.
Args:
encoded: The list of integers to be decoded.
Returns:
The decoded string.
"""
...
def num_tokens(self, string: str, return_encoded: bool = False) -> Union[int, tuple[int, list[int]]]:
"""Returns the number of tokens in a string.
Args:
string: The input string.
return_encoded: Whether or not to return the encoded string along with the number of tokens.
Returns:
If `return_encoded` is False, returns the number of tokens in the string.
If `return_encoded` is True, returns a tuple containing the number of tokens and the encoded string.
"""
encoded = self.encode(string)
if return_encoded:
return len(encoded), encoded
return len(encoded)
================================================
FILE: buster/tokenizers/gpt.py
================================================
import tiktoken
from buster.tokenizers import Tokenizer
class GPTTokenizer(Tokenizer):
"""Tokenizer class for GPT models.
This class implements a tokenizer for GPT models using the tiktoken library.
Args:
model_name (str): The name of the GPT model to be used.
Attributes:
encoder: The encoder object created using tiktoken.encoding_for_model().
"""
def __init__(self, model_name: str):
super().__init__(model_name)
self.encoder = tiktoken.encoding_for_model(model_name=model_name)
def encode(self, string: str):
"""Encodes a given string using the GPT tokenizer.
Args:
string (str): The string to be encoded.
Returns:
list[int]: The encoded representation of the string.
"""
return self.encoder.encode(string)
def decode(self, encoded: list[int]):
"""Decodes a list of tokens using the GPT tokenizer.
Args:
encoded (list[int]): The list of tokens to be decoded.
Returns:
str: The decoded string representation of the tokens.
"""
return self.encoder.decode(encoded)
================================================
FILE: buster/utils.py
================================================
import os
import urllib.request
import zipfile
def get_file_extension(filepath: str) -> str:
return os.path.splitext(filepath)[1]
def download_db(db_url: str, output_dir: str):
os.makedirs(output_dir, exist_ok=True)
fname = os.path.join(output_dir, "documents.db")
if not os.path.exists(fname):
print(f"Downloading db file from {db_url} to {fname}...")
urllib.request.urlretrieve(db_url, fname)
print("Downloaded.")
else:
print("File already exists. Skipping.")
return fname
def zip_contents(input_path, output_path):
"""
Zips the entire contents of a given path to a custom output path.
Authored by ChatGPT
Args:
input_path (str): The path of the directory to be zipped.
output_path (str): The path where the zip file will be created.
Returns:
str: The path of the created zip file.
"""
if not os.path.exists(input_path):
raise ValueError("The specified input path does not exist.")
zip_file_name = f"{os.path.basename(input_path)}.zip"
zip_file_path = os.path.join(output_path, zip_file_name)
with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
for root, _, files in os.walk(input_path):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, input_path)
zipf.write(file_path, arcname=arcname)
return zip_file_path
def extract_zip(zip_file_path, output_path):
"""
Extracts the contents of a zip file to a custom output path.
Authored by ChatGPT
Args:
zip_file_path (str): The path of the zip file to be extracted.
output_path (str): The path where the zip contents will be extracted.
Returns:
str: The path of the directory where the zip contents are extracted.
"""
if not os.path.exists(zip_file_path):
raise ValueError("The specified zip file does not exist.")
with zipfile.ZipFile(zip_file_path, "r") as zipf:
zipf.extractall(output_path)
return output_path
================================================
FILE: buster/validators/__init__.py
================================================
from .base import Validator
__all__ = [Validator]
================================================
FILE: buster/validators/base.py
================================================
import logging
import pandas as pd
from buster.llm_utils import cosine_similarity, get_openai_embedding
from buster.validators.validators import (
AnswerValidator,
DocumentsValidator,
QuestionValidator,
)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class Validator:
def __init__(
self,
use_reranking: bool,
validate_documents: bool,
question_validator_cfg=None,
answer_validator_cfg=None,
documents_validator_cfg=None,
):
"""
Initializes the Validator class.
Args:
use_reranking: A boolean indicating whether to use reranking.
validate_documents: A boolean indicating whether to validate documents.
question_validator_cfg: A configuration dictionary for the QuestionValidator.
answer_validator_cfg: A configuration dictionary for the AnswerValidator.
documents_validator_cfg: A configuration dictionary for the DocumentsValidator.
"""
self.question_validator = (
QuestionValidator(**question_validator_cfg) if question_validator_cfg is not None else QuestionValidator()
)
self.answer_validator = (
AnswerValidator(**answer_validator_cfg) if answer_validator_cfg is not None else AnswerValidator()
)
self.documents_validator = (
DocumentsValidator(**documents_validator_cfg)
if documents_validator_cfg is not None
else DocumentsValidator()
)
self.use_reranking = use_reranking
self.validate_documents = validate_documents
def check_question_relevance(self, question: str) -> tuple[bool, str]:
"""
Checks the relevance of a question.
Args:
question: The question to be checked.
Returns:
A tuple containing a boolean indicating the relevance and a string describing the result.
"""
return self.question_validator.check_question_relevance(question)
def check_answer_relevance(self, answer: str) -> bool:
"""
Checks the relevance of an answer.
Args:
answer: The answer to be checked.
Returns:
A boolean indicating the relevance of the answer.
"""
return self.answer_validator.check_answer_relevance(answer)
def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFrame:
"""
Checks the relevance of documents.
Args:
answer: The answer to be checked.
matched_documents: The DataFrame containing the matched documents.
Returns:
A DataFrame containing the relevance of the documents.
"""
return self.documents_validator.check_documents_relevance(answer, matched_documents)
def rerank_docs(
self, answer: str, matched_documents: pd.DataFrame, embedding_fn=get_openai_embedding
) -> pd.DataFrame:
"""
Reranks the matched documents based on answer similarity.
Args:
answer: The answer for reranking.
matched_documents: The DataFrame containing the matched documents.
embedding_fn: The function used to calculate document embeddings.
Returns:
A DataFrame containing the reranked documents.
"""
"""Here we re-rank matched documents according to the answer provided by the llm.
This score could be used to determine wether a document was actually relevant to generation.
An extra column is added in-place for the similarity score.
"""
if len(matched_documents) == 0:
return matched_documents
logger.info("Reranking documents based on answer similarity...")
answer_embedding = embedding_fn(answer)
col = "similarity_to_answer"
matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding))
return matched_documents.sort_values(by=col, ascending=False)
================================================
FILE: buster/validators/validators.py
================================================
import concurrent.futures
import logging
from typing import Callable, List, Optional
import numpy as np
import pandas as pd
from buster.completers import ChatGPTCompleter, Completer
from buster.llm_utils import cosine_similarity
from buster.llm_utils.embeddings import get_openai_embedding
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class QuestionValidator:
def __init__(
self,
check_question_prompt: Optional[str] = None,
invalid_question_response: Optional[str] = None,
completion_kwargs: Optional[dict] = None,
client_kwargs: Optional[dict] = None,
):
if check_question_prompt is None:
check_question_prompt = (
"""You are a chatbot answering questions on documentation.
Your job is to determine whether or not a question is valid, and should be answered.
More general questions are not considered valid, even if you might know the response.
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
For example:
Q: What is backpropagation?
true
Q: What is the meaning of life?
false
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
)
if completion_kwargs is None:
# default completion kwargs
completion_kwargs = (
{
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
)
self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)
self.check_question_prompt = check_question_prompt
self.invalid_question_response = invalid_question_response
def check_question_relevance(self, question: str) -> tuple[bool, str]:
"""Determines whether a question is relevant for our given framework."""
try:
outputs, _ = self.completer.complete(self.check_question_prompt, user_input=question)
outputs = outputs.strip(".").lower()
if outputs not in ["true", "false"]:
logger.warning(f"the question validation returned an unexpeced value: {outputs=}. Assuming Invalid...")
relevance = outputs.strip(".").lower() == "true"
response = self.invalid_question_response
except Exception as e:
logger.exception("Error during question relevance detection.")
relevance = False
response = "Unable to process your question at the moment, try again soon"
return relevance, response
class AnswerValidator:
def __init__(
self,
unknown_response_templates: Optional[list[str]] = None,
unknown_threshold: Optional[float] = None,
embedding_fn: Callable[[str], np.array] = None,
):
if unknown_threshold is None:
unknown_threshold = 0.85
if embedding_fn is None:
embedding_fn = get_openai_embedding
if unknown_response_templates is None:
unknown_response_templates = [
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
]
self.embedding_fn = embedding_fn
self.unknown_response_templates = unknown_response_templates
self.unknown_threshold = unknown_threshold
def check_answer_relevance(self, answer: str) -> bool:
"""Check if a generated answer is relevant to the chatbot's knowledge."""
if answer == "":
raise ValueError("Cannot compute embedding of an empty string.")
unknown_embeddings = [
self.embedding_fn(unknown_response) for unknown_response in self.unknown_response_templates
]
answer_embedding = self.embedding_fn(answer)
unknown_similarity_scores = [
cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings
]
# If any score is above the threshold, the answer is considered not relevant
return not any(score > self.unknown_threshold for score in unknown_similarity_scores)
class DocumentsValidator:
def __init__(
self,
completion_kwargs: Optional[dict] = None,
client_kwargs: Optional[dict] = None,
system_prompt: Optional[str] = None,
user_input_formatter: Optional[str] = None,
max_calls: int = 30,
):
if system_prompt is None:
system_prompt = """
Your goal is to determine if the content of a document can be attributed to a provided answer.
This means that if information in the document is found in the answer, it is relevant. Otherwise it is not.
Your goal is to determine if the information contained in a document was used to generate an answer.
You will be comparing a document to an answer. If the answer can be inferred from the document, return 'true'. Otherwise return 'false'.
Only respond with 'true' or 'false'."""
self.system_prompt = system_prompt
if user_input_formatter is None:
user_input_formatter = """
answer: {answer}
document: {document}
"""
self.user_input_formatter = user_input_formatter
if completion_kwargs is None:
completion_kwargs = {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
}
self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)
self.max_calls = max_calls
def check_document_relevance(self, answer: str, document: str) -> bool:
user_input = self.user_input_formatter.format(answer=answer, document=document)
output, _ = self.completer.complete(prompt=self.system_prompt, user_input=user_input)
# remove trailing periods, happens sometimes...
output = output.strip(".").lower()
if output not in ["true", "false"]:
# Default assume it's relevant if the detector didn't give one of [true, false]
logger.warning(f"the validation returned an unexpected value: {output}. Assuming valid...")
return True
return output == "true"
def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> list[bool]:
"""Determines wether a question is relevant or not for our given framework."""
logger.info(f"Checking document relevance of {len(matched_documents)} documents")
if len(matched_documents) > self.max_calls:
raise ValueError("Max calls exceeded, increase max_calls to allow this.")
# Here we parallelize the calls. We introduce a wrapper as a workaround.
def _check_documents(args):
"Thin wrapper so we can pass args as a Tuple and use ThreadPoolExecutor."
answer, document = args
return self.check_document_relevance(answer=answer, document=document)
args_list = [(answer, doc) for doc in matched_documents.content.to_list()]
with concurrent.futures.ThreadPoolExecutor() as executor:
relevance = list(executor.map(_check_documents, args_list))
logger.info(f"{relevance=}")
# add it back to the dataframe
matched_documents["relevance"] = relevance
return matched_documents
================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[project]
name = "buster-doctalk"
version = "0.0.1"
description = "Buster 🤖: A chatbot for retrieval-augmented generation"
readme = "README.md"
requires-python = ">=3.10"
dynamic = ["dependencies"]
[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
[tool.setuptools.packages.find]
include = ["buster"]
[tool.isort]
profile = "black"
[tool.black]
line-length = 120
[tool.pytest.ini_options]
log_cli = true
log_cli_level = "INFO"
[tool.poetry]
name = "buster-doctalk"
version = "v0.0.1"
description = "Buster 🤖: A chatbot for retrieval-augmented generation"
license = "MIT"
authors = [
"Jeremy Pinto ",
"Hadrien Bertrand ",
]
readme = "README.md"
repository = "https://github.com/jerpint/buster"
packages = [
{ include = "buster" },
{ include = "buster/**/*.py" },
]
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
================================================
FILE: requirements.txt
================================================
bs4
click
deeplake
gradio>=3.40
matplotlib
numpy>=1.25
openai>=1.0
pandas>=2.1.3
pinecone-client>=3.0.2
pinecone-text>=0.6.0
pymongo
pytest
tabulate
tenacity
tiktoken
================================================
FILE: tests/test_chatbot.py
================================================
import copy
import logging
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, Completer, Completion, DocumentAnswerer
from buster.documents_manager import DeepLakeDocumentsManager
from buster.formatters.documents import DocumentsFormatterHTML
from buster.formatters.prompts import PromptFormatter
from buster.llm_utils import get_openai_embedding
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers.gpt import GPTTokenizer
from buster.validators import Validator
logging.basicConfig(level=logging.INFO)
DOCUMENTS_CSV = Path(__file__).resolve().parent.parent / "buster/examples/stackoverflow.csv"
UNKNOWN_PROMPT = "I'm sorry but I don't know how to answer."
NUM_WORKERS = 1
# default class used by our tests
buster_cfg_template = BusterConfig(
completion_cfg={
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"temperature": 0,
},
"client_kwargs": {
"timeout": 20,
"max_retries": 2,
},
},
validator_cfg={
"validate_documents": False,
"use_reranking": True,
"answer_validator_cfg": {
"unknown_response_templates": [
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
],
"unknown_threshold": 0.85,
},
"question_validator_cfg": {
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
"client_kwargs": {
"timeout": 20,
"max_retries": 2,
},
"check_question_prompt": "You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.",
},
},
retriever_cfg={
# "db_path": to be set using pytest fixture,
"top_k": 3,
"thresh": 0.7,
"max_tokens": 2000,
"embedding_fn": get_openai_embedding,
},
prompt_formatter_cfg={
"max_tokens": 3500,
"text_after_docs": ("""Now answer the following question:\n"""),
"text_before_docs": (
"""You are a chatbot assistant answering technical questions about artificial intelligence (AI). """
"""If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer."""
"""Use this response when you cannot answer:\n"""
f"""'{UNKNOWN_PROMPT}'\n"""
"""For example:\n"""
"""What is the meaning of life?\n"""
f"""'{UNKNOWN_PROMPT}'\n"""
"""Only use these prodived documents as reference:\n"""
),
},
documents_formatter_cfg={
"max_tokens": 3000,
"formatter": "{content}",
},
)
def get_fake_embedding(length=1536):
rng = np.random.default_rng()
return list(rng.random(length, dtype=np.float32))
class MockAnswerer(Completer):
def __init__(self, expected_answer):
self.expected_answer = expected_answer
def prepare_prompt(self, user_inputs, matched_documents):
pass
def complete(self):
return
def get_completion(self, user_inputs, matched_documents, validator, *arg, **kwarg) -> Completion:
return Completion(
answer_text=self.expected_answer,
error=False,
user_inputs=user_inputs,
matched_documents=matched_documents,
validator=validator,
)
class MockRetriever(Retriever):
def __init__(self, **kwargs):
super().__init__(**kwargs)
path = kwargs["path"]
self.path = path
n_samples = 100
self.documents = pd.DataFrame.from_dict(
{
"title": ["test"] * n_samples,
"url": ["http://url.com"] * n_samples,
"content": ["cool text"] * n_samples,
"embedding": [get_fake_embedding()] * n_samples,
"n_tokens": [10] * n_samples,
"source": ["fake source"] * n_samples,
}
)
self.embedding_fn = get_fake_embedding
def get_documents(self, source):
return self.documents
def get_topk_documents(self, query: str, sources: list[str] = None, top_k: int = None) -> pd.DataFrame:
documents = self.documents
documents["embedding"] = [get_fake_embedding() for _ in range(len(documents))]
documents["similarity"] = [np.random.random() for _ in range(len(documents))]
return documents
def get_source_display_name(self, source):
return source
class MockValidator:
def __init__(self, *args, **kwargs):
return
def validate(self, completion):
completion.answer_relevant = True
return completion
def check_question_relevance(self, *args, **kwargs):
return True, ""
def check_answer_relevance(self, *args, **kwargs):
return True
@pytest.fixture(scope="session")
def vector_store_path(tmp_path_factory):
# Create a temporary directory and folder for the database manager
dm_path = tmp_path_factory.mktemp("data").joinpath("deeplake_store")
# Add the documents (will generate embeddings)
dm = DeepLakeDocumentsManager(vector_store_path=dm_path)
df = pd.read_csv(DOCUMENTS_CSV)
dm.add(df, num_workers=NUM_WORKERS)
return dm_path
def test_chatbot_mock_data(tmp_path, monkeypatch):
gpt_expected_answer = "this is GPT answer"
path = tmp_path / "not_a_real_file.tar.gz"
buster_cfg = copy.deepcopy(buster_cfg_template)
buster_cfg.retriever_cfg["path"] = path
buster_cfg.completion_cfg = {
"expected_answer": gpt_expected_answer,
}
retriever = MockRetriever(**buster_cfg.retriever_cfg)
document_answerer = MockAnswerer(**buster_cfg.completion_cfg)
validator = MockValidator(**buster_cfg.validator_cfg)
buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
completion = buster.process_input(user_input="What is a transformer?", sources=["fake_source"])
assert isinstance(completion.answer_text, str)
assert completion.answer_text.startswith(gpt_expected_answer)
def test_chatbot_real_data__chatGPT(vector_store_path):
buster_cfg = copy.deepcopy(buster_cfg_template)
buster_cfg.retriever_cfg["path"] = vector_store_path
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
document_answerer = DocumentAnswerer(
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
)
validator: Validator = Validator(**buster_cfg.validator_cfg)
buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
completion = buster.process_input("What is backpropagation?")
assert isinstance(completion.answer_text, str)
assert completion.question_relevant == True
assert completion.answer_relevant == True
assert completion.completion_kwargs == buster_cfg.completion_cfg["completion_kwargs"]
def test_chatbot_real_data__chatGPT_OOD(vector_store_path):
buster_cfg = copy.deepcopy(buster_cfg_template)
buster_cfg.retriever_cfg["path"] = vector_store_path
buster_cfg.prompt_formatter_cfg = {
"max_tokens": 3500,
"text_before_docs": (
"""You are a chatbot assistant answering technical questions about artificial intelligence (AI)."""
"""If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer."""
"""Use this response: """
f"""'{UNKNOWN_PROMPT}'\n"""
"""For example:\n"""
"""What is the meaning of life?\n"""
f"""'{UNKNOWN_PROMPT}'\n"""
"""Now answer the following question:\n"""
),
"text_after_docs": "Only use these documents as reference:\n",
}
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
document_answerer = DocumentAnswerer(
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
)
validator: Validator = Validator(**buster_cfg.validator_cfg)
buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
completion: Completion = buster.process_input("What is a good recipe for brocolli soup?")
assert isinstance(completion.answer_text, str)
assert completion.question_relevant == False
assert completion.answer_relevant == False
assert completion.completion_kwargs is None
def test_chatbot_real_data__no_docs_found(vector_store_path):
with pytest.warns():
buster_cfg = copy.deepcopy(buster_cfg_template)
buster_cfg.retriever_cfg = {
"path": vector_store_path,
"embedding_fn": get_openai_embedding,
"top_k": 3,
"thresh": 1, # Set threshold very high to be sure no docs are matched
"max_tokens": 3000,
}
buster_cfg.documents_answerer_cfg["no_documents_message"] = "No documents available."
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
document_answerer = DocumentAnswerer(
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
**buster_cfg.documents_answerer_cfg,
)
validator: Validator = Validator(**buster_cfg.validator_cfg)
buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
completion = buster.process_input("What is backpropagation?")
assert isinstance(completion.answer_text, str)
assert completion.question_relevant == True
assert completion.answer_relevant == False
assert completion.answer_text == "No documents available."
================================================
FILE: tests/test_documents.py
================================================
import os
import numpy as np
import pandas as pd
import pytest
from buster.documents_manager import DeepLakeDocumentsManager
from buster.documents_manager.base import compute_embeddings_parallelized
from buster.llm_utils import get_openai_embedding
from buster.retriever import DeepLakeRetriever
# Patch the get_embedding function to return a fixed, fake embedding
NUM_WORKERS = 1
fake_embedding = [-0.005, 0.0018]
def get_fake_embedding(*arg, **kwargs):
return fake_embedding
@pytest.mark.parametrize(
"documents_manager, retriever",
[(DeepLakeDocumentsManager, DeepLakeRetriever)],
)
def test_write_read(tmp_path, documents_manager, retriever):
retriever_cfg = {
"top_k": 3,
"thresh": 0.7,
"max_tokens": 2000,
"embedding_fn": get_openai_embedding,
}
dm_path = tmp_path / "tmp_dir_2"
retriever_cfg["path"] = dm_path
data = pd.DataFrame.from_dict(
{
"title": ["test"],
"url": ["http://url.com"],
"content": ["cool text"],
"source": ["sourceA"],
"embedding": [np.arange(10, dtype=np.float32) - 0.3],
"n_tokens": 5,
}
)
dm = DeepLakeDocumentsManager(vector_store_path=dm_path)
dm.add(df=data)
dm_data = retriever(**retriever_cfg).get_documents(sources=["sourceA"])
assert dm_data["title"].iloc[0] == data["title"].iloc[0]
assert dm_data["url"].iloc[0] == data["url"].iloc[0]
assert dm_data["content"].iloc[0] == data["content"].iloc[0]
assert dm_data["source"].iloc[0] == data["source"].iloc[0]
assert np.allclose(dm_data["embedding"].iloc[0], data["embedding"].iloc[0])
@pytest.mark.parametrize(
"documents_manager, retriever",
[
(DeepLakeDocumentsManager, DeepLakeRetriever),
],
)
def test_write_write_read(tmp_path, documents_manager, retriever):
retriever_cfg = {
"top_k": 3,
"thresh": 0.7,
"max_tokens": 2000,
"embedding_fn": get_openai_embedding,
}
db_path = tmp_path / "tmp_dir"
retriever_cfg["path"] = db_path
db = documents_manager(db_path)
data_1 = pd.DataFrame.from_dict(
{
"title": ["test"],
"url": ["http://url.com"],
"content": ["cool text"],
"embedding": [np.arange(10, dtype=np.float32) - 0.3],
"source": ["sourceA"],
"n_tokens": 10,
}
)
db.add(df=data_1, num_workers=NUM_WORKERS)
data_2 = pd.DataFrame.from_dict(
{
"title": ["other"],
"url": ["http://url.com/page.html"],
"content": ["lorem ipsum"],
"embedding": [np.arange(10, dtype=np.float32) / 10 - 2.3],
"source": ["sourceB"],
"n_tokens": 5,
}
)
db.add(df=data_2, num_workers=NUM_WORKERS)
db_data = retriever(**retriever_cfg).get_documents(sources=["sourceB"])
assert len(db_data) == len(data_2)
assert db_data["title"].iloc[0] == data_2["title"].iloc[0]
assert db_data["url"].iloc[0] == data_2["url"].iloc[0]
assert db_data["content"].iloc[0] == data_2["content"].iloc[0]
assert np.allclose(db_data["embedding"].iloc[0], data_2["embedding"].iloc[0])
def test_generate_embeddings(tmp_path, monkeypatch):
# Create fake data
df = pd.DataFrame.from_dict(
{"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]}
)
# Generate embeddings, store in a file
path = tmp_path / f"test_document_embeddings"
dm = DeepLakeDocumentsManager(path)
dm.add(df, embedding_fn=get_fake_embedding, num_workers=NUM_WORKERS)
# Read the embeddings from the file
retriever_cfg = {
"path": path,
"top_k": 3,
"thresh": 0.85,
"max_tokens": 3000,
"embedding_fn": get_fake_embedding,
}
read_df = DeepLakeRetriever(**retriever_cfg).get_documents("my_source")
# Check all the values are correct across the files
assert df["title"].iloc[0] == df["title"].iloc[0] == read_df["title"].iloc[0]
assert df["url"].iloc[0] == df["url"].iloc[0] == read_df["url"].iloc[0]
assert df["content"].iloc[0] == df["content"].iloc[0] == read_df["content"].iloc[0]
assert np.allclose(fake_embedding, read_df["embedding"].iloc[0])
def test_generate_embeddings_parallelized():
# Create fake data
df = pd.DataFrame.from_dict(
{
"title": ["test"] * 5,
"url": ["http://url.com"] * 5,
"content": ["cool text" + str(x) for x in range(5)],
"source": ["my_source"] * 5,
}
)
embeddings_parallel = compute_embeddings_parallelized(
df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS
)
embeddings = df.content.apply(get_openai_embedding)
# embeddings comes out as a series because of the apply, so cast it back to an array
embeddings_arr = np.array(embeddings.to_list())
# Not clear why a tolerance needs to be specified, likely because it is computed on different machines
# since the requests are done in parallel...
assert np.allclose(embeddings_parallel, embeddings_arr, atol=1e-2)
def test_add_batches(tmp_path):
dm_path = tmp_path / "deeplake_store"
num_samples = 20
batch_size = 16
csv_filename = os.path.join(tmp_path, "embedding_")
dm = DeepLakeDocumentsManager(vector_store_path=dm_path)
# Create fake data
df = pd.DataFrame.from_dict(
{
"title": ["test"] * num_samples,
"url": ["http://url.com"] * num_samples,
"content": ["cool text" + str(x) for x in range(num_samples)],
"source": ["my_source"] * num_samples,
}
)
dm.batch_add(
df,
embedding_fn=get_fake_embedding,
num_workers=NUM_WORKERS,
batch_size=batch_size,
min_time_interval=0,
csv_filename=csv_filename,
)
csv_files = [f for f in os.listdir(tmp_path) if f.endswith(".csv")]
# check that we registered the good number of doucments and that files were generated
assert len(dm) == num_samples
df_saved = pd.read_csv(csv_filename)
assert len(df_saved) == num_samples
assert "embedding" in df_saved.columns
================================================
FILE: tests/test_formatters.py
================================================
import json
import pandas as pd
import pytest
from buster.formatters.documents import DocumentsFormatterHTML, DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.tokenizers import GPTTokenizer
def test_DocumentsDormatterHTML__simple():
"""In this test, we expect all 3 documents to be matched and returned normally."""
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
documents_formatter = DocumentsFormatterHTML(
tokenizer=tokenizer,
max_tokens=100,
)
document_1 = "This is a very short document."
document_2 = "This is another very short document."
document_3 = "This is also a short document."
expected_docs_str = (
""
f"{document_1}<\\DOCUMENT>"
f"{document_2}<\\DOCUMENT>"
f"{document_3}<\\DOCUMENT>"
"<\\DOCUMENTS>"
)
matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]})
docs_str, matched_documents_new = documents_formatter.format(matched_documents)
# less documents and the new document is shorter than the original
assert all(matched_documents.content == matched_documents_new.content)
assert docs_str == expected_docs_str
def test_DocumentsDormatterJSON__simple():
"""In this test, we expect all 3 documents to be matched and returned normally."""
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=["content", "source"])
document_1 = "This is a very short document."
document_2 = "This is another very short document."
document_3 = "This is also a short document."
source_1 = "source 1"
source_2 = "source 2"
source_3 = "source 3"
data_dict = {
"content": [document_1, document_2, document_3],
"source": [source_1, source_2, source_3],
}
expected_docs_str = json.dumps(
[
{"content": document_1, "source": source_1},
{"content": document_2, "source": source_2},
{"content": document_3, "source": source_3},
],
separators=(",", ":"),
)
matched_documents = pd.DataFrame(data_dict)
docs_str, matched_documents_new = documents_formatter.format(matched_documents)
# less documents and the new document is shorter than the original
assert all(matched_documents.content == matched_documents_new.content)
assert docs_str == expected_docs_str # matched_documents.to_json(orient="records")
def test_DocumentsFormatterHTML__doc_to_long():
"""In this test, document_1 doesn't entirely fit.
we only expect a part of it to be contained.
"""
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
documents_formatter = DocumentsFormatterHTML(
tokenizer=tokenizer,
max_tokens=100,
)
long_sentence = "This is a very long document. It is long on purpose."
document_1 = long_sentence * 50
document_2 = "This is a very short document."
document_3 = "This is also a short document"
matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]})
docs_str, matched_documents_new = documents_formatter.format(matched_documents)
# less documents and the new document is shorter than the original
assert len(matched_documents) == 3
assert len(matched_documents_new) == 1
assert len(docs_str) < len(document_1)
# The long document gets truncated, the others don't make it in.
assert long_sentence in docs_str
assert document_2 not in docs_str
assert document_3 not in docs_str
def test_DocumentsFormatterJSON__doc_too_long():
"""In this test, document_3 doesn't fit.
We expect it to be excluded completely.
we only expect a part of it to be contained.
"""
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=["content", "source"])
long_sentence = "This is a very long document. It is long on purpose."
document_1 = "This is a very short document."
document_2 = "This is also a short document"
document_3 = long_sentence * 50
source_1 = "source 1"
source_2 = "source 2"
source_3 = "source 3"
data_dict = {
"content": [document_1, document_2, document_3],
"source": [source_1, source_2, source_3],
}
expected_docs_str = json.dumps(
[
{"content": document_1, "source": source_1},
{"content": document_2, "source": source_2},
],
separators=(",", ":"),
)
matched_documents = pd.DataFrame(data_dict)
docs_str, matched_documents_new = documents_formatter.format(matched_documents)
assert docs_str == expected_docs_str
# less documents and the new document is shorter than the original
assert len(matched_documents) == 3
assert len(matched_documents_new) == 2
# The last document gets ignored completely, the first 2 make it
assert document_1 in docs_str
assert document_2 in docs_str
assert long_sentence not in docs_str
def test_DocumentsFormatterHTML__doc_to_long_2():
"""In this test, document_2 doesn't entirely fit.
we only expect a part of it to be contained, as well as all of document_1, and none of document_3.
"""
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
documents_formatter = DocumentsFormatterHTML(
tokenizer=tokenizer,
max_tokens=100,
)
document_1 = "This is a very short document."
document_2 = "This is a very long document. It is long on purpose." * 50
document_3 = "This is also a short document"
matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]})
docs_str, matched_documents_new = documents_formatter.format(matched_documents)
# less documents and the new document is shorter than the original
assert len(matched_documents) == 3
assert len(matched_documents_new) == 2
assert document_1 in docs_str
assert "This is a very long document. It is long on purpose." in docs_str # at least a subset should be in there
assert document_3 not in docs_str
def test_DocumentsFormatterHTML__complex_format():
"""In this test, we expect all 3 documents to be matched and returned in a particular format."""
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
documents_formatter = DocumentsFormatterHTML(
tokenizer=tokenizer,
max_tokens=100,
formatter="Title: {title}\n{content}\n",
)
document_1 = "This is a very short document."
document_2 = "This is another very short document."
document_3 = "This is also a short document."
title_1 = "doc1"
title_2 = "doc2"
title_3 = "doc3"
country_1 = "Canada"
country_2 = "France"
country_3 = "Germany"
expected_docs_str = (
""
f"Title: {title_1}\n{document_1}\n<\\DOCUMENT>"
f"Title: {title_2}\n{document_2}\n<\\DOCUMENT>"
f"Title: {title_3}\n{document_3}\n<\\DOCUMENT>"
"<\\DOCUMENTS>"
)
matched_documents = pd.DataFrame(
{
"content": [document_1, document_2, document_3],
"title": [title_1, title_2, title_3],
"country": [country_1, country_2, country_3],
}
)
docs_str, matched_documents_new = documents_formatter.format(matched_documents)
# less documents and the new document is shorter than the original
assert all(matched_documents.content == matched_documents_new.content)
assert docs_str == expected_docs_str
def test_system_prompt_formatter():
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
prompt_formatter = PromptFormatter(
tokenizer=tokenizer,
max_tokens=200,
text_after_docs="After docs.",
text_before_docs="Before docs.",
formatter="{text_before_docs}\n{documents}\n{text_after_docs}",
)
documents = "Here are some docs"
prompt = prompt_formatter.format(documents)
assert prompt == ("Before docs.\n" "Here are some docs\n" "After docs.")
assert documents in prompt
def test_system_prompt_formatter__to_long():
tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
prompt_formatter = PromptFormatter(
tokenizer=tokenizer,
max_tokens=200,
text_after_docs="After docs.",
text_before_docs="Before docs.",
)
documents = "Here are some documents that are WAY too long." * 100
with pytest.raises(ValueError):
prompt_formatter.format(documents)
================================================
FILE: tests/test_read_write.py
================================================
import pandas as pd
from buster.completers import Completion, UserInputs
class MockValidator:
def __init__(self):
self.use_reranking = True
def check_answer_relevance(self, completion: Completion) -> bool:
return True
def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> bool:
return matched_documents
def test_read_write_completion():
n_samples = 3
completion_kwargs = {"param_1": "a"}
matched_documents = pd.DataFrame.from_dict(
{
"title": ["test"] * n_samples,
"url": ["http://url.com"] * n_samples,
"content": ["cool text"] * n_samples,
"embedding": [[0.0] * 1000] * n_samples,
"n_tokens": [10] * n_samples,
"source": ["fake source"] * n_samples,
}
)
c = Completion(
user_inputs=UserInputs(original_input="What is the meaning of life?"),
error=False,
answer_text="This is my actual answer",
matched_documents=matched_documents,
validator=MockValidator(),
completion_kwargs=completion_kwargs,
)
c_json = c.to_json()
c_back = Completion.from_dict(c_json)
assert c.error == c_back.error
assert c.answer_text == c_back.answer_text
assert c.user_inputs == c_back.user_inputs
assert c.answer_relevant == c_back.answer_relevant
assert c.completion_kwargs == c_back.completion_kwargs
for col in c_back.matched_documents.columns.tolist():
assert col in c.matched_documents.columns.tolist()
assert c_back.matched_documents[col].tolist() == c.matched_documents[col].tolist()
================================================
FILE: tests/test_validator.py
================================================
import pandas as pd
from buster.llm_utils import get_openai_embedding
from buster.validators import Validator
validator_cfg = {
"use_reranking": True,
"validate_documents": True,
"answer_validator_cfg": {
"unknown_response_templates": [
"I Don't know how to answer your question.",
],
"unknown_threshold": 0.85,
},
"question_validator_cfg": {
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
"completion_kwargs": {
"model": "gpt-3.5-turbo",
"stream": False,
"temperature": 0,
},
"check_question_prompt": "You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.",
},
}
validator = Validator(**validator_cfg)
def test_validator_check_question_relevance():
question = "What is backpropagation?"
relevance, _ = validator.check_question_relevance(question)
assert relevance == True
question = "How can I make a broccoli soup?"
relevance, _ = validator.check_question_relevance(question)
assert relevance == False
def test_validator_check_answer_relevance():
answer = "Not sure how to answer your question"
assert validator.check_answer_relevance(answer) == False
answer = "According to the documentation, the answer should be 2+2 = 4."
assert validator.check_answer_relevance(answer) == True
def test_validator_check_documents_relevance():
docs = {
"content": [
"A panda is a bear native to China, known for its black and white fur.",
"An apple is a sweet fruit, often red, green, or yellow in color.",
"A car is a wheeled vehicle used for transportation, typically powered by an engine.",
]
}
answer = "Pandas live in China."
expected_relevance = [True, False, False]
matched_documents = pd.DataFrame(docs)
matched_documents = validator.check_documents_relevance(answer=answer, matched_documents=matched_documents)
assert "relevance" in matched_documents.columns
assert matched_documents.relevance.to_list() == expected_relevance
def test_validator_rerank_docs():
documents = [
"A basketball player practicing",
"A cat eating an orange",
"A green apple on the counter",
]
matched_documents = pd.DataFrame({"documents": documents})
matched_documents["embedding"] = matched_documents.documents.apply(lambda x: get_openai_embedding(x))
answer = "An apple is a delicious fruit."
reranked_documents = validator.rerank_docs(answer, matched_documents)
assert reranked_documents.documents.to_list() == [
"A green apple on the counter",
"A cat eating an orange",
"A basketball player practicing",
]