Repository: jerpint/buster Branch: main Commit: 07b6bb893f47 Files: 44 Total size: 152.0 KB Directory structure: gitextract_5l_frr4b/ ├── .github/ │ └── workflows/ │ ├── publish_pypi.yaml │ └── tests.yaml ├── .gitignore ├── LICENSE.md ├── README.md ├── buster/ │ ├── __init__.py │ ├── busterbot.py │ ├── completers/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── chatgpt.py │ │ └── user_inputs.py │ ├── documents_manager/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── deeplake.py │ │ └── service.py │ ├── examples/ │ │ ├── cfg.py │ │ ├── generate_embeddings.py │ │ ├── gradio_app.py │ │ └── stackoverflow.csv │ ├── formatters/ │ │ ├── documents.py │ │ └── prompts.py │ ├── llm_utils/ │ │ ├── __init__.py │ │ ├── embeddings.py │ │ └── question_reformulator.py │ ├── parsers/ │ │ ├── __init__.py │ │ └── parser.py │ ├── retriever/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── deeplake.py │ │ └── service.py │ ├── tokenizers/ │ │ ├── __init__.py │ │ ├── base.py │ │ └── gpt.py │ ├── utils.py │ └── validators/ │ ├── __init__.py │ ├── base.py │ └── validators.py ├── pyproject.toml ├── requirements.txt └── tests/ ├── test_chatbot.py ├── test_documents.py ├── test_formatters.py ├── test_read_write.py └── test_validator.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/publish_pypi.yaml ================================================ name: publish-pypi on: workflow_dispatch: release: types: [created] jobs: deploy: runs-on: ubuntu-latest environment: secrets steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' - name: Install dependencies run: | python -m pip install --upgrade pip pip install poetry - name: Build and publish env: POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }} run: | poetry version $(git describe --tags --abbrev=0) poetry add $(cat requirements.txt) poetry build poetry publish ================================================ FILE: .github/workflows/tests.yaml ================================================ name: Tests on: [pull_request] jobs: tests: runs-on: ubuntu-latest environment: secrets steps: - name: Check out repository code uses: actions/checkout@v3 - name: black linter uses: psf/black@stable with: options: "--check --diff --line-length 120" - name: isort run: | pip install isort isort --profile black --check-only . - name: unit tests env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | python3 -m pip install --upgrade pip pip install -e . pytest ================================================ FILE: .gitignore ================================================ # database files *.db buster/apps/data/ deeplake_store/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # Macos *.DS_Store* albenchmark/data/ # Ignore notebooks by default *.ipynb # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # VSCode .vscode/ ================================================ FILE: LICENSE.md ================================================ MIT License Copyright (c) 2023 Buster dev team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Buster, the QA documentation chatbot!
[![GitHub](https://img.shields.io/github/license/jerpint/buster)](https://github.com/jerpint/buster) [![PyPI](https://img.shields.io/pypi/v/buster-doctalk?logo=pypi)](https://pypi.org/project/buster-doctalk) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Hugging Face Spaces](https://img.shields.io/badge/🤗%20Hugging%20Face-Buster%20Demo-blue)](https://huggingface.co/spaces/jerpint/buster)
Buster is a question-answering chatbot that can be tuned to any source of documentations. # Demo In order to view the full abilities of Buster, you can play with our [live demo here](https://huggingface.co/spaces/jerpint/buster). We scraped the documentation of [huggingface 🤗 Transformers](https://huggingface.co/docs/transformers/index) and instructed Buster to answer questions related to its usage. # Quickstart This section is meant to help you install and run local version of Buster. First step, install buster: **Note**: Buster requires python>=3.10 ```bash pip install buster-doctalk ``` Then, go to the examples folder and launch the app. We've included small sample data off stackoverflow-ai questions that you can test your setup with to try app: ```bash cd buster/buster/examples gradio gradio_app.py ``` This will launch the gradio app locally. **NOTE**: The demo uses chatGPT to generate text and compute embeddings, make sure to set a valid openai API key: ```bash export OPENAI_API_KEY=sk-... ``` # Generating your own embeddings Once your local version of Buster is up and running, the next step is for you to be able to import your own data. We will be using the `stackoverflow.csv` file in the `buster/examples/` folder for this. This is the same data that was used to generate the demo app's embeddings. You will first ingest the documents to be ready for buster. In this example, we use Deeplake's vector store, but you can always write your own custom `DocumentManager`: ```python import pandas as pd from buster.documents_manager import DeepLakeDocumentsManager # Read the csv df = pd.read_csv("stackoverflow.csv") # Generate the embeddings for our documents and store them in a deeplake format dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True) dm.add(df) ``` You can also just simply run the script: python generate_embeddings.py --csv stackoverflow.csv This will generate the embeddings and save them locally in the `deeplake_store`. **NOTE**: You will need to set a valid openai key for computing embeddings: ```bash export OPENAI_API_KEY=sk-... ``` You only need to run this operation one time. In the .csv, we expect columns ["title", "url", "content", "source"] for each row of the csv: * title: this will be the title of the url to display * url: the link that clicking the title will redirect to * source: where the content was originally sourced from (e.g. wikipedia, stackoverflow, etc.) * content: plaintext of the documents to be embedded. It is your responsibility to chunk your documents appropriately. For better results, we recommend chunks of 400-600 words. # Additional Configurations Properly prompting models as well as playing around with various model parameters can lead to different results. We use a `BusterConfig` object to keep track of the various Buster configurations. In the `buster/examples/` folder, the config is stored inside `cfg.py`. Modify this config to update parameters, prompts, etc. # How does Buster work? First, we parsed the documentation into snippets. For each snippet, we obtain an embedding by using the [OpenAI API](https://beta.openai.com/docs/guides/embeddings/what-are-embeddings). Then, when a user asks a question, we compute its embedding, and find the snippets from the doc with the highest cosine similarity to the question. Finally, we craft the prompt: - The most relevant snippets from the doc. - The engineering prompt. - The user's question. We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user! ### Currently available models - For embeddings: "text-embedding-ada-002" - For completion: We support both "gpt-3.5-turbo" and "gpt-4" ### Livestream For more information, you can watch the livestream where explain how buster works in detail! - [Livestream recording](https://youtu.be/LB5g-AhfPG8) ================================================ FILE: buster/__init__.py ================================================ ================================================ FILE: buster/busterbot.py ================================================ import logging from dataclasses import dataclass, field from typing import Optional import pandas as pd from buster.completers import Completion, DocumentAnswerer, UserInputs from buster.llm_utils import QuestionReformulator, get_openai_embedding from buster.retriever import Retriever from buster.validators import Validator logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @dataclass class BusterConfig: """Configuration object for a chatbot.""" validator_cfg: dict = field( default_factory=lambda: { "use_reranking": True, "validate_documents": False, } ) tokenizer_cfg: dict = field( default_factory=lambda: { "model_name": "gpt-3.5-turbo", } ) retriever_cfg: dict = field( default_factory=lambda: { "max_tokens": 3000, "top_k": 3, "thresh": 0.7, "embedding_fn": get_openai_embedding, } ) prompt_formatter_cfg: dict = field( default_factory=lambda: { "max_tokens": 3500, "text_before_docs": "You are a chatbot answering questions.\n", "text_after_docs": "Answer the following question:\n", "formatter": "{text_before_docs}\n{documents}\n{text_after_docs}", } ) documents_formatter_cfg: dict = ( field( default_factory=lambda: { "max_tokens": 3500, "formatter": "{content}", } ), ) documents_answerer_cfg: dict = field( default_factory=lambda: { "no_documents_message": "No documents are available for this question.", } ) question_reformulator_cfg: dict = field( default_factory=lambda: { "completion_kwargs": { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, }, "system_prompt": """ Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system. Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval.""", } ) completion_cfg: dict = field( default_factory=lambda: { "completion_kwargs": { "model": "gpt-3.5-turbo", "temperature": 0, "stream": True, }, } ) class Buster: def __init__( self, retriever: Retriever, document_answerer: DocumentAnswerer, validator: Validator, question_reformulator: Optional[QuestionReformulator] = None, ): self.document_answerer = document_answerer self.retriever = retriever self.validator = validator self.question_reformulator = question_reformulator def process_input( self, user_input: str, sources: Optional[list[str]] = None, top_k: Optional[int] = None, reformulate_question: Optional[bool] = False, ) -> Completion: """ Main function to process the input question and generate a formatted output. """ logger.info(f"User Input:\n{user_input}") # We make sure there is always a newline at the end of the question to avoid completing the question. if not user_input.endswith("\n"): user_input += "\n" user_inputs = UserInputs(original_input=user_input) # The returned message is either a generic invalid question message or an error handling message question_relevant, irrelevant_question_message = self.validator.check_question_relevance(user_input) if question_relevant: # question is relevant, get completor to generate completion # reformulate the question if a reformulator is defined if self.question_reformulator is not None and reformulate_question: reformulated_input, reformulation_error = self.question_reformulator.reformulate( user_inputs.original_input ) user_inputs.reformulated_input = reformulated_input if reformulation_error: completion = Completion( error=True, user_inputs=user_inputs, matched_documents=pd.DataFrame(), answer_text="Something went wrong reformulating the question. Try again soon.", answer_relevant=False, question_relevant=False, validator=self.validator, ) return completion # Retrieve and answer matched_documents = self.retriever.retrieve(user_inputs, sources=sources, top_k=top_k) completion: Completion = self.document_answerer.get_completion( user_inputs=user_inputs, matched_documents=matched_documents, validator=self.validator, question_relevant=question_relevant, ) return completion else: # question was determined irrelevant, so we instead return a generic response set by the user. completion = Completion( error=False, user_inputs=user_inputs, matched_documents=pd.DataFrame(), answer_text=irrelevant_question_message, answer_relevant=False, question_relevant=False, validator=self.validator, ) return completion ================================================ FILE: buster/completers/__init__.py ================================================ from .base import Completer, Completion, DocumentAnswerer from .chatgpt import ChatGPTCompleter from .user_inputs import UserInputs __all__ = [ ChatGPTCompleter, Completer, Completion, DocumentAnswerer, UserInputs, ] ================================================ FILE: buster/completers/base.py ================================================ import io import logging import warnings from abc import ABC, abstractmethod from typing import Any, Iterator, Optional import pandas as pd from fastapi.encoders import jsonable_encoder from buster.completers.user_inputs import UserInputs from buster.formatters.documents import DocumentsFormatter from buster.formatters.prompts import PromptFormatter logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class Completion: """ A class to represent the completion object of a model's output for a user's question. Attributes: error (bool): A boolean indicating if an error occurred when generating the completion. user_inputs (UserInputs): The inputs from the user. matched_documents (pd.DataFrame): The documents that were matched to the user's question. answer_generator (Iterator): An optional iterator used to generate the model's answer. answer_text (str): An optional answer text. answer_relevant (bool): An optional boolean indicating if the answer is relevant. question_relevant (bool): An optional boolean indicating if the question is relevant. completion_kwargs (dict): Optional arguments for the completion. validator (Validator): An optional Validator object. Methods: __repr__: Outputs a string representation of the object. _validate_arguments: Validates answer_generator and answer_text arguments. answer_relevant: Determines if the answer is relevant or not. question_relevant: Retrieves the relevance of the question. answer_text: Retrieves the answer text. answer_generator: Retrieves the answer generator. postprocess: Postprocesses the results after generating the model's answer. to_json: Outputs selected attributes of the object in JSON format. from_dict: Creates a Completion object from a dictionary. """ def __init__( self, error: bool, user_inputs: UserInputs, matched_documents: pd.DataFrame, answer_generator: Optional[Iterator] = None, answer_text: Optional[str] = None, answer_relevant: Optional[bool] = None, question_relevant: Optional[bool] = None, completion_kwargs: Optional[dict] = None, validator=None, ): self.error = error self.user_inputs = user_inputs self.matched_documents = matched_documents self.validator = validator self.completion_kwargs = completion_kwargs self._answer_relevant = answer_relevant self._question_relevant = question_relevant self._validate_arguments(answer_generator, answer_text) def __repr__(self): class_name = type(self).__name__ return ( f"{class_name}(" f"user_inputs={self.user_inputs!r}, " f"error={self.error!r}, " f"matched_documents={self.matched_documents!r}, " f"answer_text={self._answer_text!r}, " f"answer_generator={self.answer_generator!r}, " f"answer_relevant={self._answer_relevant!r}, " f"question_relevant={self.question_relevant!r}, " f"completion_kwargs={self.completion_kwargs!r}, " ")," ) def _validate_arguments(self, answer_generator: Optional[Iterator], answer_text: Optional[str]): """Sets answer_generator and answer_text properties depending on the provided inputs. Checks that one of either answer_generator or answer_text is not None. If answer_text is set, a generator can simply be inferred from answer_text. If answer_generator is set, answer_text will be set only once the generator gets called. Set to None for now. """ if (answer_generator is None and answer_text is None) or ( answer_generator is not None and answer_text is not None ): raise ValueError("Only one of 'answer_generator' and 'answer_text' must be set.") # If text is provided, the genrator can be inferred if answer_text is not None: assert isinstance(answer_text, str) answer_generator = (msg for msg in answer_text) self._answer_text = answer_text self._answer_generator = answer_generator @property def answer_relevant(self) -> bool: """Property determining the relevance of an answer (bool). If an error occured, the relevance is False. If no documents were retrieved, the relevance is also False. Otherwise, the relevance is computed as defined by the validator (e.g. comparing to embeddings) """ if self.error: self._answer_relevant = False elif len(self.matched_documents) == 0: self._answer_relevant = False elif self._answer_relevant is not None: return self._answer_relevant else: # Check the answer relevance by looking at the embeddings self._answer_relevant = self.validator.check_answer_relevance(self.answer_text) return self._answer_relevant @property def question_relevant(self): """Property determining the relevance of the question asked (bool).""" return self._question_relevant @property def answer_text(self): if self._answer_text is None: # generates the text if it wasn't already generated self._answer_text = "".join([i for i in self.answer_generator]) return self._answer_text @answer_text.setter def answer_text(self, value: str) -> None: self._answer_text = value @property def answer_generator(self): # keeps track of the yielded text self._answer_text = "" for token in self._answer_generator: self._answer_text += token yield token self.postprocess() @answer_generator.setter def answer_generator(self, generator: Iterator) -> None: self._answer_generator = generator def postprocess(self): """Function executed after the answer text is generated by the answer_generator""" if self.validator is None: # TODO: This should only happen if declaring a Completion using .from_dict() method. # This behaviour is not ideal and we may want to remove support for .from_dict() in the future. logger.info("No validator was set, skipping postprocessing.") return if self.validator.use_reranking: # rerank docs in order of cosine similarity to the question self.matched_documents = self.validator.rerank_docs( answer=self.answer_text, matched_documents=self.matched_documents ) if self.validator.validate_documents: self.matched_documents = self.validator.check_documents_relevance( answer=self.answer_text, matched_documents=self.matched_documents ) # access the property so it gets set if not computed alerady self.answer_relevant def to_json(self, columns_to_ignore: Optional[list[str]] = None) -> Any: """Converts selected attributes of the object to a JSON format. Args: columns_to_ignore (list[str]): A list of column names to ignore in the csulting matched_documents dataframe. Returns: Any: The object's attributes encoded as JSON. Notes: - The 'matched_documents' attribute of type pd.DataFrame is encoded separately using a custom encoder. - The resulting JSON may exclude specified columns based on the 'columns_to_ignore' parameter. """ def encode_df(df: pd.DataFrame) -> dict: if columns_to_ignore is not None: df = df.drop(columns=columns_to_ignore, errors="ignore") return df.to_json(orient="index") custom_encoder = { # Converts the matched_documents in the user_responses to json pd.DataFrame: encode_df, } to_encode = { "user_inputs": self.user_inputs, "answer_text": self.answer_text, "matched_documents": self.matched_documents, "answer_relevant": self.answer_relevant, "question_relevant": self.question_relevant, "completion_kwargs": self.completion_kwargs, "error": self.error, } return jsonable_encoder(to_encode, custom_encoder=custom_encoder) @classmethod def from_dict(cls, completion_dict: dict): # Map a dict of user inputs to the UserInputs class if isinstance(completion_dict["user_inputs"], dict): completion_dict["user_inputs"] = UserInputs(**completion_dict["user_inputs"]) # Map the matched documents back to a dataframe if isinstance(completion_dict["matched_documents"], str): # avoids deprecation warning json_data = io.StringIO(completion_dict["matched_documents"]) completion_dict["matched_documents"] = pd.read_json(json_data, orient="index") elif isinstance(completion_dict["matched_documents"], dict): completion_dict["matched_documents"] = pd.DataFrame(completion_dict["matched_documents"]).T else: raise ValueError(f"Unknown type for matched_documents: {type(completion_dict['matched_documents'])}") return cls(**completion_dict) class Completer(ABC): """ Abstract base class for completers, which generate an answer to a prompt. Methods: complete: The method that should be implemented by any child class to provide an answer to a prompt. """ @abstractmethod def complete(self, prompt: str, user_input) -> (str | Iterator, bool): """Returns the completed message (can be a generator), and a boolean to indicate if an error occured or not.""" ... class DocumentAnswerer: """ A class that answers questions based on documents. It takes care of formatting the prompts and the documents, and generating the answer when relevant. Attributes: completer (Completer): Object that actually generates an answer to the prompt. documents_formatter (DocumentsFormatter): Object that formats the documents for the prompt. prompt_formatter (PromptFormatter): Object that prepares the prompt for the completer. no_documents_message (str): Message to display when no documents are found to match the query. completion_class (Completion): Class to use for the resulting completion. Methods: prepare_prompt: Prepares the prompt that will be passed to the completer. get_completion: Generates a completion to the user's question based on matched documents. """ def __init__( self, documents_formatter: DocumentsFormatter, prompt_formatter: PromptFormatter, completer: Completer, completion_class: Completion = Completion, no_documents_message: str = "No documents were found that match your question.", ): self.completer = completer self.documents_formatter = documents_formatter self.prompt_formatter = prompt_formatter self.no_documents_message = no_documents_message self.completion_class = completion_class def prepare_prompt(self, matched_documents) -> str: """Prepare the prompt with prompt engineering. A user's question is not included here. We use the documents formatter and prompt formatter to compose the prompt itself. """ # format the matched documents, (will truncate them if too long) formatted_documents, _ = self.documents_formatter.format(matched_documents) prompt = self.prompt_formatter.format(formatted_documents) return prompt def get_completion( self, user_inputs: UserInputs, matched_documents: pd.DataFrame, validator, question_relevant: bool = True, ) -> Completion: """Generate a completion to a user's question based on matched documents. It is safe to assume the question_relevance to be True if we made it here.""" logger.info(f"{user_inputs=}") if len(matched_documents) == 0: warning_msg = "No documents found during retrieval." warnings.warn(warning_msg) logger.warning(warning_msg) # empty dataframe matched_documents = pd.DataFrame(columns=matched_documents.columns) # because we are requesting a completion, we assume the question is relevant. # However, no documents were found, so we pass the no documents found message instead of generating the answer. # The completion does not get triggered, so we do not pass completion kwargs here either. completion = self.completion_class( user_inputs=user_inputs, answer_text=self.no_documents_message, error=False, matched_documents=matched_documents, question_relevant=question_relevant, validator=validator, ) return completion # prepare the prompt with matched documents prompt = self.prepare_prompt(matched_documents) logger.info(f"{prompt=}") logger.info(f"querying model with parameters: {self.completer.completion_kwargs}...") try: answer_generator, error = self.completer.complete(prompt=prompt, user_input=user_inputs.current_input) except Exception as e: error = True answer_generator = "Something went wrong with the request, try again soon!" logger.exception("Unknown error when attempting to generate response. See traceback:") completion = self.completion_class( answer_generator=answer_generator, error=error, matched_documents=matched_documents, user_inputs=user_inputs, question_relevant=question_relevant, validator=validator, completion_kwargs=self.completer.completion_kwargs, ) return completion ================================================ FILE: buster/completers/chatgpt.py ================================================ import logging import os from typing import Iterator, Optional import openai from openai import OpenAI from buster.completers import Completer logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # Check if an API key exists for promptlayer, if it does, use it promptlayer_api_key = os.environ.get("PROMPTLAYER_API_KEY") if promptlayer_api_key: # TODO: Check if this still works with latest openAI API... try: import promptlayer logger.info("Enabling prompt layer...") promptlayer.api_key = promptlayer_api_key # replace openai with the promptlayer wrapper openai = promptlayer.openai except Exception as e: logger.exception("Something went wrong enabling promptlayer.") class ChatGPTCompleter(Completer): def __init__(self, completion_kwargs: dict, client_kwargs: Optional[dict] = None): """Initialize the ChatGPTCompleter with completion and client keyword arguments. Args: completion_kwargs: A dictionary of keyword arguments to be used for completions. client_kwargs: An optional dictionary of keyword arguments to be used for the OpenAI client. """ # use default client if none passed self.completion_kwargs = completion_kwargs if client_kwargs is None: client_kwargs = {} self.client = OpenAI(**client_kwargs) def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str | Iterator, bool): """Given a prompt and user input, returns the generated message and error flag. Args: prompt: The prompt containing the formatted documents and instructions. user_input: The user input to be responded to. completion_kwargs: An optional dictionary of keyword arguments to override the default completion kwargs. Returns: A tuple containing the completed message and a boolean indicating if an error occurred. Raises: openai.BadRequestError: If the completion request is invalid. openai.RateLimitError: If the OpenAI servers are overloaded. """ # Uses default configuration if not overridden if completion_kwargs is None: completion_kwargs = self.completion_kwargs messages = [ {"role": "system", "content": prompt}, {"role": "user", "content": user_input}, ] try: error = False response = self.client.chat.completions.create(messages=messages, **completion_kwargs) except openai.BadRequestError: error = True logger.exception("Invalid request to OpenAI API. See traceback:") error_message = "Something went wrong while connecting with OpenAI, try again soon!" return error_message, error except openai.RateLimitError: error = True logger.exception("RateLimit error from OpenAI. See traceback:") error_message = "OpenAI servers seem to be overloaded, try again later!" return error_message, error except Exception as e: error = True logger.exception("Some kind of error happened trying to generate the response. See traceback:") error_message = "Something went wrong with connecting with OpenAI, try again soon!" return error_message, error if completion_kwargs.get("stream") is True: # We are entering streaming mode, so here we're just wrapping the streamed # openai response to be easier to handle later def answer_generator(): for chunk in response: token = chunk.choices[0].delta.content # Always stream a string, openAI returns None on last token token = "" if token is None else token yield token return answer_generator(), error else: full_response: str = response.choices[0].message.content return full_response, error ================================================ FILE: buster/completers/user_inputs.py ================================================ from dataclasses import dataclass from typing import Optional @dataclass class UserInputs: """A class that represents user inputs. Attributes: original_input: The original user input. reformulated_input: The reformulated user input (optional). """ original_input: str reformulated_input: Optional[str] = None @property def current_input(self): """Returns the current user input. If the reformulated input is not None, it returns the reformulated input. Otherwise, it returns the original input. Returns: The current user input. """ return self.reformulated_input if self.reformulated_input is not None else self.original_input ================================================ FILE: buster/documents_manager/__init__.py ================================================ from .base import DocumentsManager from .deeplake import DeepLakeDocumentsManager from .service import DocumentsService __all__ = [DocumentsManager, DocumentsService, DeepLakeDocumentsManager] ================================================ FILE: buster/documents_manager/base.py ================================================ import logging import time from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Callable, Optional import numpy as np import pandas as pd from tqdm import tqdm from buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding tqdm.pandas() logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @dataclass class DocumentsManager(ABC): def __init__(self, required_columns: Optional[list[str]] = None): """ Constructor for DocumentsManager class. Args: required_columns (Optional[list[str]]): A list of column names that are required for the dataframe to contain. If None, no columns are enforced. """ self.required_columns = required_columns def _check_required_columns(self, df: pd.DataFrame): """Each entry in the df is expected to have the columns in self.required_columns""" if not all(col in df.columns for col in self.required_columns): raise ValueError(f"DataFrame is missing one or more of {self.required_columns=}") def _checkpoint_csv(self, df, csv_filename: str, csv_overwrite: bool = True): """ Saves DataFrame with embeddings to a CSV checkpoint. Args: df (pd.DataFrame): The DataFrame with embeddings. csv_filename (str): Path to save a copy of the dataframe with computed embeddings for later use. csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True. """ import os if csv_overwrite: df.to_csv(csv_filename) logger.info(f"Saved DataFrame with embeddings to {csv_filename}") else: if os.path.exists(csv_filename): # append to existing file append_df = pd.read_csv(csv_filename) append_df = pd.concat([append_df, df]) else: # will create the new file append_df = df.copy() append_df.to_csv(csv_filename) logger.info(f"Appending DataFrame embeddings to {csv_filename}") def add( self, df: pd.DataFrame, num_workers: int = 16, embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding, sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None, csv_filename: Optional[str] = None, csv_overwrite: bool = True, **add_kwargs, ): """Write documents from a DataFrame into the DocumentManager store. This method adds documents from the provided DataFrame to the database. It performs the following steps: 1. Checks if the required columns are present in the DataFrame. 2. Computes embeddings for the 'content' column if they are not already present. 3. Optionally saves the DataFrame with computed embeddings to a CSV checkpoint. 4. Calls the '_add_documents' method to add documents with embeddings to the DocumentsManager. Args: df (pd.DataFrame): The DataFrame containing the documents to be added. num_workers (int, optional): The number of parallel workers to use for computing embeddings. Default is 32. embedding_fn (callable, optional): A function that computes embeddings for a given input string. Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model. sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string. Default is None. Only use if you want sparse embeddings. csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use. csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True. **add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method. """ if self.required_columns is not None: self._check_required_columns(df) # Check if embeddings are present, computes them if not if "embedding" not in df.columns: df["embedding"] = compute_embeddings_parallelized(df, embedding_fn=embedding_fn, num_workers=num_workers) if "sparse_embedding" not in df.columns and sparse_embedding_fn is not None: df["sparse_embedding"] = sparse_embedding_fn(df.content.to_list()) if csv_filename is not None: self._checkpoint_csv(df, csv_filename=csv_filename, csv_overwrite=csv_overwrite) self._add_documents(df, **add_kwargs) def batch_add( self, df: pd.DataFrame, batch_size: int = 3000, min_time_interval: int = 60, num_workers: int = 16, embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding, sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None, csv_filename: Optional[str] = None, csv_overwrite: bool = False, **add_kwargs, ): """ Adds DataFrame data to a DataManager instance in batches. This function takes a DataFrame and adds its data to a DataManager instance in batches. It ensures that a minimum time interval is maintained between successive batches to prevent timeouts or excessive load. This is useful for APIs like openAI with rate limits. Args: df (pd.DataFrame): The input DataFrame containing data to be added. batch_size (int, optional): The size of each batch. Defaults to 3000. min_time_interval (int, optional): The minimum time interval (in seconds) between batches. Defaults to 60. num_workers (int, optional): The number of parallel workers to use when adding data. Defaults to 32. embedding_fn (callable, optional): A function that computes embeddings for a given input string. Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model. sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string. Default is None. Only use if you want sparse embeddings. csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use. csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to False. When using batches, set to False to keep all embeddings in the same file. You may want to manually remove the file if experimenting. **add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method. """ total_batches = (len(df) // batch_size) + 1 logger.info(f"Adding {len(df)} documents with {batch_size=} for {total_batches=}") for batch_idx in range(total_batches): logger.info(f"Processing batch {batch_idx + 1}/{total_batches}") start_time = time.time() # Calculate batch indices and extract batch DataFrame start_idx = batch_idx * batch_size end_idx = min((batch_idx + 1) * batch_size, len(df)) batch_df = df.iloc[start_idx:end_idx] # Add the batch data to using specified parameters self.add( batch_df, num_workers=num_workers, csv_filename=csv_filename, csv_overwrite=csv_overwrite, embedding_fn=embedding_fn, sparse_embedding_fn=sparse_embedding_fn, **add_kwargs, ) elapsed_time = time.time() - start_time # Sleep to ensure the minimum time interval is maintained # Only sleep if it's not the last iteration if batch_idx < total_batches - 1: sleep_time = max(0, min_time_interval - elapsed_time) if sleep_time > 0: logger.info(f"Sleeping for {round(sleep_time)} seconds...") time.sleep(sleep_time) logger.info("All batches processed.") @abstractmethod def _add_documents(self, df: pd.DataFrame, **add_kwargs): """Abstract method to be implemented by each inherited member. This method should handle the actual process of adding documents to the database. """ ... ================================================ FILE: buster/documents_manager/deeplake.py ================================================ import logging from typing import Optional import pandas as pd from buster.utils import zip_contents from .base import DocumentsManager logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class DeepLakeDocumentsManager(DocumentsManager): def __init__( self, vector_store_path: str = "deeplake_store", required_columns: Optional[list[str]] = None, **vector_store_kwargs, ): """Initialize a DeepLakeDocumentsManager object. Args: vector_store_path: The path to the vector store. required_columns: A list of columns that are required in the dataframe. **vector_store_kwargs: Additional keyword arguments to pass to the VectorStore initializer. """ from deeplake.core.vectorstore import VectorStore self.vector_store_path = vector_store_path self.required_columns = required_columns self.vector_store = VectorStore( path=self.vector_store_path, **vector_store_kwargs, ) def __len__(self): """Get the number of documents in the vector store. Returns: The number of documents in the vector store. """ return len(self.vector_store) @classmethod def _extract_metadata(cls, df: pd.DataFrame) -> dict: """Extract metadata from the dataframe in DeepLake dict format. Args: df: The dataframe from which to extract metadata. Returns: The extracted metadata in DeepLake dict format. """ # Ignore the content and embedding column for metadata df = df.drop(columns=["content", "embedding"], errors="ignore") columns = list(df.columns) metadata = df.apply( lambda x: {col: x[col] for col in columns}, axis=1, ).to_list() return metadata def _add_documents(self, df: pd.DataFrame, **add_kwargs): """Write all documents from the dataframe into the vector store as a new version. Each entry in the dataframe is expected to have at least the following columns: ["content", "embedding"] Embeddings will have been precomputed in the self.add() method, which calls this one. Args: df: The dataframe containing the documents to add. **add_kwargs: Additional keyword arguments to pass to the add method of the vector store. """ # Embedding should already be computed in the .add method assert "embedding" in df.columns, "expected column=embedding in the dataframe" # extract the chunked text + metadata metadata = self._extract_metadata(df) chunked_text = df.content.to_list() embeddings = df.embedding.to_list() self.vector_store.add( text=chunked_text, embedding=embeddings, metadata=metadata, **add_kwargs, ) def to_zip(self, output_path: str = "."): """Zip the contents of the vector store path folder to a .zip file in the output path. Args: output_path: The path where the zip file should be created. Returns: The path to the created zip file. """ vector_store_path = self.vector_store_path logger.info(f"Compressing {vector_store_path}...") zip_file_path = zip_contents(input_path=vector_store_path, output_path=output_path) logger.info(f"Compressed {vector_store_path} to {zip_file_path}.") return zip_file_path ================================================ FILE: buster/documents_manager/service.py ================================================ import logging import pandas as pd import pinecone from pymongo.mongo_client import MongoClient from pymongo.server_api import ServerApi from buster.documents_manager.base import DocumentsManager logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class DocumentsService(DocumentsManager): """Manager to use in production. Mixed Pinecone and MongoDB backend.""" def __init__( self, pinecone_api_key: str, pinecone_index: str, pinecone_namespace: str, mongo_uri: str, mongo_db_name: str, **kwargs, ): """Initialize the DocumentsService. Args: pinecone_api_key: The Pinecone API key. pinecone_env: The Pinecone environment. pinecone_index: The Pinecone index. pinecone_namespace: The Pinecone namespace. mongo_uri: The MongoDB URI. mongo_db_name: The MongoDB database name. **kwargs: Additional keyword arguments to pass to the parent class. """ super().__init__(**kwargs) pc = pinecone.Pinecone(api_key=pinecone_api_key) self.index = pc.Index(pinecone_index) self.namespace = pinecone_namespace self.mongo_db_name = mongo_db_name self.client = MongoClient(mongo_uri, server_api=ServerApi("1")) self.db = self.client[mongo_db_name] def __repr__(self): """Return a string representation of the DocumentsService.""" return "DocumentsService" def get_source_id(self, source: str) -> str: """Get the id of a source. Args: source: The name of the source. Returns: The id of the source. """ return str(self.db.sources.find_one({"name": source})["_id"]) def _add_documents(self, df: pd.DataFrame): """Write all documents from the dataframe into the db as a new version. Args: df: The dataframe containing the documents. """ use_sparse_vector = "sparse_embedding" in df.columns if use_sparse_vector: logger.info("Uploading sparse embeddings too.") for source in df.source.unique(): source_exists = self.db.sources.find_one({"name": source}) if source_exists is None: self.db.sources.insert_one({"name": source}) source_id = self.get_source_id(source) df_source = df[df.source == source] to_upsert = [] for row in df_source.to_dict(orient="records"): embedding = row["embedding"].tolist() if use_sparse_vector: sparse_embedding = row["sparse_embedding"] document = row.copy() document.pop("embedding") if use_sparse_vector: document.pop("sparse_embedding") document["source_id"] = source_id document_id = str(self.db.documents.insert_one(document).inserted_id) vector = {"id": document_id, "values": embedding, "metadata": {"source": source}} if use_sparse_vector: vector["sparse_values"] = sparse_embedding to_upsert.append(vector) # Current (February 2024) Pinecone upload rules: # - Max 100 vectors per batch MAX_PINECONE_BATCH_SIZE = 100 for i in range(0, len(to_upsert), MAX_PINECONE_BATCH_SIZE): self.index.upsert(vectors=to_upsert[i : i + MAX_PINECONE_BATCH_SIZE], namespace=self.namespace) def update_source(self, source: str, display_name: str = None, note: str = None): """Update the display name and/or note of a source. Also create the source if it does not exist. Args: source: The name of the source. display_name: The new display name of the source. note: The new note of the source. """ self.db.sources.update_one( {"name": source}, {"$set": {"display_name": display_name, "note": note}}, upsert=True ) def delete_source(self, source: str) -> tuple[int, int]: """Delete a source and all its documents. Return if the source was deleted and the number of deleted documents. Args: source: The name of the source. Returns: A tuple containing the number of deleted sources and the number of deleted documents. """ source_id = self.get_source_id(source) # MongoDB source_deleted = self.db.sources.delete_one({"name": source}).deleted_count documents_deleted = self.db.documents.delete_many({"source_id": source_id}).deleted_count # Pinecone self.index.delete(filter={"source": source}, namespace=self.namespace) return source_deleted, documents_deleted def drop_db(self): """Drop the currently accessible database. For Pinecone, this means deleting everything in the namespace. For Mongo DB, this means dropping the database. However this needs to be done manually through the GUI. """ confirmation = input("Dropping the database is irreversible. Are you sure you want to proceed? (y/N): ") if confirmation.strip().lower() == "y": self.index.delete(namespace=self.namespace, delete_all=True) logging.info(f"Deleted all documents from Pinecone namespace: {self.namespace=}") logging.info(f"The MongoDB database needs to be dropped manually: {self.mongo_db_name=}") else: logging.info("Operation cancelled.") ================================================ FILE: buster/examples/cfg.py ================================================ from buster.busterbot import Buster, BusterConfig from buster.completers import ChatGPTCompleter, DocumentAnswerer from buster.formatters.documents import DocumentsFormatterJSON from buster.formatters.prompts import PromptFormatter from buster.llm_utils import get_openai_embedding_constructor from buster.retriever import DeepLakeRetriever, Retriever from buster.tokenizers import GPTTokenizer from buster.validators import Validator # kwargs to pass to OpenAI client client_kwargs = { "timeout": 20, "max_retries": 3, } embedding_fn = get_openai_embedding_constructor(client_kwargs=client_kwargs) buster_cfg = BusterConfig( validator_cfg={ "question_validator_cfg": { "invalid_question_response": "This question does not seem relevant to my current knowledge.", "completion_kwargs": { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, }, "client_kwargs": client_kwargs, "check_question_prompt": """You are a chatbot answering questions on artificial intelligence. Your job is to determine wether or not a question is valid, and should be answered. More general questions are not considered valid, even if you might know the response. A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid. For example: Q: What is backpropagation? true Q: What is the meaning of life? false A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""", }, "answer_validator_cfg": { "unknown_response_templates": [ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", ], "unknown_threshold": 0.85, "embedding_fn": embedding_fn, }, "documents_validator_cfg": { "completion_kwargs": { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, }, "client_kwargs": client_kwargs, }, "use_reranking": True, "validate_documents": False, }, retriever_cfg={ "path": "deeplake_store", "top_k": 3, "thresh": 0.7, "embedding_fn": embedding_fn, }, documents_answerer_cfg={ "no_documents_message": "No documents are available for this question.", }, completion_cfg={ "completion_kwargs": { "model": "gpt-3.5-turbo", "stream": True, "temperature": 0, }, "client_kwargs": client_kwargs, }, tokenizer_cfg={ "model_name": "gpt-3.5-turbo", }, documents_formatter_cfg={ "max_tokens": 3500, "columns": ["content", "title", "source"], }, prompt_formatter_cfg={ "max_tokens": 3500, "text_before_docs": ( "You are a chatbot assistant answering technical questions about artificial intelligence (AI)." "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. " "If the answer is in the documentation, summarize it in a helpful way to the user. " "If it isn't, simply reply that you cannot answer the question. " "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. " "Here is the documentation: " ), "text_after_docs": ( "REMEMBER:\n" "You are a chatbot assistant answering technical questions about artificial intelligence (AI)." "Here are the rules you must follow:\n" "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n" "2) Make sure to format your answers in Markdown format, including code block and snippets.\n" "3) Do not reference any links, urls or hyperlinks in your answers.\n" "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n" "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. " "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'" "For example:\n" "What is the meaning of life for an qa bot?\n" "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?" "Now answer the following question:\n" ), }, ) def setup_buster(buster_cfg: BusterConfig): """initialize buster with a buster_cfg class""" retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) document_answerer: DocumentAnswerer = DocumentAnswerer( completer=ChatGPTCompleter(**buster_cfg.completion_cfg), documents_formatter=DocumentsFormatterJSON(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg), prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg), **buster_cfg.documents_answerer_cfg, ) validator: Validator = Validator(**buster_cfg.validator_cfg) buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator) return buster ================================================ FILE: buster/examples/generate_embeddings.py ================================================ import click import pandas as pd from buster.documents_manager import DeepLakeDocumentsManager REQUIRED_COLUMNS = ["url", "title", "content", "source"] @click.command( help="This script processes a CSV file and generates embeddings. The CSV argument specifies the path to the input CSV file." ) @click.argument("csv", metavar="") def main(csv): # Read the csv df = pd.read_csv(csv) # initialize our vector store from scratch dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True, required_columns=REQUIRED_COLUMNS) # Generate the embeddings for our documents and store them to the deeplake store dm.add(df, csv_filename="embeddings.csv") # Save it to a zip file dm.to_zip() if __name__ == "__main__": main() ================================================ FILE: buster/examples/gradio_app.py ================================================ import os from typing import Optional, Tuple import cfg import gradio as gr import pandas as pd from cfg import setup_buster from buster.completers import Completion from buster.utils import extract_zip # Check if an openai key is set as an env. variable if os.getenv("OPENAI_API_KEY") is None: print("Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'.") # Typehint for chatbot history ChatHistory = list[list[Optional[str], Optional[str]]] extract_zip("deeplake_store.zip", "deeplake_store") buster = setup_buster(cfg.buster_cfg) def add_user_question(user_question: str, chat_history: Optional[ChatHistory] = None) -> ChatHistory: """Adds a user's question to the chat history. If no history is provided, the first element of the history will be the user conversation. """ if chat_history is None: chat_history = [] chat_history.append([user_question, None]) return chat_history def format_sources(matched_documents: pd.DataFrame) -> str: if len(matched_documents) == 0: return "" matched_documents.similarity_to_answer = matched_documents.similarity_to_answer * 100 # drop duplicate pages (by title), keep highest ranking ones matched_documents = matched_documents.sort_values("similarity_to_answer", ascending=False).drop_duplicates( "title", keep="first" ) documents_answer_template: str = ( "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}" ) document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %" documents = "\n".join([document_template.format(document=document) for _, document in matched_documents.iterrows()]) footnote: str = "I'm a bot 🤖 and not always perfect." return documents_answer_template.format(documents=documents, footnote=footnote) def add_sources(history, completion): if completion.answer_relevant: formatted_sources = format_sources(completion.matched_documents) history.append([None, formatted_sources]) return history def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]: """Answer a user's question using retrieval augmented generation.""" # We assume that the question is the user's last interaction user_input = chat_history[-1][0] # Do retrieval + augmented generation with buster completion = buster.process_input(user_input) # Stream tokens one at a time to the user chat_history[-1][1] = "" for token in completion.answer_generator: chat_history[-1][1] += token yield chat_history, completion demo = gr.Blocks() with demo: with gr.Row(): gr.Markdown("

Buster 🤖: A Question-Answering Bot for your documentation

") chatbot = gr.Chatbot() with gr.Row(): question_textbox = gr.Textbox( label="What's your question?", placeholder="Type your question here...", lines=1, ) send_button = gr.Button(value="Send", variant="secondary") examples = gr.Examples( examples=[ "How can I perform backpropagation?", "How do I deal with noisy data?", "How do I deal with noisy data in 2 words?", ], inputs=question_textbox, ) gr.Markdown("This application uses GPT to search the docs for relevant info and answer questions.") gr.HTML("️
Created with ❤️ by @jerpint and @hadrienbertrand") response = gr.State() # fmt: off gr.on( triggers=[send_button.click, question_textbox.submit], fn=add_user_question, inputs=[question_textbox], outputs=[chatbot] ).then( chat, inputs=[chatbot], outputs=[chatbot, response] ).then( add_sources, inputs=[chatbot, response], outputs=[chatbot] ) # fmt: on demo.queue() demo.launch(debug=True, share=False) ================================================ FILE: buster/examples/stackoverflow.csv ================================================ ,source,title,content,url 0,stackoverflow,stackoverflow question #1,"""Backprop"" is the same as ""backpropagation"": it's just a shorter way to say it. It is sometimes abbreviated as ""BP"". ",https://ai.stackexchange.com/questions/1 1,stackoverflow,stackoverflow question #2,"Noise in the data, to a reasonable amount, may help the network to generalize better. Sometimes, it has the opposite effect. It partly depends on the kind of noise (""true"" vs. artificial). The AI FAQ on ANN gives a good overview. Excerpt: Noise in the actual data is never a good thing, since it limits the accuracy of generalization that can be achieved no matter how extensive the training set is. On the other hand, injecting artificial noise (jitter) into the inputs during training is one of several ways to improve generalization for smooth functions when you have a small training set. In some field, such as computer vision, it's common to increase the size of the training set by copying some samples and adding some noises or other transformation. ",https://ai.stackexchange.com/questions/2 2,stackoverflow,stackoverflow question #4,"There is no direct way to find the optimal number of them: people empirically try and see (e.g., using cross-validation). The most common search techniques are random, manual, and grid searches. There exist more advanced techniques such as Gaussian processes, e.g. Optimizing Neural Network Hyperparameters with Gaussian Processes for Dialog Act Classification, IEEE SLT 2016. ",https://ai.stackexchange.com/questions/4 3,stackoverflow,stackoverflow question #6,"It rather depends on how one defines several of the terms used. For example: Whether the term ""expected"" is interpreted in a formal (i.e. statistical) sense. Whether it's assumed that humans have any kind of utilitarian ""performance measure"". The motivation for this description of ""agent"" arose from a desire to have a quantitative model - it's not clear that such a model is a good fit for human cognition. However, there are alternative definitions of agents, for example the BDI model, which are rather more open-ended and hence more obviously applicable to humans. ",https://ai.stackexchange.com/questions/6 4,stackoverflow,stackoverflow question #7," To put it simply in layman terms, what are the possible threats from AI? Currently, there are no threat. The threat comes if humans create a so-called ultraintelligent machine, a machine that can surpass all intellectual activities by any human. This would be the last invention man would need to do, since this machine is better in inventing machines than humans are (since that is an intellectual activity). However, this could cause the machine to invent machines that can destruct humans, and we can't stop them because they are so much smarter than we are. This is all hypothetical, no one has even a clue of what an ultraintelligent machine looks like. If we know that AI is so dangerous why are we still promoting it? Why is it not banned? As I said before, the existence of a ultraintelligent machine is hypothetical. Artificial Intelligence has lots of useful applications (more than this answer can contain), and if we develop it, we get even more useful applications. We just have to be careful that the machines won't overtake us. ",https://ai.stackexchange.com/questions/7 5,stackoverflow,stackoverflow question #10,"It's analogous to analogue versus digital, or the many shades of gray in between black and white: when evaluating the truthiness of a result, in binary boolean it's either true or false (0 or 1), but when utilizing fuzzy logic, it's an estimated probability between 0 and 1 (such as 0.75 being mostly probably true). It's useful for making calculated decisions when all information needed isn't necessarily available. Wikipedia has a fantastic page for this. ",https://ai.stackexchange.com/questions/10 6,stackoverflow,stackoverflow question #15,"The problem of the Turing Test is that it tests the machines ability to resemble humans. Not necessarily every form of AI has to resemble humans. This makes the Turing Test less reliable. However, it is still useful since it is an actual test. It is also noteworthy that there is a prize for passing or coming closest to passing the Turing Test, the Loebner Prize. The intelligent agent definition of intelligence states that an agent is intelligent if it acts so to maximize the expected value of a performance measure based on past experience and knowledge. (paraphrased from Wikipedia). This definition is used more often and does not depend on the ability to resemble humans. However, it is harder to test this. ",https://ai.stackexchange.com/questions/15 7,stackoverflow,stackoverflow question #17,"The concept of ""the singularity"" is when machines outsmart the humans. Although Stephen Hawking opinion is that this situation is inevitable, but I think it'll be very difficult to reach that point, because every A.I. algorithm needs to be programmed by humans, therefore it would be always more limited than its creator. We would probably know when that point when humanity will lose control over Artificial Intelligence where super-smart AI would be in competition with humans and maybe creating more sophisticated intelligent beings occurred, but currently, it's more like science fiction (aka Terminator's Skynet). The risk could involve killing people (like self-flying war drones making their own decision), destroying countries or even the whole planet (like A.I. connected to the nuclear weapons (aka WarGames movie), but it doesn't prove the point that the machines would be smarter than humans. ",https://ai.stackexchange.com/questions/17 8,stackoverflow,stackoverflow question #26,"I think your question fits nowadays more in the field of Human-Robot Interaction, which relies largely on vision for recognition of gestures and follow movements, as well as soft, natural movements as a response. Note that the movements of the face and hands belong to the most complex tasks, involving many muscles at a time. I strongly recommend the film Plug & Pray to have an idea of what people are researching in this area. You may also find Eliza (which you can try here) interesting. It is classical in the history of AI and pretends to mimic an analyst (psychology). (I am thinking of Eliza not because of its emotional intelligence, but because it was apparently taken seriously by a couple of humans. Could this be taken as a sort of (approved) Turing test? What does it say about the humans it met?) On the purely human end of the scale, I sometimes wonder about our (my) emotional intelligence myself. Would I want to implement such an intelligence in an artificial agent at all? ",https://ai.stackexchange.com/questions/26 9,stackoverflow,stackoverflow question #28,"This is probably more a question of philosophy than anything. In terms of how things are commonly defined, I'll say ""yes, genetic algorithms are part of AI"". If you pick up a comprehensive book on artificial intelligence, there will probably be a chapter on genetic algorithms (or more broadly, evolutionary algorithms). One area that has been extensively studied in the past is the idea of using genetic algorithms to train neural networks. I don't know if people are still actively researching this topic or not, but it at least illustrates that GA's are part of the overall rubric of AI in one regard. ",https://ai.stackexchange.com/questions/28 ================================================ FILE: buster/formatters/documents.py ================================================ import logging from abc import ABC, abstractmethod from dataclasses import dataclass import pandas as pd from buster.tokenizers import Tokenizer logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class DocumentsFormatter(ABC): """ Abstract base class for document formatters. Subclasses are required to implement the `format` method which transforms the input documents into the desired format. """ @abstractmethod def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]: """ Abstract method to format matched documents. Parameters: - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted. Returns: - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a string and the possibly truncated matched documents DataFrame. """ pass @dataclass class DocumentsFormatterHTML(DocumentsFormatter): """ Formatter class to convert matched documents into an HTML format. Attributes: - tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents. - max_tokens (int): Maximum allowed tokens for the formatted documents. - formatter (str): String formatter for the document's content. - inner_tag (str): HTML tag that will be used at the document level. - outer_tag (str): HTML tag that will be used at the documents collection level. """ tokenizer: Tokenizer max_tokens: int formatter: str = "{content}" inner_tag: str = "DOCUMENT" outer_tag: str = "DOCUMENTS" def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]: """ Format the matched documents into an HTML format. If the total tokens exceed max_tokens, documents are truncated or omitted to fit within the limit. Parameters: - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted. Returns: - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as an HTML string and the possibly truncated matched documents DataFrame. """ documents_str = "" total_tokens = 0 max_tokens = self.max_tokens num_total_docs = len(matched_documents) num_preserved_docs = 0 # TODO: uniformize this logic with the DocumentsFormatterJSON for _, row in matched_documents.iterrows(): doc = self.formatter.format_map(row.to_dict()) num_preserved_docs += 1 token_count, encoded = self.tokenizer.num_tokens(doc, return_encoded=True) if total_tokens + token_count <= max_tokens: documents_str += f"<{self.inner_tag}>{doc}<\\{self.inner_tag}>" total_tokens += token_count else: logger.warning("truncating document to fit...") remaining_tokens = max_tokens - total_tokens truncated_doc = self.tokenizer.decode(encoded[:remaining_tokens]) documents_str += f"<{self.inner_tag}>{truncated_doc}<\\{self.inner_tag}>" logger.warning(f"Documents after truncation: {documents_str}") break if num_preserved_docs < (num_total_docs): logger.warning( f"{num_preserved_docs}/{num_total_docs} documents were preserved from the matched documents due to truncation." ) matched_documents = matched_documents.iloc[:num_preserved_docs] documents_str = f"<{self.outer_tag}>{documents_str}<\\{self.outer_tag}>" return documents_str, matched_documents @dataclass class DocumentsFormatterJSON(DocumentsFormatter): """ Formatter class to convert matched documents into a JSON format. Attributes: - tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents. - max_tokens (int): Maximum allowed tokens for the formatted documents. - columns (list[str]): List of columns to include in the JSON format. """ tokenizer: Tokenizer max_tokens: int columns: list[str] def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]: """ Format the matched documents into a JSON format. If the total tokens exceed max_tokens, documents are omitted one at a time until it fits the limit. Parameters: - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted. Returns: - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a JSON string and the possibly truncated matched documents DataFrame. """ max_tokens = self.max_tokens documents_str = matched_documents[self.columns].to_json(orient="records") token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True) while token_count > max_tokens: # Truncated too much, no documents left, raise an error if len(matched_documents) == 0: raise ValueError( f"Could not truncate documents to fit {max_tokens=}. Consider increasing max_tokens or decreasing chunk lengths." ) # Too many tokens, drop a document and try again. matched_documents = matched_documents.iloc[:-1] documents_str = matched_documents[self.columns].to_json(orient="records") token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True) # Log a warning with more details logger.warning( f"Truncating documents to fit. Remaining documents after truncation: {len(matched_documents)}" ) return documents_str, matched_documents ================================================ FILE: buster/formatters/prompts.py ================================================ import logging from dataclasses import dataclass import pandas as pd from buster.tokenizers import Tokenizer logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @dataclass class PromptFormatter: tokenizer: Tokenizer max_tokens: int text_before_docs: str text_after_docs: str formatter: str = "{text_before_docs}\n{documents}\n{text_after_docs}" def format(self, documents: str) -> str: """Formats the system prompt with prompt engineering. Joins the text before and after documents with the documents provided. Args: documents (str): The already formatted documents to include in the system prompt. Returns: str: The formatted system prompt. Raises: ValueError: If the number of prompt tokens exceeds the maximum allowed tokens. """ system_prompt = self.formatter.format( text_before_docs=self.text_before_docs, documents=documents, text_after_docs=self.text_after_docs ) if self.tokenizer.num_tokens(system_prompt) > self.max_tokens: raise ValueError(f"System prompt tokens > {self.max_tokens=}") return system_prompt def prompt_formatter_factory(tokenizer: Tokenizer, prompt_cfg) -> PromptFormatter: """Creates a PromptFormatter instance. Args: tokenizer (Tokenizer): The tokenizer to use for the PromptFormatter. prompt_cfg: The configuration for the PromptFormatter. Returns: PromptFormatter: The created PromptFormatter instance. """ return PromptFormatter( tokenizer=tokenizer, max_tokens=prompt_cfg["max_tokens"], text_before_docs=prompt_cfg["text_before_documents"], text_after_docs=prompt_cfg["text_before_prompt"], ) ================================================ FILE: buster/llm_utils/__init__.py ================================================ from buster.llm_utils.embeddings import ( BM25, compute_embeddings_parallelized, cosine_similarity, get_openai_embedding, get_openai_embedding_constructor, ) from buster.llm_utils.question_reformulator import QuestionReformulator __all__ = [ QuestionReformulator, cosine_similarity, get_openai_embedding, compute_embeddings_parallelized, get_openai_embedding_constructor, BM25, ] ================================================ FILE: buster/llm_utils/embeddings.py ================================================ import logging from functools import lru_cache from typing import Optional import numpy as np import pandas as pd from openai import OpenAI from pinecone_text.sparse import BM25Encoder from tqdm.contrib.concurrent import thread_map logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) def get_openai_embedding_constructor(client_kwargs: Optional[dict] = None, model: str = "text-embedding-ada-002"): if client_kwargs is None: client_kwargs = {} client = OpenAI(**client_kwargs) @lru_cache def embedding_fn(text: str, model: str = model) -> np.array: try: text = text.replace("\n", " ") response = client.embeddings.create( input=text, model=model, ) embedding = response.data[0].embedding return np.array(embedding, dtype="float32") except Exception as e: # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress. logger.exception(e) logger.warning(f"Embedding failed to compute for {text=}") return None return embedding_fn # default embedding function get_openai_embedding = get_openai_embedding_constructor() def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series: """Compute the embeddings on the 'content' column of a DataFrame in parallel. This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified embedding function. The 'content' column is expected to contain strings or textual data. The method processes the embeddings in parallel using the number of workers specified. Args: df (pd.DataFrame): The DataFrame containing the data to compute embeddings for. embedding_fn (callable): A function that computes embeddings for a given input string. num_workers (int): The number of parallel workers to use for computing embeddings. Returns: pd.Series: A Series containing the computed embeddings for each entry in the 'content' column. """ logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}") embeddings = thread_map(embedding_fn, df.content.to_list(), max_workers=num_workers) logger.info(f"Finished computing embeddings") return embeddings class BM25: def __init__(self, path_to_params: str = None) -> None: self.encoder = BM25Encoder() if path_to_params: self.encoder.load(path_to_params) def fit(self, df: pd.DataFrame): self.encoder.fit(df.content.to_list()) def dump_params(self, path: str): self.encoder.dump(path) def get_sparse_embedding_fn(self): def sparse_embedding_fn(query: str): return self.encoder.encode_queries(query) return sparse_embedding_fn ================================================ FILE: buster/llm_utils/question_reformulator.py ================================================ import logging from typing import Optional from buster.completers import ChatGPTCompleter class QuestionReformulator: def __init__( self, system_prompt: Optional[str] = None, completion_kwargs: Optional[dict] = None, client_kwargs: Optional[dict] = None, ): self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs) if completion_kwargs is None: # Default kwargs completion_kwargs = { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, } self.completion_kwargs = completion_kwargs if system_prompt is None: # Default prompt system_prompt = """ Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system. Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval.""" self.system_prompt = system_prompt def reformulate(self, user_input: str) -> str: """Reformulate a user's question""" reformulated_question, error = self.completer.complete( self.system_prompt, user_input=user_input, completion_kwargs=self.completion_kwargs ) logging.info(f"Reformulated question from {user_input=} to {reformulated_question=}") return reformulated_question, error ================================================ FILE: buster/parsers/__init__.py ================================================ from buster.parsers.parser import HuggingfaceParser, SphinxParser, get_all_documents __all__ = [get_all_documents, SphinxParser, HuggingfaceParser] ================================================ FILE: buster/parsers/parser.py ================================================ import glob import os import re from abc import ABC, abstractmethod from dataclasses import InitVar, dataclass, field from itertools import takewhile, zip_longest from pathlib import Path from typing import Iterator, Type import bs4 import pandas as pd from bs4 import BeautifulSoup from tqdm import tqdm @dataclass class Section: url: str name: str nodes: InitVar[list[bs4.element.NavigableString]] text: str = field(init=False) def __post_init__(self, nodes: list[bs4.element.NavigableString]): section = [] for node in nodes: if node.name == "table": node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github") elif node.name == "script": continue else: node_text = node.text section.append(node_text) self.text = "\n".join(section).strip() # Remove tabs self.text = self.text.replace("\t", "") # Replace group of newlines with a single newline self.text = re.sub("\n{2,}", "\n", self.text) # Replace non-breaking spaces with regular spaces self.text = self.text.replace("\xa0", " ") def __len__(self) -> int: return len(self.text) @classmethod def from_text(cls, text: str, url: str, name: str) -> "Section": """Alternate constructor, without parsing.""" section = cls.__new__(cls) # Allocate memory, does not call __init__ # Does the init here. section.text = text section.url = url section.name = name return section def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]: """Split a section into chunks.""" if len(self) > max_length: # Get the number of chunk, by dividing and rounding up. # Then, split the section into equal lenght chunks. # This could results in chunks below the minimum length, # and will truncate the end of the section. n_chunks = (len(self) + max_length - 1) // max_length length = len(self) // n_chunks for chunk in range(n_chunks): start = chunk * length yield Section.from_text(self.text[start : start + length], self.url, self.name) elif len(self) > min_length: yield self return @dataclass class Parser(ABC): soup: BeautifulSoup base_url: str root_dir: str filepath: str min_section_length: int = 100 max_section_length: int = 2000 @property def relative_path(self) -> str: """Gets the relative path of the file to the root dir. This is particularly useful for websites with pages, subdomains, etc. The split is to remove the .html extension """ parent = Path(self.root_dir) son = Path(self.filepath) self._relative_path = str(son.relative_to(parent)).split(".")[0] return self._relative_path @abstractmethod def find_sections(self) -> Iterator[Section]: ... def parse(self) -> list[Section]: """Parse the documents into sections, respecting the lenght constraints.""" sections = [] for section in self.find_sections(): sections.extend(section.get_chunks(self.min_section_length, self.max_section_length)) return sections class SphinxParser(Parser): def find_sections(self) -> Iterator[Section]: for section in self.soup.find_all("a", href=True, class_="headerlink"): container = section.parent.parent section_href = container.find_all("a", href=True, class_="headerlink") url = self.build_url(section["href"].strip().replace("\n", "")) name = section.parent.text.strip()[:-1].replace("\n", "") # If sections has subsections, keep only the part before the first subsection if len(section_href) > 1 and container.section is not None: siblings = list(container.section.previous_siblings)[::-1] section = Section(url, name, siblings) else: section = Section(url, name, container.children) yield section return def build_url(self, suffix: str) -> str: return self.base_url + self.relative_path + ".html" + suffix class HuggingfaceParser(Parser): def find_sections(self) -> Iterator[Section]: sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group") for section, next_section in zip_longest(sections, sections[1:]): href = section.find("a", href=True, class_="header-link") nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings())) suffix = href["href"].strip().replace("\n", "") url = self.build_url(suffix) name = section.text.strip().replace("\n", "") yield Section(url, name, nodes) return def build_url(self, suffix: str) -> str: return self.base_url + self.relative_path + suffix def get_document( root_dir: str, file: str, base_url: str, parser_cls: Type[Parser], min_section_length: int = 100, max_section_length: int = 2000, ) -> pd.DataFrame: """Extract all sections from one file. Sections are broken into subsections if they are longer than `max_section_length`. Sections correspond to `section` HTML tags that have a headerlink attached. """ filepath = os.path.join(root_dir, file) with open(filepath, "r") as f: source = f.read() soup = BeautifulSoup(source, "html.parser") parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length) sections = [] urls = [] names = [] for section in parser.parse(): sections.append(section.text) urls.append(section.url) names.append(section.name) documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections}) return documents_df def get_all_documents( root_dir: str, base_url: str, parser_cls: Type[Parser], min_section_length: int = 100, max_section_length: int = 2000, ) -> pd.DataFrame: """Parse all HTML files in `root_dir`, and extract all sections. Sections are broken into subsections if they are longer than `max_section_length`. Sections correspond to `section` HTML tags that have a headerlink attached. """ files = glob.glob("**/*.html", root_dir=root_dir, recursive=True) dfs = [] for file in tqdm(files): try: df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length) dfs.append(df) except Exception as e: print(f"Skipping {file} due to the following error: {e}") continue documents_df = pd.concat(dfs, ignore_index=True) return documents_df ================================================ FILE: buster/retriever/__init__.py ================================================ from .base import Retriever from .deeplake import DeepLakeRetriever from .service import ServiceRetriever __all__ = [Retriever, ServiceRetriever, DeepLakeRetriever] ================================================ FILE: buster/retriever/base.py ================================================ import logging from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Callable, Optional import numpy as np import pandas as pd from buster.completers import UserInputs from buster.llm_utils import get_openai_embedding ALL_SOURCES = "All" logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @dataclass class Retriever(ABC): def __init__( self, top_k: int, thresh: float, embedding_fn: Callable[[str], np.ndarray] = None, sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None, *args, **kwargs, ): """Initializes a Retriever instance. Args: top_k: The maximum number of documents to retrieve. thresh: The similarity threshold for document retrieval. embedding_fn: The function to compute document embeddings. embedding_fn: (Optional) The function to compute sparse document embeddings. *args, **kwargs: Additional arguments and keyword arguments. """ if embedding_fn is None: embedding_fn = get_openai_embedding self.top_k = top_k self.thresh = thresh self.embedding_fn = embedding_fn self.sparse_embedding_fn = sparse_embedding_fn # Add your access to documents in your own init @abstractmethod def get_documents(self, source: Optional[str] = None) -> pd.DataFrame: """Get all current documents from a given source. Args: source: The source from which to retrieve documents. If None, retrieves documents from all sources. Returns: A pandas DataFrame containing the documents. """ ... @abstractmethod def get_source_display_name(self, source: str) -> str: """Get the display name of a source. Args: source: The source for which to retrieve the display name. Returns: The display name of the source. If source is None, returns all documents. If source does not exist, returns empty dataframe. """ ... @abstractmethod def get_topk_documents(self, query: str, source: Optional[str] = None, top_k: Optional[int] = None) -> pd.DataFrame: """Get the topk documents matching a user's query. Args: query: The user's query. source: The source from which to retrieve documents. If None, retrieves documents from all sources. top_k: The maximum number of documents to retrieve. Returns: A pandas DataFrame containing the topk matched documents. If no matches are found, returns an empty dataframe. """ ... def threshold_documents(self, matched_documents: pd.DataFrame, thresh: float) -> pd.DataFrame: """Filters out matched documents using a similarity threshold. Args: matched_documents: The DataFrame containing the matched documents. thresh: The similarity threshold. Returns: A pandas DataFrame containing the filtered matched documents. """ # filter out matched_documents using a threshold return matched_documents[matched_documents.similarity > thresh] def retrieve( self, user_inputs: UserInputs, sources: Optional[list[str]] = None, top_k: Optional[int] = None, thresh: Optional[float] = None, ) -> pd.DataFrame: """Retrieves documents based on user inputs. Args: user_inputs: The user's inputs. sources: The sources from which to retrieve documents. If None, retrieves documents from all sources. top_k: The maximum number of documents to retrieve. thresh: The similarity threshold for document retrieval. Returns: A pandas DataFrame containing the retrieved documents. """ if top_k is None: top_k = self.top_k if thresh is None: thresh = self.thresh query = user_inputs.current_input matched_documents = self.get_topk_documents(query=query, sources=sources, top_k=top_k) # log matched_documents to the console logger.info(f"matched documents before thresh: {matched_documents}") # No matches were found, simply return at this point if len(matched_documents) == 0: return matched_documents # otherwise, make sure we have the minimum required fields assert "similarity" in matched_documents.columns assert "embedding" in matched_documents.columns assert "content" in matched_documents.columns assert "title" in matched_documents.columns # filter out matched_documents using a threshold matched_documents = self.threshold_documents(matched_documents, thresh) logger.info(f"matched documents after thresh: {matched_documents}") return matched_documents ================================================ FILE: buster/retriever/deeplake.py ================================================ import logging import os from typing import Optional import numpy as np import pandas as pd from buster.retriever.base import ALL_SOURCES, Retriever logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) def extract_metadata(x: pd.DataFrame, columns) -> pd.DataFrame: """Extracts metadata from deeplake. Args: x: The dataframe containing the metadata. columns: The columns to extract. Returns: The dataframe with the extracted metadata. """ for col in columns: x[col] = x.metadata[col] return x def data_dict_to_df(data: dict) -> pd.DataFrame: """Converts a dictionary of data to a Pandas DataFrame. Args: data: The dictionary containing the data. Returns: The DataFrame containing the data. """ # rename 'score' to 'similarity' data["similarity"] = data.pop("score") data["content"] = data.pop("text") matched_documents = pd.DataFrame(data) if len(matched_documents) == 0: logger.info("No matches found...") return pd.DataFrame() matched_documents = matched_documents.apply(extract_metadata, columns=["source", "title", "url"], axis=1) matched_documents = matched_documents.drop(columns="metadata") return matched_documents def build_tql_query(embedding, sources=None, top_k: int = 3) -> str: """Builds a TQL query. Args: embedding: The embedding vector. sources: The sources to filter by. top_k: The number of top documents to retrieve. Returns: The TQL query. """ # Initialize the where_clause to an empty string. where_clause = "" embedding_string = ",".join([str(item) for item in embedding]) # If sources is provided and it's not empty, build the where clause. if sources: conditions = [f"contains(metadata['source'], '{source}')" for source in sources] where_clause = "where " + " or ".join(conditions) # Construct the entire query query = f""" select * from ( select embedding, text, metadata, cosine_similarity(embedding, ARRAY[{embedding_string}]) as score {where_clause} ) order by score desc limit {top_k} """ return query class DeepLakeRetriever(Retriever): def __init__( self, path, exec_option: str = "python", use_tql: bool = False, deep_memory: bool = False, activeloop_token: str = None, **kwargs, ): from deeplake.core.vectorstore import VectorStore super().__init__(**kwargs) self.use_tql = use_tql self.exec_option = exec_option self.deep_memory = deep_memory self.vector_store = VectorStore( path=path, read_only=True, token=activeloop_token, exec_option=exec_option, ) if activeloop_token is None and use_tql: logger.warning( """ No activeloop token detected, enterprise features will not be available. You can set it using: export ACTIVELOOP_TOKEN=... """ ) def get_documents(self, sources: Optional[list[str]] = None) -> pd.DataFrame: """Get all current documents from a given source. Args: sources: The sources to retrieve documents from. Returns: The DataFrame containing the retrieved documents. """ k = len(self.vector_store) # currently this is the only way to retrieve all embeddings in deeplake # generate a dummy embedding and specify top-k equals the length of the vector store. embedding_dim = self.vector_store.tensors()["embedding"].shape[1] dummy_embedding = np.random.random(embedding_dim) return self.get_topk_documents(query=None, embedding=dummy_embedding, top_k=k, sources=sources) def get_source_display_name(self, source: str) -> str: """Get the display name of a source. Args: source: The name of the source. Returns: The display name of the source. Raises: NotImplementedError: If the method is not implemented. """ raise NotImplementedError() def get_topk_documents( self, query: str = None, embedding: np.array = None, sources: Optional[list[str]] = None, top_k: int = None, return_tensors: str = "*", ) -> pd.DataFrame: """Get the topk documents matching a user's query. If no matches are found, returns an empty dataframe. Args: query: The user's query. embedding: The embedding vector. sources: The sources to filter by. top_k: The number of top documents to retrieve. return_tensors: The tensors to include in the result. Returns: The DataFrame containing the matched documents. """ if query is not None: query_embedding = self.embedding_fn(query) elif embedding is not None: query_embedding = embedding else: raise ValueError("must provide either a query or an embedding") if self.use_tql: assert self.exec_option == "compute_engine", "cant use tql without compute_engine" tql_query = build_tql_query(query_embedding, sources=sources, top_k=top_k) data = self.vector_store.search(query=tql_query, deep_memory=self.deep_memory) else: # build the filter clause if sources: def filter(x): return x["metadata"].data()["value"]["source"] in sources else: filter = None data = self.vector_store.search( k=top_k, embedding=query_embedding, exec_option=self.exec_option, return_tensors=return_tensors, filter=filter, ) matched_documents = data_dict_to_df(data) return matched_documents ================================================ FILE: buster/retriever/service.py ================================================ import logging from typing import List, Optional import numpy as np import pandas as pd import pinecone from bson.objectid import ObjectId from pymongo.mongo_client import MongoClient from pymongo.server_api import ServerApi from buster.retriever.base import ALL_SOURCES, Retriever logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class ServiceRetriever(Retriever): def __init__( self, pinecone_api_key: str, pinecone_index: str, pinecone_namespace: str, mongo_uri: str, mongo_db_name: str, **kwargs, ): """ Initializes a ServiceRetriever instance. The ServiceRetriever is a hybrid retrieval combining pinecone and mongodb services. Pinecone is exclusively used as a vector store. The id of the pinecone vectors are used as a key in the mongodb database to store its associated metadata. Args: pinecone_api_key: The API key for Pinecone. pinecone_env: The environment for Pinecone. pinecone_index: The name of the Pinecone index. pinecone_namespace: The namespace for Pinecone. mongo_uri: The URI for MongoDB. mongo_db_name: The name of the MongoDB database. """ super().__init__(**kwargs) pc = pinecone.Pinecone(api_key=pinecone_api_key) self.index = pc.Index(pinecone_index) self.namespace = pinecone_namespace self.client = MongoClient(mongo_uri, server_api=ServerApi("1")) self.db = self.client[mongo_db_name] def get_source_id(self, source: str) -> str: """Get the id of a source. Returns an empty string if the source does not exist. Args: source: The name of the source. Returns: The id of the source. """ source_pointer = self.db.sources.find_one({"name": source}) return "" if source_pointer is None else str(source_pointer["_id"]) def get_documents(self, source: Optional[str] = None) -> pd.DataFrame: """Get all current documents from a given source. Args: source: The name of the source. Defaults to None. Returns: A DataFrame containing all the documents. If the source does not exist, returns an empty DataFrame. """ if source is None: # No source specified, return all documents documents = self.db.documents.find() else: assert isinstance(source, str), "source must be a valid string." source_id = self.get_source_id(source) if source_id == "": logger.warning(f"{source=} not found.") documents = self.db.documents.find({"source_id": source_id}) return pd.DataFrame(list(documents)) def get_source_display_name(self, source: str) -> str: """Get the display name of a source. Args: source: The name of the source. Returns: The display name of the source. """ if source is None: return ALL_SOURCES else: display_name = self.db.sources.find_one({"name": source})["display_name"] return display_name def get_topk_documents(self, query: str, sources: Optional[List[str]], top_k: int) -> pd.DataFrame: """Get the top k documents matching a query from the specified sources. Args: query: The query string. sources: The list of source names to search. Defaults to None. top_k: The number of top matches to return. Returns: A DataFrame containing the top k matching documents. """ if sources is None: filter = None else: filter = {"source": {"$in": sources}} source_exists = self.db.sources.find_one({"name": {"$in": sources}}) if source_exists is None: logger.warning(f"Sources {sources} do not exist. Returning empty dataframe.") return pd.DataFrame() query_embedding = self.embedding_fn(query) sparse_query_embedding = self.sparse_embedding_fn(query) if self.sparse_embedding_fn is not None else None if isinstance(query_embedding, np.ndarray): # pinecone expects a list of floats, so convert from ndarray if necessary query_embedding = query_embedding.tolist() # Pinecone retrieval matches = self.index.query( vector=query_embedding, sparse_vector=sparse_query_embedding, top_k=top_k, filter=filter, include_values=True, namespace=self.namespace, )["matches"] matching_ids = [ObjectId(match.id) for match in matches] matching_scores = {match.id: match.score for match in matches} matching_embeddings = {match.id: match.values for match in matches} if len(matching_ids) == 0: return pd.DataFrame() # MongoDB retrieval matched_documents = self.db.documents.find({"_id": {"$in": matching_ids}}) matched_documents = pd.DataFrame(list(matched_documents)) # add additional information from matching matched_documents["similarity"] = matched_documents["_id"].apply(lambda x: matching_scores[str(x)]) matched_documents["embedding"] = matched_documents["_id"].apply(lambda x: matching_embeddings[str(x)]) # sort by similarity matched_documents = matched_documents.sort_values(by="similarity", ascending=False, ignore_index=True) return matched_documents ================================================ FILE: buster/tokenizers/__init__.py ================================================ from .base import Tokenizer from .gpt import GPTTokenizer def tokenizer_factory(tokenizer_cfg: dict) -> Tokenizer: model_name = tokenizer_cfg["model_name"] if model_name in ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]: return GPTTokenizer(model_name) raise ValueError(f"Tokenizer not implemented for {model_name=}") __all__ = [Tokenizer, GPTTokenizer, tokenizer_factory] ================================================ FILE: buster/tokenizers/base.py ================================================ from abc import ABC, abstractmethod from typing import Union class Tokenizer(ABC): """Abstract base class for a tokenizer. Args: model_name: The name of the tokenizer model. Attributes: model_name: The name of the tokenizer model. """ def __init__(self, model_name: str): self.model_name = model_name @abstractmethod def encode(self, string: str) -> list[int]: """Encodes a string into a list of integers. Args: string: The input string to be encoded. Returns: A list of integers representing the encoded string. """ ... @abstractmethod def decode(self, encoded: list[int]) -> str: """Decodes a list of integers into a string. Args: encoded: The list of integers to be decoded. Returns: The decoded string. """ ... def num_tokens(self, string: str, return_encoded: bool = False) -> Union[int, tuple[int, list[int]]]: """Returns the number of tokens in a string. Args: string: The input string. return_encoded: Whether or not to return the encoded string along with the number of tokens. Returns: If `return_encoded` is False, returns the number of tokens in the string. If `return_encoded` is True, returns a tuple containing the number of tokens and the encoded string. """ encoded = self.encode(string) if return_encoded: return len(encoded), encoded return len(encoded) ================================================ FILE: buster/tokenizers/gpt.py ================================================ import tiktoken from buster.tokenizers import Tokenizer class GPTTokenizer(Tokenizer): """Tokenizer class for GPT models. This class implements a tokenizer for GPT models using the tiktoken library. Args: model_name (str): The name of the GPT model to be used. Attributes: encoder: The encoder object created using tiktoken.encoding_for_model(). """ def __init__(self, model_name: str): super().__init__(model_name) self.encoder = tiktoken.encoding_for_model(model_name=model_name) def encode(self, string: str): """Encodes a given string using the GPT tokenizer. Args: string (str): The string to be encoded. Returns: list[int]: The encoded representation of the string. """ return self.encoder.encode(string) def decode(self, encoded: list[int]): """Decodes a list of tokens using the GPT tokenizer. Args: encoded (list[int]): The list of tokens to be decoded. Returns: str: The decoded string representation of the tokens. """ return self.encoder.decode(encoded) ================================================ FILE: buster/utils.py ================================================ import os import urllib.request import zipfile def get_file_extension(filepath: str) -> str: return os.path.splitext(filepath)[1] def download_db(db_url: str, output_dir: str): os.makedirs(output_dir, exist_ok=True) fname = os.path.join(output_dir, "documents.db") if not os.path.exists(fname): print(f"Downloading db file from {db_url} to {fname}...") urllib.request.urlretrieve(db_url, fname) print("Downloaded.") else: print("File already exists. Skipping.") return fname def zip_contents(input_path, output_path): """ Zips the entire contents of a given path to a custom output path. Authored by ChatGPT Args: input_path (str): The path of the directory to be zipped. output_path (str): The path where the zip file will be created. Returns: str: The path of the created zip file. """ if not os.path.exists(input_path): raise ValueError("The specified input path does not exist.") zip_file_name = f"{os.path.basename(input_path)}.zip" zip_file_path = os.path.join(output_path, zip_file_name) with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf: for root, _, files in os.walk(input_path): for file in files: file_path = os.path.join(root, file) arcname = os.path.relpath(file_path, input_path) zipf.write(file_path, arcname=arcname) return zip_file_path def extract_zip(zip_file_path, output_path): """ Extracts the contents of a zip file to a custom output path. Authored by ChatGPT Args: zip_file_path (str): The path of the zip file to be extracted. output_path (str): The path where the zip contents will be extracted. Returns: str: The path of the directory where the zip contents are extracted. """ if not os.path.exists(zip_file_path): raise ValueError("The specified zip file does not exist.") with zipfile.ZipFile(zip_file_path, "r") as zipf: zipf.extractall(output_path) return output_path ================================================ FILE: buster/validators/__init__.py ================================================ from .base import Validator __all__ = [Validator] ================================================ FILE: buster/validators/base.py ================================================ import logging import pandas as pd from buster.llm_utils import cosine_similarity, get_openai_embedding from buster.validators.validators import ( AnswerValidator, DocumentsValidator, QuestionValidator, ) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class Validator: def __init__( self, use_reranking: bool, validate_documents: bool, question_validator_cfg=None, answer_validator_cfg=None, documents_validator_cfg=None, ): """ Initializes the Validator class. Args: use_reranking: A boolean indicating whether to use reranking. validate_documents: A boolean indicating whether to validate documents. question_validator_cfg: A configuration dictionary for the QuestionValidator. answer_validator_cfg: A configuration dictionary for the AnswerValidator. documents_validator_cfg: A configuration dictionary for the DocumentsValidator. """ self.question_validator = ( QuestionValidator(**question_validator_cfg) if question_validator_cfg is not None else QuestionValidator() ) self.answer_validator = ( AnswerValidator(**answer_validator_cfg) if answer_validator_cfg is not None else AnswerValidator() ) self.documents_validator = ( DocumentsValidator(**documents_validator_cfg) if documents_validator_cfg is not None else DocumentsValidator() ) self.use_reranking = use_reranking self.validate_documents = validate_documents def check_question_relevance(self, question: str) -> tuple[bool, str]: """ Checks the relevance of a question. Args: question: The question to be checked. Returns: A tuple containing a boolean indicating the relevance and a string describing the result. """ return self.question_validator.check_question_relevance(question) def check_answer_relevance(self, answer: str) -> bool: """ Checks the relevance of an answer. Args: answer: The answer to be checked. Returns: A boolean indicating the relevance of the answer. """ return self.answer_validator.check_answer_relevance(answer) def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFrame: """ Checks the relevance of documents. Args: answer: The answer to be checked. matched_documents: The DataFrame containing the matched documents. Returns: A DataFrame containing the relevance of the documents. """ return self.documents_validator.check_documents_relevance(answer, matched_documents) def rerank_docs( self, answer: str, matched_documents: pd.DataFrame, embedding_fn=get_openai_embedding ) -> pd.DataFrame: """ Reranks the matched documents based on answer similarity. Args: answer: The answer for reranking. matched_documents: The DataFrame containing the matched documents. embedding_fn: The function used to calculate document embeddings. Returns: A DataFrame containing the reranked documents. """ """Here we re-rank matched documents according to the answer provided by the llm. This score could be used to determine wether a document was actually relevant to generation. An extra column is added in-place for the similarity score. """ if len(matched_documents) == 0: return matched_documents logger.info("Reranking documents based on answer similarity...") answer_embedding = embedding_fn(answer) col = "similarity_to_answer" matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding)) return matched_documents.sort_values(by=col, ascending=False) ================================================ FILE: buster/validators/validators.py ================================================ import concurrent.futures import logging from typing import Callable, List, Optional import numpy as np import pandas as pd from buster.completers import ChatGPTCompleter, Completer from buster.llm_utils import cosine_similarity from buster.llm_utils.embeddings import get_openai_embedding logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) class QuestionValidator: def __init__( self, check_question_prompt: Optional[str] = None, invalid_question_response: Optional[str] = None, completion_kwargs: Optional[dict] = None, client_kwargs: Optional[dict] = None, ): if check_question_prompt is None: check_question_prompt = ( """You are a chatbot answering questions on documentation. Your job is to determine whether or not a question is valid, and should be answered. More general questions are not considered valid, even if you might know the response. A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid. For example: Q: What is backpropagation? true Q: What is the meaning of life? false A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""", ) if completion_kwargs is None: # default completion kwargs completion_kwargs = ( { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, }, ) self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs) self.check_question_prompt = check_question_prompt self.invalid_question_response = invalid_question_response def check_question_relevance(self, question: str) -> tuple[bool, str]: """Determines whether a question is relevant for our given framework.""" try: outputs, _ = self.completer.complete(self.check_question_prompt, user_input=question) outputs = outputs.strip(".").lower() if outputs not in ["true", "false"]: logger.warning(f"the question validation returned an unexpeced value: {outputs=}. Assuming Invalid...") relevance = outputs.strip(".").lower() == "true" response = self.invalid_question_response except Exception as e: logger.exception("Error during question relevance detection.") relevance = False response = "Unable to process your question at the moment, try again soon" return relevance, response class AnswerValidator: def __init__( self, unknown_response_templates: Optional[list[str]] = None, unknown_threshold: Optional[float] = None, embedding_fn: Callable[[str], np.array] = None, ): if unknown_threshold is None: unknown_threshold = 0.85 if embedding_fn is None: embedding_fn = get_openai_embedding if unknown_response_templates is None: unknown_response_templates = [ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", ] self.embedding_fn = embedding_fn self.unknown_response_templates = unknown_response_templates self.unknown_threshold = unknown_threshold def check_answer_relevance(self, answer: str) -> bool: """Check if a generated answer is relevant to the chatbot's knowledge.""" if answer == "": raise ValueError("Cannot compute embedding of an empty string.") unknown_embeddings = [ self.embedding_fn(unknown_response) for unknown_response in self.unknown_response_templates ] answer_embedding = self.embedding_fn(answer) unknown_similarity_scores = [ cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings ] # If any score is above the threshold, the answer is considered not relevant return not any(score > self.unknown_threshold for score in unknown_similarity_scores) class DocumentsValidator: def __init__( self, completion_kwargs: Optional[dict] = None, client_kwargs: Optional[dict] = None, system_prompt: Optional[str] = None, user_input_formatter: Optional[str] = None, max_calls: int = 30, ): if system_prompt is None: system_prompt = """ Your goal is to determine if the content of a document can be attributed to a provided answer. This means that if information in the document is found in the answer, it is relevant. Otherwise it is not. Your goal is to determine if the information contained in a document was used to generate an answer. You will be comparing a document to an answer. If the answer can be inferred from the document, return 'true'. Otherwise return 'false'. Only respond with 'true' or 'false'.""" self.system_prompt = system_prompt if user_input_formatter is None: user_input_formatter = """ answer: {answer} document: {document} """ self.user_input_formatter = user_input_formatter if completion_kwargs is None: completion_kwargs = { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, } self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs) self.max_calls = max_calls def check_document_relevance(self, answer: str, document: str) -> bool: user_input = self.user_input_formatter.format(answer=answer, document=document) output, _ = self.completer.complete(prompt=self.system_prompt, user_input=user_input) # remove trailing periods, happens sometimes... output = output.strip(".").lower() if output not in ["true", "false"]: # Default assume it's relevant if the detector didn't give one of [true, false] logger.warning(f"the validation returned an unexpected value: {output}. Assuming valid...") return True return output == "true" def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> list[bool]: """Determines wether a question is relevant or not for our given framework.""" logger.info(f"Checking document relevance of {len(matched_documents)} documents") if len(matched_documents) > self.max_calls: raise ValueError("Max calls exceeded, increase max_calls to allow this.") # Here we parallelize the calls. We introduce a wrapper as a workaround. def _check_documents(args): "Thin wrapper so we can pass args as a Tuple and use ThreadPoolExecutor." answer, document = args return self.check_document_relevance(answer=answer, document=document) args_list = [(answer, doc) for doc in matched_documents.content.to_list()] with concurrent.futures.ThreadPoolExecutor() as executor: relevance = list(executor.map(_check_documents, args_list)) logger.info(f"{relevance=}") # add it back to the dataframe matched_documents["relevance"] = relevance return matched_documents ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools", "setuptools-scm"] build-backend = "setuptools.build_meta" [project] name = "buster-doctalk" version = "0.0.1" description = "Buster 🤖: A chatbot for retrieval-augmented generation" readme = "README.md" requires-python = ">=3.10" dynamic = ["dependencies"] [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} [tool.setuptools.packages.find] include = ["buster"] [tool.isort] profile = "black" [tool.black] line-length = 120 [tool.pytest.ini_options] log_cli = true log_cli_level = "INFO" [tool.poetry] name = "buster-doctalk" version = "v0.0.1" description = "Buster 🤖: A chatbot for retrieval-augmented generation" license = "MIT" authors = [ "Jeremy Pinto ", "Hadrien Bertrand ", ] readme = "README.md" repository = "https://github.com/jerpint/buster" packages = [ { include = "buster" }, { include = "buster/**/*.py" }, ] [tool.poetry.dependencies] python = ">=3.10,<3.13" ================================================ FILE: requirements.txt ================================================ bs4 click deeplake gradio>=3.40 matplotlib numpy>=1.25 openai>=1.0 pandas>=2.1.3 pinecone-client>=3.0.2 pinecone-text>=0.6.0 pymongo pytest tabulate tenacity tiktoken ================================================ FILE: tests/test_chatbot.py ================================================ import copy import logging import os from pathlib import Path import numpy as np import pandas as pd import pytest from buster.busterbot import Buster, BusterConfig from buster.completers import ChatGPTCompleter, Completer, Completion, DocumentAnswerer from buster.documents_manager import DeepLakeDocumentsManager from buster.formatters.documents import DocumentsFormatterHTML from buster.formatters.prompts import PromptFormatter from buster.llm_utils import get_openai_embedding from buster.retriever import DeepLakeRetriever, Retriever from buster.tokenizers.gpt import GPTTokenizer from buster.validators import Validator logging.basicConfig(level=logging.INFO) DOCUMENTS_CSV = Path(__file__).resolve().parent.parent / "buster/examples/stackoverflow.csv" UNKNOWN_PROMPT = "I'm sorry but I don't know how to answer." NUM_WORKERS = 1 # default class used by our tests buster_cfg_template = BusterConfig( completion_cfg={ "completion_kwargs": { "model": "gpt-3.5-turbo", "temperature": 0, }, "client_kwargs": { "timeout": 20, "max_retries": 2, }, }, validator_cfg={ "validate_documents": False, "use_reranking": True, "answer_validator_cfg": { "unknown_response_templates": [ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", ], "unknown_threshold": 0.85, }, "question_validator_cfg": { "invalid_question_response": "This question does not seem relevant to my current knowledge.", "completion_kwargs": { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, }, "client_kwargs": { "timeout": 20, "max_retries": 2, }, "check_question_prompt": "You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.", }, }, retriever_cfg={ # "db_path": to be set using pytest fixture, "top_k": 3, "thresh": 0.7, "max_tokens": 2000, "embedding_fn": get_openai_embedding, }, prompt_formatter_cfg={ "max_tokens": 3500, "text_after_docs": ("""Now answer the following question:\n"""), "text_before_docs": ( """You are a chatbot assistant answering technical questions about artificial intelligence (AI). """ """If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer.""" """Use this response when you cannot answer:\n""" f"""'{UNKNOWN_PROMPT}'\n""" """For example:\n""" """What is the meaning of life?\n""" f"""'{UNKNOWN_PROMPT}'\n""" """Only use these prodived documents as reference:\n""" ), }, documents_formatter_cfg={ "max_tokens": 3000, "formatter": "{content}", }, ) def get_fake_embedding(length=1536): rng = np.random.default_rng() return list(rng.random(length, dtype=np.float32)) class MockAnswerer(Completer): def __init__(self, expected_answer): self.expected_answer = expected_answer def prepare_prompt(self, user_inputs, matched_documents): pass def complete(self): return def get_completion(self, user_inputs, matched_documents, validator, *arg, **kwarg) -> Completion: return Completion( answer_text=self.expected_answer, error=False, user_inputs=user_inputs, matched_documents=matched_documents, validator=validator, ) class MockRetriever(Retriever): def __init__(self, **kwargs): super().__init__(**kwargs) path = kwargs["path"] self.path = path n_samples = 100 self.documents = pd.DataFrame.from_dict( { "title": ["test"] * n_samples, "url": ["http://url.com"] * n_samples, "content": ["cool text"] * n_samples, "embedding": [get_fake_embedding()] * n_samples, "n_tokens": [10] * n_samples, "source": ["fake source"] * n_samples, } ) self.embedding_fn = get_fake_embedding def get_documents(self, source): return self.documents def get_topk_documents(self, query: str, sources: list[str] = None, top_k: int = None) -> pd.DataFrame: documents = self.documents documents["embedding"] = [get_fake_embedding() for _ in range(len(documents))] documents["similarity"] = [np.random.random() for _ in range(len(documents))] return documents def get_source_display_name(self, source): return source class MockValidator: def __init__(self, *args, **kwargs): return def validate(self, completion): completion.answer_relevant = True return completion def check_question_relevance(self, *args, **kwargs): return True, "" def check_answer_relevance(self, *args, **kwargs): return True @pytest.fixture(scope="session") def vector_store_path(tmp_path_factory): # Create a temporary directory and folder for the database manager dm_path = tmp_path_factory.mktemp("data").joinpath("deeplake_store") # Add the documents (will generate embeddings) dm = DeepLakeDocumentsManager(vector_store_path=dm_path) df = pd.read_csv(DOCUMENTS_CSV) dm.add(df, num_workers=NUM_WORKERS) return dm_path def test_chatbot_mock_data(tmp_path, monkeypatch): gpt_expected_answer = "this is GPT answer" path = tmp_path / "not_a_real_file.tar.gz" buster_cfg = copy.deepcopy(buster_cfg_template) buster_cfg.retriever_cfg["path"] = path buster_cfg.completion_cfg = { "expected_answer": gpt_expected_answer, } retriever = MockRetriever(**buster_cfg.retriever_cfg) document_answerer = MockAnswerer(**buster_cfg.completion_cfg) validator = MockValidator(**buster_cfg.validator_cfg) buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator) completion = buster.process_input(user_input="What is a transformer?", sources=["fake_source"]) assert isinstance(completion.answer_text, str) assert completion.answer_text.startswith(gpt_expected_answer) def test_chatbot_real_data__chatGPT(vector_store_path): buster_cfg = copy.deepcopy(buster_cfg_template) buster_cfg.retriever_cfg["path"] = vector_store_path retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) document_answerer = DocumentAnswerer( completer=ChatGPTCompleter(**buster_cfg.completion_cfg), documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg), prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg), ) validator: Validator = Validator(**buster_cfg.validator_cfg) buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator) completion = buster.process_input("What is backpropagation?") assert isinstance(completion.answer_text, str) assert completion.question_relevant == True assert completion.answer_relevant == True assert completion.completion_kwargs == buster_cfg.completion_cfg["completion_kwargs"] def test_chatbot_real_data__chatGPT_OOD(vector_store_path): buster_cfg = copy.deepcopy(buster_cfg_template) buster_cfg.retriever_cfg["path"] = vector_store_path buster_cfg.prompt_formatter_cfg = { "max_tokens": 3500, "text_before_docs": ( """You are a chatbot assistant answering technical questions about artificial intelligence (AI).""" """If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer.""" """Use this response: """ f"""'{UNKNOWN_PROMPT}'\n""" """For example:\n""" """What is the meaning of life?\n""" f"""'{UNKNOWN_PROMPT}'\n""" """Now answer the following question:\n""" ), "text_after_docs": "Only use these documents as reference:\n", } retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) document_answerer = DocumentAnswerer( completer=ChatGPTCompleter(**buster_cfg.completion_cfg), documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg), prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg), ) validator: Validator = Validator(**buster_cfg.validator_cfg) buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator) completion: Completion = buster.process_input("What is a good recipe for brocolli soup?") assert isinstance(completion.answer_text, str) assert completion.question_relevant == False assert completion.answer_relevant == False assert completion.completion_kwargs is None def test_chatbot_real_data__no_docs_found(vector_store_path): with pytest.warns(): buster_cfg = copy.deepcopy(buster_cfg_template) buster_cfg.retriever_cfg = { "path": vector_store_path, "embedding_fn": get_openai_embedding, "top_k": 3, "thresh": 1, # Set threshold very high to be sure no docs are matched "max_tokens": 3000, } buster_cfg.documents_answerer_cfg["no_documents_message"] = "No documents available." retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) document_answerer = DocumentAnswerer( completer=ChatGPTCompleter(**buster_cfg.completion_cfg), documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg), prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg), **buster_cfg.documents_answerer_cfg, ) validator: Validator = Validator(**buster_cfg.validator_cfg) buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator) completion = buster.process_input("What is backpropagation?") assert isinstance(completion.answer_text, str) assert completion.question_relevant == True assert completion.answer_relevant == False assert completion.answer_text == "No documents available." ================================================ FILE: tests/test_documents.py ================================================ import os import numpy as np import pandas as pd import pytest from buster.documents_manager import DeepLakeDocumentsManager from buster.documents_manager.base import compute_embeddings_parallelized from buster.llm_utils import get_openai_embedding from buster.retriever import DeepLakeRetriever # Patch the get_embedding function to return a fixed, fake embedding NUM_WORKERS = 1 fake_embedding = [-0.005, 0.0018] def get_fake_embedding(*arg, **kwargs): return fake_embedding @pytest.mark.parametrize( "documents_manager, retriever", [(DeepLakeDocumentsManager, DeepLakeRetriever)], ) def test_write_read(tmp_path, documents_manager, retriever): retriever_cfg = { "top_k": 3, "thresh": 0.7, "max_tokens": 2000, "embedding_fn": get_openai_embedding, } dm_path = tmp_path / "tmp_dir_2" retriever_cfg["path"] = dm_path data = pd.DataFrame.from_dict( { "title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["sourceA"], "embedding": [np.arange(10, dtype=np.float32) - 0.3], "n_tokens": 5, } ) dm = DeepLakeDocumentsManager(vector_store_path=dm_path) dm.add(df=data) dm_data = retriever(**retriever_cfg).get_documents(sources=["sourceA"]) assert dm_data["title"].iloc[0] == data["title"].iloc[0] assert dm_data["url"].iloc[0] == data["url"].iloc[0] assert dm_data["content"].iloc[0] == data["content"].iloc[0] assert dm_data["source"].iloc[0] == data["source"].iloc[0] assert np.allclose(dm_data["embedding"].iloc[0], data["embedding"].iloc[0]) @pytest.mark.parametrize( "documents_manager, retriever", [ (DeepLakeDocumentsManager, DeepLakeRetriever), ], ) def test_write_write_read(tmp_path, documents_manager, retriever): retriever_cfg = { "top_k": 3, "thresh": 0.7, "max_tokens": 2000, "embedding_fn": get_openai_embedding, } db_path = tmp_path / "tmp_dir" retriever_cfg["path"] = db_path db = documents_manager(db_path) data_1 = pd.DataFrame.from_dict( { "title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "embedding": [np.arange(10, dtype=np.float32) - 0.3], "source": ["sourceA"], "n_tokens": 10, } ) db.add(df=data_1, num_workers=NUM_WORKERS) data_2 = pd.DataFrame.from_dict( { "title": ["other"], "url": ["http://url.com/page.html"], "content": ["lorem ipsum"], "embedding": [np.arange(10, dtype=np.float32) / 10 - 2.3], "source": ["sourceB"], "n_tokens": 5, } ) db.add(df=data_2, num_workers=NUM_WORKERS) db_data = retriever(**retriever_cfg).get_documents(sources=["sourceB"]) assert len(db_data) == len(data_2) assert db_data["title"].iloc[0] == data_2["title"].iloc[0] assert db_data["url"].iloc[0] == data_2["url"].iloc[0] assert db_data["content"].iloc[0] == data_2["content"].iloc[0] assert np.allclose(db_data["embedding"].iloc[0], data_2["embedding"].iloc[0]) def test_generate_embeddings(tmp_path, monkeypatch): # Create fake data df = pd.DataFrame.from_dict( {"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]} ) # Generate embeddings, store in a file path = tmp_path / f"test_document_embeddings" dm = DeepLakeDocumentsManager(path) dm.add(df, embedding_fn=get_fake_embedding, num_workers=NUM_WORKERS) # Read the embeddings from the file retriever_cfg = { "path": path, "top_k": 3, "thresh": 0.85, "max_tokens": 3000, "embedding_fn": get_fake_embedding, } read_df = DeepLakeRetriever(**retriever_cfg).get_documents("my_source") # Check all the values are correct across the files assert df["title"].iloc[0] == df["title"].iloc[0] == read_df["title"].iloc[0] assert df["url"].iloc[0] == df["url"].iloc[0] == read_df["url"].iloc[0] assert df["content"].iloc[0] == df["content"].iloc[0] == read_df["content"].iloc[0] assert np.allclose(fake_embedding, read_df["embedding"].iloc[0]) def test_generate_embeddings_parallelized(): # Create fake data df = pd.DataFrame.from_dict( { "title": ["test"] * 5, "url": ["http://url.com"] * 5, "content": ["cool text" + str(x) for x in range(5)], "source": ["my_source"] * 5, } ) embeddings_parallel = compute_embeddings_parallelized( df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS ) embeddings = df.content.apply(get_openai_embedding) # embeddings comes out as a series because of the apply, so cast it back to an array embeddings_arr = np.array(embeddings.to_list()) # Not clear why a tolerance needs to be specified, likely because it is computed on different machines # since the requests are done in parallel... assert np.allclose(embeddings_parallel, embeddings_arr, atol=1e-2) def test_add_batches(tmp_path): dm_path = tmp_path / "deeplake_store" num_samples = 20 batch_size = 16 csv_filename = os.path.join(tmp_path, "embedding_") dm = DeepLakeDocumentsManager(vector_store_path=dm_path) # Create fake data df = pd.DataFrame.from_dict( { "title": ["test"] * num_samples, "url": ["http://url.com"] * num_samples, "content": ["cool text" + str(x) for x in range(num_samples)], "source": ["my_source"] * num_samples, } ) dm.batch_add( df, embedding_fn=get_fake_embedding, num_workers=NUM_WORKERS, batch_size=batch_size, min_time_interval=0, csv_filename=csv_filename, ) csv_files = [f for f in os.listdir(tmp_path) if f.endswith(".csv")] # check that we registered the good number of doucments and that files were generated assert len(dm) == num_samples df_saved = pd.read_csv(csv_filename) assert len(df_saved) == num_samples assert "embedding" in df_saved.columns ================================================ FILE: tests/test_formatters.py ================================================ import json import pandas as pd import pytest from buster.formatters.documents import DocumentsFormatterHTML, DocumentsFormatterJSON from buster.formatters.prompts import PromptFormatter from buster.tokenizers import GPTTokenizer def test_DocumentsDormatterHTML__simple(): """In this test, we expect all 3 documents to be matched and returned normally.""" tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") documents_formatter = DocumentsFormatterHTML( tokenizer=tokenizer, max_tokens=100, ) document_1 = "This is a very short document." document_2 = "This is another very short document." document_3 = "This is also a short document." expected_docs_str = ( "" f"{document_1}<\\DOCUMENT>" f"{document_2}<\\DOCUMENT>" f"{document_3}<\\DOCUMENT>" "<\\DOCUMENTS>" ) matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]}) docs_str, matched_documents_new = documents_formatter.format(matched_documents) # less documents and the new document is shorter than the original assert all(matched_documents.content == matched_documents_new.content) assert docs_str == expected_docs_str def test_DocumentsDormatterJSON__simple(): """In this test, we expect all 3 documents to be matched and returned normally.""" tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=["content", "source"]) document_1 = "This is a very short document." document_2 = "This is another very short document." document_3 = "This is also a short document." source_1 = "source 1" source_2 = "source 2" source_3 = "source 3" data_dict = { "content": [document_1, document_2, document_3], "source": [source_1, source_2, source_3], } expected_docs_str = json.dumps( [ {"content": document_1, "source": source_1}, {"content": document_2, "source": source_2}, {"content": document_3, "source": source_3}, ], separators=(",", ":"), ) matched_documents = pd.DataFrame(data_dict) docs_str, matched_documents_new = documents_formatter.format(matched_documents) # less documents and the new document is shorter than the original assert all(matched_documents.content == matched_documents_new.content) assert docs_str == expected_docs_str # matched_documents.to_json(orient="records") def test_DocumentsFormatterHTML__doc_to_long(): """In this test, document_1 doesn't entirely fit. we only expect a part of it to be contained. """ tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") documents_formatter = DocumentsFormatterHTML( tokenizer=tokenizer, max_tokens=100, ) long_sentence = "This is a very long document. It is long on purpose." document_1 = long_sentence * 50 document_2 = "This is a very short document." document_3 = "This is also a short document" matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]}) docs_str, matched_documents_new = documents_formatter.format(matched_documents) # less documents and the new document is shorter than the original assert len(matched_documents) == 3 assert len(matched_documents_new) == 1 assert len(docs_str) < len(document_1) # The long document gets truncated, the others don't make it in. assert long_sentence in docs_str assert document_2 not in docs_str assert document_3 not in docs_str def test_DocumentsFormatterJSON__doc_too_long(): """In this test, document_3 doesn't fit. We expect it to be excluded completely. we only expect a part of it to be contained. """ tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=["content", "source"]) long_sentence = "This is a very long document. It is long on purpose." document_1 = "This is a very short document." document_2 = "This is also a short document" document_3 = long_sentence * 50 source_1 = "source 1" source_2 = "source 2" source_3 = "source 3" data_dict = { "content": [document_1, document_2, document_3], "source": [source_1, source_2, source_3], } expected_docs_str = json.dumps( [ {"content": document_1, "source": source_1}, {"content": document_2, "source": source_2}, ], separators=(",", ":"), ) matched_documents = pd.DataFrame(data_dict) docs_str, matched_documents_new = documents_formatter.format(matched_documents) assert docs_str == expected_docs_str # less documents and the new document is shorter than the original assert len(matched_documents) == 3 assert len(matched_documents_new) == 2 # The last document gets ignored completely, the first 2 make it assert document_1 in docs_str assert document_2 in docs_str assert long_sentence not in docs_str def test_DocumentsFormatterHTML__doc_to_long_2(): """In this test, document_2 doesn't entirely fit. we only expect a part of it to be contained, as well as all of document_1, and none of document_3. """ tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") documents_formatter = DocumentsFormatterHTML( tokenizer=tokenizer, max_tokens=100, ) document_1 = "This is a very short document." document_2 = "This is a very long document. It is long on purpose." * 50 document_3 = "This is also a short document" matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]}) docs_str, matched_documents_new = documents_formatter.format(matched_documents) # less documents and the new document is shorter than the original assert len(matched_documents) == 3 assert len(matched_documents_new) == 2 assert document_1 in docs_str assert "This is a very long document. It is long on purpose." in docs_str # at least a subset should be in there assert document_3 not in docs_str def test_DocumentsFormatterHTML__complex_format(): """In this test, we expect all 3 documents to be matched and returned in a particular format.""" tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") documents_formatter = DocumentsFormatterHTML( tokenizer=tokenizer, max_tokens=100, formatter="Title: {title}\n{content}\n", ) document_1 = "This is a very short document." document_2 = "This is another very short document." document_3 = "This is also a short document." title_1 = "doc1" title_2 = "doc2" title_3 = "doc3" country_1 = "Canada" country_2 = "France" country_3 = "Germany" expected_docs_str = ( "" f"Title: {title_1}\n{document_1}\n<\\DOCUMENT>" f"Title: {title_2}\n{document_2}\n<\\DOCUMENT>" f"Title: {title_3}\n{document_3}\n<\\DOCUMENT>" "<\\DOCUMENTS>" ) matched_documents = pd.DataFrame( { "content": [document_1, document_2, document_3], "title": [title_1, title_2, title_3], "country": [country_1, country_2, country_3], } ) docs_str, matched_documents_new = documents_formatter.format(matched_documents) # less documents and the new document is shorter than the original assert all(matched_documents.content == matched_documents_new.content) assert docs_str == expected_docs_str def test_system_prompt_formatter(): tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") prompt_formatter = PromptFormatter( tokenizer=tokenizer, max_tokens=200, text_after_docs="After docs.", text_before_docs="Before docs.", formatter="{text_before_docs}\n{documents}\n{text_after_docs}", ) documents = "Here are some docs" prompt = prompt_formatter.format(documents) assert prompt == ("Before docs.\n" "Here are some docs\n" "After docs.") assert documents in prompt def test_system_prompt_formatter__to_long(): tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo") prompt_formatter = PromptFormatter( tokenizer=tokenizer, max_tokens=200, text_after_docs="After docs.", text_before_docs="Before docs.", ) documents = "Here are some documents that are WAY too long." * 100 with pytest.raises(ValueError): prompt_formatter.format(documents) ================================================ FILE: tests/test_read_write.py ================================================ import pandas as pd from buster.completers import Completion, UserInputs class MockValidator: def __init__(self): self.use_reranking = True def check_answer_relevance(self, completion: Completion) -> bool: return True def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> bool: return matched_documents def test_read_write_completion(): n_samples = 3 completion_kwargs = {"param_1": "a"} matched_documents = pd.DataFrame.from_dict( { "title": ["test"] * n_samples, "url": ["http://url.com"] * n_samples, "content": ["cool text"] * n_samples, "embedding": [[0.0] * 1000] * n_samples, "n_tokens": [10] * n_samples, "source": ["fake source"] * n_samples, } ) c = Completion( user_inputs=UserInputs(original_input="What is the meaning of life?"), error=False, answer_text="This is my actual answer", matched_documents=matched_documents, validator=MockValidator(), completion_kwargs=completion_kwargs, ) c_json = c.to_json() c_back = Completion.from_dict(c_json) assert c.error == c_back.error assert c.answer_text == c_back.answer_text assert c.user_inputs == c_back.user_inputs assert c.answer_relevant == c_back.answer_relevant assert c.completion_kwargs == c_back.completion_kwargs for col in c_back.matched_documents.columns.tolist(): assert col in c.matched_documents.columns.tolist() assert c_back.matched_documents[col].tolist() == c.matched_documents[col].tolist() ================================================ FILE: tests/test_validator.py ================================================ import pandas as pd from buster.llm_utils import get_openai_embedding from buster.validators import Validator validator_cfg = { "use_reranking": True, "validate_documents": True, "answer_validator_cfg": { "unknown_response_templates": [ "I Don't know how to answer your question.", ], "unknown_threshold": 0.85, }, "question_validator_cfg": { "invalid_question_response": "This question does not seem relevant to my current knowledge.", "completion_kwargs": { "model": "gpt-3.5-turbo", "stream": False, "temperature": 0, }, "check_question_prompt": "You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.", }, } validator = Validator(**validator_cfg) def test_validator_check_question_relevance(): question = "What is backpropagation?" relevance, _ = validator.check_question_relevance(question) assert relevance == True question = "How can I make a broccoli soup?" relevance, _ = validator.check_question_relevance(question) assert relevance == False def test_validator_check_answer_relevance(): answer = "Not sure how to answer your question" assert validator.check_answer_relevance(answer) == False answer = "According to the documentation, the answer should be 2+2 = 4." assert validator.check_answer_relevance(answer) == True def test_validator_check_documents_relevance(): docs = { "content": [ "A panda is a bear native to China, known for its black and white fur.", "An apple is a sweet fruit, often red, green, or yellow in color.", "A car is a wheeled vehicle used for transportation, typically powered by an engine.", ] } answer = "Pandas live in China." expected_relevance = [True, False, False] matched_documents = pd.DataFrame(docs) matched_documents = validator.check_documents_relevance(answer=answer, matched_documents=matched_documents) assert "relevance" in matched_documents.columns assert matched_documents.relevance.to_list() == expected_relevance def test_validator_rerank_docs(): documents = [ "A basketball player practicing", "A cat eating an orange", "A green apple on the counter", ] matched_documents = pd.DataFrame({"documents": documents}) matched_documents["embedding"] = matched_documents.documents.apply(lambda x: get_openai_embedding(x)) answer = "An apple is a delicious fruit." reranked_documents = validator.rerank_docs(answer, matched_documents) assert reranked_documents.documents.to_list() == [ "A green apple on the counter", "A cat eating an orange", "A basketball player practicing", ]