Repository: jerpint/buster
Branch: main
Commit: 07b6bb893f47
Files: 44
Total size: 152.0 KB

Directory structure:
gitextract_5l_frr4b/

├── .github/
│   └── workflows/
│       ├── publish_pypi.yaml
│       └── tests.yaml
├── .gitignore
├── LICENSE.md
├── README.md
├── buster/
│   ├── __init__.py
│   ├── busterbot.py
│   ├── completers/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── chatgpt.py
│   │   └── user_inputs.py
│   ├── documents_manager/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── deeplake.py
│   │   └── service.py
│   ├── examples/
│   │   ├── cfg.py
│   │   ├── generate_embeddings.py
│   │   ├── gradio_app.py
│   │   └── stackoverflow.csv
│   ├── formatters/
│   │   ├── documents.py
│   │   └── prompts.py
│   ├── llm_utils/
│   │   ├── __init__.py
│   │   ├── embeddings.py
│   │   └── question_reformulator.py
│   ├── parsers/
│   │   ├── __init__.py
│   │   └── parser.py
│   ├── retriever/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── deeplake.py
│   │   └── service.py
│   ├── tokenizers/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   └── gpt.py
│   ├── utils.py
│   └── validators/
│       ├── __init__.py
│       ├── base.py
│       └── validators.py
├── pyproject.toml
├── requirements.txt
└── tests/
    ├── test_chatbot.py
    ├── test_documents.py
    ├── test_formatters.py
    ├── test_read_write.py
    └── test_validator.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/publish_pypi.yaml
================================================
name: publish-pypi

on:
  workflow_dispatch:
  release:
    types: [created]

jobs:
  deploy:

    runs-on: ubuntu-latest
    environment: secrets
    steps:
    - uses: actions/checkout@v4
      with:
        fetch-depth: 0
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.10'
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install poetry
    - name: Build and publish
      env:
        POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }}
      run: |
        poetry version $(git describe --tags --abbrev=0)
        poetry add $(cat requirements.txt)
        poetry build
        poetry publish


================================================
FILE: .github/workflows/tests.yaml
================================================
name: Tests

on: [pull_request]

jobs:
  tests:
    runs-on: ubuntu-latest
    environment: secrets
    steps:
      - name: Check out repository code
        uses: actions/checkout@v3
      - name: black linter
        uses: psf/black@stable
        with:
          options: "--check --diff --line-length 120"
      - name: isort
        run: |
          pip install isort
          isort --profile black --check-only .
      - name: unit tests
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          python3 -m pip install --upgrade pip
          pip install -e .
          pytest


================================================
FILE: .gitignore
================================================
# database files
*.db

buster/apps/data/
deeplake_store/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Macos
*.DS_Store*

albenchmark/data/

# Ignore notebooks by default
*.ipynb

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# VSCode
.vscode/


================================================
FILE: LICENSE.md
================================================
MIT License

Copyright (c) 2023 Buster dev team

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# Buster, the QA documentation chatbot!

<div align="center">

[![GitHub](https://img.shields.io/github/license/jerpint/buster)](https://github.com/jerpint/buster)
[![PyPI](https://img.shields.io/pypi/v/buster-doctalk?logo=pypi)](https://pypi.org/project/buster-doctalk)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Hugging Face Spaces](https://img.shields.io/badge/🤗%20Hugging%20Face-Buster%20Demo-blue)](https://huggingface.co/spaces/jerpint/buster)

</div>

Buster is a question-answering chatbot that can be tuned to any source of documentations.

# Demo

In order to view the full abilities of Buster, you can play with our [live demo here](https://huggingface.co/spaces/jerpint/buster).
We scraped the documentation of [huggingface 🤗 Transformers](https://huggingface.co/docs/transformers/index) and instructed Buster to answer questions related to its usage.

# Quickstart

This section is meant to help you install and run local version of Buster.
First step, install buster:

**Note**: Buster requires python>=3.10

```bash
pip install buster-doctalk
```

Then, go to the examples folder and launch the app.
We've included small sample data off stackoverflow-ai questions that you can test your setup with to try app:

```bash
cd buster/buster/examples
gradio gradio_app.py
```

This will launch the gradio app locally.


**NOTE**: The demo uses chatGPT to generate text and compute embeddings, make sure to set a valid openai API key:
```bash
export OPENAI_API_KEY=sk-...
```

# Generating your own embeddings

Once your local version of Buster is up and running, the next step is for you to be able to import your own data.
We will be using the `stackoverflow.csv` file in the `buster/examples/` folder for this. This is the same data that was used to generate the demo app's embeddings.

You will first ingest the documents to be ready for buster. In this example, we use Deeplake's vector store, but you can always write your own custom `DocumentManager`:


```python
import pandas as pd
from buster.documents_manager import DeepLakeDocumentsManager

# Read the csv
df = pd.read_csv("stackoverflow.csv")

# Generate the embeddings for our documents and store them in a deeplake format
dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True)
dm.add(df)
```

You can also just simply run the script:

    python generate_embeddings.py --csv stackoverflow.csv


This will generate the embeddings and save them locally in the `deeplake_store`.


**NOTE**: You will need to set a valid openai key for computing embeddings:

```bash
export OPENAI_API_KEY=sk-...
```

You only need to run this operation one time.

In the .csv, we expect columns ["title", "url", "content", "source"] for each row of the csv:

* title: this will be the title of the url to display
* url: the link that clicking the title will redirect to
* source: where the content was originally sourced from (e.g. wikipedia, stackoverflow, etc.)
* content: plaintext of the documents to be embedded. It is your responsibility to chunk your documents appropriately. For better results, we recommend chunks of 400-600 words.

# Additional Configurations

Properly prompting models as well as playing around with various model parameters can lead to different results. We use a `BusterConfig` object to keep track of the various Buster configurations. In the `buster/examples/` folder, the config is stored inside `cfg.py`. Modify this config to update parameters, prompts, etc.

# How does Buster work?

First, we parsed the documentation into snippets. For each snippet, we obtain an embedding by using the [OpenAI API](https://beta.openai.com/docs/guides/embeddings/what-are-embeddings).

Then, when a user asks a question, we compute its embedding, and find the snippets from the doc with the highest cosine similarity to the question.

Finally, we craft the prompt:
- The most relevant snippets from the doc.
- The engineering prompt.
- The user's question.

We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!

### Currently available models

- For embeddings: "text-embedding-ada-002"
- For completion: We support both "gpt-3.5-turbo" and "gpt-4"

### Livestream

For more information, you can watch the livestream where explain how buster works in detail!

- [Livestream recording](https://youtu.be/LB5g-AhfPG8)

================================================
FILE: buster/__init__.py
================================================


================================================
FILE: buster/busterbot.py
================================================
import logging
from dataclasses import dataclass, field
from typing import Optional

import pandas as pd

from buster.completers import Completion, DocumentAnswerer, UserInputs
from buster.llm_utils import QuestionReformulator, get_openai_embedding
from buster.retriever import Retriever
from buster.validators import Validator

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


@dataclass
class BusterConfig:
    """Configuration object for a chatbot."""

    validator_cfg: dict = field(
        default_factory=lambda: {
            "use_reranking": True,
            "validate_documents": False,
        }
    )
    tokenizer_cfg: dict = field(
        default_factory=lambda: {
            "model_name": "gpt-3.5-turbo",
        }
    )
    retriever_cfg: dict = field(
        default_factory=lambda: {
            "max_tokens": 3000,
            "top_k": 3,
            "thresh": 0.7,
            "embedding_fn": get_openai_embedding,
        }
    )
    prompt_formatter_cfg: dict = field(
        default_factory=lambda: {
            "max_tokens": 3500,
            "text_before_docs": "You are a chatbot answering questions.\n",
            "text_after_docs": "Answer the following question:\n",
            "formatter": "{text_before_docs}\n{documents}\n{text_after_docs}",
        }
    )
    documents_formatter_cfg: dict = (
        field(
            default_factory=lambda: {
                "max_tokens": 3500,
                "formatter": "{content}",
            }
        ),
    )
    documents_answerer_cfg: dict = field(
        default_factory=lambda: {
            "no_documents_message": "No documents are available for this question.",
        }
    )
    question_reformulator_cfg: dict = field(
        default_factory=lambda: {
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            },
            "system_prompt": """
            Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system.
            Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval.""",
        }
    )
    completion_cfg: dict = field(
        default_factory=lambda: {
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "temperature": 0,
                "stream": True,
            },
        }
    )


class Buster:
    def __init__(
        self,
        retriever: Retriever,
        document_answerer: DocumentAnswerer,
        validator: Validator,
        question_reformulator: Optional[QuestionReformulator] = None,
    ):
        self.document_answerer = document_answerer
        self.retriever = retriever
        self.validator = validator
        self.question_reformulator = question_reformulator

    def process_input(
        self,
        user_input: str,
        sources: Optional[list[str]] = None,
        top_k: Optional[int] = None,
        reformulate_question: Optional[bool] = False,
    ) -> Completion:
        """
        Main function to process the input question and generate a formatted output.
        """

        logger.info(f"User Input:\n{user_input}")

        # We make sure there is always a newline at the end of the question to avoid completing the question.
        if not user_input.endswith("\n"):
            user_input += "\n"

        user_inputs = UserInputs(original_input=user_input)

        # The returned message is either a generic invalid question message or an error handling message
        question_relevant, irrelevant_question_message = self.validator.check_question_relevance(user_input)

        if question_relevant:
            # question is relevant, get completor to generate completion

            # reformulate the question if a reformulator is defined
            if self.question_reformulator is not None and reformulate_question:
                reformulated_input, reformulation_error = self.question_reformulator.reformulate(
                    user_inputs.original_input
                )
                user_inputs.reformulated_input = reformulated_input

                if reformulation_error:
                    completion = Completion(
                        error=True,
                        user_inputs=user_inputs,
                        matched_documents=pd.DataFrame(),
                        answer_text="Something went wrong reformulating the question. Try again soon.",
                        answer_relevant=False,
                        question_relevant=False,
                        validator=self.validator,
                    )
                    return completion

            # Retrieve and answer
            matched_documents = self.retriever.retrieve(user_inputs, sources=sources, top_k=top_k)
            completion: Completion = self.document_answerer.get_completion(
                user_inputs=user_inputs,
                matched_documents=matched_documents,
                validator=self.validator,
                question_relevant=question_relevant,
            )
            return completion

        else:
            # question was determined irrelevant, so we instead return a generic response set by the user.
            completion = Completion(
                error=False,
                user_inputs=user_inputs,
                matched_documents=pd.DataFrame(),
                answer_text=irrelevant_question_message,
                answer_relevant=False,
                question_relevant=False,
                validator=self.validator,
            )
            return completion


================================================
FILE: buster/completers/__init__.py
================================================
from .base import Completer, Completion, DocumentAnswerer
from .chatgpt import ChatGPTCompleter
from .user_inputs import UserInputs

__all__ = [
    ChatGPTCompleter,
    Completer,
    Completion,
    DocumentAnswerer,
    UserInputs,
]


================================================
FILE: buster/completers/base.py
================================================
import io
import logging
import warnings
from abc import ABC, abstractmethod
from typing import Any, Iterator, Optional

import pandas as pd
from fastapi.encoders import jsonable_encoder

from buster.completers.user_inputs import UserInputs
from buster.formatters.documents import DocumentsFormatter
from buster.formatters.prompts import PromptFormatter

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class Completion:
    """
    A class to represent the completion object of a model's output for a user's question.

    Attributes:
        error (bool): A boolean indicating if an error occurred when generating the completion.
        user_inputs (UserInputs): The inputs from the user.
        matched_documents (pd.DataFrame): The documents that were matched to the user's question.
        answer_generator (Iterator): An optional iterator used to generate the model's answer.
        answer_text (str): An optional answer text.
        answer_relevant (bool): An optional boolean indicating if the answer is relevant.
        question_relevant (bool): An optional boolean indicating if the question is relevant.
        completion_kwargs (dict): Optional arguments for the completion.
        validator (Validator): An optional Validator object.

    Methods:
        __repr__: Outputs a string representation of the object.
        _validate_arguments: Validates answer_generator and answer_text arguments.
        answer_relevant: Determines if the answer is relevant or not.
        question_relevant: Retrieves the relevance of the question.
        answer_text: Retrieves the answer text.
        answer_generator: Retrieves the answer generator.
        postprocess: Postprocesses the results after generating the model's answer.
        to_json: Outputs selected attributes of the object in JSON format.
        from_dict: Creates a Completion object from a dictionary.
    """

    def __init__(
        self,
        error: bool,
        user_inputs: UserInputs,
        matched_documents: pd.DataFrame,
        answer_generator: Optional[Iterator] = None,
        answer_text: Optional[str] = None,
        answer_relevant: Optional[bool] = None,
        question_relevant: Optional[bool] = None,
        completion_kwargs: Optional[dict] = None,
        validator=None,
    ):
        self.error = error
        self.user_inputs = user_inputs
        self.matched_documents = matched_documents
        self.validator = validator
        self.completion_kwargs = completion_kwargs
        self._answer_relevant = answer_relevant
        self._question_relevant = question_relevant

        self._validate_arguments(answer_generator, answer_text)

    def __repr__(self):
        class_name = type(self).__name__
        return (
            f"{class_name}("
            f"user_inputs={self.user_inputs!r}, "
            f"error={self.error!r}, "
            f"matched_documents={self.matched_documents!r}, "
            f"answer_text={self._answer_text!r}, "
            f"answer_generator={self.answer_generator!r}, "
            f"answer_relevant={self._answer_relevant!r}, "
            f"question_relevant={self.question_relevant!r}, "
            f"completion_kwargs={self.completion_kwargs!r}, "
            "),"
        )

    def _validate_arguments(self, answer_generator: Optional[Iterator], answer_text: Optional[str]):
        """Sets answer_generator and answer_text properties depending on the provided inputs.

        Checks that one of either answer_generator or answer_text is not None.
        If answer_text is set, a generator can simply be inferred from answer_text.
        If answer_generator is set, answer_text will be set only once the generator gets called. Set to None for now.
        """
        if (answer_generator is None and answer_text is None) or (
            answer_generator is not None and answer_text is not None
        ):
            raise ValueError("Only one of 'answer_generator' and 'answer_text' must be set.")

        # If text is provided, the genrator can be inferred
        if answer_text is not None:
            assert isinstance(answer_text, str)
            answer_generator = (msg for msg in answer_text)

        self._answer_text = answer_text
        self._answer_generator = answer_generator

    @property
    def answer_relevant(self) -> bool:
        """Property determining the relevance of an answer (bool).

        If an error occured, the relevance is False.
        If no documents were retrieved, the relevance is also False.
        Otherwise, the relevance is computed as defined by the validator (e.g. comparing to embeddings)
        """
        if self.error:
            self._answer_relevant = False
        elif len(self.matched_documents) == 0:
            self._answer_relevant = False
        elif self._answer_relevant is not None:
            return self._answer_relevant
        else:
            # Check the answer relevance by looking at the embeddings
            self._answer_relevant = self.validator.check_answer_relevance(self.answer_text)
        return self._answer_relevant

    @property
    def question_relevant(self):
        """Property determining the relevance of the question asked (bool)."""
        return self._question_relevant

    @property
    def answer_text(self):
        if self._answer_text is None:
            # generates the text if it wasn't already generated
            self._answer_text = "".join([i for i in self.answer_generator])
        return self._answer_text

    @answer_text.setter
    def answer_text(self, value: str) -> None:
        self._answer_text = value

    @property
    def answer_generator(self):
        # keeps track of the yielded text
        self._answer_text = ""
        for token in self._answer_generator:
            self._answer_text += token
            yield token

        self.postprocess()

    @answer_generator.setter
    def answer_generator(self, generator: Iterator) -> None:
        self._answer_generator = generator

    def postprocess(self):
        """Function executed after the answer text is generated by the answer_generator"""

        if self.validator is None:
            # TODO: This should only happen if declaring a Completion using .from_dict() method.
            # This behaviour is not ideal and we may want to remove support for .from_dict() in the future.
            logger.info("No validator was set, skipping postprocessing.")
            return

        if self.validator.use_reranking:
            # rerank docs in order of cosine similarity to the question
            self.matched_documents = self.validator.rerank_docs(
                answer=self.answer_text, matched_documents=self.matched_documents
            )

        if self.validator.validate_documents:
            self.matched_documents = self.validator.check_documents_relevance(
                answer=self.answer_text, matched_documents=self.matched_documents
            )

        # access the property so it gets set if not computed alerady
        self.answer_relevant

    def to_json(self, columns_to_ignore: Optional[list[str]] = None) -> Any:
        """Converts selected attributes of the object to a JSON format.

        Args:
            columns_to_ignore (list[str]): A list of column names to ignore in the csulting matched_documents dataframe.

        Returns:
            Any: The object's attributes encoded as JSON.

        Notes:
            - The 'matched_documents' attribute of type pd.DataFrame is encoded separately
            using a custom encoder.
            - The resulting JSON may exclude specified columns based on the 'columns_to_ignore' parameter.
        """

        def encode_df(df: pd.DataFrame) -> dict:
            if columns_to_ignore is not None:
                df = df.drop(columns=columns_to_ignore, errors="ignore")
            return df.to_json(orient="index")

        custom_encoder = {
            # Converts the matched_documents in the user_responses to json
            pd.DataFrame: encode_df,
        }

        to_encode = {
            "user_inputs": self.user_inputs,
            "answer_text": self.answer_text,
            "matched_documents": self.matched_documents,
            "answer_relevant": self.answer_relevant,
            "question_relevant": self.question_relevant,
            "completion_kwargs": self.completion_kwargs,
            "error": self.error,
        }
        return jsonable_encoder(to_encode, custom_encoder=custom_encoder)

    @classmethod
    def from_dict(cls, completion_dict: dict):
        # Map a dict of user inputs to the UserInputs class
        if isinstance(completion_dict["user_inputs"], dict):
            completion_dict["user_inputs"] = UserInputs(**completion_dict["user_inputs"])

        # Map the matched documents back to a dataframe
        if isinstance(completion_dict["matched_documents"], str):
            # avoids deprecation warning
            json_data = io.StringIO(completion_dict["matched_documents"])

            completion_dict["matched_documents"] = pd.read_json(json_data, orient="index")
        elif isinstance(completion_dict["matched_documents"], dict):
            completion_dict["matched_documents"] = pd.DataFrame(completion_dict["matched_documents"]).T
        else:
            raise ValueError(f"Unknown type for matched_documents: {type(completion_dict['matched_documents'])}")

        return cls(**completion_dict)


class Completer(ABC):
    """
    Abstract base class for completers, which generate an answer to a prompt.

    Methods:
        complete: The method that should be implemented by any child class to provide an answer to a prompt.
    """

    @abstractmethod
    def complete(self, prompt: str, user_input) -> (str | Iterator, bool):
        """Returns the completed message (can be a generator), and a boolean to indicate if an error occured or not."""
        ...


class DocumentAnswerer:
    """
    A class that answers questions based on documents.

    It takes care of formatting the prompts and the documents, and generating the answer when relevant.

    Attributes:
        completer (Completer): Object that actually generates an answer to the prompt.
        documents_formatter (DocumentsFormatter): Object that formats the documents for the prompt.
        prompt_formatter (PromptFormatter): Object that prepares the prompt for the completer.
        no_documents_message (str): Message to display when no documents are found to match the query.
        completion_class (Completion): Class to use for the resulting completion.

    Methods:
        prepare_prompt: Prepares the prompt that will be passed to the completer.
        get_completion: Generates a completion to the user's question based on matched documents.
    """

    def __init__(
        self,
        documents_formatter: DocumentsFormatter,
        prompt_formatter: PromptFormatter,
        completer: Completer,
        completion_class: Completion = Completion,
        no_documents_message: str = "No documents were found that match your question.",
    ):
        self.completer = completer
        self.documents_formatter = documents_formatter
        self.prompt_formatter = prompt_formatter
        self.no_documents_message = no_documents_message
        self.completion_class = completion_class

    def prepare_prompt(self, matched_documents) -> str:
        """Prepare the prompt with prompt engineering.

        A user's question is not included here. We use the documents formatter and prompt formatter to
        compose the prompt itself.
        """

        # format the matched documents, (will truncate them if too long)
        formatted_documents, _ = self.documents_formatter.format(matched_documents)
        prompt = self.prompt_formatter.format(formatted_documents)
        return prompt

    def get_completion(
        self,
        user_inputs: UserInputs,
        matched_documents: pd.DataFrame,
        validator,
        question_relevant: bool = True,
    ) -> Completion:
        """Generate a completion to a user's question based on matched documents.

        It is safe to assume the question_relevance to be True if we made it here."""

        logger.info(f"{user_inputs=}")

        if len(matched_documents) == 0:
            warning_msg = "No documents found during retrieval."
            warnings.warn(warning_msg)
            logger.warning(warning_msg)

            # empty dataframe
            matched_documents = pd.DataFrame(columns=matched_documents.columns)

            # because we are requesting a completion, we assume the question is relevant.
            # However, no documents were found, so we pass the no documents found message instead of generating the answer.
            # The completion does not get triggered, so we do not pass completion kwargs here either.
            completion = self.completion_class(
                user_inputs=user_inputs,
                answer_text=self.no_documents_message,
                error=False,
                matched_documents=matched_documents,
                question_relevant=question_relevant,
                validator=validator,
            )
            return completion

        # prepare the prompt with matched documents
        prompt = self.prepare_prompt(matched_documents)
        logger.info(f"{prompt=}")

        logger.info(f"querying model with parameters: {self.completer.completion_kwargs}...")

        try:
            answer_generator, error = self.completer.complete(prompt=prompt, user_input=user_inputs.current_input)

        except Exception as e:
            error = True
            answer_generator = "Something went wrong with the request, try again soon!"
            logger.exception("Unknown error when attempting to generate response. See traceback:")

        completion = self.completion_class(
            answer_generator=answer_generator,
            error=error,
            matched_documents=matched_documents,
            user_inputs=user_inputs,
            question_relevant=question_relevant,
            validator=validator,
            completion_kwargs=self.completer.completion_kwargs,
        )

        return completion


================================================
FILE: buster/completers/chatgpt.py
================================================
import logging
import os
from typing import Iterator, Optional

import openai
from openai import OpenAI

from buster.completers import Completer

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Check if an API key exists for promptlayer, if it does, use it
promptlayer_api_key = os.environ.get("PROMPTLAYER_API_KEY")
if promptlayer_api_key:
    # TODO: Check if this still works with latest openAI API...
    try:
        import promptlayer

        logger.info("Enabling prompt layer...")
        promptlayer.api_key = promptlayer_api_key

        # replace openai with the promptlayer wrapper
        openai = promptlayer.openai
    except Exception as e:
        logger.exception("Something went wrong enabling promptlayer.")


class ChatGPTCompleter(Completer):
    def __init__(self, completion_kwargs: dict, client_kwargs: Optional[dict] = None):
        """Initialize the ChatGPTCompleter with completion and client keyword arguments.

        Args:
          completion_kwargs: A dictionary of keyword arguments to be used for completions.
          client_kwargs: An optional dictionary of keyword arguments to be used for the OpenAI client.
        """
        # use default client if none passed
        self.completion_kwargs = completion_kwargs

        if client_kwargs is None:
            client_kwargs = {}

        self.client = OpenAI(**client_kwargs)

    def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str | Iterator, bool):
        """Given a prompt and user input, returns the generated message and error flag.

        Args:
          prompt: The prompt containing the formatted documents and instructions.
          user_input: The user input to be responded to.
          completion_kwargs: An optional dictionary of keyword arguments to override the default completion kwargs.

        Returns:
          A tuple containing the completed message and a boolean indicating if an error occurred.

        Raises:
          openai.BadRequestError: If the completion request is invalid.
          openai.RateLimitError: If the OpenAI servers are overloaded.
        """
        # Uses default configuration if not overridden

        if completion_kwargs is None:
            completion_kwargs = self.completion_kwargs

        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": user_input},
        ]

        try:
            error = False
            response = self.client.chat.completions.create(messages=messages, **completion_kwargs)
        except openai.BadRequestError:
            error = True
            logger.exception("Invalid request to OpenAI API. See traceback:")
            error_message = "Something went wrong while connecting with OpenAI, try again soon!"
            return error_message, error

        except openai.RateLimitError:
            error = True
            logger.exception("RateLimit error from OpenAI. See traceback:")
            error_message = "OpenAI servers seem to be overloaded, try again later!"
            return error_message, error

        except Exception as e:
            error = True
            logger.exception("Some kind of error happened trying to generate the response. See traceback:")
            error_message = "Something went wrong with connecting with OpenAI, try again soon!"
            return error_message, error

        if completion_kwargs.get("stream") is True:
            # We are entering streaming mode, so here we're just wrapping the streamed
            # openai response to be easier to handle later
            def answer_generator():
                for chunk in response:
                    token = chunk.choices[0].delta.content

                    # Always stream a string, openAI returns None on last token
                    token = "" if token is None else token

                    yield token

            return answer_generator(), error

        else:
            full_response: str = response.choices[0].message.content
            return full_response, error


================================================
FILE: buster/completers/user_inputs.py
================================================
from dataclasses import dataclass
from typing import Optional


@dataclass
class UserInputs:
    """A class that represents user inputs.

    Attributes:
        original_input: The original user input.
        reformulated_input: The reformulated user input (optional).
    """

    original_input: str
    reformulated_input: Optional[str] = None

    @property
    def current_input(self):
        """Returns the current user input.

        If the reformulated input is not None, it returns the reformulated input.
        Otherwise, it returns the original input.

        Returns:
            The current user input.
        """
        return self.reformulated_input if self.reformulated_input is not None else self.original_input


================================================
FILE: buster/documents_manager/__init__.py
================================================
from .base import DocumentsManager
from .deeplake import DeepLakeDocumentsManager
from .service import DocumentsService

__all__ = [DocumentsManager, DocumentsService, DeepLakeDocumentsManager]


================================================
FILE: buster/documents_manager/base.py
================================================
import logging
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Callable, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

from buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding

tqdm.pandas()

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


@dataclass
class DocumentsManager(ABC):
    def __init__(self, required_columns: Optional[list[str]] = None):
        """
        Constructor for DocumentsManager class.

        Args:
            required_columns (Optional[list[str]]): A list of column names that are required for the dataframe to contain.
                                                     If None, no columns are enforced.
        """

        self.required_columns = required_columns

    def _check_required_columns(self, df: pd.DataFrame):
        """Each entry in the df is expected to have the columns in self.required_columns"""
        if not all(col in df.columns for col in self.required_columns):
            raise ValueError(f"DataFrame is missing one or more of {self.required_columns=}")

    def _checkpoint_csv(self, df, csv_filename: str, csv_overwrite: bool = True):
        """
        Saves DataFrame with embeddings to a CSV checkpoint.

        Args:
            df (pd.DataFrame): The DataFrame with embeddings.
            csv_filename (str): Path to save a copy of the dataframe with computed embeddings for later use.
            csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True.
        """
        import os

        if csv_overwrite:
            df.to_csv(csv_filename)
            logger.info(f"Saved DataFrame with embeddings to {csv_filename}")

        else:
            if os.path.exists(csv_filename):
                # append to existing file
                append_df = pd.read_csv(csv_filename)
                append_df = pd.concat([append_df, df])
            else:
                # will create the new file
                append_df = df.copy()
            append_df.to_csv(csv_filename)
            logger.info(f"Appending DataFrame embeddings to {csv_filename}")

    def add(
        self,
        df: pd.DataFrame,
        num_workers: int = 16,
        embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding,
        sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,
        csv_filename: Optional[str] = None,
        csv_overwrite: bool = True,
        **add_kwargs,
    ):
        """Write documents from a DataFrame into the DocumentManager store.

        This method adds documents from the provided DataFrame to the database. It performs the following steps:
        1. Checks if the required columns are present in the DataFrame.
        2. Computes embeddings for the 'content' column if they are not already present.
        3. Optionally saves the DataFrame with computed embeddings to a CSV checkpoint.
        4. Calls the '_add_documents' method to add documents with embeddings to the DocumentsManager.

        Args:
            df (pd.DataFrame): The DataFrame containing the documents to be added.
            num_workers (int, optional): The number of parallel workers to use for computing embeddings. Default is 32.
            embedding_fn (callable, optional): A function that computes embeddings for a given input string.
                Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.
            sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.
                Default is None. Only use if you want sparse embeddings.
            csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use.
            csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True.
            **add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method.
        """

        if self.required_columns is not None:
            self._check_required_columns(df)

        # Check if embeddings are present, computes them if not
        if "embedding" not in df.columns:
            df["embedding"] = compute_embeddings_parallelized(df, embedding_fn=embedding_fn, num_workers=num_workers)
        if "sparse_embedding" not in df.columns and sparse_embedding_fn is not None:
            df["sparse_embedding"] = sparse_embedding_fn(df.content.to_list())

        if csv_filename is not None:
            self._checkpoint_csv(df, csv_filename=csv_filename, csv_overwrite=csv_overwrite)

        self._add_documents(df, **add_kwargs)

    def batch_add(
        self,
        df: pd.DataFrame,
        batch_size: int = 3000,
        min_time_interval: int = 60,
        num_workers: int = 16,
        embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding,
        sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,
        csv_filename: Optional[str] = None,
        csv_overwrite: bool = False,
        **add_kwargs,
    ):
        """
        Adds DataFrame data to a DataManager instance in batches.

        This function takes a DataFrame and adds its data to a DataManager instance in batches.
        It ensures that a minimum time interval is maintained between successive batches
        to prevent timeouts or excessive load. This is useful for APIs like openAI with rate limits.

        Args:
            df (pd.DataFrame): The input DataFrame containing data to be added.
            batch_size (int, optional): The size of each batch. Defaults to 3000.
            min_time_interval (int, optional): The minimum time interval (in seconds) between batches.
                                                Defaults to 60.
            num_workers (int, optional): The number of parallel workers to use when adding data.
                                        Defaults to 32.
            embedding_fn (callable, optional): A function that computes embeddings for a given input string.
                Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.
            sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.
                Default is None. Only use if you want sparse embeddings.
            csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use.
            csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to False.
                When using batches, set to False to keep all embeddings in the same file. You may want to manually remove the file if experimenting.
            **add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method.
        """

        total_batches = (len(df) // batch_size) + 1

        logger.info(f"Adding {len(df)} documents with {batch_size=} for {total_batches=}")

        for batch_idx in range(total_batches):
            logger.info(f"Processing batch {batch_idx + 1}/{total_batches}")
            start_time = time.time()

            # Calculate batch indices and extract batch DataFrame
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(df))
            batch_df = df.iloc[start_idx:end_idx]

            # Add the batch data to using specified parameters
            self.add(
                batch_df,
                num_workers=num_workers,
                csv_filename=csv_filename,
                csv_overwrite=csv_overwrite,
                embedding_fn=embedding_fn,
                sparse_embedding_fn=sparse_embedding_fn,
                **add_kwargs,
            )

            elapsed_time = time.time() - start_time

            # Sleep to ensure the minimum time interval is maintained
            # Only sleep if it's not the last iteration
            if batch_idx < total_batches - 1:
                sleep_time = max(0, min_time_interval - elapsed_time)
                if sleep_time > 0:
                    logger.info(f"Sleeping for {round(sleep_time)} seconds...")
                    time.sleep(sleep_time)

        logger.info("All batches processed.")

    @abstractmethod
    def _add_documents(self, df: pd.DataFrame, **add_kwargs):
        """Abstract method to be implemented by each inherited member.

        This method should handle the actual process of adding documents to the database.
        """
        ...


================================================
FILE: buster/documents_manager/deeplake.py
================================================
import logging
from typing import Optional

import pandas as pd

from buster.utils import zip_contents

from .base import DocumentsManager

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class DeepLakeDocumentsManager(DocumentsManager):
    def __init__(
        self,
        vector_store_path: str = "deeplake_store",
        required_columns: Optional[list[str]] = None,
        **vector_store_kwargs,
    ):
        """Initialize a DeepLakeDocumentsManager object.

        Args:
            vector_store_path: The path to the vector store.
            required_columns: A list of columns that are required in the dataframe.
            **vector_store_kwargs: Additional keyword arguments to pass to the VectorStore initializer.
        """
        from deeplake.core.vectorstore import VectorStore

        self.vector_store_path = vector_store_path
        self.required_columns = required_columns
        self.vector_store = VectorStore(
            path=self.vector_store_path,
            **vector_store_kwargs,
        )

    def __len__(self):
        """Get the number of documents in the vector store.

        Returns:
            The number of documents in the vector store.
        """
        return len(self.vector_store)

    @classmethod
    def _extract_metadata(cls, df: pd.DataFrame) -> dict:
        """Extract metadata from the dataframe in DeepLake dict format.

        Args:
            df: The dataframe from which to extract metadata.

        Returns:
            The extracted metadata in DeepLake dict format.
        """
        # Ignore the content and embedding column for metadata
        df = df.drop(columns=["content", "embedding"], errors="ignore")

        columns = list(df.columns)

        metadata = df.apply(
            lambda x: {col: x[col] for col in columns},
            axis=1,
        ).to_list()
        return metadata

    def _add_documents(self, df: pd.DataFrame, **add_kwargs):
        """Write all documents from the dataframe into the vector store as a new version.

        Each entry in the dataframe is expected to have at least the following columns:
        ["content", "embedding"]

        Embeddings will have been precomputed in the self.add() method, which calls this one.

        Args:
            df: The dataframe containing the documents to add.
            **add_kwargs: Additional keyword arguments to pass to the add method of the vector store.
        """
        # Embedding should already be computed in the .add method
        assert "embedding" in df.columns, "expected column=embedding in the dataframe"

        # extract the chunked text + metadata
        metadata = self._extract_metadata(df)

        chunked_text = df.content.to_list()

        embeddings = df.embedding.to_list()
        self.vector_store.add(
            text=chunked_text,
            embedding=embeddings,
            metadata=metadata,
            **add_kwargs,
        )

    def to_zip(self, output_path: str = "."):
        """Zip the contents of the vector store path folder to a .zip file in the output path.

        Args:
            output_path: The path where the zip file should be created.

        Returns:
            The path to the created zip file.
        """
        vector_store_path = self.vector_store_path
        logger.info(f"Compressing {vector_store_path}...")
        zip_file_path = zip_contents(input_path=vector_store_path, output_path=output_path)
        logger.info(f"Compressed {vector_store_path} to {zip_file_path}.")
        return zip_file_path


================================================
FILE: buster/documents_manager/service.py
================================================
import logging

import pandas as pd
import pinecone
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from buster.documents_manager.base import DocumentsManager

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class DocumentsService(DocumentsManager):
    """Manager to use in production. Mixed Pinecone and MongoDB backend."""

    def __init__(
        self,
        pinecone_api_key: str,
        pinecone_index: str,
        pinecone_namespace: str,
        mongo_uri: str,
        mongo_db_name: str,
        **kwargs,
    ):
        """Initialize the DocumentsService.

        Args:
            pinecone_api_key: The Pinecone API key.
            pinecone_env: The Pinecone environment.
            pinecone_index: The Pinecone index.
            pinecone_namespace: The Pinecone namespace.
            mongo_uri: The MongoDB URI.
            mongo_db_name: The MongoDB database name.
            **kwargs: Additional keyword arguments to pass to the parent class.
        """
        super().__init__(**kwargs)

        pc = pinecone.Pinecone(api_key=pinecone_api_key)

        self.index = pc.Index(pinecone_index)
        self.namespace = pinecone_namespace

        self.mongo_db_name = mongo_db_name
        self.client = MongoClient(mongo_uri, server_api=ServerApi("1"))
        self.db = self.client[mongo_db_name]

    def __repr__(self):
        """Return a string representation of the DocumentsService."""
        return "DocumentsService"

    def get_source_id(self, source: str) -> str:
        """Get the id of a source.

        Args:
            source: The name of the source.

        Returns:
            The id of the source.
        """
        return str(self.db.sources.find_one({"name": source})["_id"])

    def _add_documents(self, df: pd.DataFrame):
        """Write all documents from the dataframe into the db as a new version.

        Args:
            df: The dataframe containing the documents.
        """
        use_sparse_vector = "sparse_embedding" in df.columns
        if use_sparse_vector:
            logger.info("Uploading sparse embeddings too.")

        for source in df.source.unique():
            source_exists = self.db.sources.find_one({"name": source})
            if source_exists is None:
                self.db.sources.insert_one({"name": source})

            source_id = self.get_source_id(source)

            df_source = df[df.source == source]
            to_upsert = []
            for row in df_source.to_dict(orient="records"):
                embedding = row["embedding"].tolist()
                if use_sparse_vector:
                    sparse_embedding = row["sparse_embedding"]

                document = row.copy()
                document.pop("embedding")
                if use_sparse_vector:
                    document.pop("sparse_embedding")
                document["source_id"] = source_id

                document_id = str(self.db.documents.insert_one(document).inserted_id)
                vector = {"id": document_id, "values": embedding, "metadata": {"source": source}}
                if use_sparse_vector:
                    vector["sparse_values"] = sparse_embedding

                to_upsert.append(vector)

            # Current (February 2024) Pinecone upload rules:
            # - Max 100 vectors per batch
            MAX_PINECONE_BATCH_SIZE = 100
            for i in range(0, len(to_upsert), MAX_PINECONE_BATCH_SIZE):
                self.index.upsert(vectors=to_upsert[i : i + MAX_PINECONE_BATCH_SIZE], namespace=self.namespace)

    def update_source(self, source: str, display_name: str = None, note: str = None):
        """Update the display name and/or note of a source. Also create the source if it does not exist.

        Args:
            source: The name of the source.
            display_name: The new display name of the source.
            note: The new note of the source.
        """
        self.db.sources.update_one(
            {"name": source}, {"$set": {"display_name": display_name, "note": note}}, upsert=True
        )

    def delete_source(self, source: str) -> tuple[int, int]:
        """Delete a source and all its documents. Return if the source was deleted and the number of deleted documents.

        Args:
            source: The name of the source.

        Returns:
            A tuple containing the number of deleted sources and the number of deleted documents.
        """
        source_id = self.get_source_id(source)

        # MongoDB
        source_deleted = self.db.sources.delete_one({"name": source}).deleted_count
        documents_deleted = self.db.documents.delete_many({"source_id": source_id}).deleted_count

        # Pinecone
        self.index.delete(filter={"source": source}, namespace=self.namespace)

        return source_deleted, documents_deleted

    def drop_db(self):
        """Drop the currently accessible database.

        For Pinecone, this means deleting everything in the namespace.
        For Mongo DB, this means dropping the database. However this needs to be done manually through the GUI.
        """
        confirmation = input("Dropping the database is irreversible. Are you sure you want to proceed? (y/N): ")

        if confirmation.strip().lower() == "y":
            self.index.delete(namespace=self.namespace, delete_all=True)

            logging.info(f"Deleted all documents from Pinecone namespace: {self.namespace=}")
            logging.info(f"The MongoDB database needs to be dropped manually: {self.mongo_db_name=}")
        else:
            logging.info("Operation cancelled.")


================================================
FILE: buster/examples/cfg.py
================================================
from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, DocumentAnswerer
from buster.formatters.documents import DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.llm_utils import get_openai_embedding_constructor
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers import GPTTokenizer
from buster.validators import Validator

# kwargs to pass to OpenAI client
client_kwargs = {
    "timeout": 20,
    "max_retries": 3,
}

embedding_fn = get_openai_embedding_constructor(client_kwargs=client_kwargs)

buster_cfg = BusterConfig(
    validator_cfg={
        "question_validator_cfg": {
            "invalid_question_response": "This question does not seem relevant to my current knowledge.",
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            },
            "client_kwargs": client_kwargs,
            "check_question_prompt": """You are a chatbot answering questions on artificial intelligence.
Your job is to determine wether or not a question is valid, and should be answered.
More general questions are not considered valid, even if you might know the response.
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.

For example:

Q: What is backpropagation?
true

Q: What is the meaning of life?
false

A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
        },
        "answer_validator_cfg": {
            "unknown_response_templates": [
                "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
            ],
            "unknown_threshold": 0.85,
            "embedding_fn": embedding_fn,
        },
        "documents_validator_cfg": {
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            },
            "client_kwargs": client_kwargs,
        },
        "use_reranking": True,
        "validate_documents": False,
    },
    retriever_cfg={
        "path": "deeplake_store",
        "top_k": 3,
        "thresh": 0.7,
        "embedding_fn": embedding_fn,
    },
    documents_answerer_cfg={
        "no_documents_message": "No documents are available for this question.",
    },
    completion_cfg={
        "completion_kwargs": {
            "model": "gpt-3.5-turbo",
            "stream": True,
            "temperature": 0,
        },
        "client_kwargs": client_kwargs,
    },
    tokenizer_cfg={
        "model_name": "gpt-3.5-turbo",
    },
    documents_formatter_cfg={
        "max_tokens": 3500,
        "columns": ["content", "title", "source"],
    },
    prompt_formatter_cfg={
        "max_tokens": 3500,
        "text_before_docs": (
            "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
            "If the answer is in the documentation, summarize it in a helpful way to the user. "
            "If it isn't, simply reply that you cannot answer the question. "
            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
            "Here is the documentation: "
        ),
        "text_after_docs": (
            "REMEMBER:\n"
            "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
            "Here are the rules you must follow:\n"
            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
            "For example:\n"
            "What is the meaning of life for an qa bot?\n"
            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
            "Now answer the following question:\n"
        ),
    },
)


def setup_buster(buster_cfg: BusterConfig):
    """initialize buster with a buster_cfg class"""
    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
    document_answerer: DocumentAnswerer = DocumentAnswerer(
        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
        documents_formatter=DocumentsFormatterJSON(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
        prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
        **buster_cfg.documents_answerer_cfg,
    )
    validator: Validator = Validator(**buster_cfg.validator_cfg)
    buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
    return buster


================================================
FILE: buster/examples/generate_embeddings.py
================================================
import click
import pandas as pd

from buster.documents_manager import DeepLakeDocumentsManager

REQUIRED_COLUMNS = ["url", "title", "content", "source"]


@click.command(
    help="This script processes a CSV file and generates embeddings. The CSV argument specifies the path to the input CSV file."
)
@click.argument("csv", metavar="<path_to_csv_file>")
def main(csv):
    # Read the csv
    df = pd.read_csv(csv)

    # initialize our vector store from scratch
    dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True, required_columns=REQUIRED_COLUMNS)

    # Generate the embeddings for our documents and store them to the deeplake store
    dm.add(df, csv_filename="embeddings.csv")

    # Save it to a zip file
    dm.to_zip()


if __name__ == "__main__":
    main()


================================================
FILE: buster/examples/gradio_app.py
================================================
import os
from typing import Optional, Tuple

import cfg
import gradio as gr
import pandas as pd
from cfg import setup_buster

from buster.completers import Completion
from buster.utils import extract_zip

# Check if an openai key is set as an env. variable
if os.getenv("OPENAI_API_KEY") is None:
    print("Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'.")

# Typehint for chatbot history
ChatHistory = list[list[Optional[str], Optional[str]]]

extract_zip("deeplake_store.zip", "deeplake_store")

buster = setup_buster(cfg.buster_cfg)


def add_user_question(user_question: str, chat_history: Optional[ChatHistory] = None) -> ChatHistory:
    """Adds a user's question to the chat history.

    If no history is provided, the first element of the history will be the user conversation.
    """
    if chat_history is None:
        chat_history = []
    chat_history.append([user_question, None])
    return chat_history


def format_sources(matched_documents: pd.DataFrame) -> str:
    if len(matched_documents) == 0:
        return ""

    matched_documents.similarity_to_answer = matched_documents.similarity_to_answer * 100

    # drop duplicate pages (by title), keep highest ranking ones
    matched_documents = matched_documents.sort_values("similarity_to_answer", ascending=False).drop_duplicates(
        "title", keep="first"
    )

    documents_answer_template: str = (
        "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
    )
    document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"

    documents = "\n".join([document_template.format(document=document) for _, document in matched_documents.iterrows()])
    footnote: str = "I'm a bot 🤖 and not always perfect."

    return documents_answer_template.format(documents=documents, footnote=footnote)


def add_sources(history, completion):
    if completion.answer_relevant:
        formatted_sources = format_sources(completion.matched_documents)
        history.append([None, formatted_sources])

    return history


def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
    """Answer a user's question using retrieval augmented generation."""

    # We assume that the question is the user's last interaction
    user_input = chat_history[-1][0]

    # Do retrieval + augmented generation with buster
    completion = buster.process_input(user_input)

    # Stream tokens one at a time to the user
    chat_history[-1][1] = ""
    for token in completion.answer_generator:
        chat_history[-1][1] += token

        yield chat_history, completion


demo = gr.Blocks()

with demo:
    with gr.Row():
        gr.Markdown("<h3><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h3>")

    chatbot = gr.Chatbot()

    with gr.Row():
        question_textbox = gr.Textbox(
            label="What's your question?",
            placeholder="Type your question here...",
            lines=1,
        )
        send_button = gr.Button(value="Send", variant="secondary")

    examples = gr.Examples(
        examples=[
            "How can I perform backpropagation?",
            "How do I deal with noisy data?",
            "How do I deal with noisy data in 2 words?",
        ],
        inputs=question_textbox,
    )

    gr.Markdown("This application uses GPT to search the docs for relevant info and answer questions.")

    gr.HTML("️<center> Created with ❤️ by @jerpint and @hadrienbertrand")

    response = gr.State()

    # fmt: off
    gr.on(
        triggers=[send_button.click, question_textbox.submit],
        fn=add_user_question,
        inputs=[question_textbox],
        outputs=[chatbot]
    ).then(
        chat,
        inputs=[chatbot],
        outputs=[chatbot, response]
    ).then(
        add_sources,
        inputs=[chatbot, response],
        outputs=[chatbot]
    )

    # fmt: on


demo.queue()
demo.launch(debug=True, share=False)


================================================
FILE: buster/examples/stackoverflow.csv
================================================
,source,title,content,url
0,stackoverflow,stackoverflow question #1,"""Backprop"" is the same as ""backpropagation"": it's just a shorter way to say it. It is sometimes abbreviated as ""BP"".
",https://ai.stackexchange.com/questions/1
1,stackoverflow,stackoverflow question #2,"Noise in the data, to a reasonable amount, may help the network to generalize better. Sometimes, it has the opposite effect. It partly depends on the kind of noise (""true"" vs. artificial).
The AI FAQ on ANN gives a good overview. Excerpt:

Noise in the actual data is never a good thing, since it limits the accuracy of generalization that can be achieved no matter how extensive the training set is. On the other hand, injecting artificial noise (jitter) into the inputs during training is one of several ways to improve generalization for smooth functions when you have a small training set.

In some field, such as computer vision, it's common to increase the size of the training set by copying some samples and adding some noises or other transformation.
",https://ai.stackexchange.com/questions/2
2,stackoverflow,stackoverflow question #4,"There is no direct way to find the optimal number of them: people empirically try and see (e.g., using cross-validation). The most common search techniques are random, manual, and grid searches. 
There exist more advanced techniques such as Gaussian processes, e.g. Optimizing Neural Network Hyperparameters with Gaussian Processes for Dialog Act Classification, IEEE SLT 2016.
",https://ai.stackexchange.com/questions/4
3,stackoverflow,stackoverflow question #6,"It rather depends on how one defines several of the terms used. For example:

Whether the term ""expected"" is interpreted in a formal (i.e.
statistical) sense.  
Whether it's assumed that humans have any kind of utilitarian
""performance measure"".

The motivation for this description of ""agent"" arose from a desire to have a quantitative model - it's not clear that such a model is a good fit for human cognition.
However, there are alternative definitions of agents, for example the BDI model, which are rather more open-ended and hence more obviously applicable to humans.
",https://ai.stackexchange.com/questions/6
4,stackoverflow,stackoverflow question #7,"
To put it simply in layman terms, what are the possible threats from AI? 

Currently, there are no threat. 
The threat comes if humans create a so-called ultraintelligent machine, a machine that can surpass all intellectual activities by any human. This would be the last invention man would need to do, since this machine is better in inventing machines than humans are (since that is an intellectual activity).  However, this could cause the machine to invent machines that can destruct humans, and we can't stop them because they are so much smarter than we are.
This is all hypothetical, no one has even a clue of what an ultraintelligent machine looks like. 

If we know that AI is so dangerous why are we still promoting it? Why is it not banned?

As I said before, the existence of a ultraintelligent machine is hypothetical. Artificial Intelligence has lots of useful applications (more than this answer can contain), and if we develop it, we get even more useful applications. We just have to be careful that the machines won't overtake us. 
",https://ai.stackexchange.com/questions/7
5,stackoverflow,stackoverflow question #10,"It's analogous to analogue versus digital, or the many shades of gray in between black and white: when evaluating the truthiness of a result, in binary boolean it's either true or false (0 or 1), but when utilizing fuzzy logic, it's an estimated probability between 0 and 1 (such as 0.75 being mostly probably true). It's useful for making calculated decisions when all information needed isn't necessarily available.
Wikipedia has a fantastic page for this.
",https://ai.stackexchange.com/questions/10
6,stackoverflow,stackoverflow question #15,"The problem of the Turing Test is that it tests the machines ability to resemble humans. Not necessarily every form of AI has to resemble humans. This makes the Turing Test less reliable. However, it is still useful since it is an actual test. It is also noteworthy that there is a prize for passing or coming closest to passing the Turing Test, the Loebner Prize.
The intelligent agent definition of intelligence states that an agent is intelligent if it acts so to maximize the expected value of a performance measure based on past experience and knowledge. (paraphrased from Wikipedia). This definition is used more often and does not depend on the ability to resemble humans. However, it is harder to test this. 
",https://ai.stackexchange.com/questions/15
7,stackoverflow,stackoverflow question #17,"The concept of ""the singularity"" is when machines outsmart the humans. Although Stephen Hawking opinion is that this situation is inevitable, but I think it'll be very difficult to reach that point, because every A.I. algorithm needs to be programmed by humans, therefore it would be always more limited than its creator.
We would probably know when that point when humanity will lose control over Artificial Intelligence where super-smart AI would be in competition with humans and maybe creating more sophisticated intelligent beings occurred, but currently, it's more like science fiction (aka Terminator's Skynet).
The risk could involve killing people (like self-flying war drones making their own decision), destroying countries or even the whole planet (like A.I. connected to the nuclear weapons (aka WarGames movie), but it doesn't prove the point that the machines would be smarter than humans.
",https://ai.stackexchange.com/questions/17
8,stackoverflow,stackoverflow question #26,"I think your question fits nowadays more in the field of Human-Robot Interaction, which relies largely on vision for recognition of gestures and follow movements, as well as soft, natural movements as a response. Note that the movements of the face and hands belong to the most complex tasks, involving many muscles at a time.
I strongly recommend the film Plug & Pray to have an idea of what people are researching in this area.
You may also find Eliza (which you can try here) interesting. It is classical in the history of AI and pretends to mimic an analyst (psychology). (I am thinking of Eliza not because of its emotional intelligence, but because it was apparently taken seriously by a couple of humans. Could this be taken as a sort of (approved) Turing test? What does it say about the humans it met?)
On the purely human end of the scale, I sometimes wonder about our (my) emotional intelligence myself. Would I want to implement such an intelligence in an artificial agent at all?
",https://ai.stackexchange.com/questions/26
9,stackoverflow,stackoverflow question #28,"This is probably more a question of philosophy than anything. In terms of how things are commonly defined, I'll say ""yes, genetic algorithms are part of AI"".  If you pick up a comprehensive book on artificial intelligence, there will probably be a chapter on genetic algorithms (or more broadly, evolutionary algorithms). 
One area that has been extensively studied in the past is the idea of using genetic algorithms to train neural networks.  I don't know if people are still actively researching this topic or not, but it at least illustrates that GA's are part of the overall rubric of AI in one regard.
",https://ai.stackexchange.com/questions/28


================================================
FILE: buster/formatters/documents.py
================================================
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass

import pandas as pd

from buster.tokenizers import Tokenizer

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class DocumentsFormatter(ABC):
    """
    Abstract base class for document formatters.

    Subclasses are required to implement the `format` method which transforms the input documents
    into the desired format.
    """

    @abstractmethod
    def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:
        """
        Abstract method to format matched documents.

        Parameters:
        - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.

        Returns:
        - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a string and
                                    the possibly truncated matched documents DataFrame.
        """
        pass


@dataclass
class DocumentsFormatterHTML(DocumentsFormatter):
    """
    Formatter class to convert matched documents into an HTML format.

    Attributes:
    - tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents.
    - max_tokens (int): Maximum allowed tokens for the formatted documents.
    - formatter (str): String formatter for the document's content.
    - inner_tag (str): HTML tag that will be used at the document level.
    - outer_tag (str): HTML tag that will be used at the documents collection level.
    """

    tokenizer: Tokenizer
    max_tokens: int
    formatter: str = "{content}"
    inner_tag: str = "DOCUMENT"
    outer_tag: str = "DOCUMENTS"

    def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:
        """
        Format the matched documents into an HTML format.

        If the total tokens exceed max_tokens, documents are truncated or omitted to fit within the limit.

        Parameters:
        - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.

        Returns:
        - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as an HTML string and
                                    the possibly truncated matched documents DataFrame.
        """

        documents_str = ""
        total_tokens = 0
        max_tokens = self.max_tokens

        num_total_docs = len(matched_documents)
        num_preserved_docs = 0
        # TODO: uniformize this logic with the DocumentsFormatterJSON
        for _, row in matched_documents.iterrows():
            doc = self.formatter.format_map(row.to_dict())
            num_preserved_docs += 1
            token_count, encoded = self.tokenizer.num_tokens(doc, return_encoded=True)
            if total_tokens + token_count <= max_tokens:
                documents_str += f"<{self.inner_tag}>{doc}<\\{self.inner_tag}>"
                total_tokens += token_count
            else:
                logger.warning("truncating document to fit...")
                remaining_tokens = max_tokens - total_tokens
                truncated_doc = self.tokenizer.decode(encoded[:remaining_tokens])
                documents_str += f"<{self.inner_tag}>{truncated_doc}<\\{self.inner_tag}>"
                logger.warning(f"Documents after truncation: {documents_str}")
                break

        if num_preserved_docs < (num_total_docs):
            logger.warning(
                f"{num_preserved_docs}/{num_total_docs} documents were preserved from the matched documents due to truncation."
            )
            matched_documents = matched_documents.iloc[:num_preserved_docs]

        documents_str = f"<{self.outer_tag}>{documents_str}<\\{self.outer_tag}>"

        return documents_str, matched_documents


@dataclass
class DocumentsFormatterJSON(DocumentsFormatter):
    """
    Formatter class to convert matched documents into a JSON format.

    Attributes:
    - tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents.
    - max_tokens (int): Maximum allowed tokens for the formatted documents.
    - columns (list[str]): List of columns to include in the JSON format.
    """

    tokenizer: Tokenizer
    max_tokens: int
    columns: list[str]

    def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:
        """
        Format the matched documents into a JSON format.

        If the total tokens exceed max_tokens, documents are omitted one at a time until it fits the limit.

        Parameters:
        - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.

        Returns:
        - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a JSON string and
                                    the possibly truncated matched documents DataFrame.
        """

        max_tokens = self.max_tokens
        documents_str = matched_documents[self.columns].to_json(orient="records")
        token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True)

        while token_count > max_tokens:
            # Truncated too much, no documents left, raise an error
            if len(matched_documents) == 0:
                raise ValueError(
                    f"Could not truncate documents to fit {max_tokens=}. Consider increasing max_tokens or decreasing chunk lengths."
                )

            # Too many tokens, drop a document and try again.
            matched_documents = matched_documents.iloc[:-1]
            documents_str = matched_documents[self.columns].to_json(orient="records")
            token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True)

            # Log a warning with more details
            logger.warning(
                f"Truncating documents to fit. Remaining documents after truncation: {len(matched_documents)}"
            )

        return documents_str, matched_documents


================================================
FILE: buster/formatters/prompts.py
================================================
import logging
from dataclasses import dataclass

import pandas as pd

from buster.tokenizers import Tokenizer

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


@dataclass
class PromptFormatter:
    tokenizer: Tokenizer
    max_tokens: int
    text_before_docs: str
    text_after_docs: str
    formatter: str = "{text_before_docs}\n{documents}\n{text_after_docs}"

    def format(self, documents: str) -> str:
        """Formats the system prompt with prompt engineering.

        Joins the text before and after documents with the documents provided.

        Args:
            documents (str): The already formatted documents to include in the system prompt.

        Returns:
            str: The formatted system prompt.

        Raises:
            ValueError: If the number of prompt tokens exceeds the maximum allowed tokens.
        """
        system_prompt = self.formatter.format(
            text_before_docs=self.text_before_docs, documents=documents, text_after_docs=self.text_after_docs
        )

        if self.tokenizer.num_tokens(system_prompt) > self.max_tokens:
            raise ValueError(f"System prompt tokens > {self.max_tokens=}")
        return system_prompt


def prompt_formatter_factory(tokenizer: Tokenizer, prompt_cfg) -> PromptFormatter:
    """Creates a PromptFormatter instance.

    Args:
        tokenizer (Tokenizer): The tokenizer to use for the PromptFormatter.
        prompt_cfg: The configuration for the PromptFormatter.

    Returns:
        PromptFormatter: The created PromptFormatter instance.
    """
    return PromptFormatter(
        tokenizer=tokenizer,
        max_tokens=prompt_cfg["max_tokens"],
        text_before_docs=prompt_cfg["text_before_documents"],
        text_after_docs=prompt_cfg["text_before_prompt"],
    )


================================================
FILE: buster/llm_utils/__init__.py
================================================
from buster.llm_utils.embeddings import (
    BM25,
    compute_embeddings_parallelized,
    cosine_similarity,
    get_openai_embedding,
    get_openai_embedding_constructor,
)
from buster.llm_utils.question_reformulator import QuestionReformulator

__all__ = [
    QuestionReformulator,
    cosine_similarity,
    get_openai_embedding,
    compute_embeddings_parallelized,
    get_openai_embedding_constructor,
    BM25,
]


================================================
FILE: buster/llm_utils/embeddings.py
================================================
import logging
from functools import lru_cache
from typing import Optional

import numpy as np
import pandas as pd
from openai import OpenAI
from pinecone_text.sparse import BM25Encoder
from tqdm.contrib.concurrent import thread_map

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_openai_embedding_constructor(client_kwargs: Optional[dict] = None, model: str = "text-embedding-ada-002"):
    if client_kwargs is None:
        client_kwargs = {}
    client = OpenAI(**client_kwargs)

    @lru_cache
    def embedding_fn(text: str, model: str = model) -> np.array:
        try:
            text = text.replace("\n", " ")
            response = client.embeddings.create(
                input=text,
                model=model,
            )
            embedding = response.data[0].embedding
            return np.array(embedding, dtype="float32")
        except Exception as e:
            # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.
            logger.exception(e)
            logger.warning(f"Embedding failed to compute for {text=}")
            return None

    return embedding_fn


# default embedding function
get_openai_embedding = get_openai_embedding_constructor()


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:
    """Compute the embeddings on the 'content' column of a DataFrame in parallel.

    This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified
    embedding function. The 'content' column is expected to contain strings or textual data. The method processes the
    embeddings in parallel using the number of workers specified.

    Args:
        df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.
        embedding_fn (callable): A function that computes embeddings for a given input string.
        num_workers (int): The number of parallel workers to use for computing embeddings.

    Returns:
        pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.
    """

    logger.info(f"Computing embeddings of {len(df)} chunks. Using {num_workers=}")
    embeddings = thread_map(embedding_fn, df.content.to_list(), max_workers=num_workers)

    logger.info(f"Finished computing embeddings")
    return embeddings


class BM25:
    def __init__(self, path_to_params: str = None) -> None:
        self.encoder = BM25Encoder()

        if path_to_params:
            self.encoder.load(path_to_params)

    def fit(self, df: pd.DataFrame):
        self.encoder.fit(df.content.to_list())

    def dump_params(self, path: str):
        self.encoder.dump(path)

    def get_sparse_embedding_fn(self):
        def sparse_embedding_fn(query: str):
            return self.encoder.encode_queries(query)

        return sparse_embedding_fn


================================================
FILE: buster/llm_utils/question_reformulator.py
================================================
import logging
from typing import Optional

from buster.completers import ChatGPTCompleter


class QuestionReformulator:
    def __init__(
        self,
        system_prompt: Optional[str] = None,
        completion_kwargs: Optional[dict] = None,
        client_kwargs: Optional[dict] = None,
    ):
        self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)

        if completion_kwargs is None:
            # Default kwargs
            completion_kwargs = {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            }
        self.completion_kwargs = completion_kwargs

        if system_prompt is None:
            # Default prompt
            system_prompt = """
            Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system.
            Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval."""
        self.system_prompt = system_prompt

    def reformulate(self, user_input: str) -> str:
        """Reformulate a user's question"""
        reformulated_question, error = self.completer.complete(
            self.system_prompt, user_input=user_input, completion_kwargs=self.completion_kwargs
        )
        logging.info(f"Reformulated question from {user_input=} to {reformulated_question=}")
        return reformulated_question, error


================================================
FILE: buster/parsers/__init__.py
================================================
from buster.parsers.parser import HuggingfaceParser, SphinxParser, get_all_documents

__all__ = [get_all_documents, SphinxParser, HuggingfaceParser]


================================================
FILE: buster/parsers/parser.py
================================================
import glob
import os
import re
from abc import ABC, abstractmethod
from dataclasses import InitVar, dataclass, field
from itertools import takewhile, zip_longest
from pathlib import Path
from typing import Iterator, Type

import bs4
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm


@dataclass
class Section:
    url: str
    name: str
    nodes: InitVar[list[bs4.element.NavigableString]]
    text: str = field(init=False)

    def __post_init__(self, nodes: list[bs4.element.NavigableString]):
        section = []
        for node in nodes:
            if node.name == "table":
                node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
            elif node.name == "script":
                continue
            else:
                node_text = node.text
            section.append(node_text)
        self.text = "\n".join(section).strip()

        # Remove tabs
        self.text = self.text.replace("\t", "")

        # Replace group of newlines with a single newline
        self.text = re.sub("\n{2,}", "\n", self.text)

        # Replace non-breaking spaces with regular spaces
        self.text = self.text.replace("\xa0", " ")

    def __len__(self) -> int:
        return len(self.text)

    @classmethod
    def from_text(cls, text: str, url: str, name: str) -> "Section":
        """Alternate constructor, without parsing."""
        section = cls.__new__(cls)  # Allocate memory, does not call __init__
        # Does the init here.
        section.text = text
        section.url = url
        section.name = name

        return section

    def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
        """Split a section into chunks."""
        if len(self) > max_length:
            # Get the number of chunk, by dividing and rounding up.
            # Then, split the section into equal lenght chunks.
            # This could results in chunks below the minimum length,
            # and will truncate the end of the section.
            n_chunks = (len(self) + max_length - 1) // max_length
            length = len(self) // n_chunks
            for chunk in range(n_chunks):
                start = chunk * length
                yield Section.from_text(self.text[start : start + length], self.url, self.name)
        elif len(self) > min_length:
            yield self
        return


@dataclass
class Parser(ABC):
    soup: BeautifulSoup
    base_url: str
    root_dir: str
    filepath: str
    min_section_length: int = 100
    max_section_length: int = 2000

    @property
    def relative_path(self) -> str:
        """Gets the relative path of the file to the root dir.

        This is particularly useful for websites with pages, subdomains, etc.
        The split is to remove the .html extension
        """
        parent = Path(self.root_dir)
        son = Path(self.filepath)
        self._relative_path = str(son.relative_to(parent)).split(".")[0]
        return self._relative_path

    @abstractmethod
    def find_sections(self) -> Iterator[Section]: ...

    def parse(self) -> list[Section]:
        """Parse the documents into sections, respecting the lenght constraints."""
        sections = []
        for section in self.find_sections():
            sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))
        return sections


class SphinxParser(Parser):
    def find_sections(self) -> Iterator[Section]:
        for section in self.soup.find_all("a", href=True, class_="headerlink"):
            container = section.parent.parent
            section_href = container.find_all("a", href=True, class_="headerlink")

            url = self.build_url(section["href"].strip().replace("\n", ""))
            name = section.parent.text.strip()[:-1].replace("\n", "")

            # If sections has subsections, keep only the part before the first subsection
            if len(section_href) > 1 and container.section is not None:
                siblings = list(container.section.previous_siblings)[::-1]
                section = Section(url, name, siblings)
            else:
                section = Section(url, name, container.children)
            yield section
        return

    def build_url(self, suffix: str) -> str:
        return self.base_url + self.relative_path + ".html" + suffix


class HuggingfaceParser(Parser):
    def find_sections(self) -> Iterator[Section]:
        sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
        for section, next_section in zip_longest(sections, sections[1:]):
            href = section.find("a", href=True, class_="header-link")
            nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))

            suffix = href["href"].strip().replace("\n", "")
            url = self.build_url(suffix)
            name = section.text.strip().replace("\n", "")
            yield Section(url, name, nodes)
        return

    def build_url(self, suffix: str) -> str:
        return self.base_url + self.relative_path + suffix


def get_document(
    root_dir: str,
    file: str,
    base_url: str,
    parser_cls: Type[Parser],
    min_section_length: int = 100,
    max_section_length: int = 2000,
) -> pd.DataFrame:
    """Extract all sections from one file.

    Sections are broken into subsections if they are longer than `max_section_length`.
    Sections correspond to `section` HTML tags that have a headerlink attached.
    """
    filepath = os.path.join(root_dir, file)
    with open(filepath, "r") as f:
        source = f.read()

    soup = BeautifulSoup(source, "html.parser")
    parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length)

    sections = []
    urls = []
    names = []
    for section in parser.parse():
        sections.append(section.text)
        urls.append(section.url)
        names.append(section.name)

    documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})

    return documents_df


def get_all_documents(
    root_dir: str,
    base_url: str,
    parser_cls: Type[Parser],
    min_section_length: int = 100,
    max_section_length: int = 2000,
) -> pd.DataFrame:
    """Parse all HTML files in `root_dir`, and extract all sections.

    Sections are broken into subsections if they are longer than `max_section_length`.
    Sections correspond to `section` HTML tags that have a headerlink attached.
    """
    files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)

    dfs = []
    for file in tqdm(files):
        try:
            df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length)
            dfs.append(df)
        except Exception as e:
            print(f"Skipping {file} due to the following error: {e}")
            continue

    documents_df = pd.concat(dfs, ignore_index=True)

    return documents_df


================================================
FILE: buster/retriever/__init__.py
================================================
from .base import Retriever
from .deeplake import DeepLakeRetriever
from .service import ServiceRetriever

__all__ = [Retriever, ServiceRetriever, DeepLakeRetriever]


================================================
FILE: buster/retriever/base.py
================================================
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Callable, Optional

import numpy as np
import pandas as pd

from buster.completers import UserInputs
from buster.llm_utils import get_openai_embedding

ALL_SOURCES = "All"

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


@dataclass
class Retriever(ABC):
    def __init__(
        self,
        top_k: int,
        thresh: float,
        embedding_fn: Callable[[str], np.ndarray] = None,
        sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,
        *args,
        **kwargs,
    ):
        """Initializes a Retriever instance.

        Args:
          top_k: The maximum number of documents to retrieve.
          thresh: The similarity threshold for document retrieval.
          embedding_fn: The function to compute document embeddings.
          embedding_fn: (Optional) The function to compute sparse document embeddings.
          *args, **kwargs: Additional arguments and keyword arguments.
        """
        if embedding_fn is None:
            embedding_fn = get_openai_embedding

        self.top_k = top_k
        self.thresh = thresh
        self.embedding_fn = embedding_fn
        self.sparse_embedding_fn = sparse_embedding_fn

        # Add your access to documents in your own init

    @abstractmethod
    def get_documents(self, source: Optional[str] = None) -> pd.DataFrame:
        """Get all current documents from a given source.

        Args:
          source: The source from which to retrieve documents. If None, retrieves documents from all sources.

        Returns:
          A pandas DataFrame containing the documents.
        """
        ...

    @abstractmethod
    def get_source_display_name(self, source: str) -> str:
        """Get the display name of a source.

        Args:
          source: The source for which to retrieve the display name.

        Returns:
          The display name of the source.

        If source is None, returns all documents. If source does not exist, returns empty dataframe.
        """
        ...

    @abstractmethod
    def get_topk_documents(self, query: str, source: Optional[str] = None, top_k: Optional[int] = None) -> pd.DataFrame:
        """Get the topk documents matching a user's query.

        Args:
          query: The user's query.
          source: The source from which to retrieve documents. If None, retrieves documents from all sources.
          top_k: The maximum number of documents to retrieve.

        Returns:
          A pandas DataFrame containing the topk matched documents.

        If no matches are found, returns an empty dataframe.
        """
        ...

    def threshold_documents(self, matched_documents: pd.DataFrame, thresh: float) -> pd.DataFrame:
        """Filters out matched documents using a similarity threshold.

        Args:
          matched_documents: The DataFrame containing the matched documents.
          thresh: The similarity threshold.

        Returns:
          A pandas DataFrame containing the filtered matched documents.
        """
        # filter out matched_documents using a threshold
        return matched_documents[matched_documents.similarity > thresh]

    def retrieve(
        self,
        user_inputs: UserInputs,
        sources: Optional[list[str]] = None,
        top_k: Optional[int] = None,
        thresh: Optional[float] = None,
    ) -> pd.DataFrame:
        """Retrieves documents based on user inputs.

        Args:
          user_inputs: The user's inputs.
          sources: The sources from which to retrieve documents. If None, retrieves documents from all sources.
          top_k: The maximum number of documents to retrieve.
          thresh: The similarity threshold for document retrieval.

        Returns:
          A pandas DataFrame containing the retrieved documents.
        """
        if top_k is None:
            top_k = self.top_k
        if thresh is None:
            thresh = self.thresh

        query = user_inputs.current_input

        matched_documents = self.get_topk_documents(query=query, sources=sources, top_k=top_k)

        # log matched_documents to the console
        logger.info(f"matched documents before thresh: {matched_documents}")

        # No matches were found, simply return at this point
        if len(matched_documents) == 0:
            return matched_documents

        # otherwise, make sure we have the minimum required fields
        assert "similarity" in matched_documents.columns
        assert "embedding" in matched_documents.columns
        assert "content" in matched_documents.columns
        assert "title" in matched_documents.columns

        # filter out matched_documents using a threshold
        matched_documents = self.threshold_documents(matched_documents, thresh)

        logger.info(f"matched documents after thresh: {matched_documents}")

        return matched_documents


================================================
FILE: buster/retriever/deeplake.py
================================================
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd

from buster.retriever.base import ALL_SOURCES, Retriever

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def extract_metadata(x: pd.DataFrame, columns) -> pd.DataFrame:
    """Extracts metadata from deeplake.

    Args:
      x: The dataframe containing the metadata.
      columns: The columns to extract.

    Returns:
      The dataframe with the extracted metadata.
    """
    for col in columns:
        x[col] = x.metadata[col]
    return x


def data_dict_to_df(data: dict) -> pd.DataFrame:
    """Converts a dictionary of data to a Pandas DataFrame.

    Args:
      data: The dictionary containing the data.

    Returns:
      The DataFrame containing the data.
    """
    # rename 'score' to 'similarity'
    data["similarity"] = data.pop("score")
    data["content"] = data.pop("text")

    matched_documents = pd.DataFrame(data)

    if len(matched_documents) == 0:
        logger.info("No matches found...")
        return pd.DataFrame()

    matched_documents = matched_documents.apply(extract_metadata, columns=["source", "title", "url"], axis=1)
    matched_documents = matched_documents.drop(columns="metadata")

    return matched_documents


def build_tql_query(embedding, sources=None, top_k: int = 3) -> str:
    """Builds a TQL query.

    Args:
      embedding: The embedding vector.
      sources: The sources to filter by.
      top_k: The number of top documents to retrieve.

    Returns:
      The TQL query.
    """
    # Initialize the where_clause to an empty string.
    where_clause = ""

    embedding_string = ",".join([str(item) for item in embedding])

    # If sources is provided and it's not empty, build the where clause.
    if sources:
        conditions = [f"contains(metadata['source'], '{source}')" for source in sources]
        where_clause = "where " + " or ".join(conditions)

    # Construct the entire query
    query = f"""
select * from (
    select embedding, text, metadata, cosine_similarity(embedding, ARRAY[{embedding_string}]) as score
    {where_clause}
)
order by score desc limit {top_k}
"""
    return query


class DeepLakeRetriever(Retriever):
    def __init__(
        self,
        path,
        exec_option: str = "python",
        use_tql: bool = False,
        deep_memory: bool = False,
        activeloop_token: str = None,
        **kwargs,
    ):
        from deeplake.core.vectorstore import VectorStore

        super().__init__(**kwargs)
        self.use_tql = use_tql
        self.exec_option = exec_option
        self.deep_memory = deep_memory
        self.vector_store = VectorStore(
            path=path,
            read_only=True,
            token=activeloop_token,
            exec_option=exec_option,
        )

        if activeloop_token is None and use_tql:
            logger.warning(
                """
                No activeloop token detected, enterprise features will not be available.
                You can set it using: export ACTIVELOOP_TOKEN=...
                """
            )

    def get_documents(self, sources: Optional[list[str]] = None) -> pd.DataFrame:
        """Get all current documents from a given source.

        Args:
          sources: The sources to retrieve documents from.

        Returns:
          The DataFrame containing the retrieved documents.
        """
        k = len(self.vector_store)

        # currently this is the only way to retrieve all embeddings in deeplake
        # generate a dummy embedding and specify top-k equals the length of the vector store.
        embedding_dim = self.vector_store.tensors()["embedding"].shape[1]
        dummy_embedding = np.random.random(embedding_dim)

        return self.get_topk_documents(query=None, embedding=dummy_embedding, top_k=k, sources=sources)

    def get_source_display_name(self, source: str) -> str:
        """Get the display name of a source.

        Args:
          source: The name of the source.

        Returns:
          The display name of the source.

        Raises:
          NotImplementedError: If the method is not implemented.
        """
        raise NotImplementedError()

    def get_topk_documents(
        self,
        query: str = None,
        embedding: np.array = None,
        sources: Optional[list[str]] = None,
        top_k: int = None,
        return_tensors: str = "*",
    ) -> pd.DataFrame:
        """Get the topk documents matching a user's query.

        If no matches are found, returns an empty dataframe.

        Args:
          query: The user's query.
          embedding: The embedding vector.
          sources: The sources to filter by.
          top_k: The number of top documents to retrieve.
          return_tensors: The tensors to include in the result.

        Returns:
          The DataFrame containing the matched documents.
        """
        if query is not None:
            query_embedding = self.embedding_fn(query)
        elif embedding is not None:
            query_embedding = embedding
        else:
            raise ValueError("must provide either a query or an embedding")

        if self.use_tql:
            assert self.exec_option == "compute_engine", "cant use tql without compute_engine"
            tql_query = build_tql_query(query_embedding, sources=sources, top_k=top_k)
            data = self.vector_store.search(query=tql_query, deep_memory=self.deep_memory)
        else:
            # build the filter clause
            if sources:

                def filter(x):
                    return x["metadata"].data()["value"]["source"] in sources

            else:
                filter = None

            data = self.vector_store.search(
                k=top_k,
                embedding=query_embedding,
                exec_option=self.exec_option,
                return_tensors=return_tensors,
                filter=filter,
            )

        matched_documents = data_dict_to_df(data)
        return matched_documents


================================================
FILE: buster/retriever/service.py
================================================
import logging
from typing import List, Optional

import numpy as np
import pandas as pd
import pinecone
from bson.objectid import ObjectId
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from buster.retriever.base import ALL_SOURCES, Retriever

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class ServiceRetriever(Retriever):
    def __init__(
        self,
        pinecone_api_key: str,
        pinecone_index: str,
        pinecone_namespace: str,
        mongo_uri: str,
        mongo_db_name: str,
        **kwargs,
    ):
        """
        Initializes a ServiceRetriever instance.

        The ServiceRetriever is a hybrid retrieval combining pinecone and mongodb services.

        Pinecone is exclusively used as a vector store.
        The id of the pinecone vectors are used as a key in the mongodb database to store its associated metadata.

        Args:
            pinecone_api_key: The API key for Pinecone.
            pinecone_env: The environment for Pinecone.
            pinecone_index: The name of the Pinecone index.
            pinecone_namespace: The namespace for Pinecone.
            mongo_uri: The URI for MongoDB.
            mongo_db_name: The name of the MongoDB database.
        """
        super().__init__(**kwargs)

        pc = pinecone.Pinecone(api_key=pinecone_api_key)

        self.index = pc.Index(pinecone_index)
        self.namespace = pinecone_namespace

        self.client = MongoClient(mongo_uri, server_api=ServerApi("1"))
        self.db = self.client[mongo_db_name]

    def get_source_id(self, source: str) -> str:
        """Get the id of a source. Returns an empty string if the source does not exist.

        Args:
            source: The name of the source.

        Returns:
            The id of the source.
        """
        source_pointer = self.db.sources.find_one({"name": source})
        return "" if source_pointer is None else str(source_pointer["_id"])

    def get_documents(self, source: Optional[str] = None) -> pd.DataFrame:
        """Get all current documents from a given source.

        Args:
            source: The name of the source. Defaults to None.

        Returns:
            A DataFrame containing all the documents. If the source does not exist, returns an empty DataFrame.
        """
        if source is None:
            # No source specified, return all documents
            documents = self.db.documents.find()
        else:
            assert isinstance(source, str), "source must be a valid string."
            source_id = self.get_source_id(source)

            if source_id == "":
                logger.warning(f"{source=} not found.")

            documents = self.db.documents.find({"source_id": source_id})

        return pd.DataFrame(list(documents))

    def get_source_display_name(self, source: str) -> str:
        """Get the display name of a source.

        Args:
            source: The name of the source.

        Returns:
            The display name of the source.
        """
        if source is None:
            return ALL_SOURCES
        else:
            display_name = self.db.sources.find_one({"name": source})["display_name"]
            return display_name

    def get_topk_documents(self, query: str, sources: Optional[List[str]], top_k: int) -> pd.DataFrame:
        """Get the top k documents matching a query from the specified sources.

        Args:
            query: The query string.
            sources: The list of source names to search. Defaults to None.
            top_k: The number of top matches to return.

        Returns:
            A DataFrame containing the top k matching documents.
        """
        if sources is None:
            filter = None
        else:
            filter = {"source": {"$in": sources}}
            source_exists = self.db.sources.find_one({"name": {"$in": sources}})
            if source_exists is None:
                logger.warning(f"Sources {sources} do not exist. Returning empty dataframe.")
                return pd.DataFrame()

        query_embedding = self.embedding_fn(query)
        sparse_query_embedding = self.sparse_embedding_fn(query) if self.sparse_embedding_fn is not None else None

        if isinstance(query_embedding, np.ndarray):
            # pinecone expects a list of floats, so convert from ndarray if necessary
            query_embedding = query_embedding.tolist()

        # Pinecone retrieval
        matches = self.index.query(
            vector=query_embedding,
            sparse_vector=sparse_query_embedding,
            top_k=top_k,
            filter=filter,
            include_values=True,
            namespace=self.namespace,
        )["matches"]
        matching_ids = [ObjectId(match.id) for match in matches]
        matching_scores = {match.id: match.score for match in matches}
        matching_embeddings = {match.id: match.values for match in matches}

        if len(matching_ids) == 0:
            return pd.DataFrame()

        # MongoDB retrieval
        matched_documents = self.db.documents.find({"_id": {"$in": matching_ids}})
        matched_documents = pd.DataFrame(list(matched_documents))

        # add additional information from matching
        matched_documents["similarity"] = matched_documents["_id"].apply(lambda x: matching_scores[str(x)])
        matched_documents["embedding"] = matched_documents["_id"].apply(lambda x: matching_embeddings[str(x)])

        # sort by similarity
        matched_documents = matched_documents.sort_values(by="similarity", ascending=False, ignore_index=True)

        return matched_documents


================================================
FILE: buster/tokenizers/__init__.py
================================================
from .base import Tokenizer
from .gpt import GPTTokenizer


def tokenizer_factory(tokenizer_cfg: dict) -> Tokenizer:
    model_name = tokenizer_cfg["model_name"]
    if model_name in ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]:
        return GPTTokenizer(model_name)

    raise ValueError(f"Tokenizer not implemented for {model_name=}")


__all__ = [Tokenizer, GPTTokenizer, tokenizer_factory]


================================================
FILE: buster/tokenizers/base.py
================================================
from abc import ABC, abstractmethod
from typing import Union


class Tokenizer(ABC):
    """Abstract base class for a tokenizer.

    Args:
      model_name: The name of the tokenizer model.

    Attributes:
      model_name: The name of the tokenizer model.

    """

    def __init__(self, model_name: str):
        self.model_name = model_name

    @abstractmethod
    def encode(self, string: str) -> list[int]:
        """Encodes a string into a list of integers.

        Args:
          string: The input string to be encoded.

        Returns:
          A list of integers representing the encoded string.

        """

        ...

    @abstractmethod
    def decode(self, encoded: list[int]) -> str:
        """Decodes a list of integers into a string.

        Args:
          encoded: The list of integers to be decoded.

        Returns:
          The decoded string.

        """

        ...

    def num_tokens(self, string: str, return_encoded: bool = False) -> Union[int, tuple[int, list[int]]]:
        """Returns the number of tokens in a string.

        Args:
          string: The input string.
          return_encoded: Whether or not to return the encoded string along with the number of tokens.

        Returns:
          If `return_encoded` is False, returns the number of tokens in the string.
          If `return_encoded` is True, returns a tuple containing the number of tokens and the encoded string.

        """

        encoded = self.encode(string)
        if return_encoded:
            return len(encoded), encoded
        return len(encoded)


================================================
FILE: buster/tokenizers/gpt.py
================================================
import tiktoken

from buster.tokenizers import Tokenizer


class GPTTokenizer(Tokenizer):
    """Tokenizer class for GPT models.

    This class implements a tokenizer for GPT models using the tiktoken library.

    Args:
        model_name (str): The name of the GPT model to be used.

    Attributes:
        encoder: The encoder object created using tiktoken.encoding_for_model().

    """

    def __init__(self, model_name: str):
        super().__init__(model_name)
        self.encoder = tiktoken.encoding_for_model(model_name=model_name)

    def encode(self, string: str):
        """Encodes a given string using the GPT tokenizer.

        Args:
            string (str): The string to be encoded.

        Returns:
            list[int]: The encoded representation of the string.

        """
        return self.encoder.encode(string)

    def decode(self, encoded: list[int]):
        """Decodes a list of tokens using the GPT tokenizer.

        Args:
            encoded (list[int]): The list of tokens to be decoded.

        Returns:
            str: The decoded string representation of the tokens.

        """
        return self.encoder.decode(encoded)


================================================
FILE: buster/utils.py
================================================
import os
import urllib.request
import zipfile


def get_file_extension(filepath: str) -> str:
    return os.path.splitext(filepath)[1]


def download_db(db_url: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    fname = os.path.join(output_dir, "documents.db")
    if not os.path.exists(fname):
        print(f"Downloading db file from {db_url} to {fname}...")
        urllib.request.urlretrieve(db_url, fname)
        print("Downloaded.")
    else:
        print("File already exists. Skipping.")
    return fname


def zip_contents(input_path, output_path):
    """
    Zips the entire contents of a given path to a custom output path.

    Authored by ChatGPT

    Args:
        input_path (str): The path of the directory to be zipped.
        output_path (str): The path where the zip file will be created.

    Returns:
        str: The path of the created zip file.
    """
    if not os.path.exists(input_path):
        raise ValueError("The specified input path does not exist.")

    zip_file_name = f"{os.path.basename(input_path)}.zip"
    zip_file_path = os.path.join(output_path, zip_file_name)

    with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(input_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, input_path)
                zipf.write(file_path, arcname=arcname)

    return zip_file_path


def extract_zip(zip_file_path, output_path):
    """
    Extracts the contents of a zip file to a custom output path.

    Authored by ChatGPT

    Args:
        zip_file_path (str): The path of the zip file to be extracted.
        output_path (str): The path where the zip contents will be extracted.

    Returns:
        str: The path of the directory where the zip contents are extracted.
    """
    if not os.path.exists(zip_file_path):
        raise ValueError("The specified zip file does not exist.")

    with zipfile.ZipFile(zip_file_path, "r") as zipf:
        zipf.extractall(output_path)

    return output_path


================================================
FILE: buster/validators/__init__.py
================================================
from .base import Validator

__all__ = [Validator]


================================================
FILE: buster/validators/base.py
================================================
import logging

import pandas as pd

from buster.llm_utils import cosine_similarity, get_openai_embedding
from buster.validators.validators import (
    AnswerValidator,
    DocumentsValidator,
    QuestionValidator,
)

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class Validator:
    def __init__(
        self,
        use_reranking: bool,
        validate_documents: bool,
        question_validator_cfg=None,
        answer_validator_cfg=None,
        documents_validator_cfg=None,
    ):
        """
        Initializes the Validator class.

        Args:
          use_reranking: A boolean indicating whether to use reranking.
          validate_documents: A boolean indicating whether to validate documents.
          question_validator_cfg: A configuration dictionary for the QuestionValidator.
          answer_validator_cfg: A configuration dictionary for the AnswerValidator.
          documents_validator_cfg: A configuration dictionary for the DocumentsValidator.
        """
        self.question_validator = (
            QuestionValidator(**question_validator_cfg) if question_validator_cfg is not None else QuestionValidator()
        )
        self.answer_validator = (
            AnswerValidator(**answer_validator_cfg) if answer_validator_cfg is not None else AnswerValidator()
        )
        self.documents_validator = (
            DocumentsValidator(**documents_validator_cfg)
            if documents_validator_cfg is not None
            else DocumentsValidator()
        )
        self.use_reranking = use_reranking
        self.validate_documents = validate_documents

    def check_question_relevance(self, question: str) -> tuple[bool, str]:
        """
        Checks the relevance of a question.

        Args:
          question: The question to be checked.

        Returns:
          A tuple containing a boolean indicating the relevance and a string describing the result.
        """
        return self.question_validator.check_question_relevance(question)

    def check_answer_relevance(self, answer: str) -> bool:
        """
        Checks the relevance of an answer.

        Args:
          answer: The answer to be checked.

        Returns:
          A boolean indicating the relevance of the answer.
        """
        return self.answer_validator.check_answer_relevance(answer)

    def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFrame:
        """
        Checks the relevance of documents.

        Args:
          answer: The answer to be checked.
          matched_documents: The DataFrame containing the matched documents.

        Returns:
          A DataFrame containing the relevance of the documents.
        """
        return self.documents_validator.check_documents_relevance(answer, matched_documents)

    def rerank_docs(
        self, answer: str, matched_documents: pd.DataFrame, embedding_fn=get_openai_embedding
    ) -> pd.DataFrame:
        """
        Reranks the matched documents based on answer similarity.

        Args:
          answer: The answer for reranking.
          matched_documents: The DataFrame containing the matched documents.
          embedding_fn: The function used to calculate document embeddings.

        Returns:
          A DataFrame containing the reranked documents.
        """
        """Here we re-rank matched documents according to the answer provided by the llm.

        This score could be used to determine wether a document was actually relevant to generation.
        An extra column is added in-place for the similarity score.
        """
        if len(matched_documents) == 0:
            return matched_documents
        logger.info("Reranking documents based on answer similarity...")

        answer_embedding = embedding_fn(answer)

        col = "similarity_to_answer"
        matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding))

        return matched_documents.sort_values(by=col, ascending=False)


================================================
FILE: buster/validators/validators.py
================================================
import concurrent.futures
import logging
from typing import Callable, List, Optional

import numpy as np
import pandas as pd

from buster.completers import ChatGPTCompleter, Completer
from buster.llm_utils import cosine_similarity
from buster.llm_utils.embeddings import get_openai_embedding

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


class QuestionValidator:
    def __init__(
        self,
        check_question_prompt: Optional[str] = None,
        invalid_question_response: Optional[str] = None,
        completion_kwargs: Optional[dict] = None,
        client_kwargs: Optional[dict] = None,
    ):
        if check_question_prompt is None:
            check_question_prompt = (
                """You are a chatbot answering questions on documentation.
Your job is to determine whether or not a question is valid, and should be answered.
More general questions are not considered valid, even if you might know the response.
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.

For example:

Q: What is backpropagation?
true

Q: What is the meaning of life?
false

A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
            )

        if completion_kwargs is None:
            # default completion kwargs
            completion_kwargs = (
                {
                    "model": "gpt-3.5-turbo",
                    "stream": False,
                    "temperature": 0,
                },
            )

        self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)
        self.check_question_prompt = check_question_prompt
        self.invalid_question_response = invalid_question_response

    def check_question_relevance(self, question: str) -> tuple[bool, str]:
        """Determines whether a question is relevant for our given framework."""
        try:
            outputs, _ = self.completer.complete(self.check_question_prompt, user_input=question)
            outputs = outputs.strip(".").lower()
            if outputs not in ["true", "false"]:
                logger.warning(f"the question validation returned an unexpeced value: {outputs=}. Assuming Invalid...")
            relevance = outputs.strip(".").lower() == "true"
            response = self.invalid_question_response

        except Exception as e:
            logger.exception("Error during question relevance detection.")
            relevance = False
            response = "Unable to process your question at the moment, try again soon"

        return relevance, response


class AnswerValidator:
    def __init__(
        self,
        unknown_response_templates: Optional[list[str]] = None,
        unknown_threshold: Optional[float] = None,
        embedding_fn: Callable[[str], np.array] = None,
    ):
        if unknown_threshold is None:
            unknown_threshold = 0.85

        if embedding_fn is None:
            embedding_fn = get_openai_embedding

        if unknown_response_templates is None:
            unknown_response_templates = [
                "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
            ]

        self.embedding_fn = embedding_fn
        self.unknown_response_templates = unknown_response_templates
        self.unknown_threshold = unknown_threshold

    def check_answer_relevance(self, answer: str) -> bool:
        """Check if a generated answer is relevant to the chatbot's knowledge."""
        if answer == "":
            raise ValueError("Cannot compute embedding of an empty string.")

        unknown_embeddings = [
            self.embedding_fn(unknown_response) for unknown_response in self.unknown_response_templates
        ]

        answer_embedding = self.embedding_fn(answer)
        unknown_similarity_scores = [
            cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings
        ]

        # If any score is above the threshold, the answer is considered not relevant
        return not any(score > self.unknown_threshold for score in unknown_similarity_scores)


class DocumentsValidator:
    def __init__(
        self,
        completion_kwargs: Optional[dict] = None,
        client_kwargs: Optional[dict] = None,
        system_prompt: Optional[str] = None,
        user_input_formatter: Optional[str] = None,
        max_calls: int = 30,
    ):
        if system_prompt is None:
            system_prompt = """
            Your goal is to determine if the content of a document can be attributed to a provided answer.
            This means that if information in the document is found in the answer, it is relevant. Otherwise it is not.
            Your goal is to determine if the information contained in a document was used to generate an answer.
            You will be comparing a document to an answer. If the answer can be inferred from the document, return 'true'. Otherwise return 'false'.
            Only respond with 'true' or 'false'."""
        self.system_prompt = system_prompt

        if user_input_formatter is None:
            user_input_formatter = """
            answer: {answer}
            document: {document}
        """
        self.user_input_formatter = user_input_formatter

        if completion_kwargs is None:
            completion_kwargs = {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            }

        self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)

        self.max_calls = max_calls

    def check_document_relevance(self, answer: str, document: str) -> bool:
        user_input = self.user_input_formatter.format(answer=answer, document=document)
        output, _ = self.completer.complete(prompt=self.system_prompt, user_input=user_input)

        # remove trailing periods, happens sometimes...
        output = output.strip(".").lower()

        if output not in ["true", "false"]:
            # Default assume it's relevant if the detector didn't give one of [true, false]
            logger.warning(f"the validation returned an unexpected value: {output}. Assuming valid...")
            return True
        return output == "true"

    def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> list[bool]:
        """Determines wether a question is relevant or not for our given framework."""

        logger.info(f"Checking document relevance of {len(matched_documents)} documents")

        if len(matched_documents) > self.max_calls:
            raise ValueError("Max calls exceeded, increase max_calls to allow this.")

        # Here we parallelize the calls. We introduce a wrapper as a workaround.
        def _check_documents(args):
            "Thin wrapper so we can pass args as a Tuple and use ThreadPoolExecutor."
            answer, document = args
            return self.check_document_relevance(answer=answer, document=document)

        args_list = [(answer, doc) for doc in matched_documents.content.to_list()]
        with concurrent.futures.ThreadPoolExecutor() as executor:
            relevance = list(executor.map(_check_documents, args_list))

        logger.info(f"{relevance=}")
        # add it back to the dataframe
        matched_documents["relevance"] = relevance
        return matched_documents


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"

[project]
name = "buster-doctalk"
version = "0.0.1"
description = "Buster 🤖: A chatbot for retrieval-augmented generation"
readme = "README.md"
requires-python = ">=3.10"
dynamic = ["dependencies"]

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}

[tool.setuptools.packages.find]
include = ["buster"]

[tool.isort]
profile = "black"

[tool.black]
line-length = 120

[tool.pytest.ini_options]
log_cli = true
log_cli_level = "INFO"

[tool.poetry]
name = "buster-doctalk"
version = "v0.0.1"
description = "Buster 🤖: A chatbot for retrieval-augmented generation"
license = "MIT"
authors = [
    "Jeremy Pinto <jerpint@gmail.com>",
    "Hadrien Bertrand <bertrand.hadrien@gmail.com>",
]
readme = "README.md"
repository = "https://github.com/jerpint/buster"

packages = [
    { include = "buster" },
    { include = "buster/**/*.py" },
]

[tool.poetry.dependencies]
python = ">=3.10,<3.13"

================================================
FILE: requirements.txt
================================================
bs4
click
deeplake
gradio>=3.40
matplotlib
numpy>=1.25
openai>=1.0
pandas>=2.1.3
pinecone-client>=3.0.2
pinecone-text>=0.6.0
pymongo
pytest
tabulate
tenacity
tiktoken


================================================
FILE: tests/test_chatbot.py
================================================
import copy
import logging
import os
from pathlib import Path

import numpy as np
import pandas as pd
import pytest

from buster.busterbot import Buster, BusterConfig
from buster.completers import ChatGPTCompleter, Completer, Completion, DocumentAnswerer
from buster.documents_manager import DeepLakeDocumentsManager
from buster.formatters.documents import DocumentsFormatterHTML
from buster.formatters.prompts import PromptFormatter
from buster.llm_utils import get_openai_embedding
from buster.retriever import DeepLakeRetriever, Retriever
from buster.tokenizers.gpt import GPTTokenizer
from buster.validators import Validator

logging.basicConfig(level=logging.INFO)


DOCUMENTS_CSV = Path(__file__).resolve().parent.parent / "buster/examples/stackoverflow.csv"
UNKNOWN_PROMPT = "I'm sorry but I don't know how to answer."
NUM_WORKERS = 1

# default class used by our tests
buster_cfg_template = BusterConfig(
    completion_cfg={
        "completion_kwargs": {
            "model": "gpt-3.5-turbo",
            "temperature": 0,
        },
        "client_kwargs": {
            "timeout": 20,
            "max_retries": 2,
        },
    },
    validator_cfg={
        "validate_documents": False,
        "use_reranking": True,
        "answer_validator_cfg": {
            "unknown_response_templates": [
                "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
            ],
            "unknown_threshold": 0.85,
        },
        "question_validator_cfg": {
            "invalid_question_response": "This question does not seem relevant to my current knowledge.",
            "completion_kwargs": {
                "model": "gpt-3.5-turbo",
                "stream": False,
                "temperature": 0,
            },
            "client_kwargs": {
                "timeout": 20,
                "max_retries": 2,
            },
            "check_question_prompt": "You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.",
        },
    },
    retriever_cfg={
        # "db_path": to be set using pytest fixture,
        "top_k": 3,
        "thresh": 0.7,
        "max_tokens": 2000,
        "embedding_fn": get_openai_embedding,
    },
    prompt_formatter_cfg={
        "max_tokens": 3500,
        "text_after_docs": ("""Now answer the following question:\n"""),
        "text_before_docs": (
            """You are a chatbot assistant answering technical questions about artificial intelligence (AI). """
            """If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer."""
            """Use this response when you cannot answer:\n"""
            f"""'{UNKNOWN_PROMPT}'\n"""
            """For example:\n"""
            """What is the meaning of life?\n"""
            f"""'{UNKNOWN_PROMPT}'\n"""
            """Only use these prodived documents as reference:\n"""
        ),
    },
    documents_formatter_cfg={
        "max_tokens": 3000,
        "formatter": "{content}",
    },
)


def get_fake_embedding(length=1536):
    rng = np.random.default_rng()
    return list(rng.random(length, dtype=np.float32))


class MockAnswerer(Completer):
    def __init__(self, expected_answer):
        self.expected_answer = expected_answer

    def prepare_prompt(self, user_inputs, matched_documents):
        pass

    def complete(self):
        return

    def get_completion(self, user_inputs, matched_documents, validator, *arg, **kwarg) -> Completion:
        return Completion(
            answer_text=self.expected_answer,
            error=False,
            user_inputs=user_inputs,
            matched_documents=matched_documents,
            validator=validator,
        )


class MockRetriever(Retriever):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        path = kwargs["path"]

        self.path = path

        n_samples = 100
        self.documents = pd.DataFrame.from_dict(
            {
                "title": ["test"] * n_samples,
                "url": ["http://url.com"] * n_samples,
                "content": ["cool text"] * n_samples,
                "embedding": [get_fake_embedding()] * n_samples,
                "n_tokens": [10] * n_samples,
                "source": ["fake source"] * n_samples,
            }
        )

        self.embedding_fn = get_fake_embedding

    def get_documents(self, source):
        return self.documents

    def get_topk_documents(self, query: str, sources: list[str] = None, top_k: int = None) -> pd.DataFrame:
        documents = self.documents
        documents["embedding"] = [get_fake_embedding() for _ in range(len(documents))]
        documents["similarity"] = [np.random.random() for _ in range(len(documents))]
        return documents

    def get_source_display_name(self, source):
        return source


class MockValidator:
    def __init__(self, *args, **kwargs):
        return

    def validate(self, completion):
        completion.answer_relevant = True
        return completion

    def check_question_relevance(self, *args, **kwargs):
        return True, ""

    def check_answer_relevance(self, *args, **kwargs):
        return True


@pytest.fixture(scope="session")
def vector_store_path(tmp_path_factory):
    # Create a temporary directory and folder for the database manager
    dm_path = tmp_path_factory.mktemp("data").joinpath("deeplake_store")

    # Add the documents (will generate embeddings)
    dm = DeepLakeDocumentsManager(vector_store_path=dm_path)
    df = pd.read_csv(DOCUMENTS_CSV)
    dm.add(df, num_workers=NUM_WORKERS)
    return dm_path


def test_chatbot_mock_data(tmp_path, monkeypatch):
    gpt_expected_answer = "this is GPT answer"

    path = tmp_path / "not_a_real_file.tar.gz"

    buster_cfg = copy.deepcopy(buster_cfg_template)
    buster_cfg.retriever_cfg["path"] = path
    buster_cfg.completion_cfg = {
        "expected_answer": gpt_expected_answer,
    }

    retriever = MockRetriever(**buster_cfg.retriever_cfg)
    document_answerer = MockAnswerer(**buster_cfg.completion_cfg)
    validator = MockValidator(**buster_cfg.validator_cfg)
    buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)
    completion = buster.process_input(user_input="What is a transformer?", sources=["fake_source"])
    assert isinstance(completion.answer_text, str)
    assert completion.answer_text.startswith(gpt_expected_answer)


def test_chatbot_real_data__chatGPT(vector_store_path):
    buster_cfg = copy.deepcopy(buster_cfg_template)
    buster_cfg.retriever_cfg["path"] = vector_store_path

    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
    document_answerer = DocumentAnswerer(
        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
        documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
        prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
    )
    validator: Validator = Validator(**buster_cfg.validator_cfg)
    buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)

    completion = buster.process_input("What is backpropagation?")
    assert isinstance(completion.answer_text, str)

    assert completion.question_relevant == True
    assert completion.answer_relevant == True

    assert completion.completion_kwargs == buster_cfg.completion_cfg["completion_kwargs"]


def test_chatbot_real_data__chatGPT_OOD(vector_store_path):
    buster_cfg = copy.deepcopy(buster_cfg_template)
    buster_cfg.retriever_cfg["path"] = vector_store_path
    buster_cfg.prompt_formatter_cfg = {
        "max_tokens": 3500,
        "text_before_docs": (
            """You are a chatbot assistant answering technical questions about artificial intelligence (AI)."""
            """If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer."""
            """Use this response: """
            f"""'{UNKNOWN_PROMPT}'\n"""
            """For example:\n"""
            """What is the meaning of life?\n"""
            f"""'{UNKNOWN_PROMPT}'\n"""
            """Now answer the following question:\n"""
        ),
        "text_after_docs": "Only use these documents as reference:\n",
    }

    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
    document_answerer = DocumentAnswerer(
        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
        documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
        prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
    )
    validator: Validator = Validator(**buster_cfg.validator_cfg)
    buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)

    completion: Completion = buster.process_input("What is a good recipe for brocolli soup?")
    assert isinstance(completion.answer_text, str)

    assert completion.question_relevant == False
    assert completion.answer_relevant == False

    assert completion.completion_kwargs is None


def test_chatbot_real_data__no_docs_found(vector_store_path):
    with pytest.warns():
        buster_cfg = copy.deepcopy(buster_cfg_template)
        buster_cfg.retriever_cfg = {
            "path": vector_store_path,
            "embedding_fn": get_openai_embedding,
            "top_k": 3,
            "thresh": 1,  # Set threshold very high to be sure no docs are matched
            "max_tokens": 3000,
        }
        buster_cfg.documents_answerer_cfg["no_documents_message"] = "No documents available."
        retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
        tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
        document_answerer = DocumentAnswerer(
            completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
            documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),
            prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),
            **buster_cfg.documents_answerer_cfg,
        )
        validator: Validator = Validator(**buster_cfg.validator_cfg)
        buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)

        completion = buster.process_input("What is backpropagation?")
        assert isinstance(completion.answer_text, str)

        assert completion.question_relevant == True
        assert completion.answer_relevant == False
        assert completion.answer_text == "No documents available."


================================================
FILE: tests/test_documents.py
================================================
import os

import numpy as np
import pandas as pd
import pytest

from buster.documents_manager import DeepLakeDocumentsManager
from buster.documents_manager.base import compute_embeddings_parallelized
from buster.llm_utils import get_openai_embedding
from buster.retriever import DeepLakeRetriever

# Patch the get_embedding function to return a fixed, fake embedding
NUM_WORKERS = 1
fake_embedding = [-0.005, 0.0018]


def get_fake_embedding(*arg, **kwargs):
    return fake_embedding


@pytest.mark.parametrize(
    "documents_manager, retriever",
    [(DeepLakeDocumentsManager, DeepLakeRetriever)],
)
def test_write_read(tmp_path, documents_manager, retriever):
    retriever_cfg = {
        "top_k": 3,
        "thresh": 0.7,
        "max_tokens": 2000,
        "embedding_fn": get_openai_embedding,
    }
    dm_path = tmp_path / "tmp_dir_2"
    retriever_cfg["path"] = dm_path

    data = pd.DataFrame.from_dict(
        {
            "title": ["test"],
            "url": ["http://url.com"],
            "content": ["cool text"],
            "source": ["sourceA"],
            "embedding": [np.arange(10, dtype=np.float32) - 0.3],
            "n_tokens": 5,
        }
    )

    dm = DeepLakeDocumentsManager(vector_store_path=dm_path)

    dm.add(df=data)
    dm_data = retriever(**retriever_cfg).get_documents(sources=["sourceA"])

    assert dm_data["title"].iloc[0] == data["title"].iloc[0]
    assert dm_data["url"].iloc[0] == data["url"].iloc[0]
    assert dm_data["content"].iloc[0] == data["content"].iloc[0]
    assert dm_data["source"].iloc[0] == data["source"].iloc[0]
    assert np.allclose(dm_data["embedding"].iloc[0], data["embedding"].iloc[0])


@pytest.mark.parametrize(
    "documents_manager, retriever",
    [
        (DeepLakeDocumentsManager, DeepLakeRetriever),
    ],
)
def test_write_write_read(tmp_path, documents_manager, retriever):
    retriever_cfg = {
        "top_k": 3,
        "thresh": 0.7,
        "max_tokens": 2000,
        "embedding_fn": get_openai_embedding,
    }
    db_path = tmp_path / "tmp_dir"
    retriever_cfg["path"] = db_path

    db = documents_manager(db_path)

    data_1 = pd.DataFrame.from_dict(
        {
            "title": ["test"],
            "url": ["http://url.com"],
            "content": ["cool text"],
            "embedding": [np.arange(10, dtype=np.float32) - 0.3],
            "source": ["sourceA"],
            "n_tokens": 10,
        }
    )
    db.add(df=data_1, num_workers=NUM_WORKERS)

    data_2 = pd.DataFrame.from_dict(
        {
            "title": ["other"],
            "url": ["http://url.com/page.html"],
            "content": ["lorem ipsum"],
            "embedding": [np.arange(10, dtype=np.float32) / 10 - 2.3],
            "source": ["sourceB"],
            "n_tokens": 5,
        }
    )
    db.add(df=data_2, num_workers=NUM_WORKERS)

    db_data = retriever(**retriever_cfg).get_documents(sources=["sourceB"])

    assert len(db_data) == len(data_2)
    assert db_data["title"].iloc[0] == data_2["title"].iloc[0]
    assert db_data["url"].iloc[0] == data_2["url"].iloc[0]
    assert db_data["content"].iloc[0] == data_2["content"].iloc[0]
    assert np.allclose(db_data["embedding"].iloc[0], data_2["embedding"].iloc[0])


def test_generate_embeddings(tmp_path, monkeypatch):
    # Create fake data
    df = pd.DataFrame.from_dict(
        {"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]}
    )

    # Generate embeddings, store in a file
    path = tmp_path / f"test_document_embeddings"
    dm = DeepLakeDocumentsManager(path)
    dm.add(df, embedding_fn=get_fake_embedding, num_workers=NUM_WORKERS)

    # Read the embeddings from the file
    retriever_cfg = {
        "path": path,
        "top_k": 3,
        "thresh": 0.85,
        "max_tokens": 3000,
        "embedding_fn": get_fake_embedding,
    }
    read_df = DeepLakeRetriever(**retriever_cfg).get_documents("my_source")

    # Check all the values are correct across the files
    assert df["title"].iloc[0] == df["title"].iloc[0] == read_df["title"].iloc[0]
    assert df["url"].iloc[0] == df["url"].iloc[0] == read_df["url"].iloc[0]
    assert df["content"].iloc[0] == df["content"].iloc[0] == read_df["content"].iloc[0]
    assert np.allclose(fake_embedding, read_df["embedding"].iloc[0])


def test_generate_embeddings_parallelized():
    # Create fake data
    df = pd.DataFrame.from_dict(
        {
            "title": ["test"] * 5,
            "url": ["http://url.com"] * 5,
            "content": ["cool text" + str(x) for x in range(5)],
            "source": ["my_source"] * 5,
        }
    )

    embeddings_parallel = compute_embeddings_parallelized(
        df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS
    )
    embeddings = df.content.apply(get_openai_embedding)

    # embeddings comes out as a series because of the apply, so cast it back to an array
    embeddings_arr = np.array(embeddings.to_list())

    # Not clear why a tolerance needs to be specified, likely because it is computed on different machines
    # since the requests are done in parallel...
    assert np.allclose(embeddings_parallel, embeddings_arr, atol=1e-2)


def test_add_batches(tmp_path):
    dm_path = tmp_path / "deeplake_store"
    num_samples = 20
    batch_size = 16
    csv_filename = os.path.join(tmp_path, "embedding_")

    dm = DeepLakeDocumentsManager(vector_store_path=dm_path)

    # Create fake data
    df = pd.DataFrame.from_dict(
        {
            "title": ["test"] * num_samples,
            "url": ["http://url.com"] * num_samples,
            "content": ["cool text" + str(x) for x in range(num_samples)],
            "source": ["my_source"] * num_samples,
        }
    )

    dm.batch_add(
        df,
        embedding_fn=get_fake_embedding,
        num_workers=NUM_WORKERS,
        batch_size=batch_size,
        min_time_interval=0,
        csv_filename=csv_filename,
    )

    csv_files = [f for f in os.listdir(tmp_path) if f.endswith(".csv")]

    # check that we registered the good number of doucments and that files were generated
    assert len(dm) == num_samples

    df_saved = pd.read_csv(csv_filename)
    assert len(df_saved) == num_samples
    assert "embedding" in df_saved.columns


================================================
FILE: tests/test_formatters.py
================================================
import json

import pandas as pd
import pytest

from buster.formatters.documents import DocumentsFormatterHTML, DocumentsFormatterJSON
from buster.formatters.prompts import PromptFormatter
from buster.tokenizers import GPTTokenizer


def test_DocumentsDormatterHTML__simple():
    """In this test, we expect all 3 documents to be matched and returned normally."""
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    documents_formatter = DocumentsFormatterHTML(
        tokenizer=tokenizer,
        max_tokens=100,
    )

    document_1 = "This is a very short document."
    document_2 = "This is another very short document."
    document_3 = "This is also a short document."

    expected_docs_str = (
        "<DOCUMENTS>"
        f"<DOCUMENT>{document_1}<\\DOCUMENT>"
        f"<DOCUMENT>{document_2}<\\DOCUMENT>"
        f"<DOCUMENT>{document_3}<\\DOCUMENT>"
        "<\\DOCUMENTS>"
    )

    matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]})

    docs_str, matched_documents_new = documents_formatter.format(matched_documents)

    # less documents and the new document is shorter than the original
    assert all(matched_documents.content == matched_documents_new.content)

    assert docs_str == expected_docs_str


def test_DocumentsDormatterJSON__simple():
    """In this test, we expect all 3 documents to be matched and returned normally."""
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=["content", "source"])

    document_1 = "This is a very short document."
    document_2 = "This is another very short document."
    document_3 = "This is also a short document."

    source_1 = "source 1"
    source_2 = "source 2"
    source_3 = "source 3"

    data_dict = {
        "content": [document_1, document_2, document_3],
        "source": [source_1, source_2, source_3],
    }

    expected_docs_str = json.dumps(
        [
            {"content": document_1, "source": source_1},
            {"content": document_2, "source": source_2},
            {"content": document_3, "source": source_3},
        ],
        separators=(",", ":"),
    )

    matched_documents = pd.DataFrame(data_dict)

    docs_str, matched_documents_new = documents_formatter.format(matched_documents)

    # less documents and the new document is shorter than the original
    assert all(matched_documents.content == matched_documents_new.content)

    assert docs_str == expected_docs_str  # matched_documents.to_json(orient="records")


def test_DocumentsFormatterHTML__doc_to_long():
    """In this test, document_1 doesn't entirely fit.

    we only expect a part of it to be contained.
    """
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    documents_formatter = DocumentsFormatterHTML(
        tokenizer=tokenizer,
        max_tokens=100,
    )

    long_sentence = "This is a very long document. It is long on purpose."
    document_1 = long_sentence * 50
    document_2 = "This is a very short document."
    document_3 = "This is also a short document"

    matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]})

    docs_str, matched_documents_new = documents_formatter.format(matched_documents)

    # less documents and the new document is shorter than the original
    assert len(matched_documents) == 3
    assert len(matched_documents_new) == 1
    assert len(docs_str) < len(document_1)

    # The long document gets truncated, the others don't make it in.
    assert long_sentence in docs_str
    assert document_2 not in docs_str
    assert document_3 not in docs_str


def test_DocumentsFormatterJSON__doc_too_long():
    """In this test, document_3 doesn't fit.
    We expect it to be excluded completely.

    we only expect a part of it to be contained.
    """
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=["content", "source"])

    long_sentence = "This is a very long document. It is long on purpose."

    document_1 = "This is a very short document."
    document_2 = "This is also a short document"
    document_3 = long_sentence * 50

    source_1 = "source 1"
    source_2 = "source 2"
    source_3 = "source 3"

    data_dict = {
        "content": [document_1, document_2, document_3],
        "source": [source_1, source_2, source_3],
    }

    expected_docs_str = json.dumps(
        [
            {"content": document_1, "source": source_1},
            {"content": document_2, "source": source_2},
        ],
        separators=(",", ":"),
    )

    matched_documents = pd.DataFrame(data_dict)

    docs_str, matched_documents_new = documents_formatter.format(matched_documents)
    assert docs_str == expected_docs_str

    # less documents and the new document is shorter than the original
    assert len(matched_documents) == 3
    assert len(matched_documents_new) == 2

    # The last document gets ignored completely, the first 2 make it
    assert document_1 in docs_str
    assert document_2 in docs_str
    assert long_sentence not in docs_str


def test_DocumentsFormatterHTML__doc_to_long_2():
    """In this test, document_2 doesn't entirely fit.

    we only expect a part of it to be contained, as well as all of document_1, and none of document_3.
    """

    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    documents_formatter = DocumentsFormatterHTML(
        tokenizer=tokenizer,
        max_tokens=100,
    )

    document_1 = "This is a very short document."
    document_2 = "This is a very long document. It is long on purpose." * 50
    document_3 = "This is also a short document"

    matched_documents = pd.DataFrame({"content": [document_1, document_2, document_3]})

    docs_str, matched_documents_new = documents_formatter.format(matched_documents)

    # less documents and the new document is shorter than the original
    assert len(matched_documents) == 3
    assert len(matched_documents_new) == 2

    assert document_1 in docs_str
    assert "This is a very long document. It is long on purpose." in docs_str  # at least a subset should be in there
    assert document_3 not in docs_str


def test_DocumentsFormatterHTML__complex_format():
    """In this test, we expect all 3 documents to be matched and returned in a particular format."""
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    documents_formatter = DocumentsFormatterHTML(
        tokenizer=tokenizer,
        max_tokens=100,
        formatter="Title: {title}\n{content}\n",
    )

    document_1 = "This is a very short document."
    document_2 = "This is another very short document."
    document_3 = "This is also a short document."

    title_1 = "doc1"
    title_2 = "doc2"
    title_3 = "doc3"

    country_1 = "Canada"
    country_2 = "France"
    country_3 = "Germany"

    expected_docs_str = (
        "<DOCUMENTS>"
        f"<DOCUMENT>Title: {title_1}\n{document_1}\n<\\DOCUMENT>"
        f"<DOCUMENT>Title: {title_2}\n{document_2}\n<\\DOCUMENT>"
        f"<DOCUMENT>Title: {title_3}\n{document_3}\n<\\DOCUMENT>"
        "<\\DOCUMENTS>"
    )

    matched_documents = pd.DataFrame(
        {
            "content": [document_1, document_2, document_3],
            "title": [title_1, title_2, title_3],
            "country": [country_1, country_2, country_3],
        }
    )

    docs_str, matched_documents_new = documents_formatter.format(matched_documents)

    # less documents and the new document is shorter than the original
    assert all(matched_documents.content == matched_documents_new.content)

    assert docs_str == expected_docs_str


def test_system_prompt_formatter():
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    prompt_formatter = PromptFormatter(
        tokenizer=tokenizer,
        max_tokens=200,
        text_after_docs="After docs.",
        text_before_docs="Before docs.",
        formatter="{text_before_docs}\n{documents}\n{text_after_docs}",
    )

    documents = "Here are some docs"

    prompt = prompt_formatter.format(documents)

    assert prompt == ("Before docs.\n" "Here are some docs\n" "After docs.")

    assert documents in prompt


def test_system_prompt_formatter__to_long():
    tokenizer = GPTTokenizer(model_name="gpt-3.5-turbo")
    prompt_formatter = PromptFormatter(
        tokenizer=tokenizer,
        max_tokens=200,
        text_after_docs="After docs.",
        text_before_docs="Before docs.",
    )

    documents = "Here are some documents that are WAY too long." * 100

    with pytest.raises(ValueError):
        prompt_formatter.format(documents)


================================================
FILE: tests/test_read_write.py
================================================
import pandas as pd

from buster.completers import Completion, UserInputs


class MockValidator:
    def __init__(self):
        self.use_reranking = True

    def check_answer_relevance(self, completion: Completion) -> bool:
        return True

    def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> bool:
        return matched_documents


def test_read_write_completion():
    n_samples = 3
    completion_kwargs = {"param_1": "a"}
    matched_documents = pd.DataFrame.from_dict(
        {
            "title": ["test"] * n_samples,
            "url": ["http://url.com"] * n_samples,
            "content": ["cool text"] * n_samples,
            "embedding": [[0.0] * 1000] * n_samples,
            "n_tokens": [10] * n_samples,
            "source": ["fake source"] * n_samples,
        }
    )
    c = Completion(
        user_inputs=UserInputs(original_input="What is the meaning of life?"),
        error=False,
        answer_text="This is my actual answer",
        matched_documents=matched_documents,
        validator=MockValidator(),
        completion_kwargs=completion_kwargs,
    )

    c_json = c.to_json()
    c_back = Completion.from_dict(c_json)

    assert c.error == c_back.error
    assert c.answer_text == c_back.answer_text
    assert c.user_inputs == c_back.user_inputs
    assert c.answer_relevant == c_back.answer_relevant
    assert c.completion_kwargs == c_back.completion_kwargs
    for col in c_back.matched_documents.columns.tolist():
        assert col in c.matched_documents.columns.tolist()
        assert c_back.matched_documents[col].tolist() == c.matched_documents[col].tolist()


================================================
FILE: tests/test_validator.py
================================================
import pandas as pd

from buster.llm_utils import get_openai_embedding
from buster.validators import Validator

validator_cfg = {
    "use_reranking": True,
    "validate_documents": True,
    "answer_validator_cfg": {
        "unknown_response_templates": [
            "I Don't know how to answer your question.",
        ],
        "unknown_threshold": 0.85,
    },
    "question_validator_cfg": {
        "invalid_question_response": "This question does not seem relevant to my current knowledge.",
        "completion_kwargs": {
            "model": "gpt-3.5-turbo",
            "stream": False,
            "temperature": 0,
        },
        "check_question_prompt": "You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.",
    },
}
validator = Validator(**validator_cfg)


def test_validator_check_question_relevance():
    question = "What is backpropagation?"
    relevance, _ = validator.check_question_relevance(question)
    assert relevance == True

    question = "How can I make a broccoli soup?"
    relevance, _ = validator.check_question_relevance(question)
    assert relevance == False


def test_validator_check_answer_relevance():
    answer = "Not sure how to answer your question"
    assert validator.check_answer_relevance(answer) == False

    answer = "According to the documentation, the answer should be 2+2 = 4."
    assert validator.check_answer_relevance(answer) == True


def test_validator_check_documents_relevance():
    docs = {
        "content": [
            "A panda is a bear native to China, known for its black and white fur.",
            "An apple is a sweet fruit, often red, green, or yellow in color.",
            "A car is a wheeled vehicle used for transportation, typically powered by an engine.",
        ]
    }

    answer = "Pandas live in China."
    expected_relevance = [True, False, False]

    matched_documents = pd.DataFrame(docs)
    matched_documents = validator.check_documents_relevance(answer=answer, matched_documents=matched_documents)

    assert "relevance" in matched_documents.columns
    assert matched_documents.relevance.to_list() == expected_relevance


def test_validator_rerank_docs():
    documents = [
        "A basketball player practicing",
        "A cat eating an orange",
        "A green apple on the counter",
    ]
    matched_documents = pd.DataFrame({"documents": documents})
    matched_documents["embedding"] = matched_documents.documents.apply(lambda x: get_openai_embedding(x))

    answer = "An apple is a delicious fruit."
    reranked_documents = validator.rerank_docs(answer, matched_documents)

    assert reranked_documents.documents.to_list() == [
        "A green apple on the counter",
        "A cat eating an orange",
        "A basketball player practicing",
    ]