[
  {
    "path": ".github/workflows/publish_pypi.yaml",
    "content": "name: publish-pypi\n\non:\n  workflow_dispatch:\n  release:\n    types: [created]\n\njobs:\n  deploy:\n\n    runs-on: ubuntu-latest\n    environment: secrets\n    steps:\n    - uses: actions/checkout@v4\n      with:\n        fetch-depth: 0\n    - name: Set up Python\n      uses: actions/setup-python@v4\n      with:\n        python-version: '3.10'\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install poetry\n    - name: Build and publish\n      env:\n        POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }}\n      run: |\n        poetry version $(git describe --tags --abbrev=0)\n        poetry add $(cat requirements.txt)\n        poetry build\n        poetry publish\n"
  },
  {
    "path": ".github/workflows/tests.yaml",
    "content": "name: Tests\n\non: [pull_request]\n\njobs:\n  tests:\n    runs-on: ubuntu-latest\n    environment: secrets\n    steps:\n      - name: Check out repository code\n        uses: actions/checkout@v3\n      - name: black linter\n        uses: psf/black@stable\n        with:\n          options: \"--check --diff --line-length 120\"\n      - name: isort\n        run: |\n          pip install isort\n          isort --profile black --check-only .\n      - name: unit tests\n        env:\n          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}\n        run: |\n          python3 -m pip install --upgrade pip\n          pip install -e .\n          pytest\n"
  },
  {
    "path": ".gitignore",
    "content": "# database files\n*.db\n\nbuster/apps/data/\ndeeplake_store/\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# Macos\n*.DS_Store*\n\nalbenchmark/data/\n\n# Ignore notebooks by default\n*.ipynb\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# VSCode\n.vscode/\n"
  },
  {
    "path": "LICENSE.md",
    "content": "MIT License\n\nCopyright (c) 2023 Buster dev team\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Buster, the QA documentation chatbot!\n\n<div align=\"center\">\n\n[![GitHub](https://img.shields.io/github/license/jerpint/buster)](https://github.com/jerpint/buster)\n[![PyPI](https://img.shields.io/pypi/v/buster-doctalk?logo=pypi)](https://pypi.org/project/buster-doctalk)\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n[![Hugging Face Spaces](https://img.shields.io/badge/🤗%20Hugging%20Face-Buster%20Demo-blue)](https://huggingface.co/spaces/jerpint/buster)\n\n</div>\n\nBuster is a question-answering chatbot that can be tuned to any source of documentations.\n\n# Demo\n\nIn order to view the full abilities of Buster, you can play with our [live demo here](https://huggingface.co/spaces/jerpint/buster).\nWe scraped the documentation of [huggingface 🤗 Transformers](https://huggingface.co/docs/transformers/index) and instructed Buster to answer questions related to its usage.\n\n# Quickstart\n\nThis section is meant to help you install and run local version of Buster.\nFirst step, install buster:\n\n**Note**: Buster requires python>=3.10\n\n```bash\npip install buster-doctalk\n```\n\nThen, go to the examples folder and launch the app.\nWe've included small sample data off stackoverflow-ai questions that you can test your setup with to try app:\n\n```bash\ncd buster/buster/examples\ngradio gradio_app.py\n```\n\nThis will launch the gradio app locally.\n\n\n**NOTE**: The demo uses chatGPT to generate text and compute embeddings, make sure to set a valid openai API key:\n```bash\nexport OPENAI_API_KEY=sk-...\n```\n\n# Generating your own embeddings\n\nOnce your local version of Buster is up and running, the next step is for you to be able to import your own data.\nWe will be using the `stackoverflow.csv` file in the `buster/examples/` folder for this. This is the same data that was used to generate the demo app's embeddings.\n\nYou will first ingest the documents to be ready for buster. In this example, we use Deeplake's vector store, but you can always write your own custom `DocumentManager`:\n\n\n```python\nimport pandas as pd\nfrom buster.documents_manager import DeepLakeDocumentsManager\n\n# Read the csv\ndf = pd.read_csv(\"stackoverflow.csv\")\n\n# Generate the embeddings for our documents and store them in a deeplake format\ndm = DeepLakeDocumentsManager(vector_store_path=\"deeplake_store\", overwrite=True)\ndm.add(df)\n```\n\nYou can also just simply run the script:\n\n    python generate_embeddings.py --csv stackoverflow.csv\n\n\nThis will generate the embeddings and save them locally in the `deeplake_store`.\n\n\n**NOTE**: You will need to set a valid openai key for computing embeddings:\n\n```bash\nexport OPENAI_API_KEY=sk-...\n```\n\nYou only need to run this operation one time.\n\nIn the .csv, we expect columns [\"title\", \"url\", \"content\", \"source\"] for each row of the csv:\n\n* title: this will be the title of the url to display\n* url: the link that clicking the title will redirect to\n* source: where the content was originally sourced from (e.g. wikipedia, stackoverflow, etc.)\n* content: plaintext of the documents to be embedded. It is your responsibility to chunk your documents appropriately. For better results, we recommend chunks of 400-600 words.\n\n# Additional Configurations\n\nProperly prompting models as well as playing around with various model parameters can lead to different results. We use a `BusterConfig` object to keep track of the various Buster configurations. In the `buster/examples/` folder, the config is stored inside `cfg.py`. Modify this config to update parameters, prompts, etc.\n\n# How does Buster work?\n\nFirst, we parsed the documentation into snippets. For each snippet, we obtain an embedding by using the [OpenAI API](https://beta.openai.com/docs/guides/embeddings/what-are-embeddings).\n\nThen, when a user asks a question, we compute its embedding, and find the snippets from the doc with the highest cosine similarity to the question.\n\nFinally, we craft the prompt:\n- The most relevant snippets from the doc.\n- The engineering prompt.\n- The user's question.\n\nWe send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!\n\n### Currently available models\n\n- For embeddings: \"text-embedding-ada-002\"\n- For completion: We support both \"gpt-3.5-turbo\" and \"gpt-4\"\n\n### Livestream\n\nFor more information, you can watch the livestream where explain how buster works in detail!\n\n- [Livestream recording](https://youtu.be/LB5g-AhfPG8)"
  },
  {
    "path": "buster/__init__.py",
    "content": ""
  },
  {
    "path": "buster/busterbot.py",
    "content": "import logging\nfrom dataclasses import dataclass, field\nfrom typing import Optional\n\nimport pandas as pd\n\nfrom buster.completers import Completion, DocumentAnswerer, UserInputs\nfrom buster.llm_utils import QuestionReformulator, get_openai_embedding\nfrom buster.retriever import Retriever\nfrom buster.validators import Validator\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\n@dataclass\nclass BusterConfig:\n    \"\"\"Configuration object for a chatbot.\"\"\"\n\n    validator_cfg: dict = field(\n        default_factory=lambda: {\n            \"use_reranking\": True,\n            \"validate_documents\": False,\n        }\n    )\n    tokenizer_cfg: dict = field(\n        default_factory=lambda: {\n            \"model_name\": \"gpt-3.5-turbo\",\n        }\n    )\n    retriever_cfg: dict = field(\n        default_factory=lambda: {\n            \"max_tokens\": 3000,\n            \"top_k\": 3,\n            \"thresh\": 0.7,\n            \"embedding_fn\": get_openai_embedding,\n        }\n    )\n    prompt_formatter_cfg: dict = field(\n        default_factory=lambda: {\n            \"max_tokens\": 3500,\n            \"text_before_docs\": \"You are a chatbot answering questions.\\n\",\n            \"text_after_docs\": \"Answer the following question:\\n\",\n            \"formatter\": \"{text_before_docs}\\n{documents}\\n{text_after_docs}\",\n        }\n    )\n    documents_formatter_cfg: dict = (\n        field(\n            default_factory=lambda: {\n                \"max_tokens\": 3500,\n                \"formatter\": \"{content}\",\n            }\n        ),\n    )\n    documents_answerer_cfg: dict = field(\n        default_factory=lambda: {\n            \"no_documents_message\": \"No documents are available for this question.\",\n        }\n    )\n    question_reformulator_cfg: dict = field(\n        default_factory=lambda: {\n            \"completion_kwargs\": {\n                \"model\": \"gpt-3.5-turbo\",\n                \"stream\": False,\n                \"temperature\": 0,\n            },\n            \"system_prompt\": \"\"\"\n            Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system.\n            Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval.\"\"\",\n        }\n    )\n    completion_cfg: dict = field(\n        default_factory=lambda: {\n            \"completion_kwargs\": {\n                \"model\": \"gpt-3.5-turbo\",\n                \"temperature\": 0,\n                \"stream\": True,\n            },\n        }\n    )\n\n\nclass Buster:\n    def __init__(\n        self,\n        retriever: Retriever,\n        document_answerer: DocumentAnswerer,\n        validator: Validator,\n        question_reformulator: Optional[QuestionReformulator] = None,\n    ):\n        self.document_answerer = document_answerer\n        self.retriever = retriever\n        self.validator = validator\n        self.question_reformulator = question_reformulator\n\n    def process_input(\n        self,\n        user_input: str,\n        sources: Optional[list[str]] = None,\n        top_k: Optional[int] = None,\n        reformulate_question: Optional[bool] = False,\n    ) -> Completion:\n        \"\"\"\n        Main function to process the input question and generate a formatted output.\n        \"\"\"\n\n        logger.info(f\"User Input:\\n{user_input}\")\n\n        # We make sure there is always a newline at the end of the question to avoid completing the question.\n        if not user_input.endswith(\"\\n\"):\n            user_input += \"\\n\"\n\n        user_inputs = UserInputs(original_input=user_input)\n\n        # The returned message is either a generic invalid question message or an error handling message\n        question_relevant, irrelevant_question_message = self.validator.check_question_relevance(user_input)\n\n        if question_relevant:\n            # question is relevant, get completor to generate completion\n\n            # reformulate the question if a reformulator is defined\n            if self.question_reformulator is not None and reformulate_question:\n                reformulated_input, reformulation_error = self.question_reformulator.reformulate(\n                    user_inputs.original_input\n                )\n                user_inputs.reformulated_input = reformulated_input\n\n                if reformulation_error:\n                    completion = Completion(\n                        error=True,\n                        user_inputs=user_inputs,\n                        matched_documents=pd.DataFrame(),\n                        answer_text=\"Something went wrong reformulating the question. Try again soon.\",\n                        answer_relevant=False,\n                        question_relevant=False,\n                        validator=self.validator,\n                    )\n                    return completion\n\n            # Retrieve and answer\n            matched_documents = self.retriever.retrieve(user_inputs, sources=sources, top_k=top_k)\n            completion: Completion = self.document_answerer.get_completion(\n                user_inputs=user_inputs,\n                matched_documents=matched_documents,\n                validator=self.validator,\n                question_relevant=question_relevant,\n            )\n            return completion\n\n        else:\n            # question was determined irrelevant, so we instead return a generic response set by the user.\n            completion = Completion(\n                error=False,\n                user_inputs=user_inputs,\n                matched_documents=pd.DataFrame(),\n                answer_text=irrelevant_question_message,\n                answer_relevant=False,\n                question_relevant=False,\n                validator=self.validator,\n            )\n            return completion\n"
  },
  {
    "path": "buster/completers/__init__.py",
    "content": "from .base import Completer, Completion, DocumentAnswerer\nfrom .chatgpt import ChatGPTCompleter\nfrom .user_inputs import UserInputs\n\n__all__ = [\n    ChatGPTCompleter,\n    Completer,\n    Completion,\n    DocumentAnswerer,\n    UserInputs,\n]\n"
  },
  {
    "path": "buster/completers/base.py",
    "content": "import io\nimport logging\nimport warnings\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Iterator, Optional\n\nimport pandas as pd\nfrom fastapi.encoders import jsonable_encoder\n\nfrom buster.completers.user_inputs import UserInputs\nfrom buster.formatters.documents import DocumentsFormatter\nfrom buster.formatters.prompts import PromptFormatter\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass Completion:\n    \"\"\"\n    A class to represent the completion object of a model's output for a user's question.\n\n    Attributes:\n        error (bool): A boolean indicating if an error occurred when generating the completion.\n        user_inputs (UserInputs): The inputs from the user.\n        matched_documents (pd.DataFrame): The documents that were matched to the user's question.\n        answer_generator (Iterator): An optional iterator used to generate the model's answer.\n        answer_text (str): An optional answer text.\n        answer_relevant (bool): An optional boolean indicating if the answer is relevant.\n        question_relevant (bool): An optional boolean indicating if the question is relevant.\n        completion_kwargs (dict): Optional arguments for the completion.\n        validator (Validator): An optional Validator object.\n\n    Methods:\n        __repr__: Outputs a string representation of the object.\n        _validate_arguments: Validates answer_generator and answer_text arguments.\n        answer_relevant: Determines if the answer is relevant or not.\n        question_relevant: Retrieves the relevance of the question.\n        answer_text: Retrieves the answer text.\n        answer_generator: Retrieves the answer generator.\n        postprocess: Postprocesses the results after generating the model's answer.\n        to_json: Outputs selected attributes of the object in JSON format.\n        from_dict: Creates a Completion object from a dictionary.\n    \"\"\"\n\n    def __init__(\n        self,\n        error: bool,\n        user_inputs: UserInputs,\n        matched_documents: pd.DataFrame,\n        answer_generator: Optional[Iterator] = None,\n        answer_text: Optional[str] = None,\n        answer_relevant: Optional[bool] = None,\n        question_relevant: Optional[bool] = None,\n        completion_kwargs: Optional[dict] = None,\n        validator=None,\n    ):\n        self.error = error\n        self.user_inputs = user_inputs\n        self.matched_documents = matched_documents\n        self.validator = validator\n        self.completion_kwargs = completion_kwargs\n        self._answer_relevant = answer_relevant\n        self._question_relevant = question_relevant\n\n        self._validate_arguments(answer_generator, answer_text)\n\n    def __repr__(self):\n        class_name = type(self).__name__\n        return (\n            f\"{class_name}(\"\n            f\"user_inputs={self.user_inputs!r}, \"\n            f\"error={self.error!r}, \"\n            f\"matched_documents={self.matched_documents!r}, \"\n            f\"answer_text={self._answer_text!r}, \"\n            f\"answer_generator={self.answer_generator!r}, \"\n            f\"answer_relevant={self._answer_relevant!r}, \"\n            f\"question_relevant={self.question_relevant!r}, \"\n            f\"completion_kwargs={self.completion_kwargs!r}, \"\n            \"),\"\n        )\n\n    def _validate_arguments(self, answer_generator: Optional[Iterator], answer_text: Optional[str]):\n        \"\"\"Sets answer_generator and answer_text properties depending on the provided inputs.\n\n        Checks that one of either answer_generator or answer_text is not None.\n        If answer_text is set, a generator can simply be inferred from answer_text.\n        If answer_generator is set, answer_text will be set only once the generator gets called. Set to None for now.\n        \"\"\"\n        if (answer_generator is None and answer_text is None) or (\n            answer_generator is not None and answer_text is not None\n        ):\n            raise ValueError(\"Only one of 'answer_generator' and 'answer_text' must be set.\")\n\n        # If text is provided, the genrator can be inferred\n        if answer_text is not None:\n            assert isinstance(answer_text, str)\n            answer_generator = (msg for msg in answer_text)\n\n        self._answer_text = answer_text\n        self._answer_generator = answer_generator\n\n    @property\n    def answer_relevant(self) -> bool:\n        \"\"\"Property determining the relevance of an answer (bool).\n\n        If an error occured, the relevance is False.\n        If no documents were retrieved, the relevance is also False.\n        Otherwise, the relevance is computed as defined by the validator (e.g. comparing to embeddings)\n        \"\"\"\n        if self.error:\n            self._answer_relevant = False\n        elif len(self.matched_documents) == 0:\n            self._answer_relevant = False\n        elif self._answer_relevant is not None:\n            return self._answer_relevant\n        else:\n            # Check the answer relevance by looking at the embeddings\n            self._answer_relevant = self.validator.check_answer_relevance(self.answer_text)\n        return self._answer_relevant\n\n    @property\n    def question_relevant(self):\n        \"\"\"Property determining the relevance of the question asked (bool).\"\"\"\n        return self._question_relevant\n\n    @property\n    def answer_text(self):\n        if self._answer_text is None:\n            # generates the text if it wasn't already generated\n            self._answer_text = \"\".join([i for i in self.answer_generator])\n        return self._answer_text\n\n    @answer_text.setter\n    def answer_text(self, value: str) -> None:\n        self._answer_text = value\n\n    @property\n    def answer_generator(self):\n        # keeps track of the yielded text\n        self._answer_text = \"\"\n        for token in self._answer_generator:\n            self._answer_text += token\n            yield token\n\n        self.postprocess()\n\n    @answer_generator.setter\n    def answer_generator(self, generator: Iterator) -> None:\n        self._answer_generator = generator\n\n    def postprocess(self):\n        \"\"\"Function executed after the answer text is generated by the answer_generator\"\"\"\n\n        if self.validator is None:\n            # TODO: This should only happen if declaring a Completion using .from_dict() method.\n            # This behaviour is not ideal and we may want to remove support for .from_dict() in the future.\n            logger.info(\"No validator was set, skipping postprocessing.\")\n            return\n\n        if self.validator.use_reranking:\n            # rerank docs in order of cosine similarity to the question\n            self.matched_documents = self.validator.rerank_docs(\n                answer=self.answer_text, matched_documents=self.matched_documents\n            )\n\n        if self.validator.validate_documents:\n            self.matched_documents = self.validator.check_documents_relevance(\n                answer=self.answer_text, matched_documents=self.matched_documents\n            )\n\n        # access the property so it gets set if not computed alerady\n        self.answer_relevant\n\n    def to_json(self, columns_to_ignore: Optional[list[str]] = None) -> Any:\n        \"\"\"Converts selected attributes of the object to a JSON format.\n\n        Args:\n            columns_to_ignore (list[str]): A list of column names to ignore in the csulting matched_documents dataframe.\n\n        Returns:\n            Any: The object's attributes encoded as JSON.\n\n        Notes:\n            - The 'matched_documents' attribute of type pd.DataFrame is encoded separately\n            using a custom encoder.\n            - The resulting JSON may exclude specified columns based on the 'columns_to_ignore' parameter.\n        \"\"\"\n\n        def encode_df(df: pd.DataFrame) -> dict:\n            if columns_to_ignore is not None:\n                df = df.drop(columns=columns_to_ignore, errors=\"ignore\")\n            return df.to_json(orient=\"index\")\n\n        custom_encoder = {\n            # Converts the matched_documents in the user_responses to json\n            pd.DataFrame: encode_df,\n        }\n\n        to_encode = {\n            \"user_inputs\": self.user_inputs,\n            \"answer_text\": self.answer_text,\n            \"matched_documents\": self.matched_documents,\n            \"answer_relevant\": self.answer_relevant,\n            \"question_relevant\": self.question_relevant,\n            \"completion_kwargs\": self.completion_kwargs,\n            \"error\": self.error,\n        }\n        return jsonable_encoder(to_encode, custom_encoder=custom_encoder)\n\n    @classmethod\n    def from_dict(cls, completion_dict: dict):\n        # Map a dict of user inputs to the UserInputs class\n        if isinstance(completion_dict[\"user_inputs\"], dict):\n            completion_dict[\"user_inputs\"] = UserInputs(**completion_dict[\"user_inputs\"])\n\n        # Map the matched documents back to a dataframe\n        if isinstance(completion_dict[\"matched_documents\"], str):\n            # avoids deprecation warning\n            json_data = io.StringIO(completion_dict[\"matched_documents\"])\n\n            completion_dict[\"matched_documents\"] = pd.read_json(json_data, orient=\"index\")\n        elif isinstance(completion_dict[\"matched_documents\"], dict):\n            completion_dict[\"matched_documents\"] = pd.DataFrame(completion_dict[\"matched_documents\"]).T\n        else:\n            raise ValueError(f\"Unknown type for matched_documents: {type(completion_dict['matched_documents'])}\")\n\n        return cls(**completion_dict)\n\n\nclass Completer(ABC):\n    \"\"\"\n    Abstract base class for completers, which generate an answer to a prompt.\n\n    Methods:\n        complete: The method that should be implemented by any child class to provide an answer to a prompt.\n    \"\"\"\n\n    @abstractmethod\n    def complete(self, prompt: str, user_input) -> (str | Iterator, bool):\n        \"\"\"Returns the completed message (can be a generator), and a boolean to indicate if an error occured or not.\"\"\"\n        ...\n\n\nclass DocumentAnswerer:\n    \"\"\"\n    A class that answers questions based on documents.\n\n    It takes care of formatting the prompts and the documents, and generating the answer when relevant.\n\n    Attributes:\n        completer (Completer): Object that actually generates an answer to the prompt.\n        documents_formatter (DocumentsFormatter): Object that formats the documents for the prompt.\n        prompt_formatter (PromptFormatter): Object that prepares the prompt for the completer.\n        no_documents_message (str): Message to display when no documents are found to match the query.\n        completion_class (Completion): Class to use for the resulting completion.\n\n    Methods:\n        prepare_prompt: Prepares the prompt that will be passed to the completer.\n        get_completion: Generates a completion to the user's question based on matched documents.\n    \"\"\"\n\n    def __init__(\n        self,\n        documents_formatter: DocumentsFormatter,\n        prompt_formatter: PromptFormatter,\n        completer: Completer,\n        completion_class: Completion = Completion,\n        no_documents_message: str = \"No documents were found that match your question.\",\n    ):\n        self.completer = completer\n        self.documents_formatter = documents_formatter\n        self.prompt_formatter = prompt_formatter\n        self.no_documents_message = no_documents_message\n        self.completion_class = completion_class\n\n    def prepare_prompt(self, matched_documents) -> str:\n        \"\"\"Prepare the prompt with prompt engineering.\n\n        A user's question is not included here. We use the documents formatter and prompt formatter to\n        compose the prompt itself.\n        \"\"\"\n\n        # format the matched documents, (will truncate them if too long)\n        formatted_documents, _ = self.documents_formatter.format(matched_documents)\n        prompt = self.prompt_formatter.format(formatted_documents)\n        return prompt\n\n    def get_completion(\n        self,\n        user_inputs: UserInputs,\n        matched_documents: pd.DataFrame,\n        validator,\n        question_relevant: bool = True,\n    ) -> Completion:\n        \"\"\"Generate a completion to a user's question based on matched documents.\n\n        It is safe to assume the question_relevance to be True if we made it here.\"\"\"\n\n        logger.info(f\"{user_inputs=}\")\n\n        if len(matched_documents) == 0:\n            warning_msg = \"No documents found during retrieval.\"\n            warnings.warn(warning_msg)\n            logger.warning(warning_msg)\n\n            # empty dataframe\n            matched_documents = pd.DataFrame(columns=matched_documents.columns)\n\n            # because we are requesting a completion, we assume the question is relevant.\n            # However, no documents were found, so we pass the no documents found message instead of generating the answer.\n            # The completion does not get triggered, so we do not pass completion kwargs here either.\n            completion = self.completion_class(\n                user_inputs=user_inputs,\n                answer_text=self.no_documents_message,\n                error=False,\n                matched_documents=matched_documents,\n                question_relevant=question_relevant,\n                validator=validator,\n            )\n            return completion\n\n        # prepare the prompt with matched documents\n        prompt = self.prepare_prompt(matched_documents)\n        logger.info(f\"{prompt=}\")\n\n        logger.info(f\"querying model with parameters: {self.completer.completion_kwargs}...\")\n\n        try:\n            answer_generator, error = self.completer.complete(prompt=prompt, user_input=user_inputs.current_input)\n\n        except Exception as e:\n            error = True\n            answer_generator = \"Something went wrong with the request, try again soon!\"\n            logger.exception(\"Unknown error when attempting to generate response. See traceback:\")\n\n        completion = self.completion_class(\n            answer_generator=answer_generator,\n            error=error,\n            matched_documents=matched_documents,\n            user_inputs=user_inputs,\n            question_relevant=question_relevant,\n            validator=validator,\n            completion_kwargs=self.completer.completion_kwargs,\n        )\n\n        return completion\n"
  },
  {
    "path": "buster/completers/chatgpt.py",
    "content": "import logging\nimport os\nfrom typing import Iterator, Optional\n\nimport openai\nfrom openai import OpenAI\n\nfrom buster.completers import Completer\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n# Check if an API key exists for promptlayer, if it does, use it\npromptlayer_api_key = os.environ.get(\"PROMPTLAYER_API_KEY\")\nif promptlayer_api_key:\n    # TODO: Check if this still works with latest openAI API...\n    try:\n        import promptlayer\n\n        logger.info(\"Enabling prompt layer...\")\n        promptlayer.api_key = promptlayer_api_key\n\n        # replace openai with the promptlayer wrapper\n        openai = promptlayer.openai\n    except Exception as e:\n        logger.exception(\"Something went wrong enabling promptlayer.\")\n\n\nclass ChatGPTCompleter(Completer):\n    def __init__(self, completion_kwargs: dict, client_kwargs: Optional[dict] = None):\n        \"\"\"Initialize the ChatGPTCompleter with completion and client keyword arguments.\n\n        Args:\n          completion_kwargs: A dictionary of keyword arguments to be used for completions.\n          client_kwargs: An optional dictionary of keyword arguments to be used for the OpenAI client.\n        \"\"\"\n        # use default client if none passed\n        self.completion_kwargs = completion_kwargs\n\n        if client_kwargs is None:\n            client_kwargs = {}\n\n        self.client = OpenAI(**client_kwargs)\n\n    def complete(self, prompt: str, user_input: str, completion_kwargs=None) -> (str | Iterator, bool):\n        \"\"\"Given a prompt and user input, returns the generated message and error flag.\n\n        Args:\n          prompt: The prompt containing the formatted documents and instructions.\n          user_input: The user input to be responded to.\n          completion_kwargs: An optional dictionary of keyword arguments to override the default completion kwargs.\n\n        Returns:\n          A tuple containing the completed message and a boolean indicating if an error occurred.\n\n        Raises:\n          openai.BadRequestError: If the completion request is invalid.\n          openai.RateLimitError: If the OpenAI servers are overloaded.\n        \"\"\"\n        # Uses default configuration if not overridden\n\n        if completion_kwargs is None:\n            completion_kwargs = self.completion_kwargs\n\n        messages = [\n            {\"role\": \"system\", \"content\": prompt},\n            {\"role\": \"user\", \"content\": user_input},\n        ]\n\n        try:\n            error = False\n            response = self.client.chat.completions.create(messages=messages, **completion_kwargs)\n        except openai.BadRequestError:\n            error = True\n            logger.exception(\"Invalid request to OpenAI API. See traceback:\")\n            error_message = \"Something went wrong while connecting with OpenAI, try again soon!\"\n            return error_message, error\n\n        except openai.RateLimitError:\n            error = True\n            logger.exception(\"RateLimit error from OpenAI. See traceback:\")\n            error_message = \"OpenAI servers seem to be overloaded, try again later!\"\n            return error_message, error\n\n        except Exception as e:\n            error = True\n            logger.exception(\"Some kind of error happened trying to generate the response. See traceback:\")\n            error_message = \"Something went wrong with connecting with OpenAI, try again soon!\"\n            return error_message, error\n\n        if completion_kwargs.get(\"stream\") is True:\n            # We are entering streaming mode, so here we're just wrapping the streamed\n            # openai response to be easier to handle later\n            def answer_generator():\n                for chunk in response:\n                    token = chunk.choices[0].delta.content\n\n                    # Always stream a string, openAI returns None on last token\n                    token = \"\" if token is None else token\n\n                    yield token\n\n            return answer_generator(), error\n\n        else:\n            full_response: str = response.choices[0].message.content\n            return full_response, error\n"
  },
  {
    "path": "buster/completers/user_inputs.py",
    "content": "from dataclasses import dataclass\nfrom typing import Optional\n\n\n@dataclass\nclass UserInputs:\n    \"\"\"A class that represents user inputs.\n\n    Attributes:\n        original_input: The original user input.\n        reformulated_input: The reformulated user input (optional).\n    \"\"\"\n\n    original_input: str\n    reformulated_input: Optional[str] = None\n\n    @property\n    def current_input(self):\n        \"\"\"Returns the current user input.\n\n        If the reformulated input is not None, it returns the reformulated input.\n        Otherwise, it returns the original input.\n\n        Returns:\n            The current user input.\n        \"\"\"\n        return self.reformulated_input if self.reformulated_input is not None else self.original_input\n"
  },
  {
    "path": "buster/documents_manager/__init__.py",
    "content": "from .base import DocumentsManager\nfrom .deeplake import DeepLakeDocumentsManager\nfrom .service import DocumentsService\n\n__all__ = [DocumentsManager, DocumentsService, DeepLakeDocumentsManager]\n"
  },
  {
    "path": "buster/documents_manager/base.py",
    "content": "import logging\nimport time\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom typing import Callable, Optional\n\nimport numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\n\nfrom buster.llm_utils import compute_embeddings_parallelized, get_openai_embedding\n\ntqdm.pandas()\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\n@dataclass\nclass DocumentsManager(ABC):\n    def __init__(self, required_columns: Optional[list[str]] = None):\n        \"\"\"\n        Constructor for DocumentsManager class.\n\n        Args:\n            required_columns (Optional[list[str]]): A list of column names that are required for the dataframe to contain.\n                                                     If None, no columns are enforced.\n        \"\"\"\n\n        self.required_columns = required_columns\n\n    def _check_required_columns(self, df: pd.DataFrame):\n        \"\"\"Each entry in the df is expected to have the columns in self.required_columns\"\"\"\n        if not all(col in df.columns for col in self.required_columns):\n            raise ValueError(f\"DataFrame is missing one or more of {self.required_columns=}\")\n\n    def _checkpoint_csv(self, df, csv_filename: str, csv_overwrite: bool = True):\n        \"\"\"\n        Saves DataFrame with embeddings to a CSV checkpoint.\n\n        Args:\n            df (pd.DataFrame): The DataFrame with embeddings.\n            csv_filename (str): Path to save a copy of the dataframe with computed embeddings for later use.\n            csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True.\n        \"\"\"\n        import os\n\n        if csv_overwrite:\n            df.to_csv(csv_filename)\n            logger.info(f\"Saved DataFrame with embeddings to {csv_filename}\")\n\n        else:\n            if os.path.exists(csv_filename):\n                # append to existing file\n                append_df = pd.read_csv(csv_filename)\n                append_df = pd.concat([append_df, df])\n            else:\n                # will create the new file\n                append_df = df.copy()\n            append_df.to_csv(csv_filename)\n            logger.info(f\"Appending DataFrame embeddings to {csv_filename}\")\n\n    def add(\n        self,\n        df: pd.DataFrame,\n        num_workers: int = 16,\n        embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding,\n        sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,\n        csv_filename: Optional[str] = None,\n        csv_overwrite: bool = True,\n        **add_kwargs,\n    ):\n        \"\"\"Write documents from a DataFrame into the DocumentManager store.\n\n        This method adds documents from the provided DataFrame to the database. It performs the following steps:\n        1. Checks if the required columns are present in the DataFrame.\n        2. Computes embeddings for the 'content' column if they are not already present.\n        3. Optionally saves the DataFrame with computed embeddings to a CSV checkpoint.\n        4. Calls the '_add_documents' method to add documents with embeddings to the DocumentsManager.\n\n        Args:\n            df (pd.DataFrame): The DataFrame containing the documents to be added.\n            num_workers (int, optional): The number of parallel workers to use for computing embeddings. Default is 32.\n            embedding_fn (callable, optional): A function that computes embeddings for a given input string.\n                Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.\n            sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.\n                Default is None. Only use if you want sparse embeddings.\n            csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use.\n            csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to True.\n            **add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method.\n        \"\"\"\n\n        if self.required_columns is not None:\n            self._check_required_columns(df)\n\n        # Check if embeddings are present, computes them if not\n        if \"embedding\" not in df.columns:\n            df[\"embedding\"] = compute_embeddings_parallelized(df, embedding_fn=embedding_fn, num_workers=num_workers)\n        if \"sparse_embedding\" not in df.columns and sparse_embedding_fn is not None:\n            df[\"sparse_embedding\"] = sparse_embedding_fn(df.content.to_list())\n\n        if csv_filename is not None:\n            self._checkpoint_csv(df, csv_filename=csv_filename, csv_overwrite=csv_overwrite)\n\n        self._add_documents(df, **add_kwargs)\n\n    def batch_add(\n        self,\n        df: pd.DataFrame,\n        batch_size: int = 3000,\n        min_time_interval: int = 60,\n        num_workers: int = 16,\n        embedding_fn: Callable[[str], np.ndarray] = get_openai_embedding,\n        sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,\n        csv_filename: Optional[str] = None,\n        csv_overwrite: bool = False,\n        **add_kwargs,\n    ):\n        \"\"\"\n        Adds DataFrame data to a DataManager instance in batches.\n\n        This function takes a DataFrame and adds its data to a DataManager instance in batches.\n        It ensures that a minimum time interval is maintained between successive batches\n        to prevent timeouts or excessive load. This is useful for APIs like openAI with rate limits.\n\n        Args:\n            df (pd.DataFrame): The input DataFrame containing data to be added.\n            batch_size (int, optional): The size of each batch. Defaults to 3000.\n            min_time_interval (int, optional): The minimum time interval (in seconds) between batches.\n                                                Defaults to 60.\n            num_workers (int, optional): The number of parallel workers to use when adding data.\n                                        Defaults to 32.\n            embedding_fn (callable, optional): A function that computes embeddings for a given input string.\n                Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.\n            sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.\n                Default is None. Only use if you want sparse embeddings.\n            csv_filename (str, optional): Path to save a copy of the dataframe with computed embeddings for later use.\n            csv_overwrite (bool, optional): Whether to overwrite the file with a new file. Defaults to False.\n                When using batches, set to False to keep all embeddings in the same file. You may want to manually remove the file if experimenting.\n            **add_kwargs: Additional keyword arguments to be passed to the '_add_documents' method.\n        \"\"\"\n\n        total_batches = (len(df) // batch_size) + 1\n\n        logger.info(f\"Adding {len(df)} documents with {batch_size=} for {total_batches=}\")\n\n        for batch_idx in range(total_batches):\n            logger.info(f\"Processing batch {batch_idx + 1}/{total_batches}\")\n            start_time = time.time()\n\n            # Calculate batch indices and extract batch DataFrame\n            start_idx = batch_idx * batch_size\n            end_idx = min((batch_idx + 1) * batch_size, len(df))\n            batch_df = df.iloc[start_idx:end_idx]\n\n            # Add the batch data to using specified parameters\n            self.add(\n                batch_df,\n                num_workers=num_workers,\n                csv_filename=csv_filename,\n                csv_overwrite=csv_overwrite,\n                embedding_fn=embedding_fn,\n                sparse_embedding_fn=sparse_embedding_fn,\n                **add_kwargs,\n            )\n\n            elapsed_time = time.time() - start_time\n\n            # Sleep to ensure the minimum time interval is maintained\n            # Only sleep if it's not the last iteration\n            if batch_idx < total_batches - 1:\n                sleep_time = max(0, min_time_interval - elapsed_time)\n                if sleep_time > 0:\n                    logger.info(f\"Sleeping for {round(sleep_time)} seconds...\")\n                    time.sleep(sleep_time)\n\n        logger.info(\"All batches processed.\")\n\n    @abstractmethod\n    def _add_documents(self, df: pd.DataFrame, **add_kwargs):\n        \"\"\"Abstract method to be implemented by each inherited member.\n\n        This method should handle the actual process of adding documents to the database.\n        \"\"\"\n        ...\n"
  },
  {
    "path": "buster/documents_manager/deeplake.py",
    "content": "import logging\nfrom typing import Optional\n\nimport pandas as pd\n\nfrom buster.utils import zip_contents\n\nfrom .base import DocumentsManager\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass DeepLakeDocumentsManager(DocumentsManager):\n    def __init__(\n        self,\n        vector_store_path: str = \"deeplake_store\",\n        required_columns: Optional[list[str]] = None,\n        **vector_store_kwargs,\n    ):\n        \"\"\"Initialize a DeepLakeDocumentsManager object.\n\n        Args:\n            vector_store_path: The path to the vector store.\n            required_columns: A list of columns that are required in the dataframe.\n            **vector_store_kwargs: Additional keyword arguments to pass to the VectorStore initializer.\n        \"\"\"\n        from deeplake.core.vectorstore import VectorStore\n\n        self.vector_store_path = vector_store_path\n        self.required_columns = required_columns\n        self.vector_store = VectorStore(\n            path=self.vector_store_path,\n            **vector_store_kwargs,\n        )\n\n    def __len__(self):\n        \"\"\"Get the number of documents in the vector store.\n\n        Returns:\n            The number of documents in the vector store.\n        \"\"\"\n        return len(self.vector_store)\n\n    @classmethod\n    def _extract_metadata(cls, df: pd.DataFrame) -> dict:\n        \"\"\"Extract metadata from the dataframe in DeepLake dict format.\n\n        Args:\n            df: The dataframe from which to extract metadata.\n\n        Returns:\n            The extracted metadata in DeepLake dict format.\n        \"\"\"\n        # Ignore the content and embedding column for metadata\n        df = df.drop(columns=[\"content\", \"embedding\"], errors=\"ignore\")\n\n        columns = list(df.columns)\n\n        metadata = df.apply(\n            lambda x: {col: x[col] for col in columns},\n            axis=1,\n        ).to_list()\n        return metadata\n\n    def _add_documents(self, df: pd.DataFrame, **add_kwargs):\n        \"\"\"Write all documents from the dataframe into the vector store as a new version.\n\n        Each entry in the dataframe is expected to have at least the following columns:\n        [\"content\", \"embedding\"]\n\n        Embeddings will have been precomputed in the self.add() method, which calls this one.\n\n        Args:\n            df: The dataframe containing the documents to add.\n            **add_kwargs: Additional keyword arguments to pass to the add method of the vector store.\n        \"\"\"\n        # Embedding should already be computed in the .add method\n        assert \"embedding\" in df.columns, \"expected column=embedding in the dataframe\"\n\n        # extract the chunked text + metadata\n        metadata = self._extract_metadata(df)\n\n        chunked_text = df.content.to_list()\n\n        embeddings = df.embedding.to_list()\n        self.vector_store.add(\n            text=chunked_text,\n            embedding=embeddings,\n            metadata=metadata,\n            **add_kwargs,\n        )\n\n    def to_zip(self, output_path: str = \".\"):\n        \"\"\"Zip the contents of the vector store path folder to a .zip file in the output path.\n\n        Args:\n            output_path: The path where the zip file should be created.\n\n        Returns:\n            The path to the created zip file.\n        \"\"\"\n        vector_store_path = self.vector_store_path\n        logger.info(f\"Compressing {vector_store_path}...\")\n        zip_file_path = zip_contents(input_path=vector_store_path, output_path=output_path)\n        logger.info(f\"Compressed {vector_store_path} to {zip_file_path}.\")\n        return zip_file_path\n"
  },
  {
    "path": "buster/documents_manager/service.py",
    "content": "import logging\n\nimport pandas as pd\nimport pinecone\nfrom pymongo.mongo_client import MongoClient\nfrom pymongo.server_api import ServerApi\n\nfrom buster.documents_manager.base import DocumentsManager\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass DocumentsService(DocumentsManager):\n    \"\"\"Manager to use in production. Mixed Pinecone and MongoDB backend.\"\"\"\n\n    def __init__(\n        self,\n        pinecone_api_key: str,\n        pinecone_index: str,\n        pinecone_namespace: str,\n        mongo_uri: str,\n        mongo_db_name: str,\n        **kwargs,\n    ):\n        \"\"\"Initialize the DocumentsService.\n\n        Args:\n            pinecone_api_key: The Pinecone API key.\n            pinecone_env: The Pinecone environment.\n            pinecone_index: The Pinecone index.\n            pinecone_namespace: The Pinecone namespace.\n            mongo_uri: The MongoDB URI.\n            mongo_db_name: The MongoDB database name.\n            **kwargs: Additional keyword arguments to pass to the parent class.\n        \"\"\"\n        super().__init__(**kwargs)\n\n        pc = pinecone.Pinecone(api_key=pinecone_api_key)\n\n        self.index = pc.Index(pinecone_index)\n        self.namespace = pinecone_namespace\n\n        self.mongo_db_name = mongo_db_name\n        self.client = MongoClient(mongo_uri, server_api=ServerApi(\"1\"))\n        self.db = self.client[mongo_db_name]\n\n    def __repr__(self):\n        \"\"\"Return a string representation of the DocumentsService.\"\"\"\n        return \"DocumentsService\"\n\n    def get_source_id(self, source: str) -> str:\n        \"\"\"Get the id of a source.\n\n        Args:\n            source: The name of the source.\n\n        Returns:\n            The id of the source.\n        \"\"\"\n        return str(self.db.sources.find_one({\"name\": source})[\"_id\"])\n\n    def _add_documents(self, df: pd.DataFrame):\n        \"\"\"Write all documents from the dataframe into the db as a new version.\n\n        Args:\n            df: The dataframe containing the documents.\n        \"\"\"\n        use_sparse_vector = \"sparse_embedding\" in df.columns\n        if use_sparse_vector:\n            logger.info(\"Uploading sparse embeddings too.\")\n\n        for source in df.source.unique():\n            source_exists = self.db.sources.find_one({\"name\": source})\n            if source_exists is None:\n                self.db.sources.insert_one({\"name\": source})\n\n            source_id = self.get_source_id(source)\n\n            df_source = df[df.source == source]\n            to_upsert = []\n            for row in df_source.to_dict(orient=\"records\"):\n                embedding = row[\"embedding\"].tolist()\n                if use_sparse_vector:\n                    sparse_embedding = row[\"sparse_embedding\"]\n\n                document = row.copy()\n                document.pop(\"embedding\")\n                if use_sparse_vector:\n                    document.pop(\"sparse_embedding\")\n                document[\"source_id\"] = source_id\n\n                document_id = str(self.db.documents.insert_one(document).inserted_id)\n                vector = {\"id\": document_id, \"values\": embedding, \"metadata\": {\"source\": source}}\n                if use_sparse_vector:\n                    vector[\"sparse_values\"] = sparse_embedding\n\n                to_upsert.append(vector)\n\n            # Current (February 2024) Pinecone upload rules:\n            # - Max 100 vectors per batch\n            MAX_PINECONE_BATCH_SIZE = 100\n            for i in range(0, len(to_upsert), MAX_PINECONE_BATCH_SIZE):\n                self.index.upsert(vectors=to_upsert[i : i + MAX_PINECONE_BATCH_SIZE], namespace=self.namespace)\n\n    def update_source(self, source: str, display_name: str = None, note: str = None):\n        \"\"\"Update the display name and/or note of a source. Also create the source if it does not exist.\n\n        Args:\n            source: The name of the source.\n            display_name: The new display name of the source.\n            note: The new note of the source.\n        \"\"\"\n        self.db.sources.update_one(\n            {\"name\": source}, {\"$set\": {\"display_name\": display_name, \"note\": note}}, upsert=True\n        )\n\n    def delete_source(self, source: str) -> tuple[int, int]:\n        \"\"\"Delete a source and all its documents. Return if the source was deleted and the number of deleted documents.\n\n        Args:\n            source: The name of the source.\n\n        Returns:\n            A tuple containing the number of deleted sources and the number of deleted documents.\n        \"\"\"\n        source_id = self.get_source_id(source)\n\n        # MongoDB\n        source_deleted = self.db.sources.delete_one({\"name\": source}).deleted_count\n        documents_deleted = self.db.documents.delete_many({\"source_id\": source_id}).deleted_count\n\n        # Pinecone\n        self.index.delete(filter={\"source\": source}, namespace=self.namespace)\n\n        return source_deleted, documents_deleted\n\n    def drop_db(self):\n        \"\"\"Drop the currently accessible database.\n\n        For Pinecone, this means deleting everything in the namespace.\n        For Mongo DB, this means dropping the database. However this needs to be done manually through the GUI.\n        \"\"\"\n        confirmation = input(\"Dropping the database is irreversible. Are you sure you want to proceed? (y/N): \")\n\n        if confirmation.strip().lower() == \"y\":\n            self.index.delete(namespace=self.namespace, delete_all=True)\n\n            logging.info(f\"Deleted all documents from Pinecone namespace: {self.namespace=}\")\n            logging.info(f\"The MongoDB database needs to be dropped manually: {self.mongo_db_name=}\")\n        else:\n            logging.info(\"Operation cancelled.\")\n"
  },
  {
    "path": "buster/examples/cfg.py",
    "content": "from buster.busterbot import Buster, BusterConfig\nfrom buster.completers import ChatGPTCompleter, DocumentAnswerer\nfrom buster.formatters.documents import DocumentsFormatterJSON\nfrom buster.formatters.prompts import PromptFormatter\nfrom buster.llm_utils import get_openai_embedding_constructor\nfrom buster.retriever import DeepLakeRetriever, Retriever\nfrom buster.tokenizers import GPTTokenizer\nfrom buster.validators import Validator\n\n# kwargs to pass to OpenAI client\nclient_kwargs = {\n    \"timeout\": 20,\n    \"max_retries\": 3,\n}\n\nembedding_fn = get_openai_embedding_constructor(client_kwargs=client_kwargs)\n\nbuster_cfg = BusterConfig(\n    validator_cfg={\n        \"question_validator_cfg\": {\n            \"invalid_question_response\": \"This question does not seem relevant to my current knowledge.\",\n            \"completion_kwargs\": {\n                \"model\": \"gpt-3.5-turbo\",\n                \"stream\": False,\n                \"temperature\": 0,\n            },\n            \"client_kwargs\": client_kwargs,\n            \"check_question_prompt\": \"\"\"You are a chatbot answering questions on artificial intelligence.\nYour job is to determine wether or not a question is valid, and should be answered.\nMore general questions are not considered valid, even if you might know the response.\nA user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.\n\nFor example:\n\nQ: What is backpropagation?\ntrue\n\nQ: What is the meaning of life?\nfalse\n\nA user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.\"\"\",\n        },\n        \"answer_validator_cfg\": {\n            \"unknown_response_templates\": [\n                \"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?\",\n            ],\n            \"unknown_threshold\": 0.85,\n            \"embedding_fn\": embedding_fn,\n        },\n        \"documents_validator_cfg\": {\n            \"completion_kwargs\": {\n                \"model\": \"gpt-3.5-turbo\",\n                \"stream\": False,\n                \"temperature\": 0,\n            },\n            \"client_kwargs\": client_kwargs,\n        },\n        \"use_reranking\": True,\n        \"validate_documents\": False,\n    },\n    retriever_cfg={\n        \"path\": \"deeplake_store\",\n        \"top_k\": 3,\n        \"thresh\": 0.7,\n        \"embedding_fn\": embedding_fn,\n    },\n    documents_answerer_cfg={\n        \"no_documents_message\": \"No documents are available for this question.\",\n    },\n    completion_cfg={\n        \"completion_kwargs\": {\n            \"model\": \"gpt-3.5-turbo\",\n            \"stream\": True,\n            \"temperature\": 0,\n        },\n        \"client_kwargs\": client_kwargs,\n    },\n    tokenizer_cfg={\n        \"model_name\": \"gpt-3.5-turbo\",\n    },\n    documents_formatter_cfg={\n        \"max_tokens\": 3500,\n        \"columns\": [\"content\", \"title\", \"source\"],\n    },\n    prompt_formatter_cfg={\n        \"max_tokens\": 3500,\n        \"text_before_docs\": (\n            \"You are a chatbot assistant answering technical questions about artificial intelligence (AI).\"\n            \"You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. \"\n            \"If the answer is in the documentation, summarize it in a helpful way to the user. \"\n            \"If it isn't, simply reply that you cannot answer the question. \"\n            \"Do not refer to the documentation directly, but use the instructions provided within it to answer questions. \"\n            \"Here is the documentation: \"\n        ),\n        \"text_after_docs\": (\n            \"REMEMBER:\\n\"\n            \"You are a chatbot assistant answering technical questions about artificial intelligence (AI).\"\n            \"Here are the rules you must follow:\\n\"\n            \"1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\\n\"\n            \"2) Make sure to format your answers in Markdown format, including code block and snippets.\\n\"\n            \"3) Do not reference any links, urls or hyperlinks in your answers.\\n\"\n            \"4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\\n\"\n            \"5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. \"\n            \"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'\"\n            \"For example:\\n\"\n            \"What is the meaning of life for an qa bot?\\n\"\n            \"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?\"\n            \"Now answer the following question:\\n\"\n        ),\n    },\n)\n\n\ndef setup_buster(buster_cfg: BusterConfig):\n    \"\"\"initialize buster with a buster_cfg class\"\"\"\n    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)\n    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)\n    document_answerer: DocumentAnswerer = DocumentAnswerer(\n        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),\n        documents_formatter=DocumentsFormatterJSON(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),\n        prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),\n        **buster_cfg.documents_answerer_cfg,\n    )\n    validator: Validator = Validator(**buster_cfg.validator_cfg)\n    buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)\n    return buster\n"
  },
  {
    "path": "buster/examples/generate_embeddings.py",
    "content": "import click\nimport pandas as pd\n\nfrom buster.documents_manager import DeepLakeDocumentsManager\n\nREQUIRED_COLUMNS = [\"url\", \"title\", \"content\", \"source\"]\n\n\n@click.command(\n    help=\"This script processes a CSV file and generates embeddings. The CSV argument specifies the path to the input CSV file.\"\n)\n@click.argument(\"csv\", metavar=\"<path_to_csv_file>\")\ndef main(csv):\n    # Read the csv\n    df = pd.read_csv(csv)\n\n    # initialize our vector store from scratch\n    dm = DeepLakeDocumentsManager(vector_store_path=\"deeplake_store\", overwrite=True, required_columns=REQUIRED_COLUMNS)\n\n    # Generate the embeddings for our documents and store them to the deeplake store\n    dm.add(df, csv_filename=\"embeddings.csv\")\n\n    # Save it to a zip file\n    dm.to_zip()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "buster/examples/gradio_app.py",
    "content": "import os\nfrom typing import Optional, Tuple\n\nimport cfg\nimport gradio as gr\nimport pandas as pd\nfrom cfg import setup_buster\n\nfrom buster.completers import Completion\nfrom buster.utils import extract_zip\n\n# Check if an openai key is set as an env. variable\nif os.getenv(\"OPENAI_API_KEY\") is None:\n    print(\"Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'.\")\n\n# Typehint for chatbot history\nChatHistory = list[list[Optional[str], Optional[str]]]\n\nextract_zip(\"deeplake_store.zip\", \"deeplake_store\")\n\nbuster = setup_buster(cfg.buster_cfg)\n\n\ndef add_user_question(user_question: str, chat_history: Optional[ChatHistory] = None) -> ChatHistory:\n    \"\"\"Adds a user's question to the chat history.\n\n    If no history is provided, the first element of the history will be the user conversation.\n    \"\"\"\n    if chat_history is None:\n        chat_history = []\n    chat_history.append([user_question, None])\n    return chat_history\n\n\ndef format_sources(matched_documents: pd.DataFrame) -> str:\n    if len(matched_documents) == 0:\n        return \"\"\n\n    matched_documents.similarity_to_answer = matched_documents.similarity_to_answer * 100\n\n    # drop duplicate pages (by title), keep highest ranking ones\n    matched_documents = matched_documents.sort_values(\"similarity_to_answer\", ascending=False).drop_duplicates(\n        \"title\", keep=\"first\"\n    )\n\n    documents_answer_template: str = (\n        \"📝 Here are the sources I used to answer your question:\\n\\n{documents}\\n\\n{footnote}\"\n    )\n    document_template: str = \"[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %\"\n\n    documents = \"\\n\".join([document_template.format(document=document) for _, document in matched_documents.iterrows()])\n    footnote: str = \"I'm a bot 🤖 and not always perfect.\"\n\n    return documents_answer_template.format(documents=documents, footnote=footnote)\n\n\ndef add_sources(history, completion):\n    if completion.answer_relevant:\n        formatted_sources = format_sources(completion.matched_documents)\n        history.append([None, formatted_sources])\n\n    return history\n\n\ndef chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:\n    \"\"\"Answer a user's question using retrieval augmented generation.\"\"\"\n\n    # We assume that the question is the user's last interaction\n    user_input = chat_history[-1][0]\n\n    # Do retrieval + augmented generation with buster\n    completion = buster.process_input(user_input)\n\n    # Stream tokens one at a time to the user\n    chat_history[-1][1] = \"\"\n    for token in completion.answer_generator:\n        chat_history[-1][1] += token\n\n        yield chat_history, completion\n\n\ndemo = gr.Blocks()\n\nwith demo:\n    with gr.Row():\n        gr.Markdown(\"<h3><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h3>\")\n\n    chatbot = gr.Chatbot()\n\n    with gr.Row():\n        question_textbox = gr.Textbox(\n            label=\"What's your question?\",\n            placeholder=\"Type your question here...\",\n            lines=1,\n        )\n        send_button = gr.Button(value=\"Send\", variant=\"secondary\")\n\n    examples = gr.Examples(\n        examples=[\n            \"How can I perform backpropagation?\",\n            \"How do I deal with noisy data?\",\n            \"How do I deal with noisy data in 2 words?\",\n        ],\n        inputs=question_textbox,\n    )\n\n    gr.Markdown(\"This application uses GPT to search the docs for relevant info and answer questions.\")\n\n    gr.HTML(\"️<center> Created with ❤️ by @jerpint and @hadrienbertrand\")\n\n    response = gr.State()\n\n    # fmt: off\n    gr.on(\n        triggers=[send_button.click, question_textbox.submit],\n        fn=add_user_question,\n        inputs=[question_textbox],\n        outputs=[chatbot]\n    ).then(\n        chat,\n        inputs=[chatbot],\n        outputs=[chatbot, response]\n    ).then(\n        add_sources,\n        inputs=[chatbot, response],\n        outputs=[chatbot]\n    )\n\n    # fmt: on\n\n\ndemo.queue()\ndemo.launch(debug=True, share=False)\n"
  },
  {
    "path": "buster/examples/stackoverflow.csv",
    "content": ",source,title,content,url\n0,stackoverflow,stackoverflow question #1,\"\"\"Backprop\"\" is the same as \"\"backpropagation\"\": it's just a shorter way to say it. It is sometimes abbreviated as \"\"BP\"\".\n\",https://ai.stackexchange.com/questions/1\n1,stackoverflow,stackoverflow question #2,\"Noise in the data, to a reasonable amount, may help the network to generalize better. Sometimes, it has the opposite effect. It partly depends on the kind of noise (\"\"true\"\" vs. artificial).\nThe AI FAQ on ANN gives a good overview. Excerpt:\n\nNoise in the actual data is never a good thing, since it limits the accuracy of generalization that can be achieved no matter how extensive the training set is. On the other hand, injecting artificial noise (jitter) into the inputs during training is one of several ways to improve generalization for smooth functions when you have a small training set.\n\nIn some field, such as computer vision, it's common to increase the size of the training set by copying some samples and adding some noises or other transformation.\n\",https://ai.stackexchange.com/questions/2\n2,stackoverflow,stackoverflow question #4,\"There is no direct way to find the optimal number of them: people empirically try and see (e.g., using cross-validation). The most common search techniques are random, manual, and grid searches. \nThere exist more advanced techniques such as Gaussian processes, e.g. Optimizing Neural Network Hyperparameters with Gaussian Processes for Dialog Act Classification, IEEE SLT 2016.\n\",https://ai.stackexchange.com/questions/4\n3,stackoverflow,stackoverflow question #6,\"It rather depends on how one defines several of the terms used. For example:\n\nWhether the term \"\"expected\"\" is interpreted in a formal (i.e.\nstatistical) sense.  \nWhether it's assumed that humans have any kind of utilitarian\n\"\"performance measure\"\".\n\nThe motivation for this description of \"\"agent\"\" arose from a desire to have a quantitative model - it's not clear that such a model is a good fit for human cognition.\nHowever, there are alternative definitions of agents, for example the BDI model, which are rather more open-ended and hence more obviously applicable to humans.\n\",https://ai.stackexchange.com/questions/6\n4,stackoverflow,stackoverflow question #7,\"\nTo put it simply in layman terms, what are the possible threats from AI? \n\nCurrently, there are no threat. \nThe threat comes if humans create a so-called ultraintelligent machine, a machine that can surpass all intellectual activities by any human. This would be the last invention man would need to do, since this machine is better in inventing machines than humans are (since that is an intellectual activity).  However, this could cause the machine to invent machines that can destruct humans, and we can't stop them because they are so much smarter than we are.\nThis is all hypothetical, no one has even a clue of what an ultraintelligent machine looks like. \n\nIf we know that AI is so dangerous why are we still promoting it? Why is it not banned?\n\nAs I said before, the existence of a ultraintelligent machine is hypothetical. Artificial Intelligence has lots of useful applications (more than this answer can contain), and if we develop it, we get even more useful applications. We just have to be careful that the machines won't overtake us. \n\",https://ai.stackexchange.com/questions/7\n5,stackoverflow,stackoverflow question #10,\"It's analogous to analogue versus digital, or the many shades of gray in between black and white: when evaluating the truthiness of a result, in binary boolean it's either true or false (0 or 1), but when utilizing fuzzy logic, it's an estimated probability between 0 and 1 (such as 0.75 being mostly probably true). It's useful for making calculated decisions when all information needed isn't necessarily available.\nWikipedia has a fantastic page for this.\n\",https://ai.stackexchange.com/questions/10\n6,stackoverflow,stackoverflow question #15,\"The problem of the Turing Test is that it tests the machines ability to resemble humans. Not necessarily every form of AI has to resemble humans. This makes the Turing Test less reliable. However, it is still useful since it is an actual test. It is also noteworthy that there is a prize for passing or coming closest to passing the Turing Test, the Loebner Prize.\nThe intelligent agent definition of intelligence states that an agent is intelligent if it acts so to maximize the expected value of a performance measure based on past experience and knowledge. (paraphrased from Wikipedia). This definition is used more often and does not depend on the ability to resemble humans. However, it is harder to test this. \n\",https://ai.stackexchange.com/questions/15\n7,stackoverflow,stackoverflow question #17,\"The concept of \"\"the singularity\"\" is when machines outsmart the humans. Although Stephen Hawking opinion is that this situation is inevitable, but I think it'll be very difficult to reach that point, because every A.I. algorithm needs to be programmed by humans, therefore it would be always more limited than its creator.\nWe would probably know when that point when humanity will lose control over Artificial Intelligence where super-smart AI would be in competition with humans and maybe creating more sophisticated intelligent beings occurred, but currently, it's more like science fiction (aka Terminator's Skynet).\nThe risk could involve killing people (like self-flying war drones making their own decision), destroying countries or even the whole planet (like A.I. connected to the nuclear weapons (aka WarGames movie), but it doesn't prove the point that the machines would be smarter than humans.\n\",https://ai.stackexchange.com/questions/17\n8,stackoverflow,stackoverflow question #26,\"I think your question fits nowadays more in the field of Human-Robot Interaction, which relies largely on vision for recognition of gestures and follow movements, as well as soft, natural movements as a response. Note that the movements of the face and hands belong to the most complex tasks, involving many muscles at a time.\nI strongly recommend the film Plug & Pray to have an idea of what people are researching in this area.\nYou may also find Eliza (which you can try here) interesting. It is classical in the history of AI and pretends to mimic an analyst (psychology). (I am thinking of Eliza not because of its emotional intelligence, but because it was apparently taken seriously by a couple of humans. Could this be taken as a sort of (approved) Turing test? What does it say about the humans it met?)\nOn the purely human end of the scale, I sometimes wonder about our (my) emotional intelligence myself. Would I want to implement such an intelligence in an artificial agent at all?\n\",https://ai.stackexchange.com/questions/26\n9,stackoverflow,stackoverflow question #28,\"This is probably more a question of philosophy than anything. In terms of how things are commonly defined, I'll say \"\"yes, genetic algorithms are part of AI\"\".  If you pick up a comprehensive book on artificial intelligence, there will probably be a chapter on genetic algorithms (or more broadly, evolutionary algorithms). \nOne area that has been extensively studied in the past is the idea of using genetic algorithms to train neural networks.  I don't know if people are still actively researching this topic or not, but it at least illustrates that GA's are part of the overall rubric of AI in one regard.\n\",https://ai.stackexchange.com/questions/28\n"
  },
  {
    "path": "buster/formatters/documents.py",
    "content": "import logging\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\n\nimport pandas as pd\n\nfrom buster.tokenizers import Tokenizer\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass DocumentsFormatter(ABC):\n    \"\"\"\n    Abstract base class for document formatters.\n\n    Subclasses are required to implement the `format` method which transforms the input documents\n    into the desired format.\n    \"\"\"\n\n    @abstractmethod\n    def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:\n        \"\"\"\n        Abstract method to format matched documents.\n\n        Parameters:\n        - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.\n\n        Returns:\n        - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a string and\n                                    the possibly truncated matched documents DataFrame.\n        \"\"\"\n        pass\n\n\n@dataclass\nclass DocumentsFormatterHTML(DocumentsFormatter):\n    \"\"\"\n    Formatter class to convert matched documents into an HTML format.\n\n    Attributes:\n    - tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents.\n    - max_tokens (int): Maximum allowed tokens for the formatted documents.\n    - formatter (str): String formatter for the document's content.\n    - inner_tag (str): HTML tag that will be used at the document level.\n    - outer_tag (str): HTML tag that will be used at the documents collection level.\n    \"\"\"\n\n    tokenizer: Tokenizer\n    max_tokens: int\n    formatter: str = \"{content}\"\n    inner_tag: str = \"DOCUMENT\"\n    outer_tag: str = \"DOCUMENTS\"\n\n    def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:\n        \"\"\"\n        Format the matched documents into an HTML format.\n\n        If the total tokens exceed max_tokens, documents are truncated or omitted to fit within the limit.\n\n        Parameters:\n        - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.\n\n        Returns:\n        - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as an HTML string and\n                                    the possibly truncated matched documents DataFrame.\n        \"\"\"\n\n        documents_str = \"\"\n        total_tokens = 0\n        max_tokens = self.max_tokens\n\n        num_total_docs = len(matched_documents)\n        num_preserved_docs = 0\n        # TODO: uniformize this logic with the DocumentsFormatterJSON\n        for _, row in matched_documents.iterrows():\n            doc = self.formatter.format_map(row.to_dict())\n            num_preserved_docs += 1\n            token_count, encoded = self.tokenizer.num_tokens(doc, return_encoded=True)\n            if total_tokens + token_count <= max_tokens:\n                documents_str += f\"<{self.inner_tag}>{doc}<\\\\{self.inner_tag}>\"\n                total_tokens += token_count\n            else:\n                logger.warning(\"truncating document to fit...\")\n                remaining_tokens = max_tokens - total_tokens\n                truncated_doc = self.tokenizer.decode(encoded[:remaining_tokens])\n                documents_str += f\"<{self.inner_tag}>{truncated_doc}<\\\\{self.inner_tag}>\"\n                logger.warning(f\"Documents after truncation: {documents_str}\")\n                break\n\n        if num_preserved_docs < (num_total_docs):\n            logger.warning(\n                f\"{num_preserved_docs}/{num_total_docs} documents were preserved from the matched documents due to truncation.\"\n            )\n            matched_documents = matched_documents.iloc[:num_preserved_docs]\n\n        documents_str = f\"<{self.outer_tag}>{documents_str}<\\\\{self.outer_tag}>\"\n\n        return documents_str, matched_documents\n\n\n@dataclass\nclass DocumentsFormatterJSON(DocumentsFormatter):\n    \"\"\"\n    Formatter class to convert matched documents into a JSON format.\n\n    Attributes:\n    - tokenizer (Tokenizer): Tokenizer instance to count tokens in the documents.\n    - max_tokens (int): Maximum allowed tokens for the formatted documents.\n    - columns (list[str]): List of columns to include in the JSON format.\n    \"\"\"\n\n    tokenizer: Tokenizer\n    max_tokens: int\n    columns: list[str]\n\n    def format(self, matched_documents: pd.DataFrame) -> tuple[str, pd.DataFrame]:\n        \"\"\"\n        Format the matched documents into a JSON format.\n\n        If the total tokens exceed max_tokens, documents are omitted one at a time until it fits the limit.\n\n        Parameters:\n        - matched_documents (pd.DataFrame): DataFrame containing the matched documents to be formatted.\n\n        Returns:\n        - tuple[str, pd.DataFrame]: A tuple containing the formatted documents as a JSON string and\n                                    the possibly truncated matched documents DataFrame.\n        \"\"\"\n\n        max_tokens = self.max_tokens\n        documents_str = matched_documents[self.columns].to_json(orient=\"records\")\n        token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True)\n\n        while token_count > max_tokens:\n            # Truncated too much, no documents left, raise an error\n            if len(matched_documents) == 0:\n                raise ValueError(\n                    f\"Could not truncate documents to fit {max_tokens=}. Consider increasing max_tokens or decreasing chunk lengths.\"\n                )\n\n            # Too many tokens, drop a document and try again.\n            matched_documents = matched_documents.iloc[:-1]\n            documents_str = matched_documents[self.columns].to_json(orient=\"records\")\n            token_count, _ = self.tokenizer.num_tokens(documents_str, return_encoded=True)\n\n            # Log a warning with more details\n            logger.warning(\n                f\"Truncating documents to fit. Remaining documents after truncation: {len(matched_documents)}\"\n            )\n\n        return documents_str, matched_documents\n"
  },
  {
    "path": "buster/formatters/prompts.py",
    "content": "import logging\nfrom dataclasses import dataclass\n\nimport pandas as pd\n\nfrom buster.tokenizers import Tokenizer\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\n@dataclass\nclass PromptFormatter:\n    tokenizer: Tokenizer\n    max_tokens: int\n    text_before_docs: str\n    text_after_docs: str\n    formatter: str = \"{text_before_docs}\\n{documents}\\n{text_after_docs}\"\n\n    def format(self, documents: str) -> str:\n        \"\"\"Formats the system prompt with prompt engineering.\n\n        Joins the text before and after documents with the documents provided.\n\n        Args:\n            documents (str): The already formatted documents to include in the system prompt.\n\n        Returns:\n            str: The formatted system prompt.\n\n        Raises:\n            ValueError: If the number of prompt tokens exceeds the maximum allowed tokens.\n        \"\"\"\n        system_prompt = self.formatter.format(\n            text_before_docs=self.text_before_docs, documents=documents, text_after_docs=self.text_after_docs\n        )\n\n        if self.tokenizer.num_tokens(system_prompt) > self.max_tokens:\n            raise ValueError(f\"System prompt tokens > {self.max_tokens=}\")\n        return system_prompt\n\n\ndef prompt_formatter_factory(tokenizer: Tokenizer, prompt_cfg) -> PromptFormatter:\n    \"\"\"Creates a PromptFormatter instance.\n\n    Args:\n        tokenizer (Tokenizer): The tokenizer to use for the PromptFormatter.\n        prompt_cfg: The configuration for the PromptFormatter.\n\n    Returns:\n        PromptFormatter: The created PromptFormatter instance.\n    \"\"\"\n    return PromptFormatter(\n        tokenizer=tokenizer,\n        max_tokens=prompt_cfg[\"max_tokens\"],\n        text_before_docs=prompt_cfg[\"text_before_documents\"],\n        text_after_docs=prompt_cfg[\"text_before_prompt\"],\n    )\n"
  },
  {
    "path": "buster/llm_utils/__init__.py",
    "content": "from buster.llm_utils.embeddings import (\n    BM25,\n    compute_embeddings_parallelized,\n    cosine_similarity,\n    get_openai_embedding,\n    get_openai_embedding_constructor,\n)\nfrom buster.llm_utils.question_reformulator import QuestionReformulator\n\n__all__ = [\n    QuestionReformulator,\n    cosine_similarity,\n    get_openai_embedding,\n    compute_embeddings_parallelized,\n    get_openai_embedding_constructor,\n    BM25,\n]\n"
  },
  {
    "path": "buster/llm_utils/embeddings.py",
    "content": "import logging\nfrom functools import lru_cache\nfrom typing import Optional\n\nimport numpy as np\nimport pandas as pd\nfrom openai import OpenAI\nfrom pinecone_text.sparse import BM25Encoder\nfrom tqdm.contrib.concurrent import thread_map\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\ndef get_openai_embedding_constructor(client_kwargs: Optional[dict] = None, model: str = \"text-embedding-ada-002\"):\n    if client_kwargs is None:\n        client_kwargs = {}\n    client = OpenAI(**client_kwargs)\n\n    @lru_cache\n    def embedding_fn(text: str, model: str = model) -> np.array:\n        try:\n            text = text.replace(\"\\n\", \" \")\n            response = client.embeddings.create(\n                input=text,\n                model=model,\n            )\n            embedding = response.data[0].embedding\n            return np.array(embedding, dtype=\"float32\")\n        except Exception as e:\n            # This rarely happens with the API but in the off chance it does, will allow us not to loose the progress.\n            logger.exception(e)\n            logger.warning(f\"Embedding failed to compute for {text=}\")\n            return None\n\n    return embedding_fn\n\n\n# default embedding function\nget_openai_embedding = get_openai_embedding_constructor()\n\n\ndef cosine_similarity(a, b):\n    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))\n\n\ndef compute_embeddings_parallelized(df: pd.DataFrame, embedding_fn: callable, num_workers: int) -> pd.Series:\n    \"\"\"Compute the embeddings on the 'content' column of a DataFrame in parallel.\n\n    This method calculates embeddings for the entries in the 'content' column of the provided DataFrame using the specified\n    embedding function. The 'content' column is expected to contain strings or textual data. The method processes the\n    embeddings in parallel using the number of workers specified.\n\n    Args:\n        df (pd.DataFrame): The DataFrame containing the data to compute embeddings for.\n        embedding_fn (callable): A function that computes embeddings for a given input string.\n        num_workers (int): The number of parallel workers to use for computing embeddings.\n\n    Returns:\n        pd.Series: A Series containing the computed embeddings for each entry in the 'content' column.\n    \"\"\"\n\n    logger.info(f\"Computing embeddings of {len(df)} chunks. Using {num_workers=}\")\n    embeddings = thread_map(embedding_fn, df.content.to_list(), max_workers=num_workers)\n\n    logger.info(f\"Finished computing embeddings\")\n    return embeddings\n\n\nclass BM25:\n    def __init__(self, path_to_params: str = None) -> None:\n        self.encoder = BM25Encoder()\n\n        if path_to_params:\n            self.encoder.load(path_to_params)\n\n    def fit(self, df: pd.DataFrame):\n        self.encoder.fit(df.content.to_list())\n\n    def dump_params(self, path: str):\n        self.encoder.dump(path)\n\n    def get_sparse_embedding_fn(self):\n        def sparse_embedding_fn(query: str):\n            return self.encoder.encode_queries(query)\n\n        return sparse_embedding_fn\n"
  },
  {
    "path": "buster/llm_utils/question_reformulator.py",
    "content": "import logging\nfrom typing import Optional\n\nfrom buster.completers import ChatGPTCompleter\n\n\nclass QuestionReformulator:\n    def __init__(\n        self,\n        system_prompt: Optional[str] = None,\n        completion_kwargs: Optional[dict] = None,\n        client_kwargs: Optional[dict] = None,\n    ):\n        self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)\n\n        if completion_kwargs is None:\n            # Default kwargs\n            completion_kwargs = {\n                \"model\": \"gpt-3.5-turbo\",\n                \"stream\": False,\n                \"temperature\": 0,\n            }\n        self.completion_kwargs = completion_kwargs\n\n        if system_prompt is None:\n            # Default prompt\n            system_prompt = \"\"\"\n            Your role is to reformat a user's input into a question that is useful in the context of a semantic retrieval system.\n            Reformulate the question in a way that captures the original essence of the question while also adding more relevant details that can be useful in the context of semantic retrieval.\"\"\"\n        self.system_prompt = system_prompt\n\n    def reformulate(self, user_input: str) -> str:\n        \"\"\"Reformulate a user's question\"\"\"\n        reformulated_question, error = self.completer.complete(\n            self.system_prompt, user_input=user_input, completion_kwargs=self.completion_kwargs\n        )\n        logging.info(f\"Reformulated question from {user_input=} to {reformulated_question=}\")\n        return reformulated_question, error\n"
  },
  {
    "path": "buster/parsers/__init__.py",
    "content": "from buster.parsers.parser import HuggingfaceParser, SphinxParser, get_all_documents\n\n__all__ = [get_all_documents, SphinxParser, HuggingfaceParser]\n"
  },
  {
    "path": "buster/parsers/parser.py",
    "content": "import glob\nimport os\nimport re\nfrom abc import ABC, abstractmethod\nfrom dataclasses import InitVar, dataclass, field\nfrom itertools import takewhile, zip_longest\nfrom pathlib import Path\nfrom typing import Iterator, Type\n\nimport bs4\nimport pandas as pd\nfrom bs4 import BeautifulSoup\nfrom tqdm import tqdm\n\n\n@dataclass\nclass Section:\n    url: str\n    name: str\n    nodes: InitVar[list[bs4.element.NavigableString]]\n    text: str = field(init=False)\n\n    def __post_init__(self, nodes: list[bs4.element.NavigableString]):\n        section = []\n        for node in nodes:\n            if node.name == \"table\":\n                node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt=\"github\")\n            elif node.name == \"script\":\n                continue\n            else:\n                node_text = node.text\n            section.append(node_text)\n        self.text = \"\\n\".join(section).strip()\n\n        # Remove tabs\n        self.text = self.text.replace(\"\\t\", \"\")\n\n        # Replace group of newlines with a single newline\n        self.text = re.sub(\"\\n{2,}\", \"\\n\", self.text)\n\n        # Replace non-breaking spaces with regular spaces\n        self.text = self.text.replace(\"\\xa0\", \" \")\n\n    def __len__(self) -> int:\n        return len(self.text)\n\n    @classmethod\n    def from_text(cls, text: str, url: str, name: str) -> \"Section\":\n        \"\"\"Alternate constructor, without parsing.\"\"\"\n        section = cls.__new__(cls)  # Allocate memory, does not call __init__\n        # Does the init here.\n        section.text = text\n        section.url = url\n        section.name = name\n\n        return section\n\n    def get_chunks(self, min_length: int, max_length: int) -> Iterator[\"Section\"]:\n        \"\"\"Split a section into chunks.\"\"\"\n        if len(self) > max_length:\n            # Get the number of chunk, by dividing and rounding up.\n            # Then, split the section into equal lenght chunks.\n            # This could results in chunks below the minimum length,\n            # and will truncate the end of the section.\n            n_chunks = (len(self) + max_length - 1) // max_length\n            length = len(self) // n_chunks\n            for chunk in range(n_chunks):\n                start = chunk * length\n                yield Section.from_text(self.text[start : start + length], self.url, self.name)\n        elif len(self) > min_length:\n            yield self\n        return\n\n\n@dataclass\nclass Parser(ABC):\n    soup: BeautifulSoup\n    base_url: str\n    root_dir: str\n    filepath: str\n    min_section_length: int = 100\n    max_section_length: int = 2000\n\n    @property\n    def relative_path(self) -> str:\n        \"\"\"Gets the relative path of the file to the root dir.\n\n        This is particularly useful for websites with pages, subdomains, etc.\n        The split is to remove the .html extension\n        \"\"\"\n        parent = Path(self.root_dir)\n        son = Path(self.filepath)\n        self._relative_path = str(son.relative_to(parent)).split(\".\")[0]\n        return self._relative_path\n\n    @abstractmethod\n    def find_sections(self) -> Iterator[Section]: ...\n\n    def parse(self) -> list[Section]:\n        \"\"\"Parse the documents into sections, respecting the lenght constraints.\"\"\"\n        sections = []\n        for section in self.find_sections():\n            sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))\n        return sections\n\n\nclass SphinxParser(Parser):\n    def find_sections(self) -> Iterator[Section]:\n        for section in self.soup.find_all(\"a\", href=True, class_=\"headerlink\"):\n            container = section.parent.parent\n            section_href = container.find_all(\"a\", href=True, class_=\"headerlink\")\n\n            url = self.build_url(section[\"href\"].strip().replace(\"\\n\", \"\"))\n            name = section.parent.text.strip()[:-1].replace(\"\\n\", \"\")\n\n            # If sections has subsections, keep only the part before the first subsection\n            if len(section_href) > 1 and container.section is not None:\n                siblings = list(container.section.previous_siblings)[::-1]\n                section = Section(url, name, siblings)\n            else:\n                section = Section(url, name, container.children)\n            yield section\n        return\n\n    def build_url(self, suffix: str) -> str:\n        return self.base_url + self.relative_path + \".html\" + suffix\n\n\nclass HuggingfaceParser(Parser):\n    def find_sections(self) -> Iterator[Section]:\n        sections = self.soup.find_all([\"h1\", \"h2\", \"h3\"], class_=\"relative group\")\n        for section, next_section in zip_longest(sections, sections[1:]):\n            href = section.find(\"a\", href=True, class_=\"header-link\")\n            nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))\n\n            suffix = href[\"href\"].strip().replace(\"\\n\", \"\")\n            url = self.build_url(suffix)\n            name = section.text.strip().replace(\"\\n\", \"\")\n            yield Section(url, name, nodes)\n        return\n\n    def build_url(self, suffix: str) -> str:\n        return self.base_url + self.relative_path + suffix\n\n\ndef get_document(\n    root_dir: str,\n    file: str,\n    base_url: str,\n    parser_cls: Type[Parser],\n    min_section_length: int = 100,\n    max_section_length: int = 2000,\n) -> pd.DataFrame:\n    \"\"\"Extract all sections from one file.\n\n    Sections are broken into subsections if they are longer than `max_section_length`.\n    Sections correspond to `section` HTML tags that have a headerlink attached.\n    \"\"\"\n    filepath = os.path.join(root_dir, file)\n    with open(filepath, \"r\") as f:\n        source = f.read()\n\n    soup = BeautifulSoup(source, \"html.parser\")\n    parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length)\n\n    sections = []\n    urls = []\n    names = []\n    for section in parser.parse():\n        sections.append(section.text)\n        urls.append(section.url)\n        names.append(section.name)\n\n    documents_df = pd.DataFrame.from_dict({\"title\": names, \"url\": urls, \"content\": sections})\n\n    return documents_df\n\n\ndef get_all_documents(\n    root_dir: str,\n    base_url: str,\n    parser_cls: Type[Parser],\n    min_section_length: int = 100,\n    max_section_length: int = 2000,\n) -> pd.DataFrame:\n    \"\"\"Parse all HTML files in `root_dir`, and extract all sections.\n\n    Sections are broken into subsections if they are longer than `max_section_length`.\n    Sections correspond to `section` HTML tags that have a headerlink attached.\n    \"\"\"\n    files = glob.glob(\"**/*.html\", root_dir=root_dir, recursive=True)\n\n    dfs = []\n    for file in tqdm(files):\n        try:\n            df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length)\n            dfs.append(df)\n        except Exception as e:\n            print(f\"Skipping {file} due to the following error: {e}\")\n            continue\n\n    documents_df = pd.concat(dfs, ignore_index=True)\n\n    return documents_df\n"
  },
  {
    "path": "buster/retriever/__init__.py",
    "content": "from .base import Retriever\nfrom .deeplake import DeepLakeRetriever\nfrom .service import ServiceRetriever\n\n__all__ = [Retriever, ServiceRetriever, DeepLakeRetriever]\n"
  },
  {
    "path": "buster/retriever/base.py",
    "content": "import logging\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom typing import Callable, Optional\n\nimport numpy as np\nimport pandas as pd\n\nfrom buster.completers import UserInputs\nfrom buster.llm_utils import get_openai_embedding\n\nALL_SOURCES = \"All\"\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\n@dataclass\nclass Retriever(ABC):\n    def __init__(\n        self,\n        top_k: int,\n        thresh: float,\n        embedding_fn: Callable[[str], np.ndarray] = None,\n        sparse_embedding_fn: Callable[[str], dict[str, list[float]]] = None,\n        *args,\n        **kwargs,\n    ):\n        \"\"\"Initializes a Retriever instance.\n\n        Args:\n          top_k: The maximum number of documents to retrieve.\n          thresh: The similarity threshold for document retrieval.\n          embedding_fn: The function to compute document embeddings.\n          embedding_fn: (Optional) The function to compute sparse document embeddings.\n          *args, **kwargs: Additional arguments and keyword arguments.\n        \"\"\"\n        if embedding_fn is None:\n            embedding_fn = get_openai_embedding\n\n        self.top_k = top_k\n        self.thresh = thresh\n        self.embedding_fn = embedding_fn\n        self.sparse_embedding_fn = sparse_embedding_fn\n\n        # Add your access to documents in your own init\n\n    @abstractmethod\n    def get_documents(self, source: Optional[str] = None) -> pd.DataFrame:\n        \"\"\"Get all current documents from a given source.\n\n        Args:\n          source: The source from which to retrieve documents. If None, retrieves documents from all sources.\n\n        Returns:\n          A pandas DataFrame containing the documents.\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get_source_display_name(self, source: str) -> str:\n        \"\"\"Get the display name of a source.\n\n        Args:\n          source: The source for which to retrieve the display name.\n\n        Returns:\n          The display name of the source.\n\n        If source is None, returns all documents. If source does not exist, returns empty dataframe.\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get_topk_documents(self, query: str, source: Optional[str] = None, top_k: Optional[int] = None) -> pd.DataFrame:\n        \"\"\"Get the topk documents matching a user's query.\n\n        Args:\n          query: The user's query.\n          source: The source from which to retrieve documents. If None, retrieves documents from all sources.\n          top_k: The maximum number of documents to retrieve.\n\n        Returns:\n          A pandas DataFrame containing the topk matched documents.\n\n        If no matches are found, returns an empty dataframe.\n        \"\"\"\n        ...\n\n    def threshold_documents(self, matched_documents: pd.DataFrame, thresh: float) -> pd.DataFrame:\n        \"\"\"Filters out matched documents using a similarity threshold.\n\n        Args:\n          matched_documents: The DataFrame containing the matched documents.\n          thresh: The similarity threshold.\n\n        Returns:\n          A pandas DataFrame containing the filtered matched documents.\n        \"\"\"\n        # filter out matched_documents using a threshold\n        return matched_documents[matched_documents.similarity > thresh]\n\n    def retrieve(\n        self,\n        user_inputs: UserInputs,\n        sources: Optional[list[str]] = None,\n        top_k: Optional[int] = None,\n        thresh: Optional[float] = None,\n    ) -> pd.DataFrame:\n        \"\"\"Retrieves documents based on user inputs.\n\n        Args:\n          user_inputs: The user's inputs.\n          sources: The sources from which to retrieve documents. If None, retrieves documents from all sources.\n          top_k: The maximum number of documents to retrieve.\n          thresh: The similarity threshold for document retrieval.\n\n        Returns:\n          A pandas DataFrame containing the retrieved documents.\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n        if thresh is None:\n            thresh = self.thresh\n\n        query = user_inputs.current_input\n\n        matched_documents = self.get_topk_documents(query=query, sources=sources, top_k=top_k)\n\n        # log matched_documents to the console\n        logger.info(f\"matched documents before thresh: {matched_documents}\")\n\n        # No matches were found, simply return at this point\n        if len(matched_documents) == 0:\n            return matched_documents\n\n        # otherwise, make sure we have the minimum required fields\n        assert \"similarity\" in matched_documents.columns\n        assert \"embedding\" in matched_documents.columns\n        assert \"content\" in matched_documents.columns\n        assert \"title\" in matched_documents.columns\n\n        # filter out matched_documents using a threshold\n        matched_documents = self.threshold_documents(matched_documents, thresh)\n\n        logger.info(f\"matched documents after thresh: {matched_documents}\")\n\n        return matched_documents\n"
  },
  {
    "path": "buster/retriever/deeplake.py",
    "content": "import logging\nimport os\nfrom typing import Optional\n\nimport numpy as np\nimport pandas as pd\n\nfrom buster.retriever.base import ALL_SOURCES, Retriever\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\ndef extract_metadata(x: pd.DataFrame, columns) -> pd.DataFrame:\n    \"\"\"Extracts metadata from deeplake.\n\n    Args:\n      x: The dataframe containing the metadata.\n      columns: The columns to extract.\n\n    Returns:\n      The dataframe with the extracted metadata.\n    \"\"\"\n    for col in columns:\n        x[col] = x.metadata[col]\n    return x\n\n\ndef data_dict_to_df(data: dict) -> pd.DataFrame:\n    \"\"\"Converts a dictionary of data to a Pandas DataFrame.\n\n    Args:\n      data: The dictionary containing the data.\n\n    Returns:\n      The DataFrame containing the data.\n    \"\"\"\n    # rename 'score' to 'similarity'\n    data[\"similarity\"] = data.pop(\"score\")\n    data[\"content\"] = data.pop(\"text\")\n\n    matched_documents = pd.DataFrame(data)\n\n    if len(matched_documents) == 0:\n        logger.info(\"No matches found...\")\n        return pd.DataFrame()\n\n    matched_documents = matched_documents.apply(extract_metadata, columns=[\"source\", \"title\", \"url\"], axis=1)\n    matched_documents = matched_documents.drop(columns=\"metadata\")\n\n    return matched_documents\n\n\ndef build_tql_query(embedding, sources=None, top_k: int = 3) -> str:\n    \"\"\"Builds a TQL query.\n\n    Args:\n      embedding: The embedding vector.\n      sources: The sources to filter by.\n      top_k: The number of top documents to retrieve.\n\n    Returns:\n      The TQL query.\n    \"\"\"\n    # Initialize the where_clause to an empty string.\n    where_clause = \"\"\n\n    embedding_string = \",\".join([str(item) for item in embedding])\n\n    # If sources is provided and it's not empty, build the where clause.\n    if sources:\n        conditions = [f\"contains(metadata['source'], '{source}')\" for source in sources]\n        where_clause = \"where \" + \" or \".join(conditions)\n\n    # Construct the entire query\n    query = f\"\"\"\nselect * from (\n    select embedding, text, metadata, cosine_similarity(embedding, ARRAY[{embedding_string}]) as score\n    {where_clause}\n)\norder by score desc limit {top_k}\n\"\"\"\n    return query\n\n\nclass DeepLakeRetriever(Retriever):\n    def __init__(\n        self,\n        path,\n        exec_option: str = \"python\",\n        use_tql: bool = False,\n        deep_memory: bool = False,\n        activeloop_token: str = None,\n        **kwargs,\n    ):\n        from deeplake.core.vectorstore import VectorStore\n\n        super().__init__(**kwargs)\n        self.use_tql = use_tql\n        self.exec_option = exec_option\n        self.deep_memory = deep_memory\n        self.vector_store = VectorStore(\n            path=path,\n            read_only=True,\n            token=activeloop_token,\n            exec_option=exec_option,\n        )\n\n        if activeloop_token is None and use_tql:\n            logger.warning(\n                \"\"\"\n                No activeloop token detected, enterprise features will not be available.\n                You can set it using: export ACTIVELOOP_TOKEN=...\n                \"\"\"\n            )\n\n    def get_documents(self, sources: Optional[list[str]] = None) -> pd.DataFrame:\n        \"\"\"Get all current documents from a given source.\n\n        Args:\n          sources: The sources to retrieve documents from.\n\n        Returns:\n          The DataFrame containing the retrieved documents.\n        \"\"\"\n        k = len(self.vector_store)\n\n        # currently this is the only way to retrieve all embeddings in deeplake\n        # generate a dummy embedding and specify top-k equals the length of the vector store.\n        embedding_dim = self.vector_store.tensors()[\"embedding\"].shape[1]\n        dummy_embedding = np.random.random(embedding_dim)\n\n        return self.get_topk_documents(query=None, embedding=dummy_embedding, top_k=k, sources=sources)\n\n    def get_source_display_name(self, source: str) -> str:\n        \"\"\"Get the display name of a source.\n\n        Args:\n          source: The name of the source.\n\n        Returns:\n          The display name of the source.\n\n        Raises:\n          NotImplementedError: If the method is not implemented.\n        \"\"\"\n        raise NotImplementedError()\n\n    def get_topk_documents(\n        self,\n        query: str = None,\n        embedding: np.array = None,\n        sources: Optional[list[str]] = None,\n        top_k: int = None,\n        return_tensors: str = \"*\",\n    ) -> pd.DataFrame:\n        \"\"\"Get the topk documents matching a user's query.\n\n        If no matches are found, returns an empty dataframe.\n\n        Args:\n          query: The user's query.\n          embedding: The embedding vector.\n          sources: The sources to filter by.\n          top_k: The number of top documents to retrieve.\n          return_tensors: The tensors to include in the result.\n\n        Returns:\n          The DataFrame containing the matched documents.\n        \"\"\"\n        if query is not None:\n            query_embedding = self.embedding_fn(query)\n        elif embedding is not None:\n            query_embedding = embedding\n        else:\n            raise ValueError(\"must provide either a query or an embedding\")\n\n        if self.use_tql:\n            assert self.exec_option == \"compute_engine\", \"cant use tql without compute_engine\"\n            tql_query = build_tql_query(query_embedding, sources=sources, top_k=top_k)\n            data = self.vector_store.search(query=tql_query, deep_memory=self.deep_memory)\n        else:\n            # build the filter clause\n            if sources:\n\n                def filter(x):\n                    return x[\"metadata\"].data()[\"value\"][\"source\"] in sources\n\n            else:\n                filter = None\n\n            data = self.vector_store.search(\n                k=top_k,\n                embedding=query_embedding,\n                exec_option=self.exec_option,\n                return_tensors=return_tensors,\n                filter=filter,\n            )\n\n        matched_documents = data_dict_to_df(data)\n        return matched_documents\n"
  },
  {
    "path": "buster/retriever/service.py",
    "content": "import logging\nfrom typing import List, Optional\n\nimport numpy as np\nimport pandas as pd\nimport pinecone\nfrom bson.objectid import ObjectId\nfrom pymongo.mongo_client import MongoClient\nfrom pymongo.server_api import ServerApi\n\nfrom buster.retriever.base import ALL_SOURCES, Retriever\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass ServiceRetriever(Retriever):\n    def __init__(\n        self,\n        pinecone_api_key: str,\n        pinecone_index: str,\n        pinecone_namespace: str,\n        mongo_uri: str,\n        mongo_db_name: str,\n        **kwargs,\n    ):\n        \"\"\"\n        Initializes a ServiceRetriever instance.\n\n        The ServiceRetriever is a hybrid retrieval combining pinecone and mongodb services.\n\n        Pinecone is exclusively used as a vector store.\n        The id of the pinecone vectors are used as a key in the mongodb database to store its associated metadata.\n\n        Args:\n            pinecone_api_key: The API key for Pinecone.\n            pinecone_env: The environment for Pinecone.\n            pinecone_index: The name of the Pinecone index.\n            pinecone_namespace: The namespace for Pinecone.\n            mongo_uri: The URI for MongoDB.\n            mongo_db_name: The name of the MongoDB database.\n        \"\"\"\n        super().__init__(**kwargs)\n\n        pc = pinecone.Pinecone(api_key=pinecone_api_key)\n\n        self.index = pc.Index(pinecone_index)\n        self.namespace = pinecone_namespace\n\n        self.client = MongoClient(mongo_uri, server_api=ServerApi(\"1\"))\n        self.db = self.client[mongo_db_name]\n\n    def get_source_id(self, source: str) -> str:\n        \"\"\"Get the id of a source. Returns an empty string if the source does not exist.\n\n        Args:\n            source: The name of the source.\n\n        Returns:\n            The id of the source.\n        \"\"\"\n        source_pointer = self.db.sources.find_one({\"name\": source})\n        return \"\" if source_pointer is None else str(source_pointer[\"_id\"])\n\n    def get_documents(self, source: Optional[str] = None) -> pd.DataFrame:\n        \"\"\"Get all current documents from a given source.\n\n        Args:\n            source: The name of the source. Defaults to None.\n\n        Returns:\n            A DataFrame containing all the documents. If the source does not exist, returns an empty DataFrame.\n        \"\"\"\n        if source is None:\n            # No source specified, return all documents\n            documents = self.db.documents.find()\n        else:\n            assert isinstance(source, str), \"source must be a valid string.\"\n            source_id = self.get_source_id(source)\n\n            if source_id == \"\":\n                logger.warning(f\"{source=} not found.\")\n\n            documents = self.db.documents.find({\"source_id\": source_id})\n\n        return pd.DataFrame(list(documents))\n\n    def get_source_display_name(self, source: str) -> str:\n        \"\"\"Get the display name of a source.\n\n        Args:\n            source: The name of the source.\n\n        Returns:\n            The display name of the source.\n        \"\"\"\n        if source is None:\n            return ALL_SOURCES\n        else:\n            display_name = self.db.sources.find_one({\"name\": source})[\"display_name\"]\n            return display_name\n\n    def get_topk_documents(self, query: str, sources: Optional[List[str]], top_k: int) -> pd.DataFrame:\n        \"\"\"Get the top k documents matching a query from the specified sources.\n\n        Args:\n            query: The query string.\n            sources: The list of source names to search. Defaults to None.\n            top_k: The number of top matches to return.\n\n        Returns:\n            A DataFrame containing the top k matching documents.\n        \"\"\"\n        if sources is None:\n            filter = None\n        else:\n            filter = {\"source\": {\"$in\": sources}}\n            source_exists = self.db.sources.find_one({\"name\": {\"$in\": sources}})\n            if source_exists is None:\n                logger.warning(f\"Sources {sources} do not exist. Returning empty dataframe.\")\n                return pd.DataFrame()\n\n        query_embedding = self.embedding_fn(query)\n        sparse_query_embedding = self.sparse_embedding_fn(query) if self.sparse_embedding_fn is not None else None\n\n        if isinstance(query_embedding, np.ndarray):\n            # pinecone expects a list of floats, so convert from ndarray if necessary\n            query_embedding = query_embedding.tolist()\n\n        # Pinecone retrieval\n        matches = self.index.query(\n            vector=query_embedding,\n            sparse_vector=sparse_query_embedding,\n            top_k=top_k,\n            filter=filter,\n            include_values=True,\n            namespace=self.namespace,\n        )[\"matches\"]\n        matching_ids = [ObjectId(match.id) for match in matches]\n        matching_scores = {match.id: match.score for match in matches}\n        matching_embeddings = {match.id: match.values for match in matches}\n\n        if len(matching_ids) == 0:\n            return pd.DataFrame()\n\n        # MongoDB retrieval\n        matched_documents = self.db.documents.find({\"_id\": {\"$in\": matching_ids}})\n        matched_documents = pd.DataFrame(list(matched_documents))\n\n        # add additional information from matching\n        matched_documents[\"similarity\"] = matched_documents[\"_id\"].apply(lambda x: matching_scores[str(x)])\n        matched_documents[\"embedding\"] = matched_documents[\"_id\"].apply(lambda x: matching_embeddings[str(x)])\n\n        # sort by similarity\n        matched_documents = matched_documents.sort_values(by=\"similarity\", ascending=False, ignore_index=True)\n\n        return matched_documents\n"
  },
  {
    "path": "buster/tokenizers/__init__.py",
    "content": "from .base import Tokenizer\nfrom .gpt import GPTTokenizer\n\n\ndef tokenizer_factory(tokenizer_cfg: dict) -> Tokenizer:\n    model_name = tokenizer_cfg[\"model_name\"]\n    if model_name in [\"text-davinci-003\", \"gpt-3.5-turbo\", \"gpt-4\"]:\n        return GPTTokenizer(model_name)\n\n    raise ValueError(f\"Tokenizer not implemented for {model_name=}\")\n\n\n__all__ = [Tokenizer, GPTTokenizer, tokenizer_factory]\n"
  },
  {
    "path": "buster/tokenizers/base.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import Union\n\n\nclass Tokenizer(ABC):\n    \"\"\"Abstract base class for a tokenizer.\n\n    Args:\n      model_name: The name of the tokenizer model.\n\n    Attributes:\n      model_name: The name of the tokenizer model.\n\n    \"\"\"\n\n    def __init__(self, model_name: str):\n        self.model_name = model_name\n\n    @abstractmethod\n    def encode(self, string: str) -> list[int]:\n        \"\"\"Encodes a string into a list of integers.\n\n        Args:\n          string: The input string to be encoded.\n\n        Returns:\n          A list of integers representing the encoded string.\n\n        \"\"\"\n\n        ...\n\n    @abstractmethod\n    def decode(self, encoded: list[int]) -> str:\n        \"\"\"Decodes a list of integers into a string.\n\n        Args:\n          encoded: The list of integers to be decoded.\n\n        Returns:\n          The decoded string.\n\n        \"\"\"\n\n        ...\n\n    def num_tokens(self, string: str, return_encoded: bool = False) -> Union[int, tuple[int, list[int]]]:\n        \"\"\"Returns the number of tokens in a string.\n\n        Args:\n          string: The input string.\n          return_encoded: Whether or not to return the encoded string along with the number of tokens.\n\n        Returns:\n          If `return_encoded` is False, returns the number of tokens in the string.\n          If `return_encoded` is True, returns a tuple containing the number of tokens and the encoded string.\n\n        \"\"\"\n\n        encoded = self.encode(string)\n        if return_encoded:\n            return len(encoded), encoded\n        return len(encoded)\n"
  },
  {
    "path": "buster/tokenizers/gpt.py",
    "content": "import tiktoken\n\nfrom buster.tokenizers import Tokenizer\n\n\nclass GPTTokenizer(Tokenizer):\n    \"\"\"Tokenizer class for GPT models.\n\n    This class implements a tokenizer for GPT models using the tiktoken library.\n\n    Args:\n        model_name (str): The name of the GPT model to be used.\n\n    Attributes:\n        encoder: The encoder object created using tiktoken.encoding_for_model().\n\n    \"\"\"\n\n    def __init__(self, model_name: str):\n        super().__init__(model_name)\n        self.encoder = tiktoken.encoding_for_model(model_name=model_name)\n\n    def encode(self, string: str):\n        \"\"\"Encodes a given string using the GPT tokenizer.\n\n        Args:\n            string (str): The string to be encoded.\n\n        Returns:\n            list[int]: The encoded representation of the string.\n\n        \"\"\"\n        return self.encoder.encode(string)\n\n    def decode(self, encoded: list[int]):\n        \"\"\"Decodes a list of tokens using the GPT tokenizer.\n\n        Args:\n            encoded (list[int]): The list of tokens to be decoded.\n\n        Returns:\n            str: The decoded string representation of the tokens.\n\n        \"\"\"\n        return self.encoder.decode(encoded)\n"
  },
  {
    "path": "buster/utils.py",
    "content": "import os\nimport urllib.request\nimport zipfile\n\n\ndef get_file_extension(filepath: str) -> str:\n    return os.path.splitext(filepath)[1]\n\n\ndef download_db(db_url: str, output_dir: str):\n    os.makedirs(output_dir, exist_ok=True)\n    fname = os.path.join(output_dir, \"documents.db\")\n    if not os.path.exists(fname):\n        print(f\"Downloading db file from {db_url} to {fname}...\")\n        urllib.request.urlretrieve(db_url, fname)\n        print(\"Downloaded.\")\n    else:\n        print(\"File already exists. Skipping.\")\n    return fname\n\n\ndef zip_contents(input_path, output_path):\n    \"\"\"\n    Zips the entire contents of a given path to a custom output path.\n\n    Authored by ChatGPT\n\n    Args:\n        input_path (str): The path of the directory to be zipped.\n        output_path (str): The path where the zip file will be created.\n\n    Returns:\n        str: The path of the created zip file.\n    \"\"\"\n    if not os.path.exists(input_path):\n        raise ValueError(\"The specified input path does not exist.\")\n\n    zip_file_name = f\"{os.path.basename(input_path)}.zip\"\n    zip_file_path = os.path.join(output_path, zip_file_name)\n\n    with zipfile.ZipFile(zip_file_path, \"w\", zipfile.ZIP_DEFLATED) as zipf:\n        for root, _, files in os.walk(input_path):\n            for file in files:\n                file_path = os.path.join(root, file)\n                arcname = os.path.relpath(file_path, input_path)\n                zipf.write(file_path, arcname=arcname)\n\n    return zip_file_path\n\n\ndef extract_zip(zip_file_path, output_path):\n    \"\"\"\n    Extracts the contents of a zip file to a custom output path.\n\n    Authored by ChatGPT\n\n    Args:\n        zip_file_path (str): The path of the zip file to be extracted.\n        output_path (str): The path where the zip contents will be extracted.\n\n    Returns:\n        str: The path of the directory where the zip contents are extracted.\n    \"\"\"\n    if not os.path.exists(zip_file_path):\n        raise ValueError(\"The specified zip file does not exist.\")\n\n    with zipfile.ZipFile(zip_file_path, \"r\") as zipf:\n        zipf.extractall(output_path)\n\n    return output_path\n"
  },
  {
    "path": "buster/validators/__init__.py",
    "content": "from .base import Validator\n\n__all__ = [Validator]\n"
  },
  {
    "path": "buster/validators/base.py",
    "content": "import logging\n\nimport pandas as pd\n\nfrom buster.llm_utils import cosine_similarity, get_openai_embedding\nfrom buster.validators.validators import (\n    AnswerValidator,\n    DocumentsValidator,\n    QuestionValidator,\n)\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass Validator:\n    def __init__(\n        self,\n        use_reranking: bool,\n        validate_documents: bool,\n        question_validator_cfg=None,\n        answer_validator_cfg=None,\n        documents_validator_cfg=None,\n    ):\n        \"\"\"\n        Initializes the Validator class.\n\n        Args:\n          use_reranking: A boolean indicating whether to use reranking.\n          validate_documents: A boolean indicating whether to validate documents.\n          question_validator_cfg: A configuration dictionary for the QuestionValidator.\n          answer_validator_cfg: A configuration dictionary for the AnswerValidator.\n          documents_validator_cfg: A configuration dictionary for the DocumentsValidator.\n        \"\"\"\n        self.question_validator = (\n            QuestionValidator(**question_validator_cfg) if question_validator_cfg is not None else QuestionValidator()\n        )\n        self.answer_validator = (\n            AnswerValidator(**answer_validator_cfg) if answer_validator_cfg is not None else AnswerValidator()\n        )\n        self.documents_validator = (\n            DocumentsValidator(**documents_validator_cfg)\n            if documents_validator_cfg is not None\n            else DocumentsValidator()\n        )\n        self.use_reranking = use_reranking\n        self.validate_documents = validate_documents\n\n    def check_question_relevance(self, question: str) -> tuple[bool, str]:\n        \"\"\"\n        Checks the relevance of a question.\n\n        Args:\n          question: The question to be checked.\n\n        Returns:\n          A tuple containing a boolean indicating the relevance and a string describing the result.\n        \"\"\"\n        return self.question_validator.check_question_relevance(question)\n\n    def check_answer_relevance(self, answer: str) -> bool:\n        \"\"\"\n        Checks the relevance of an answer.\n\n        Args:\n          answer: The answer to be checked.\n\n        Returns:\n          A boolean indicating the relevance of the answer.\n        \"\"\"\n        return self.answer_validator.check_answer_relevance(answer)\n\n    def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> pd.DataFrame:\n        \"\"\"\n        Checks the relevance of documents.\n\n        Args:\n          answer: The answer to be checked.\n          matched_documents: The DataFrame containing the matched documents.\n\n        Returns:\n          A DataFrame containing the relevance of the documents.\n        \"\"\"\n        return self.documents_validator.check_documents_relevance(answer, matched_documents)\n\n    def rerank_docs(\n        self, answer: str, matched_documents: pd.DataFrame, embedding_fn=get_openai_embedding\n    ) -> pd.DataFrame:\n        \"\"\"\n        Reranks the matched documents based on answer similarity.\n\n        Args:\n          answer: The answer for reranking.\n          matched_documents: The DataFrame containing the matched documents.\n          embedding_fn: The function used to calculate document embeddings.\n\n        Returns:\n          A DataFrame containing the reranked documents.\n        \"\"\"\n        \"\"\"Here we re-rank matched documents according to the answer provided by the llm.\n\n        This score could be used to determine wether a document was actually relevant to generation.\n        An extra column is added in-place for the similarity score.\n        \"\"\"\n        if len(matched_documents) == 0:\n            return matched_documents\n        logger.info(\"Reranking documents based on answer similarity...\")\n\n        answer_embedding = embedding_fn(answer)\n\n        col = \"similarity_to_answer\"\n        matched_documents[col] = matched_documents.embedding.apply(lambda x: cosine_similarity(x, answer_embedding))\n\n        return matched_documents.sort_values(by=col, ascending=False)\n"
  },
  {
    "path": "buster/validators/validators.py",
    "content": "import concurrent.futures\nimport logging\nfrom typing import Callable, List, Optional\n\nimport numpy as np\nimport pandas as pd\n\nfrom buster.completers import ChatGPTCompleter, Completer\nfrom buster.llm_utils import cosine_similarity\nfrom buster.llm_utils.embeddings import get_openai_embedding\n\nlogger = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO)\n\n\nclass QuestionValidator:\n    def __init__(\n        self,\n        check_question_prompt: Optional[str] = None,\n        invalid_question_response: Optional[str] = None,\n        completion_kwargs: Optional[dict] = None,\n        client_kwargs: Optional[dict] = None,\n    ):\n        if check_question_prompt is None:\n            check_question_prompt = (\n                \"\"\"You are a chatbot answering questions on documentation.\nYour job is to determine whether or not a question is valid, and should be answered.\nMore general questions are not considered valid, even if you might know the response.\nA user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.\n\nFor example:\n\nQ: What is backpropagation?\ntrue\n\nQ: What is the meaning of life?\nfalse\n\nA user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.\"\"\",\n            )\n\n        if completion_kwargs is None:\n            # default completion kwargs\n            completion_kwargs = (\n                {\n                    \"model\": \"gpt-3.5-turbo\",\n                    \"stream\": False,\n                    \"temperature\": 0,\n                },\n            )\n\n        self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)\n        self.check_question_prompt = check_question_prompt\n        self.invalid_question_response = invalid_question_response\n\n    def check_question_relevance(self, question: str) -> tuple[bool, str]:\n        \"\"\"Determines whether a question is relevant for our given framework.\"\"\"\n        try:\n            outputs, _ = self.completer.complete(self.check_question_prompt, user_input=question)\n            outputs = outputs.strip(\".\").lower()\n            if outputs not in [\"true\", \"false\"]:\n                logger.warning(f\"the question validation returned an unexpeced value: {outputs=}. Assuming Invalid...\")\n            relevance = outputs.strip(\".\").lower() == \"true\"\n            response = self.invalid_question_response\n\n        except Exception as e:\n            logger.exception(\"Error during question relevance detection.\")\n            relevance = False\n            response = \"Unable to process your question at the moment, try again soon\"\n\n        return relevance, response\n\n\nclass AnswerValidator:\n    def __init__(\n        self,\n        unknown_response_templates: Optional[list[str]] = None,\n        unknown_threshold: Optional[float] = None,\n        embedding_fn: Callable[[str], np.array] = None,\n    ):\n        if unknown_threshold is None:\n            unknown_threshold = 0.85\n\n        if embedding_fn is None:\n            embedding_fn = get_openai_embedding\n\n        if unknown_response_templates is None:\n            unknown_response_templates = [\n                \"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?\",\n            ]\n\n        self.embedding_fn = embedding_fn\n        self.unknown_response_templates = unknown_response_templates\n        self.unknown_threshold = unknown_threshold\n\n    def check_answer_relevance(self, answer: str) -> bool:\n        \"\"\"Check if a generated answer is relevant to the chatbot's knowledge.\"\"\"\n        if answer == \"\":\n            raise ValueError(\"Cannot compute embedding of an empty string.\")\n\n        unknown_embeddings = [\n            self.embedding_fn(unknown_response) for unknown_response in self.unknown_response_templates\n        ]\n\n        answer_embedding = self.embedding_fn(answer)\n        unknown_similarity_scores = [\n            cosine_similarity(answer_embedding, unknown_embedding) for unknown_embedding in unknown_embeddings\n        ]\n\n        # If any score is above the threshold, the answer is considered not relevant\n        return not any(score > self.unknown_threshold for score in unknown_similarity_scores)\n\n\nclass DocumentsValidator:\n    def __init__(\n        self,\n        completion_kwargs: Optional[dict] = None,\n        client_kwargs: Optional[dict] = None,\n        system_prompt: Optional[str] = None,\n        user_input_formatter: Optional[str] = None,\n        max_calls: int = 30,\n    ):\n        if system_prompt is None:\n            system_prompt = \"\"\"\n            Your goal is to determine if the content of a document can be attributed to a provided answer.\n            This means that if information in the document is found in the answer, it is relevant. Otherwise it is not.\n            Your goal is to determine if the information contained in a document was used to generate an answer.\n            You will be comparing a document to an answer. If the answer can be inferred from the document, return 'true'. Otherwise return 'false'.\n            Only respond with 'true' or 'false'.\"\"\"\n        self.system_prompt = system_prompt\n\n        if user_input_formatter is None:\n            user_input_formatter = \"\"\"\n            answer: {answer}\n            document: {document}\n        \"\"\"\n        self.user_input_formatter = user_input_formatter\n\n        if completion_kwargs is None:\n            completion_kwargs = {\n                \"model\": \"gpt-3.5-turbo\",\n                \"stream\": False,\n                \"temperature\": 0,\n            }\n\n        self.completer = ChatGPTCompleter(completion_kwargs=completion_kwargs, client_kwargs=client_kwargs)\n\n        self.max_calls = max_calls\n\n    def check_document_relevance(self, answer: str, document: str) -> bool:\n        user_input = self.user_input_formatter.format(answer=answer, document=document)\n        output, _ = self.completer.complete(prompt=self.system_prompt, user_input=user_input)\n\n        # remove trailing periods, happens sometimes...\n        output = output.strip(\".\").lower()\n\n        if output not in [\"true\", \"false\"]:\n            # Default assume it's relevant if the detector didn't give one of [true, false]\n            logger.warning(f\"the validation returned an unexpected value: {output}. Assuming valid...\")\n            return True\n        return output == \"true\"\n\n    def check_documents_relevance(self, answer: str, matched_documents: pd.DataFrame) -> list[bool]:\n        \"\"\"Determines wether a question is relevant or not for our given framework.\"\"\"\n\n        logger.info(f\"Checking document relevance of {len(matched_documents)} documents\")\n\n        if len(matched_documents) > self.max_calls:\n            raise ValueError(\"Max calls exceeded, increase max_calls to allow this.\")\n\n        # Here we parallelize the calls. We introduce a wrapper as a workaround.\n        def _check_documents(args):\n            \"Thin wrapper so we can pass args as a Tuple and use ThreadPoolExecutor.\"\n            answer, document = args\n            return self.check_document_relevance(answer=answer, document=document)\n\n        args_list = [(answer, doc) for doc in matched_documents.content.to_list()]\n        with concurrent.futures.ThreadPoolExecutor() as executor:\n            relevance = list(executor.map(_check_documents, args_list))\n\n        logger.info(f\"{relevance=}\")\n        # add it back to the dataframe\n        matched_documents[\"relevance\"] = relevance\n        return matched_documents\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"setuptools\", \"setuptools-scm\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"buster-doctalk\"\nversion = \"0.0.1\"\ndescription = \"Buster 🤖: A chatbot for retrieval-augmented generation\"\nreadme = \"README.md\"\nrequires-python = \">=3.10\"\ndynamic = [\"dependencies\"]\n\n[tool.setuptools.dynamic]\ndependencies = {file = [\"requirements.txt\"]}\n\n[tool.setuptools.packages.find]\ninclude = [\"buster\"]\n\n[tool.isort]\nprofile = \"black\"\n\n[tool.black]\nline-length = 120\n\n[tool.pytest.ini_options]\nlog_cli = true\nlog_cli_level = \"INFO\"\n\n[tool.poetry]\nname = \"buster-doctalk\"\nversion = \"v0.0.1\"\ndescription = \"Buster 🤖: A chatbot for retrieval-augmented generation\"\nlicense = \"MIT\"\nauthors = [\n    \"Jeremy Pinto <jerpint@gmail.com>\",\n    \"Hadrien Bertrand <bertrand.hadrien@gmail.com>\",\n]\nreadme = \"README.md\"\nrepository = \"https://github.com/jerpint/buster\"\n\npackages = [\n    { include = \"buster\" },\n    { include = \"buster/**/*.py\" },\n]\n\n[tool.poetry.dependencies]\npython = \">=3.10,<3.13\""
  },
  {
    "path": "requirements.txt",
    "content": "bs4\nclick\ndeeplake\ngradio>=3.40\nmatplotlib\nnumpy>=1.25\nopenai>=1.0\npandas>=2.1.3\npinecone-client>=3.0.2\npinecone-text>=0.6.0\npymongo\npytest\ntabulate\ntenacity\ntiktoken\n"
  },
  {
    "path": "tests/test_chatbot.py",
    "content": "import copy\nimport logging\nimport os\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\nimport pytest\n\nfrom buster.busterbot import Buster, BusterConfig\nfrom buster.completers import ChatGPTCompleter, Completer, Completion, DocumentAnswerer\nfrom buster.documents_manager import DeepLakeDocumentsManager\nfrom buster.formatters.documents import DocumentsFormatterHTML\nfrom buster.formatters.prompts import PromptFormatter\nfrom buster.llm_utils import get_openai_embedding\nfrom buster.retriever import DeepLakeRetriever, Retriever\nfrom buster.tokenizers.gpt import GPTTokenizer\nfrom buster.validators import Validator\n\nlogging.basicConfig(level=logging.INFO)\n\n\nDOCUMENTS_CSV = Path(__file__).resolve().parent.parent / \"buster/examples/stackoverflow.csv\"\nUNKNOWN_PROMPT = \"I'm sorry but I don't know how to answer.\"\nNUM_WORKERS = 1\n\n# default class used by our tests\nbuster_cfg_template = BusterConfig(\n    completion_cfg={\n        \"completion_kwargs\": {\n            \"model\": \"gpt-3.5-turbo\",\n            \"temperature\": 0,\n        },\n        \"client_kwargs\": {\n            \"timeout\": 20,\n            \"max_retries\": 2,\n        },\n    },\n    validator_cfg={\n        \"validate_documents\": False,\n        \"use_reranking\": True,\n        \"answer_validator_cfg\": {\n            \"unknown_response_templates\": [\n                \"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?\",\n            ],\n            \"unknown_threshold\": 0.85,\n        },\n        \"question_validator_cfg\": {\n            \"invalid_question_response\": \"This question does not seem relevant to my current knowledge.\",\n            \"completion_kwargs\": {\n                \"model\": \"gpt-3.5-turbo\",\n                \"stream\": False,\n                \"temperature\": 0,\n            },\n            \"client_kwargs\": {\n                \"timeout\": 20,\n                \"max_retries\": 2,\n            },\n            \"check_question_prompt\": \"You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.\",\n        },\n    },\n    retriever_cfg={\n        # \"db_path\": to be set using pytest fixture,\n        \"top_k\": 3,\n        \"thresh\": 0.7,\n        \"max_tokens\": 2000,\n        \"embedding_fn\": get_openai_embedding,\n    },\n    prompt_formatter_cfg={\n        \"max_tokens\": 3500,\n        \"text_after_docs\": (\"\"\"Now answer the following question:\\n\"\"\"),\n        \"text_before_docs\": (\n            \"\"\"You are a chatbot assistant answering technical questions about artificial intelligence (AI). \"\"\"\n            \"\"\"If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer.\"\"\"\n            \"\"\"Use this response when you cannot answer:\\n\"\"\"\n            f\"\"\"'{UNKNOWN_PROMPT}'\\n\"\"\"\n            \"\"\"For example:\\n\"\"\"\n            \"\"\"What is the meaning of life?\\n\"\"\"\n            f\"\"\"'{UNKNOWN_PROMPT}'\\n\"\"\"\n            \"\"\"Only use these prodived documents as reference:\\n\"\"\"\n        ),\n    },\n    documents_formatter_cfg={\n        \"max_tokens\": 3000,\n        \"formatter\": \"{content}\",\n    },\n)\n\n\ndef get_fake_embedding(length=1536):\n    rng = np.random.default_rng()\n    return list(rng.random(length, dtype=np.float32))\n\n\nclass MockAnswerer(Completer):\n    def __init__(self, expected_answer):\n        self.expected_answer = expected_answer\n\n    def prepare_prompt(self, user_inputs, matched_documents):\n        pass\n\n    def complete(self):\n        return\n\n    def get_completion(self, user_inputs, matched_documents, validator, *arg, **kwarg) -> Completion:\n        return Completion(\n            answer_text=self.expected_answer,\n            error=False,\n            user_inputs=user_inputs,\n            matched_documents=matched_documents,\n            validator=validator,\n        )\n\n\nclass MockRetriever(Retriever):\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        path = kwargs[\"path\"]\n\n        self.path = path\n\n        n_samples = 100\n        self.documents = pd.DataFrame.from_dict(\n            {\n                \"title\": [\"test\"] * n_samples,\n                \"url\": [\"http://url.com\"] * n_samples,\n                \"content\": [\"cool text\"] * n_samples,\n                \"embedding\": [get_fake_embedding()] * n_samples,\n                \"n_tokens\": [10] * n_samples,\n                \"source\": [\"fake source\"] * n_samples,\n            }\n        )\n\n        self.embedding_fn = get_fake_embedding\n\n    def get_documents(self, source):\n        return self.documents\n\n    def get_topk_documents(self, query: str, sources: list[str] = None, top_k: int = None) -> pd.DataFrame:\n        documents = self.documents\n        documents[\"embedding\"] = [get_fake_embedding() for _ in range(len(documents))]\n        documents[\"similarity\"] = [np.random.random() for _ in range(len(documents))]\n        return documents\n\n    def get_source_display_name(self, source):\n        return source\n\n\nclass MockValidator:\n    def __init__(self, *args, **kwargs):\n        return\n\n    def validate(self, completion):\n        completion.answer_relevant = True\n        return completion\n\n    def check_question_relevance(self, *args, **kwargs):\n        return True, \"\"\n\n    def check_answer_relevance(self, *args, **kwargs):\n        return True\n\n\n@pytest.fixture(scope=\"session\")\ndef vector_store_path(tmp_path_factory):\n    # Create a temporary directory and folder for the database manager\n    dm_path = tmp_path_factory.mktemp(\"data\").joinpath(\"deeplake_store\")\n\n    # Add the documents (will generate embeddings)\n    dm = DeepLakeDocumentsManager(vector_store_path=dm_path)\n    df = pd.read_csv(DOCUMENTS_CSV)\n    dm.add(df, num_workers=NUM_WORKERS)\n    return dm_path\n\n\ndef test_chatbot_mock_data(tmp_path, monkeypatch):\n    gpt_expected_answer = \"this is GPT answer\"\n\n    path = tmp_path / \"not_a_real_file.tar.gz\"\n\n    buster_cfg = copy.deepcopy(buster_cfg_template)\n    buster_cfg.retriever_cfg[\"path\"] = path\n    buster_cfg.completion_cfg = {\n        \"expected_answer\": gpt_expected_answer,\n    }\n\n    retriever = MockRetriever(**buster_cfg.retriever_cfg)\n    document_answerer = MockAnswerer(**buster_cfg.completion_cfg)\n    validator = MockValidator(**buster_cfg.validator_cfg)\n    buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)\n    completion = buster.process_input(user_input=\"What is a transformer?\", sources=[\"fake_source\"])\n    assert isinstance(completion.answer_text, str)\n    assert completion.answer_text.startswith(gpt_expected_answer)\n\n\ndef test_chatbot_real_data__chatGPT(vector_store_path):\n    buster_cfg = copy.deepcopy(buster_cfg_template)\n    buster_cfg.retriever_cfg[\"path\"] = vector_store_path\n\n    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)\n    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)\n    document_answerer = DocumentAnswerer(\n        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),\n        documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),\n        prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),\n    )\n    validator: Validator = Validator(**buster_cfg.validator_cfg)\n    buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)\n\n    completion = buster.process_input(\"What is backpropagation?\")\n    assert isinstance(completion.answer_text, str)\n\n    assert completion.question_relevant == True\n    assert completion.answer_relevant == True\n\n    assert completion.completion_kwargs == buster_cfg.completion_cfg[\"completion_kwargs\"]\n\n\ndef test_chatbot_real_data__chatGPT_OOD(vector_store_path):\n    buster_cfg = copy.deepcopy(buster_cfg_template)\n    buster_cfg.retriever_cfg[\"path\"] = vector_store_path\n    buster_cfg.prompt_formatter_cfg = {\n        \"max_tokens\": 3500,\n        \"text_before_docs\": (\n            \"\"\"You are a chatbot assistant answering technical questions about artificial intelligence (AI).\"\"\"\n            \"\"\"If you do not know the answer to a question, or if it is completely irrelevant to your domain knowledge of AI library usage, let the user know you cannot answer.\"\"\"\n            \"\"\"Use this response: \"\"\"\n            f\"\"\"'{UNKNOWN_PROMPT}'\\n\"\"\"\n            \"\"\"For example:\\n\"\"\"\n            \"\"\"What is the meaning of life?\\n\"\"\"\n            f\"\"\"'{UNKNOWN_PROMPT}'\\n\"\"\"\n            \"\"\"Now answer the following question:\\n\"\"\"\n        ),\n        \"text_after_docs\": \"Only use these documents as reference:\\n\",\n    }\n\n    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)\n    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)\n    document_answerer = DocumentAnswerer(\n        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),\n        documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),\n        prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),\n    )\n    validator: Validator = Validator(**buster_cfg.validator_cfg)\n    buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)\n\n    completion: Completion = buster.process_input(\"What is a good recipe for brocolli soup?\")\n    assert isinstance(completion.answer_text, str)\n\n    assert completion.question_relevant == False\n    assert completion.answer_relevant == False\n\n    assert completion.completion_kwargs is None\n\n\ndef test_chatbot_real_data__no_docs_found(vector_store_path):\n    with pytest.warns():\n        buster_cfg = copy.deepcopy(buster_cfg_template)\n        buster_cfg.retriever_cfg = {\n            \"path\": vector_store_path,\n            \"embedding_fn\": get_openai_embedding,\n            \"top_k\": 3,\n            \"thresh\": 1,  # Set threshold very high to be sure no docs are matched\n            \"max_tokens\": 3000,\n        }\n        buster_cfg.documents_answerer_cfg[\"no_documents_message\"] = \"No documents available.\"\n        retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)\n        tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)\n        document_answerer = DocumentAnswerer(\n            completer=ChatGPTCompleter(**buster_cfg.completion_cfg),\n            documents_formatter=DocumentsFormatterHTML(tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg),\n            prompt_formatter=PromptFormatter(tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg),\n            **buster_cfg.documents_answerer_cfg,\n        )\n        validator: Validator = Validator(**buster_cfg.validator_cfg)\n        buster: Buster = Buster(retriever=retriever, document_answerer=document_answerer, validator=validator)\n\n        completion = buster.process_input(\"What is backpropagation?\")\n        assert isinstance(completion.answer_text, str)\n\n        assert completion.question_relevant == True\n        assert completion.answer_relevant == False\n        assert completion.answer_text == \"No documents available.\"\n"
  },
  {
    "path": "tests/test_documents.py",
    "content": "import os\n\nimport numpy as np\nimport pandas as pd\nimport pytest\n\nfrom buster.documents_manager import DeepLakeDocumentsManager\nfrom buster.documents_manager.base import compute_embeddings_parallelized\nfrom buster.llm_utils import get_openai_embedding\nfrom buster.retriever import DeepLakeRetriever\n\n# Patch the get_embedding function to return a fixed, fake embedding\nNUM_WORKERS = 1\nfake_embedding = [-0.005, 0.0018]\n\n\ndef get_fake_embedding(*arg, **kwargs):\n    return fake_embedding\n\n\n@pytest.mark.parametrize(\n    \"documents_manager, retriever\",\n    [(DeepLakeDocumentsManager, DeepLakeRetriever)],\n)\ndef test_write_read(tmp_path, documents_manager, retriever):\n    retriever_cfg = {\n        \"top_k\": 3,\n        \"thresh\": 0.7,\n        \"max_tokens\": 2000,\n        \"embedding_fn\": get_openai_embedding,\n    }\n    dm_path = tmp_path / \"tmp_dir_2\"\n    retriever_cfg[\"path\"] = dm_path\n\n    data = pd.DataFrame.from_dict(\n        {\n            \"title\": [\"test\"],\n            \"url\": [\"http://url.com\"],\n            \"content\": [\"cool text\"],\n            \"source\": [\"sourceA\"],\n            \"embedding\": [np.arange(10, dtype=np.float32) - 0.3],\n            \"n_tokens\": 5,\n        }\n    )\n\n    dm = DeepLakeDocumentsManager(vector_store_path=dm_path)\n\n    dm.add(df=data)\n    dm_data = retriever(**retriever_cfg).get_documents(sources=[\"sourceA\"])\n\n    assert dm_data[\"title\"].iloc[0] == data[\"title\"].iloc[0]\n    assert dm_data[\"url\"].iloc[0] == data[\"url\"].iloc[0]\n    assert dm_data[\"content\"].iloc[0] == data[\"content\"].iloc[0]\n    assert dm_data[\"source\"].iloc[0] == data[\"source\"].iloc[0]\n    assert np.allclose(dm_data[\"embedding\"].iloc[0], data[\"embedding\"].iloc[0])\n\n\n@pytest.mark.parametrize(\n    \"documents_manager, retriever\",\n    [\n        (DeepLakeDocumentsManager, DeepLakeRetriever),\n    ],\n)\ndef test_write_write_read(tmp_path, documents_manager, retriever):\n    retriever_cfg = {\n        \"top_k\": 3,\n        \"thresh\": 0.7,\n        \"max_tokens\": 2000,\n        \"embedding_fn\": get_openai_embedding,\n    }\n    db_path = tmp_path / \"tmp_dir\"\n    retriever_cfg[\"path\"] = db_path\n\n    db = documents_manager(db_path)\n\n    data_1 = pd.DataFrame.from_dict(\n        {\n            \"title\": [\"test\"],\n            \"url\": [\"http://url.com\"],\n            \"content\": [\"cool text\"],\n            \"embedding\": [np.arange(10, dtype=np.float32) - 0.3],\n            \"source\": [\"sourceA\"],\n            \"n_tokens\": 10,\n        }\n    )\n    db.add(df=data_1, num_workers=NUM_WORKERS)\n\n    data_2 = pd.DataFrame.from_dict(\n        {\n            \"title\": [\"other\"],\n            \"url\": [\"http://url.com/page.html\"],\n            \"content\": [\"lorem ipsum\"],\n            \"embedding\": [np.arange(10, dtype=np.float32) / 10 - 2.3],\n            \"source\": [\"sourceB\"],\n            \"n_tokens\": 5,\n        }\n    )\n    db.add(df=data_2, num_workers=NUM_WORKERS)\n\n    db_data = retriever(**retriever_cfg).get_documents(sources=[\"sourceB\"])\n\n    assert len(db_data) == len(data_2)\n    assert db_data[\"title\"].iloc[0] == data_2[\"title\"].iloc[0]\n    assert db_data[\"url\"].iloc[0] == data_2[\"url\"].iloc[0]\n    assert db_data[\"content\"].iloc[0] == data_2[\"content\"].iloc[0]\n    assert np.allclose(db_data[\"embedding\"].iloc[0], data_2[\"embedding\"].iloc[0])\n\n\ndef test_generate_embeddings(tmp_path, monkeypatch):\n    # Create fake data\n    df = pd.DataFrame.from_dict(\n        {\"title\": [\"test\"], \"url\": [\"http://url.com\"], \"content\": [\"cool text\"], \"source\": [\"my_source\"]}\n    )\n\n    # Generate embeddings, store in a file\n    path = tmp_path / f\"test_document_embeddings\"\n    dm = DeepLakeDocumentsManager(path)\n    dm.add(df, embedding_fn=get_fake_embedding, num_workers=NUM_WORKERS)\n\n    # Read the embeddings from the file\n    retriever_cfg = {\n        \"path\": path,\n        \"top_k\": 3,\n        \"thresh\": 0.85,\n        \"max_tokens\": 3000,\n        \"embedding_fn\": get_fake_embedding,\n    }\n    read_df = DeepLakeRetriever(**retriever_cfg).get_documents(\"my_source\")\n\n    # Check all the values are correct across the files\n    assert df[\"title\"].iloc[0] == df[\"title\"].iloc[0] == read_df[\"title\"].iloc[0]\n    assert df[\"url\"].iloc[0] == df[\"url\"].iloc[0] == read_df[\"url\"].iloc[0]\n    assert df[\"content\"].iloc[0] == df[\"content\"].iloc[0] == read_df[\"content\"].iloc[0]\n    assert np.allclose(fake_embedding, read_df[\"embedding\"].iloc[0])\n\n\ndef test_generate_embeddings_parallelized():\n    # Create fake data\n    df = pd.DataFrame.from_dict(\n        {\n            \"title\": [\"test\"] * 5,\n            \"url\": [\"http://url.com\"] * 5,\n            \"content\": [\"cool text\" + str(x) for x in range(5)],\n            \"source\": [\"my_source\"] * 5,\n        }\n    )\n\n    embeddings_parallel = compute_embeddings_parallelized(\n        df, embedding_fn=get_openai_embedding, num_workers=NUM_WORKERS\n    )\n    embeddings = df.content.apply(get_openai_embedding)\n\n    # embeddings comes out as a series because of the apply, so cast it back to an array\n    embeddings_arr = np.array(embeddings.to_list())\n\n    # Not clear why a tolerance needs to be specified, likely because it is computed on different machines\n    # since the requests are done in parallel...\n    assert np.allclose(embeddings_parallel, embeddings_arr, atol=1e-2)\n\n\ndef test_add_batches(tmp_path):\n    dm_path = tmp_path / \"deeplake_store\"\n    num_samples = 20\n    batch_size = 16\n    csv_filename = os.path.join(tmp_path, \"embedding_\")\n\n    dm = DeepLakeDocumentsManager(vector_store_path=dm_path)\n\n    # Create fake data\n    df = pd.DataFrame.from_dict(\n        {\n            \"title\": [\"test\"] * num_samples,\n            \"url\": [\"http://url.com\"] * num_samples,\n            \"content\": [\"cool text\" + str(x) for x in range(num_samples)],\n            \"source\": [\"my_source\"] * num_samples,\n        }\n    )\n\n    dm.batch_add(\n        df,\n        embedding_fn=get_fake_embedding,\n        num_workers=NUM_WORKERS,\n        batch_size=batch_size,\n        min_time_interval=0,\n        csv_filename=csv_filename,\n    )\n\n    csv_files = [f for f in os.listdir(tmp_path) if f.endswith(\".csv\")]\n\n    # check that we registered the good number of doucments and that files were generated\n    assert len(dm) == num_samples\n\n    df_saved = pd.read_csv(csv_filename)\n    assert len(df_saved) == num_samples\n    assert \"embedding\" in df_saved.columns\n"
  },
  {
    "path": "tests/test_formatters.py",
    "content": "import json\n\nimport pandas as pd\nimport pytest\n\nfrom buster.formatters.documents import DocumentsFormatterHTML, DocumentsFormatterJSON\nfrom buster.formatters.prompts import PromptFormatter\nfrom buster.tokenizers import GPTTokenizer\n\n\ndef test_DocumentsDormatterHTML__simple():\n    \"\"\"In this test, we expect all 3 documents to be matched and returned normally.\"\"\"\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    documents_formatter = DocumentsFormatterHTML(\n        tokenizer=tokenizer,\n        max_tokens=100,\n    )\n\n    document_1 = \"This is a very short document.\"\n    document_2 = \"This is another very short document.\"\n    document_3 = \"This is also a short document.\"\n\n    expected_docs_str = (\n        \"<DOCUMENTS>\"\n        f\"<DOCUMENT>{document_1}<\\\\DOCUMENT>\"\n        f\"<DOCUMENT>{document_2}<\\\\DOCUMENT>\"\n        f\"<DOCUMENT>{document_3}<\\\\DOCUMENT>\"\n        \"<\\\\DOCUMENTS>\"\n    )\n\n    matched_documents = pd.DataFrame({\"content\": [document_1, document_2, document_3]})\n\n    docs_str, matched_documents_new = documents_formatter.format(matched_documents)\n\n    # less documents and the new document is shorter than the original\n    assert all(matched_documents.content == matched_documents_new.content)\n\n    assert docs_str == expected_docs_str\n\n\ndef test_DocumentsDormatterJSON__simple():\n    \"\"\"In this test, we expect all 3 documents to be matched and returned normally.\"\"\"\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=[\"content\", \"source\"])\n\n    document_1 = \"This is a very short document.\"\n    document_2 = \"This is another very short document.\"\n    document_3 = \"This is also a short document.\"\n\n    source_1 = \"source 1\"\n    source_2 = \"source 2\"\n    source_3 = \"source 3\"\n\n    data_dict = {\n        \"content\": [document_1, document_2, document_3],\n        \"source\": [source_1, source_2, source_3],\n    }\n\n    expected_docs_str = json.dumps(\n        [\n            {\"content\": document_1, \"source\": source_1},\n            {\"content\": document_2, \"source\": source_2},\n            {\"content\": document_3, \"source\": source_3},\n        ],\n        separators=(\",\", \":\"),\n    )\n\n    matched_documents = pd.DataFrame(data_dict)\n\n    docs_str, matched_documents_new = documents_formatter.format(matched_documents)\n\n    # less documents and the new document is shorter than the original\n    assert all(matched_documents.content == matched_documents_new.content)\n\n    assert docs_str == expected_docs_str  # matched_documents.to_json(orient=\"records\")\n\n\ndef test_DocumentsFormatterHTML__doc_to_long():\n    \"\"\"In this test, document_1 doesn't entirely fit.\n\n    we only expect a part of it to be contained.\n    \"\"\"\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    documents_formatter = DocumentsFormatterHTML(\n        tokenizer=tokenizer,\n        max_tokens=100,\n    )\n\n    long_sentence = \"This is a very long document. It is long on purpose.\"\n    document_1 = long_sentence * 50\n    document_2 = \"This is a very short document.\"\n    document_3 = \"This is also a short document\"\n\n    matched_documents = pd.DataFrame({\"content\": [document_1, document_2, document_3]})\n\n    docs_str, matched_documents_new = documents_formatter.format(matched_documents)\n\n    # less documents and the new document is shorter than the original\n    assert len(matched_documents) == 3\n    assert len(matched_documents_new) == 1\n    assert len(docs_str) < len(document_1)\n\n    # The long document gets truncated, the others don't make it in.\n    assert long_sentence in docs_str\n    assert document_2 not in docs_str\n    assert document_3 not in docs_str\n\n\ndef test_DocumentsFormatterJSON__doc_too_long():\n    \"\"\"In this test, document_3 doesn't fit.\n    We expect it to be excluded completely.\n\n    we only expect a part of it to be contained.\n    \"\"\"\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    documents_formatter = DocumentsFormatterJSON(tokenizer=tokenizer, max_tokens=100, columns=[\"content\", \"source\"])\n\n    long_sentence = \"This is a very long document. It is long on purpose.\"\n\n    document_1 = \"This is a very short document.\"\n    document_2 = \"This is also a short document\"\n    document_3 = long_sentence * 50\n\n    source_1 = \"source 1\"\n    source_2 = \"source 2\"\n    source_3 = \"source 3\"\n\n    data_dict = {\n        \"content\": [document_1, document_2, document_3],\n        \"source\": [source_1, source_2, source_3],\n    }\n\n    expected_docs_str = json.dumps(\n        [\n            {\"content\": document_1, \"source\": source_1},\n            {\"content\": document_2, \"source\": source_2},\n        ],\n        separators=(\",\", \":\"),\n    )\n\n    matched_documents = pd.DataFrame(data_dict)\n\n    docs_str, matched_documents_new = documents_formatter.format(matched_documents)\n    assert docs_str == expected_docs_str\n\n    # less documents and the new document is shorter than the original\n    assert len(matched_documents) == 3\n    assert len(matched_documents_new) == 2\n\n    # The last document gets ignored completely, the first 2 make it\n    assert document_1 in docs_str\n    assert document_2 in docs_str\n    assert long_sentence not in docs_str\n\n\ndef test_DocumentsFormatterHTML__doc_to_long_2():\n    \"\"\"In this test, document_2 doesn't entirely fit.\n\n    we only expect a part of it to be contained, as well as all of document_1, and none of document_3.\n    \"\"\"\n\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    documents_formatter = DocumentsFormatterHTML(\n        tokenizer=tokenizer,\n        max_tokens=100,\n    )\n\n    document_1 = \"This is a very short document.\"\n    document_2 = \"This is a very long document. It is long on purpose.\" * 50\n    document_3 = \"This is also a short document\"\n\n    matched_documents = pd.DataFrame({\"content\": [document_1, document_2, document_3]})\n\n    docs_str, matched_documents_new = documents_formatter.format(matched_documents)\n\n    # less documents and the new document is shorter than the original\n    assert len(matched_documents) == 3\n    assert len(matched_documents_new) == 2\n\n    assert document_1 in docs_str\n    assert \"This is a very long document. It is long on purpose.\" in docs_str  # at least a subset should be in there\n    assert document_3 not in docs_str\n\n\ndef test_DocumentsFormatterHTML__complex_format():\n    \"\"\"In this test, we expect all 3 documents to be matched and returned in a particular format.\"\"\"\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    documents_formatter = DocumentsFormatterHTML(\n        tokenizer=tokenizer,\n        max_tokens=100,\n        formatter=\"Title: {title}\\n{content}\\n\",\n    )\n\n    document_1 = \"This is a very short document.\"\n    document_2 = \"This is another very short document.\"\n    document_3 = \"This is also a short document.\"\n\n    title_1 = \"doc1\"\n    title_2 = \"doc2\"\n    title_3 = \"doc3\"\n\n    country_1 = \"Canada\"\n    country_2 = \"France\"\n    country_3 = \"Germany\"\n\n    expected_docs_str = (\n        \"<DOCUMENTS>\"\n        f\"<DOCUMENT>Title: {title_1}\\n{document_1}\\n<\\\\DOCUMENT>\"\n        f\"<DOCUMENT>Title: {title_2}\\n{document_2}\\n<\\\\DOCUMENT>\"\n        f\"<DOCUMENT>Title: {title_3}\\n{document_3}\\n<\\\\DOCUMENT>\"\n        \"<\\\\DOCUMENTS>\"\n    )\n\n    matched_documents = pd.DataFrame(\n        {\n            \"content\": [document_1, document_2, document_3],\n            \"title\": [title_1, title_2, title_3],\n            \"country\": [country_1, country_2, country_3],\n        }\n    )\n\n    docs_str, matched_documents_new = documents_formatter.format(matched_documents)\n\n    # less documents and the new document is shorter than the original\n    assert all(matched_documents.content == matched_documents_new.content)\n\n    assert docs_str == expected_docs_str\n\n\ndef test_system_prompt_formatter():\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    prompt_formatter = PromptFormatter(\n        tokenizer=tokenizer,\n        max_tokens=200,\n        text_after_docs=\"After docs.\",\n        text_before_docs=\"Before docs.\",\n        formatter=\"{text_before_docs}\\n{documents}\\n{text_after_docs}\",\n    )\n\n    documents = \"Here are some docs\"\n\n    prompt = prompt_formatter.format(documents)\n\n    assert prompt == (\"Before docs.\\n\" \"Here are some docs\\n\" \"After docs.\")\n\n    assert documents in prompt\n\n\ndef test_system_prompt_formatter__to_long():\n    tokenizer = GPTTokenizer(model_name=\"gpt-3.5-turbo\")\n    prompt_formatter = PromptFormatter(\n        tokenizer=tokenizer,\n        max_tokens=200,\n        text_after_docs=\"After docs.\",\n        text_before_docs=\"Before docs.\",\n    )\n\n    documents = \"Here are some documents that are WAY too long.\" * 100\n\n    with pytest.raises(ValueError):\n        prompt_formatter.format(documents)\n"
  },
  {
    "path": "tests/test_read_write.py",
    "content": "import pandas as pd\n\nfrom buster.completers import Completion, UserInputs\n\n\nclass MockValidator:\n    def __init__(self):\n        self.use_reranking = True\n\n    def check_answer_relevance(self, completion: Completion) -> bool:\n        return True\n\n    def rerank_docs(self, answer: str, matched_documents: pd.DataFrame) -> bool:\n        return matched_documents\n\n\ndef test_read_write_completion():\n    n_samples = 3\n    completion_kwargs = {\"param_1\": \"a\"}\n    matched_documents = pd.DataFrame.from_dict(\n        {\n            \"title\": [\"test\"] * n_samples,\n            \"url\": [\"http://url.com\"] * n_samples,\n            \"content\": [\"cool text\"] * n_samples,\n            \"embedding\": [[0.0] * 1000] * n_samples,\n            \"n_tokens\": [10] * n_samples,\n            \"source\": [\"fake source\"] * n_samples,\n        }\n    )\n    c = Completion(\n        user_inputs=UserInputs(original_input=\"What is the meaning of life?\"),\n        error=False,\n        answer_text=\"This is my actual answer\",\n        matched_documents=matched_documents,\n        validator=MockValidator(),\n        completion_kwargs=completion_kwargs,\n    )\n\n    c_json = c.to_json()\n    c_back = Completion.from_dict(c_json)\n\n    assert c.error == c_back.error\n    assert c.answer_text == c_back.answer_text\n    assert c.user_inputs == c_back.user_inputs\n    assert c.answer_relevant == c_back.answer_relevant\n    assert c.completion_kwargs == c_back.completion_kwargs\n    for col in c_back.matched_documents.columns.tolist():\n        assert col in c.matched_documents.columns.tolist()\n        assert c_back.matched_documents[col].tolist() == c.matched_documents[col].tolist()\n"
  },
  {
    "path": "tests/test_validator.py",
    "content": "import pandas as pd\n\nfrom buster.llm_utils import get_openai_embedding\nfrom buster.validators import Validator\n\nvalidator_cfg = {\n    \"use_reranking\": True,\n    \"validate_documents\": True,\n    \"answer_validator_cfg\": {\n        \"unknown_response_templates\": [\n            \"I Don't know how to answer your question.\",\n        ],\n        \"unknown_threshold\": 0.85,\n    },\n    \"question_validator_cfg\": {\n        \"invalid_question_response\": \"This question does not seem relevant to my current knowledge.\",\n        \"completion_kwargs\": {\n            \"model\": \"gpt-3.5-turbo\",\n            \"stream\": False,\n            \"temperature\": 0,\n        },\n        \"check_question_prompt\": \"You are validating if questions are related to AI. If a question is relevant, respond with 'true', if it is irrlevant, respond with 'false'.\",\n    },\n}\nvalidator = Validator(**validator_cfg)\n\n\ndef test_validator_check_question_relevance():\n    question = \"What is backpropagation?\"\n    relevance, _ = validator.check_question_relevance(question)\n    assert relevance == True\n\n    question = \"How can I make a broccoli soup?\"\n    relevance, _ = validator.check_question_relevance(question)\n    assert relevance == False\n\n\ndef test_validator_check_answer_relevance():\n    answer = \"Not sure how to answer your question\"\n    assert validator.check_answer_relevance(answer) == False\n\n    answer = \"According to the documentation, the answer should be 2+2 = 4.\"\n    assert validator.check_answer_relevance(answer) == True\n\n\ndef test_validator_check_documents_relevance():\n    docs = {\n        \"content\": [\n            \"A panda is a bear native to China, known for its black and white fur.\",\n            \"An apple is a sweet fruit, often red, green, or yellow in color.\",\n            \"A car is a wheeled vehicle used for transportation, typically powered by an engine.\",\n        ]\n    }\n\n    answer = \"Pandas live in China.\"\n    expected_relevance = [True, False, False]\n\n    matched_documents = pd.DataFrame(docs)\n    matched_documents = validator.check_documents_relevance(answer=answer, matched_documents=matched_documents)\n\n    assert \"relevance\" in matched_documents.columns\n    assert matched_documents.relevance.to_list() == expected_relevance\n\n\ndef test_validator_rerank_docs():\n    documents = [\n        \"A basketball player practicing\",\n        \"A cat eating an orange\",\n        \"A green apple on the counter\",\n    ]\n    matched_documents = pd.DataFrame({\"documents\": documents})\n    matched_documents[\"embedding\"] = matched_documents.documents.apply(lambda x: get_openai_embedding(x))\n\n    answer = \"An apple is a delicious fruit.\"\n    reranked_documents = validator.rerank_docs(answer, matched_documents)\n\n    assert reranked_documents.documents.to_list() == [\n        \"A green apple on the counter\",\n        \"A cat eating an orange\",\n        \"A basketball player practicing\",\n    ]\n"
  }
]