[
  {
    "path": ".github/ISSUE_TEMPLATE/default_issue.yml",
    "content": "name: Default Issue\ndescription: Raise an issue that wouldn't be covered by the other templates.\ntitle: \"Issue: <Please write a comprehensive title after the 'Issue: ' prefix>\"\nlabels: [Default Issue Template]\n\nbody:\n  - type: textarea\n    attributes:\n      label: \"Issue you'd like to raise.\"\n      description: >\n        Please describe the issue you'd like to raise as clearly as possible.\n        Make sure to include any relevant links or references.\n\n  - type: textarea\n    attributes:\n      label: \"Suggestion:\"\n      description: >\n        Please outline a suggestion to improve the issue here.\n"
  },
  {
    "path": ".gitignore",
    "content": ".chroma/\ndata/\nExternal_Data_Pipeline/\nPDF/Omren\nconfig.py\nmain.py\nnote.md\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\nPipfile.lock\nPipfile\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/#use-with-ide\n.pdm.toml\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\n"
  },
  {
    "path": ".streamlit/config.toml",
    "content": "[server]\nenableStaticServing = true"
  },
  {
    "path": "Dockerfile",
    "content": "FROM python:3.9-slim\n\n# Set the working directory in the container.\nWORKDIR /app\n\n# Copy the project's requirements file into the container\nCOPY requirements.txt ./requirements.txt\n# Upgrade pip for the latest features and install the project's Python dependencies.\nRUN pip install --upgrade pip && pip install -r requirements.txt\n\n# Copy the entire project into the container.\n# This may include all code, assets, and configuration files required to run the application.\nCOPY . /app\n\n# Expose port 8501\nEXPOSE 8501\n\n# Define the default command to run the app using Python's module mode.\nCMD [\"streamlit\", \"run\", \"app.py\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2023 JunXiang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "<p align=\"center\">\n    <img style=\"width: 50%; height: auto;\" src=\"./static/img/repos_logo.png\" alt=\"Chatbot Image\">\n</p>\n\n\n[English](./README.md) | [中文版](./README.zh-TW.md)\n\nFree `docGPT` allows you to chat with your documents (`.pdf`, `.docx`, `.csv`, `.txt`), without the need for any keys or fees.\n\nAdditionally, you can deploy the app anywhere based on the document.\n\n- Table of Contents\n    - [Introduction](#introduction)\n    - [Features](#🧨features)\n    - [What's LangChain?](#whats-langchain)\n    - [How to Use docGPT?](#how-to-use-docgpt)\n    - [How to Develop a docGPT with Streamlit?](#how-to-develop-a-docgpt-with-streamlit)\n    - [Advanced - How to build a better model in langchain](#advanced---how-to-build-a-better-model-in-langchain)\n\n* Main Development Software and Packages:\n    * `Python 3.10.11`\n    * `Langchain 0.0.218`\n    * `Streamlit 1.22.0`\n    * [more](./requirements.txt)\n\nIf you like this project, please give it a ⭐`Star` to support the developers~\n\n### 📚Introduction\n\n* Upload a Document link from your local device (`.pdf`, `.docx`, `.csv`, `.txt`) and query `docGPT` about the content of the Document. For example, you can ask GPT to summarize an article.\n\n* Provide two models:\n  * `gpt4free`\n    * **Completely free, allowing users to use the application without the need for API keys or payments.**\n    * Select the `Provider`. For more details about `gpt4free`, please refer to the [source project](https://github.com/xtekky/gpt4free).\n  * `openai`\n    * **Requires an `openai_api_key`, which you can obtain from [this link](https://platform.openai.com/).**\n    * If you have an `serpapi_key`, AI responses can include Google search results.\n\n<p align=\"center\">\n<img src=\"static/img/2023-09-06-14-56-20.png\" width=\"80%\">\n</p>\n\n---\n\n### 🧨Features\n\n- **`gpt4free` Integration**: Everyone can use `docGPT` for **free** without needing an OpenAI API key.\n- **Support docx, pdf, csv, txt file**: Users can upload PDF, Word, CSV, txt file.\n- **Direct Document URL Input**: Users can input Document `URL` links for parsing without uploading document files(see the demo).\n- **Langchain Agent**: Enables AI to answer current questions and achieve Google search-like functionality.\n- **User-Friendly Environment**: Easy-to-use interface for simple operations.\n\n---\n\n### 🦜️What's LangChain?\n\n* LangChain is a framework for developing applications powered by language models. It supports the following applications:\n    1. Connecting LLM models with external data sources.\n    2. Interactive communication with LLM models.\n\n* For more details about LangChain, refer to the [official documentation](https://github.com/hwchase17/langchain).\n\n**For questions that ChatGPT can't answer, turn to LangChain!**\n\nLangChain fills in the gaps left by ChatGPT. Through the following example, you can understand the power of LangChain:\n\n> In cases where ChatGPT can't solve mathematical problems or answer questions about events after 2020 (e.g., \"Who is the president in 2023?\"):\n>\n> * For mathematical problems: There's a math-LLM model dedicated to handling math queries.\n> * For modern topics: You can use Google search.\n>\n> To create a comprehensive AI model, we need to combine \"ChatGPT,\" \"math-LLM,\" and \"Google search\" tools.\n>\n> In the non-AI era, we used `if...else...` to categorize user queries and had users select the question type through UI.\n>\n> In the AI era, users should be able to directly ask questions without preselecting the question type. With LangChain's agent:\n>  * We provide tools to the agent, e.g., `tools = ['chatgpt', 'math-llm', 'google-search']`.\n>  * Tools can include chains designed using LangChain, such as using a retrievalQA chain to answer questions from documents.\n>  * **The agent automatically decides which tool to use based on user queries** (fully automated).\n\nThrough LangChain, you can create a universal AI model or tailor it for business applications.\n\n\n---\n\n### 🚩How to Use docGPT?\n\n1. 🎬Visit the [application](https://docgpt-app.streamlit.app/).\n\n2. 🔑Enter your `API_KEY` (optional in Version 3, as you can use the `gpt4free` free model):\n   - `OpenAI API KEY`: Ensure you have available usage.\n   - `SERPAPI API KEY`: Required if you want to query content not present in the Document.\n\n3. 📁Upload a Document file (choose one method)\n    * Method 1: Browse and upload your own `.pdf`, `.docx`, `.csv`, `.txt` file from your local machine.\n    * Method 2: Enter the Document `URL` link directly.\n\n4. 🚀Start asking questions!\n\n![docGPT](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/static/img/docGPT.gif?raw=true)\n\n> [!WARNING]\n> Due to resource limitations in the free version of Streamlit Cloud, the application may experience crashes when used by multiple users simultaneously ([Oh no!](https://github.com/Lin-jun-xiang/docGPT-langchain/issues/2)). If you encounter this problem, feel free to report it in the issue tracker, and the developers will restart the application.\n\n---\n\n### 🧠How to Develop a docGPT with Streamlit?\n\nA step-by-step tutorial to quickly build your own chatGPT!\n\nFirst, clone the repository using `git clone https://github.com/Lin-jun-xiang/docGPT-streamlit.git`.\n\nThere are few methods:\n\n* **Local development without docker**:\n    * Download the required packages for development.\n        ```\n        pip install -r requirements.txt\n        ```\n\n    * Start the service in the project's root directory.\n        ```\n        streamlit run ./app.py\n        ```\n\n    * Start exploring! You server will now be running at `http://localhost:8501`.\n\n* **Local development with docker**:\n    * Start the service using Docker Compose\n        ```\n        docker-compose up\n        ```\n\n        You server will now be running at `http://localhost:8501`. You can interact with the `docGPT` or run your tests as you would normally.\n    \n    * To stop the Docker containers, simply run:\n        ```\n        docker-compose down\n        ```\n\n* **Streamlit Community Cloud for free** deployment, management, and sharing of applications:\n   - Place your application in a public GitHub repository (ensure you have `requirements.txt`).\n   - Log in to [share.streamlit.io](https://share.streamlit.io/).\n   - Click \"Deploy an App,\" then paste your GitHub URL.\n   - Complete deployment and share your [application](https://docgpt-app.streamlit.app//).\n\nDue to the limitations of the free version of Streamlit Cloud and its reliance on server resources, `docGPT` may experience some latency. We recommend users to consider deploying it locally for a smoother experience\n\n---\n\n### 💬Advanced - How to build a better model in langchain\n\nTo build a powerful docGPT model in LangChain, consider these tips to enhance performance:\n\n1. **Language Model**\n\n    Select an appropriate LLM model, such as OpenAI's `gpt-3.5-turbo` or other models. Experiment with different models to find the best fit for your use case.\n\n    ```python\n    # ./docGPT/docGPT.py\n    llm = ChatOpenAI(\n    temperature=0.2,\n    max_tokens=2000,\n    model_name='gpt-3.5-turbo'\n    )\n    ```\n\n    Please note that there is no best or worst model. You need to try multiple models to find the one that suits your use case the best. For more OpenAI models, please refer to the [documentation](https://platform.openai.com/docs/models).\n    \n    (Some models support up to 16,000 tokens!)\n\n2. **PDF Loader**\n\n    Choose a suitable PDF loader. Consider using `PyMuPDF` for fast text extraction and `PDFPlumber` for extracting text from tables.\n    \n    ([official Langchain documentation](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf))\n\n    * `PyPDF`: Simple and easy to use.\n    * `PyMuPDF`: Reads the document very **quickly** and provides additional metadata such as page numbers and document dates.\n    * `PDFPlumber`: Can **extract text within tables**. Similar to PyMuPDF, it provides metadata but takes longer to parse.\n\n    If your document contains multiple tables and important information is within those tables, it is recommended to try `PDFPlumber`, which may give you unexpected results!\n\n    Please do not overlook this detail, as without correctly parsing the text from the document, even the most powerful LLM model would be useless!\n\n3. **Tracking Token Usage**\n\n    Implement token usage tracking with callbacks in LangChain to monitor token and API key usage during the QA chain process.\n\n    When using `chain.run`, you can try using the [method](https://python.langchain.com/docs/modules/model_io/models/llms/how_to/token_usage_tracking) provided by Langchain to track token usage here:\n\n    ```python\n    from langchain.callbacks import get_openai_callback\n\n    with get_openai_callback() as callback:\n        response = self.qa_chain.run(query)\n\n    print(callback)\n\n    # Result of print\n    \"\"\"\n    chain...\n    ...\n    > Finished chain.\n    Total Tokens: 1506\n    Prompt Tokens: 1350\n    Completion Tokens: 156\n    Total Cost (USD): $0.03012\n    ```\n\n<a href=\"#top\">Back to top</a>\n "
  },
  {
    "path": "README.zh-TW.md",
    "content": "<p align=\"center\">\n    <img style=\"width: 50%; height: auto;\" src=\"./static/img/repos_logo.png\" alt=\"Chatbot Image\">\n</p>\n\n[English](./README.md) | [中文版](./README.zh-TW.md)\n\n免費的`docGPT`允許您與您的文件 (`.pdf`, `.docx`, `.csv`, `.txt`) 進行對話，無需任何金鑰或費用。\n\n此外，您也可以根據該文件操作，將程序部屬在任何地方。\n\n- 目錄\n    - [Introduction](#introduction)\n    - [Features](#🧨features)\n    - [What's LangChain?](#whats-langchain)\n    - [How to Use docGPT?](#how-to-use-docgpt)\n    - [How to develope a docGPT with streamlit?](#how-to-develope-a-docgpt-with-streamlit)\n    - [Advanced - How to build a better model in langchain](#advanced---how-to-build-a-better-model-in-langchain)\n\n* 主要開發軟體與套件:\n    * `Python 3.10.11`\n    * `Langchain 0.0.218`\n    * `Streamlit 1.22.0`\n    * [more](./requirements.txt)\n\n如果您喜歡這個專案，請給予⭐`Star`以支持開發者~\n\n### 📚Introduction\n\n* 上傳來自本地的 Document 連結 (`.pdf`, `.docx`, `.csv`, `.txt`)，並且向 `docGPT` 詢問有關 Document 內容。例如: 您可以請 GPT 幫忙總結文章\n* 提供兩種模型選擇:\n  * `gpt4free`\n    * **完全免費，\"允許使用者在無需輸入 API 金鑰或付款的情況下使用該應用程序\"**\n    * 需選擇 `Provider`。有關 `gpt4free` 的更多詳細信息，請參閱[源專案](https://github.com/xtekky/gpt4free)\n  * `openai`\n    * **須具備** `openai_api_key`，您可以從此[鏈接](https://platform.openai.com/)獲取金鑰\n    * 若具備 `serpapi_key`，AI 的回應可以包括 Google 搜索結果\n\n<p align=\"center\">\n<img src=\"static/img/2023-09-06-14-56-20.png\" width=\"80%\">\n</p>\n\n---\n\n### 🧨Features\n\n- **`gpt4free` 整合**：任何人都可以免費使用 GPT4，無需輸入 OpenAI API 金鑰。\n- **支援 docx, pdf, csv, txt 檔案**: 可以上傳 PDF, Word, CSV, txt 檔\n- **直接輸入 Document 網址**：使用者可以直接輸入 Document URL 進行解析，無需從本地上傳檔案(如下方demo所示)。\n- **Langchain Agent**：AI 能夠回答當前問題，實現類似 Google 搜尋功能。\n- **簡易操作環境**：友善的界面，操作簡便\n\n---\n\n### 🦜️What's LangChain?\n\n* LangChain 是一個用於**開發由語言模型支持的應用程序的框架**。它支持以下應用程序\n    1. 將 LLM 模型與外部數據源進行連接\n    2. 允許與 LLM 模型進行交互\n\n* 有關 langchain 的介紹，建議查看官方文件、[Github源專案](https://github.com/hwchase17/langchain)\n\n\n**ChatGPT 無法回答的問題，交給 Langchain 實現!**\n\nLangChain 填補了 ChatGPT 的不足之處。通過以下示例，您可以理解 LangChain 的威力：\n\n> 在 ChatGPT 無法解答數學問題或回答 2020 年以後的問題（例如“2023 年的總統是誰？”）的情況下：\n>\n> * 數學問題: 有專門處理數學問題的 math-LLM 模型\n> * 現今問題: 使用 Google 搜索\n>\n> 要創建一個全面的 AI 模型，我們需要結合 \"ChatGPT\"、\"math-LLM\" 和 \"Google 搜索\" 工具。\n>\n> 在非 AI 時代，我們將使用 `if...else...` 將用戶查詢進行分類，讓用戶選擇問題類型（通過 UI）。\n>\n> 在 AI 時代，用戶應能夠直接提問。通過 LangChain 的 agent：\n>\n>  * 我們向 agent 提供工具，例如 `tools = ['chatgpt', 'math-llm', 'google-search']`\n>  * 工具可以包括使用 LangChain 設計的 chains，例如使用 `retrievalQA chain` 回答來自文檔的問題。\n>  * agent 根據用戶查詢自動決定使用哪個工具（完全自動化）。\n\n通過 LangChain，您可以創建通用的 AI 模型，也可以為**商業應用**量身定制。\n\n---\n\n### 🚩How to Use docGPT?\n\n1. 🎬前往[應用程序](https://docgpt-app.streamlit.app/)\n\n2. 🔑輸入您的 `API_KEY` (在版本 3 中為可選，您可以使用 `gpt4free` 免費模型):\n    * `OpenAI API KEY`: 確保還有可用的使用次數。\n    * `SERPAPI API KEY`: 如果您要查詢 Document 中不存在的內容，則需要使用此金鑰。\n\n3. 📁上傳來自本地的 Document 檔案 (選擇一個方法)\n    * 方法一: 從本地機瀏覽並上傳自己的 `.pdf`, `.docx`, `.csv` or `.txt` 檔\n    * 方法二: 輸入 Document URL 連結\n\n4. 🚀開始提問 ! \n\n![RGB_cleanup](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/static/img/docGPT.gif?raw=true)\n\n> [!WARNING]\n> 由於免費版 streamlit cloud 資源限制，該程序在多人同時使用時，容易引發崩潰([Oh no!](https://github.com/Lin-jun-xiang/docGPT-langchain/issues/2))，若遇上該問題歡迎到 Issue 提醒開發者，開發者會重啟程序。\n\n\n---\n\n### 🧠How to develope a docGPT with streamlit?\n\n手把手教學，讓您快速建立一個屬於自己的 chatGPT !\n\n首先請進行 `git clone https://github.com/Lin-jun-xiang/docGPT-streamlit.git`\n\n方法有如下幾種方法:\n\n* 於**本地開發方式(不使用docker)**:\n    * 下載開發需求套件\n        ```\n        pip install -r requirements.txt\n        ```\n\n    * 於專案根目錄啟動服務\n        ```\n        streamlit run ./app.py\n        ```\n\n    * 開始體驗! 您的服務會運行在 `http://localhost:8501`.\n\n* 於**本地開發方式(使用docker)**:\n    * 使用 Docker Compose 啟動服務\n        ```\n        docker-compose up\n        ```\n\n        您的服務會運行在 `http://localhost:8501`. 您可以開始使用 `docGPT` 應用程序\n    \n    * 停止服務運行\n        ```\n        docker-compose down\n        ```\n\n* 使用 Streamlit Community **Cloud 免費部屬**、管理和共享應用程序\n    * 將您的應用程序放在公共 GitHub 存儲庫中（確保有 `requirements.txt`！）\n    * 登錄[share.streamlit.io](https://share.streamlit.io/)\n    * 單擊“部署應用程序”，然後粘貼您的 GitHub URL\n    * 完成部屬[應用程序](https://docgpt-app.streamlit.app//)\n\n由於 `docGPT` 是使用 streamlit cloud 免費版部屬，受限於設備關係會有不少延遲，建議使用者可以使用本地部屬方式來體驗。\n\n---\n\n### 💬Advanced - How to build a better model in langchain\n\n要在 LangChain 中構建功能強大的 docGPT 模型，請考慮以下技巧以改進性能\n\n1. **Language Model**\n   \n   使用適當的 LLM Model，會讓您事半功倍，例如您可以選擇使用 OpenAI 的 `gpt-3.5-turbo` (預設是 `text-davinci-003`):\n\n   ```python\n   # ./docGPT/docGPT.py\n   llm = ChatOpenAI(\n    temperature=0.2,\n    max_tokens=2000,\n    model_name='gpt-3.5-turbo'\n   ) \n   ```\n\n   請注意，模型之間並沒有最好與最壞，您需要多試幾個模型，才會發現最適合自己案例的模型，更多 OpenAI model 請[參考](https://platform.openai.com/docs/models)\n   \n   (部分模型可以使用 16,000 tokens!)\n\n2. **PDF Loader**\n\n    在 Python 中有許多解析 PDF 文字的 Loader，每個 Loader 各有優缺點，以下整理三個作者用過的\n    \n    ([Langchain官方介紹](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf)):\n\n    * `PyPDF`: 簡單易用\n    * `PyMuPDF`: 讀取文件**速度非常快速**，除了能解析文字，還能取得頁數、文檔日期...等 MetaData。\n    * `PDFPlumber`: 能夠解析出**表格內部文字**，使用方面與 `PyMuPDF` 相似，皆能取得 MetaData，但是解析時間較長。\n\n    如果您的文件具有多個表格，且重要資訊存在表格中，建議您嘗試 `PDFPlumber`，它會給您意想不到的結果!\n    請不要忽略這個細節，因為沒有正確解析出文件中的文字，即使 LLM 模型再強大也無用! \n\n3. **Tracking Token Usage**\n\n    這個並不能讓模型強大，但是能讓您清楚知道 QA Chain 的過程中，您使用的 tokens、openai api key 的使用量。\n\n    當您使用 `chain.run` 時，可以嘗試用 langchain 提供的 [方法](https://python.langchain.com/docs/modules/model_io/models/llms/how_to/token_usage_tracking):\n\n    ```python\n    from langchain.callbacks import get_openai_callback\n\n    with get_openai_callback() as callback:\n        response = self.qa_chain.run(query)\n\n    print(callback)\n\n    # Result of print\n    \"\"\"\n    chain...\n    ...\n    > Finished chain.\n    Total Tokens: 1506\n    Prompt Tokens: 1350\n    Completion Tokens: 156\n    Total Cost (USD): $0.03012\n    \"\"\"\n    ```\n\n<a href=\"#top\">Back to top</a>\n"
  },
  {
    "path": "app.py",
    "content": "import os\n\nos.chdir(os.path.dirname(os.path.abspath(__file__)))\nos.environ['SERPAPI_API_KEY'] = ''\n\nimport streamlit as st\nfrom streamlit import logger\nfrom streamlit_chat import message\n\nfrom components import get_response, side_bar, theme, upload_and_process_document\nfrom docGPT import create_doc_gpt\n\nOPENAI_API_KEY = ''\nSERPAPI_API_KEY = ''\nmodel = None\n\nst.session_state.openai_api_key = None\nst.session_state.serpapi_api_key = None\nst.session_state.g4f_provider = None\nst.session_state.button_clicked = None\n\n\nif 'response' not in st.session_state:\n    st.session_state['response'] = ['How can I help you?']\n\nif 'query' not in st.session_state:\n    st.session_state['query'] = ['Hi']\n\napp_logger = logger.get_logger(__name__)\n\n\ndef main():\n    global model\n    theme()\n    side_bar()\n\n    doc_container = st.container()\n    with doc_container:\n        docs = upload_and_process_document()\n\n        if docs:\n            model = create_doc_gpt(\n                docs,\n                {k: v for k, v in docs[0].metadata.items() if k not in ['source', 'file_path']},\n                st.session_state.g4f_provider\n            )\n            app_logger.info(f'{__file__}: Created model: {model}')\n            del docs\n        st.write('---')\n\n    user_container = st.container()\n    response_container = st.container()\n    with user_container:\n        query = st.text_input(\n            \"#### Question:\",\n            placeholder='Enter your question'\n        )\n\n        if model and query and query != '' and not st.session_state.button_clicked:\n            response = get_response(query, model)\n            st.session_state.query.append(query)\n            st.session_state.response.append(response) \n\n    with response_container:\n        if st.session_state['response']:\n            for i in range(len(st.session_state['response'])-1, -1, -1):\n                message(\n                    st.session_state[\"response\"][i], key=str(i),\n                    logo=(\n                        'https://github.com/Lin-jun-xiang/docGPT-streamlit/'\n                        'blob/main/static/img/chatbot_v2.png?raw=true'\n                    )\n                )\n                message(\n                    st.session_state['query'][i], is_user=True, key=str(i) + '_user',\n                    logo=(\n                        'https://api.dicebear.com/6.x/adventurer/svg?'\n                        'hair=short16&hairColor=85c2c6&'\n                        'eyes=variant12&size=100&'\n                        'mouth=variant26&skinColor=f2d3b1'\n                    )\n                )\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "components/__init__.py",
    "content": "from .sidebar import side_bar\nfrom .document_processor import upload_and_process_document\nfrom .response_handler import get_response\nfrom .theme import theme\n\n__all__ = [\n    'get_response',\n    'side_bar',\n    'theme',\n    'upload_and_process_document'\n]\n"
  },
  {
    "path": "components/document_processor.py",
    "content": "import os\nimport tempfile\n\nimport streamlit as st\n\nfrom model import DocumentLoader\n\n\ndef upload_and_process_document() -> list:\n    st.write('#### Upload a Document file')\n    browse, url_link = st.tabs(\n        ['Drag and drop file (Browse files)', 'Enter document URL link']\n    )\n    with browse:\n        upload_file = st.file_uploader(\n            'Browse file (.pdf, .docx, .csv, `.txt`)',\n            type=['pdf', 'docx', 'csv', 'txt'],\n            label_visibility='hidden'\n        )\n        filetype = os.path.splitext(upload_file.name)[1].lower() if upload_file else None\n        upload_file = upload_file.read() if upload_file else None\n\n    with url_link:\n        doc_url = st.text_input(\n            \"Enter document URL Link (.pdf, .docx, .csv, .txt)\",\n            placeholder='https://www.xxx/uploads/file.pdf',\n            label_visibility='hidden'\n        )\n        if doc_url:\n            upload_file, filetype = DocumentLoader.crawl_file(doc_url)\n\n    if upload_file and filetype:\n        temp_file = tempfile.NamedTemporaryFile(delete=False)\n        temp_file.write(upload_file)\n        temp_file_path = temp_file.name\n\n        docs = DocumentLoader.load_documents(temp_file_path, filetype)\n        docs = DocumentLoader.split_documents(\n            docs, chunk_size=2000,\n            chunk_overlap=200\n        )\n\n        temp_file.close()\n        if temp_file_path:\n            os.remove(temp_file_path)\n\n        return docs\n"
  },
  {
    "path": "components/response_handler.py",
    "content": "from streamlit import logger\n\napp_logger = logger.get_logger(__name__)\n\ndef get_response(query: str, model) -> str:\n    app_logger.info(f'\\033[36mUser Query: {query}\\033[0m')\n    try:\n        if model is not None and query:\n            response = model.run(query)\n            app_logger.info(f'\\033[36mLLM Response: {response}\\033[0m')\n            return response\n        return (\n            'Your model still not created.\\n'\n            '1. If you are using gpt4free model, '\n            'try to re-select a provider. '\n            '(Click the \"Show Available Providers\" button in sidebar)\\n'\n            '2. If you are using openai model, '\n            'try to re-pass openai api key.\\n'\n            '3. Or you did not pass the file successfully.\\n'\n            '4. Try to Refresh the page (F5).'\n        )\n    except Exception as e:\n        app_logger.info(f'{__file__}: {e}')\n        return (\n            'Something wrong in docGPT...\\n'\n            '1. If you are using gpt4free model, '\n            'try to select the different provider. '\n            '(Click the \"Show Available Providers\" button in sidebar)\\n'\n            '2. If you are using openai model, '\n            'check your usage for openai api key.\\n'\n            '3. Try to Refresh the page (F5).'\n        )\n"
  },
  {
    "path": "components/sidebar.py",
    "content": "import asyncio\nimport os\n\nimport streamlit as st\n\nfrom docGPT import GPT4Free\n\n\ndef side_bar() -> None:\n    with st.sidebar:\n        with st.expander(':orange[How to use?]'):\n            st.markdown(\n                \"\"\"\n                1. Enter your API keys: (You can use the `gpt4free` free model **without API keys**)\n                    * `OpenAI API Key`: Make sure you still have usage left\n                    * `SERPAPI API Key`: Optional. If you want to ask questions about content not appearing in the PDF document, you need this key.\n                2. **Upload a Document** file (choose one method):\n                    * method1: Browse and upload your own document file from your local machine.\n                    * method2: Enter the document URL link directly.\n                    \n                    (**support documents**: `.pdf`, `.docx`, `.csv`, `.txt`)\n                3. Start asking questions!\n                4. More details.(https://github.com/Lin-jun-xiang/docGPT-streamlit)\n                5. If you have any questions, feel free to leave comments and engage in discussions.(https://github.com/Lin-jun-xiang/docGPT-streamlit/issues)\n                \"\"\"\n            )\n\n    with st.sidebar:\n        if st.session_state.openai_api_key:\n            OPENAI_API_KEY = st.session_state.openai_api_key\n            st.sidebar.success('API key loaded form previous input')\n        else:\n            OPENAI_API_KEY = st.sidebar.text_input(\n                label='#### Your OpenAI API Key 👇',\n                placeholder=\"sk-...\",\n                type=\"password\",\n                key='OPENAI_API_KEY'\n            )\n            st.session_state.openai_api_key = OPENAI_API_KEY\n\n        os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY\n\n    with st.sidebar:\n        if st.session_state.serpapi_api_key:\n            SERPAPI_API_KEY = st.session_state.serpapi_api_key\n            st.sidebar.success('API key loaded form previous input')\n        else:\n            SERPAPI_API_KEY = st.sidebar.text_input(\n                label='#### Your SERPAPI API Key 👇',\n                placeholder=\"...\",\n                type=\"password\",\n                key='SERPAPI_API_KEY'\n            )\n            st.session_state.serpapi_api_key = SERPAPI_API_KEY\n\n        os.environ['SERPAPI_API_KEY'] = SERPAPI_API_KEY\n\n    with st.sidebar:\n        gpt4free = GPT4Free()\n        st.session_state.g4f_provider = st.selectbox(\n            (\n                \"#### Select a provider if you want to use free model. \"\n                \"([details](https://github.com/xtekky/gpt4free#models))\"\n            ),\n            (['BestProvider'] + list(gpt4free.providers_table.keys()))\n        )\n\n        st.session_state.button_clicked = st.button(\n            'Show Available Providers',\n            help='Click to test which providers are currently available.',\n            type='primary'\n        )\n        if st.session_state.button_clicked:\n            available_providers = asyncio.run(gpt4free.show_available_providers())\n            st.session_state.query.append('What are the available providers right now?')\n            st.session_state.response.append(\n                'The current available providers are:\\n'\n                f'{available_providers}'\n            )\n"
  },
  {
    "path": "components/theme.py",
    "content": "import streamlit as st\n\n\ndef theme() -> None:\n    st.set_page_config(page_title=\"Document GPT\")\n    st.image('./static/img/chatbot_v2.png', width=150)\n"
  },
  {
    "path": "docGPT/__init__.py",
    "content": "import os\n\nimport openai\nimport streamlit as st\nfrom langchain.chat_models import ChatOpenAI\nfrom streamlit import logger\n\nfrom .agent import AgentHelper\nfrom .check_api_key import OpenAiAPI, SerpAPI\nfrom .docGPT import DocGPT, GPT4Free\n\nopenai.api_key = os.getenv('OPENAI_API_KEY')\nos.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY')\nmodule_logger = logger.get_logger(__name__)\n\n\n@st.cache_resource(ttl=1200, max_entries=3)\ndef create_doc_gpt(\n    _docs: list,\n    doc_metadata: str,\n    g4f_provider: str\n) -> DocGPT:\n    docGPT = DocGPT(docs=_docs)\n\n    try:\n        if OpenAiAPI.is_valid():\n            # Use openai llm model with agent\n            docGPT_tool, calculate_tool, search_tool, llm_tool = [None] * 4\n            agent_ = AgentHelper()\n\n            llm_model = ChatOpenAI(\n                temperature=0.2,\n                max_tokens=6000,\n                model_name='gpt-3.5-turbo-16k'\n            )\n            docGPT.llm = llm_model\n            agent_.llm = llm_model\n\n            docGPT.create_qa_chain(chain_type='refine', verbose=False)\n            docGPT_tool = agent_.create_doc_chat(docGPT)\n            calculate_tool = agent_.get_calculate_chain\n            # llm_tool = agent_.create_llm_chain()\n\n            module_logger.info('\\033[43mUsing OpenAI model...\\033[0m')\n\n            if SerpAPI.is_valid():\n                search_tool = agent_.get_searp_chain\n\n                tools = [\n                    docGPT_tool,\n                    search_tool,\n                    # llm_tool, # This will cause agent confuse\n                    calculate_tool\n                ]\n                agent_.initialize(tools)\n                return agent_ if agent_ is not None else None\n            else:\n                return docGPT\n        else:\n            # Use gpt4free llm model without agent\n            llm_model = GPT4Free(provider=g4f_provider)\n            docGPT.llm = llm_model\n            docGPT.create_qa_chain(chain_type='refine', verbose=False)\n            module_logger.info('\\033[43mUsing Gpt4free model...\\033[0m')\n            return docGPT\n\n    except Exception as e:\n        print(e)\n        module_logger.info(f'{__file__}: {e}')\n"
  },
  {
    "path": "docGPT/agent.py",
    "content": "import os\nfrom typing import Optional\n\nimport openai\nfrom langchain.agents import AgentType, Tool, initialize_agent\nfrom langchain.callbacks import get_openai_callback\nfrom langchain.chains import LLMChain\nfrom langchain.prompts import PromptTemplate\n\nopenai.api_key = os.getenv('OPENAI_API_KEY')\nos.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY')\n\n\nclass AgentHelper:\n    \"\"\"Add agent to help docGPT can be perfonm better.\"\"\"\n    def __init__(self) -> None:\n        self._llm = None\n        self.agent_ = None\n        self.tools = []\n\n    @property\n    def llm(self):\n        return self._llm\n\n    @llm.setter\n    def llm(self, llm) -> None:\n        self._llm = llm\n\n    @property\n    def get_calculate_chain(self) -> Tool:\n        from langchain import LLMMathChain\n\n        llm_math_chain = LLMMathChain.from_llm(llm=self.llm, verbose=True)\n        tool = Tool(\n            name='Calculator',\n            func=llm_math_chain.run,\n            description='useful for when you need to answer questions about math'\n        )\n        return tool\n\n    @property\n    def get_searp_chain(self) -> Tool:\n        from langchain import SerpAPIWrapper\n\n        search = SerpAPIWrapper()\n        tool = Tool(\n            name='Search',\n            func=search.run,\n            description='useful for when you need to answer questions about current events'\n        )\n        return tool\n\n    def create_doc_chat(self, docGPT) -> Tool:\n        \"\"\"Add a custom docGPT tool\"\"\"\n        tool = Tool(\n            name='DocumentGPT',\n            func=docGPT.run,\n            description=\"\"\"\n            useful for when you need to answer questions from the context of PDF\n            \"\"\"\n        )\n        return tool\n\n    def create_llm_chain(self) -> Tool:\n        \"\"\"Add a llm tool\"\"\"\n        prompt = PromptTemplate(\n            input_variables = ['query'],\n            template = '{query}'\n        )\n        llm_chain = LLMChain(llm=self.llm, prompt=prompt)\n\n        tool = Tool(\n            name='LLM',\n            func=llm_chain.run,\n            description='useful for general purpose queries and logic.'\n        )\n        return tool\n\n    def initialize(self, tools):\n        for tool in tools:\n            if isinstance(tool, Tool):\n                self.tools.append(tool)\n\n        self.agent_ = initialize_agent(\n            self.tools,\n            self.llm,\n            agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n            verbose=True\n        )\n\n    def run(self, query: str) -> Optional[str]:\n        response = None\n        with get_openai_callback() as callback:\n            try:\n                response = self.agent_.run(query)\n            except ValueError as e:\n                response = 'Something wrong in agent: ' + str(e)\n                if not response.startswith(\"Could not parse LLM output: `\"):\n                    raise e\n\n            print(callback)\n        return response\n"
  },
  {
    "path": "docGPT/check_api_key.py",
    "content": "import os\nfrom abc import ABC, abstractmethod\n\nimport openai\nimport streamlit as st\n\n\nclass ApiKey(ABC):\n    \"\"\"Check the Api key is valid or not\"\"\"\n    query = 'This is a test.'\n\n    @classmethod\n    @abstractmethod\n    def is_valid(cls):\n        pass\n\n\nclass OpenAiAPI(ApiKey):\n    @classmethod\n    def is_valid(cls) -> str:\n        if not st.session_state['openai_api_key']:\n            st.error('⚠️ :red[You have not pass OpenAI API key.] Use default model')\n            return\n\n        openai.api_key = os.getenv('OPENAI_API_KEY')\n        try:\n            response = openai.Completion.create(\n                engine='davinci',\n                prompt=cls.query,\n                max_tokens=5\n            )\n            return response\n        except Exception as e:\n            st.error(\n                '🚨 :red[Your OpenAI API key has a problem.] '\n                '[Check your usage](https://platform.openai.com/account/usage)'\n            )\n            print(f'Test error\\n{e}')\n\n\nclass SerpAPI(ApiKey):\n    @classmethod\n    def is_valid(cls) -> str:\n        if not st.session_state['serpapi_api_key']:\n            st.warning('⚠️ You have not pass SerpAPI key. (You cannot ask current events.)')\n            return\n        from langchain import SerpAPIWrapper\n\n        os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY')\n        try:\n            search = SerpAPIWrapper()\n            response = search.run(cls.query)\n            return response\n        except Exception as e:\n            st.error(\n                '🚨 :red[Your SerpAPI key has a problem.] '\n                '[Check your usage](https://serpapi.com/dashboard)'\n            )\n            print(f'Test error\\n{e}')\n"
  },
  {
    "path": "docGPT/docGPT.py",
    "content": "import asyncio\nimport os\nfrom abc import ABC, abstractmethod\nfrom typing import List, Optional\n\nimport g4f\nimport openai\nfrom langchain.callbacks import get_openai_callback\nfrom langchain.callbacks.manager import CallbackManagerForLLMRun\nfrom langchain.chains import RetrievalQA\nfrom langchain.embeddings import HuggingFaceEmbeddings\nfrom langchain.embeddings.openai import OpenAIEmbeddings\nfrom langchain.llms.base import LLM\nfrom langchain.prompts import PromptTemplate\nfrom langchain.vectorstores import FAISS\nfrom streamlit import logger\n\nopenai.api_key = os.getenv('OPENAI_API_KEY')\nmodule_logger = logger.get_logger(__name__)\n\n\nclass BaseQaChain(ABC):\n    def __init__(\n        self,\n        chain_type: str,\n        retriever,\n        llm\n    ) -> None:\n        self.chain_type = chain_type\n        self.retriever = retriever\n        self.llm = llm\n\n    @abstractmethod\n    def create_qa_chain(self):\n        pass\n\n\nclass RChain(BaseQaChain):\n    def __init__(\n        self,\n        chain_type: str,\n        retriever,\n        llm,\n        chain_type_kwargs: dict\n    ) -> None:\n        super().__init__(chain_type, retriever, llm)\n        self.chain_type_kwargs = chain_type_kwargs\n\n    @property\n    def create_qa_chain(self) -> RetrievalQA:\n        qa_chain = RetrievalQA.from_chain_type(\n            llm=self.llm,\n            chain_type=self.chain_type,\n            retriever=self.retriever,\n            chain_type_kwargs=self.chain_type_kwargs\n        )\n        return qa_chain\n\n\nclass CRChain(BaseQaChain):\n    def __init__(\n        self,\n        chain_type: str,\n        retriever,\n        llm,\n    ) -> None:\n        super().__init__(chain_type, retriever, llm)\n\n    @property\n    def create_qa_chain(self):\n        # TODO: cannot use conversation qa chain\n        from langchain.chains import ConversationalRetrievalChain\n        from langchain.memory import ConversationBufferMemory\n\n        memory = ConversationBufferMemory(\n            memory_key='chat_history',\n            return_messages=True\n        )\n        qa_chain = ConversationalRetrievalChain.from_llm(\n            llm=self.llm,\n            chain_type=self.chain_type,\n            retriever=self.retriever,\n            memory=memory\n        )\n        return qa_chain    \n\n\nclass DocGPT:\n    def __init__(self, docs):\n        self.docs = docs\n        self.qa_chain = None\n        self._llm = None\n\n        self.prompt_template = (\n            \"Only answer what is asked. Answer step-by-step.\\n\"\n            \"If the content has sections, please summarize them \"\n            \"in order and present them in a bulleted format.\\n\"\n            \"Utilize line breaks for better readability.\\n\"\n            \"For example, sequentially summarize the \"\n            \"introduction, methods, results, and so on.\\n\"\n            \"Please use Python's newline symbols appropriately to \"\n            \"enhance the readability of the response, \"\n            \"but don't use two newline symbols consecutive.\\n\\n\"\n            \"{context}\\n\\n\"\n            \"Question: {question}\\n\"\n        )\n        self.prompt = PromptTemplate(\n            template=self.prompt_template,\n            input_variables=['context', 'question']\n        )\n\n        self.refine_prompt_template = (\n            \"The original question is as follows: {question}\\n\"\n            \"We have provided an existing answer: {existing_answer}\\n\"\n            \"We have the opportunity to refine the existing answer\"\n            \"(only if needed) with some more context below.\\n\"\n            \"------------\\n\"\n            \"{context_str}\\n\"\n            \"------------\\n\"\n            \"Given the new context, refine the original answer to better \"\n            \"answer the question. \"\n            \"If the context isn't useful, return the original answer.\\n\"\n            \"Please use Python's newline symbols \"\n            \"appropriately to enhance the readability of the response, \"\n            \"but don't use two newline symbols consecutive.\\n\"\n        )\n        self.refine_prompt = PromptTemplate(\n            template=self.refine_prompt_template,\n            input_variables=['question', 'existing_answer', 'context_str']\n        )\n\n    @property\n    def llm(self):\n        return self._llm\n\n    @llm.setter\n    def llm(self, llm) -> None:\n        self._llm = llm\n\n    def _helper_prompt(self, chain_type: str) -> None:\n        # TODO: Bug helper\n        if chain_type == 'refine':\n            self.prompt_template = self.prompt_template.replace(\n                '{context}', '{context_str}'\n            )\n            self.prompt.template = self.prompt_template\n            for i in range(len(self.prompt.input_variables)):\n                if self.prompt.input_variables[i] == 'context':\n                    self.prompt.input_variables[i] = 'context_str'\n\n    def _embeddings(self):\n        try:\n            # If have openai api\n            embeddings = OpenAIEmbeddings()\n        except:\n            embeddings = HuggingFaceEmbeddings(\n                model_name=(\n                    'sentence-transformers/'\n                    'multi-qa-MiniLM-L6-cos-v1'\n                )\n            )\n\n        db = FAISS.from_documents(\n            documents=self.docs,\n            embedding=embeddings\n        )\n        module_logger.info('embedded...')\n        return db\n\n    def create_qa_chain(\n        self,\n        chain_type: str ='stuff',\n        verbose: bool = True\n    ) -> BaseQaChain:\n        # TODO: Bug helper\n        self._helper_prompt(chain_type)\n        chain_type_kwargs = {\n            'question_prompt': self.prompt,\n            'verbose': verbose,\n            'refine_prompt': self.refine_prompt\n        }\n\n        db = self._embeddings()\n        retriever = db.as_retriever()\n\n        self.qa_chain = RChain(\n            chain_type=chain_type,\n            retriever=retriever,\n            llm=self._llm,\n            chain_type_kwargs=chain_type_kwargs\n        ).create_qa_chain\n\n    def run(self, query: str) -> str:\n        response = 'Nothing...'\n        with get_openai_callback() as callback:\n            if isinstance(self.qa_chain, RetrievalQA):\n                response = self.qa_chain.run(query)\n            module_logger.info(callback)\n        return response\n\n\nclass GPT4Free(LLM):\n    providers_table = {\n        f'g4f.Provider.{provider}': getattr(g4f.Provider, provider)\n        for provider in g4f.Provider.__all__\n    }\n    provider: str = 'g4f.Provider.DeepAi'\n\n    @property\n    def _llm_type(self) -> str:\n        return 'gpt4free model'\n\n    def _call(\n        self,\n        prompt: str,\n        stop: Optional[List[str]] = None,\n        run_manager: Optional[CallbackManagerForLLMRun] = None,\n    ) -> str:\n        try:\n            # print(f'\\033[36mPromopt: {prompt}\\033[0m')\n            provider = self.providers_table.get(self.provider, None)\n            module_logger.info(\n                f'\\033[36mProvider: {provider}\\033[0m'\n            )\n            return g4f.ChatCompletion.create(\n                model=\"gpt-3.5-turbo\",\n                messages=[{\"role\": \"user\", \"content\": prompt}],\n                provider=provider,\n                ignored=[\"ChatBase\"]\n            )\n        except Exception as e:\n            module_logger.info(f'{__file__}: call gpt4free error - {e}')\n\n    async def _test_provider(self, provider: g4f.Provider) -> str:\n        provider_name = provider.__name__\n        try:\n            await g4f.ChatCompletion.create_async(\n                model=\"gpt-3.5-turbo\",\n                messages=[{\"role\": \"user\", \"content\": 'Hi, this is test'}],\n                provider=provider,\n                ignored=[\"ChatBase\"]\n            )\n            return provider_name\n        except Exception as e:\n            print(f'{provider_name}: {e}')\n\n    async def show_available_providers(self) -> list:\n        \"\"\"Test all the providers then find out which are available\"\"\"\n        tasks = [\n            self._test_provider(provider)\n            for provider in self.providers_table.values()    \n        ]\n        available_providers = await asyncio.gather(*tasks)\n\n        return [\n            available_provider for available_provider in available_providers\n            if available_provider is not None\n        ]\n"
  },
  {
    "path": "docker-compose.yml",
    "content": "version: '3'\n\nservices:\n  docgpt:\n    build:\n      context: .\n      dockerfile: Dockerfile\n    ports:\n      - '8501:8501'\n"
  },
  {
    "path": "model/__init__.py",
    "content": "from .data_connection import (\n    DocumentLoader\n)\n"
  },
  {
    "path": "model/data_connection.py",
    "content": "import os\nfrom typing import Iterator, Union\n\nimport requests\nimport streamlit as st\nfrom langchain.document_loaders import (\n    CSVLoader,\n    Docx2txtLoader,\n    PyMuPDFLoader,\n    TextLoader,\n)\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\n\nclass DocumentLoader:\n    @staticmethod\n    def get_files(path: str, filetype: str = '.pdf') -> Iterator[str]:\n        try:\n            yield from [\n                file_name for file_name in os.listdir(f'{path}')\n                if file_name.endswith(filetype)\n            ]\n        except FileNotFoundError as e:\n            print(f'\\033[31m{e}')\n\n    @staticmethod\n    def load_documents(\n        file: str,\n        filetype: str = '.pdf'\n    ) -> Union[CSVLoader, Docx2txtLoader, PyMuPDFLoader, TextLoader]:\n        \"\"\"Loading PDF, Docx, CSV\"\"\"\n        try:\n            if filetype == '.pdf':\n                loader = PyMuPDFLoader(file)\n            elif filetype == '.docx':\n                loader = Docx2txtLoader(file)\n            elif filetype == '.csv':\n                loader = CSVLoader(file, encoding='utf-8')\n            elif filetype == '.txt':\n                loader = TextLoader(file, encoding='utf-8')\n\n            return loader.load()\n\n        except Exception as e:\n            print(f'\\033[31m{e}')\n            return []\n\n    @staticmethod\n    def split_documents(\n        document: Union[CSVLoader, Docx2txtLoader, PyMuPDFLoader, TextLoader],\n        chunk_size: int=2000,\n        chunk_overlap: int=0\n    ) -> list:\n        splitter = RecursiveCharacterTextSplitter(\n            chunk_size=chunk_size,\n            chunk_overlap=chunk_overlap\n        )\n\n        return splitter.split_documents(document)\n\n    @staticmethod\n    def crawl_file(url: str) -> str:\n        try:\n            response = requests.get(url)\n            filetype = os.path.splitext(url)[1]\n            if response.status_code == 200 and (\n                any(ext in filetype for ext in ['.pdf', '.docx', '.csv', '.txt'])\n            ):\n                return response.content, filetype\n            else:\n                st.warning('Url cannot parse correctly.')\n        except:\n            st.warning('Url cannot parse correctly.')\n"
  },
  {
    "path": "requirements.txt",
    "content": "g4f\nlangchain==0.0.218\nopenai==0.27.8\nstreamlit==1.26.0\nstreamlit_chat==0.1.1\npymupdf==1.22.5\nfaiss-cpu==1.7.4\ntiktoken==0.4.0\ntenacity==8.1.0\ngoogle-search-results==2.4.2\nsentence_transformers\nrequests\nhttpx\ndocx2txt\n"
  }
]