Repository: JinghaoZhao/GPT-Code-Learner Branch: main Commit: 0ea836d08b72 Files: 13 Total size: 40.3 KB Directory structure: gitextract_h5l5fq57/ ├── .gitignore ├── LICENSE ├── README.md ├── code_learner.py ├── code_searcher.py ├── docs/ │ ├── KnowledgeBase.md │ └── LocalLLM.md ├── knowledge_base.py ├── repo_parser.py ├── requirements.txt ├── run.py ├── tool_planner.py └── util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ *.pkl models/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 Jinghao Zhao Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # GPT-Code-Learner Learn A Repo Interactively with GPT. Ask questions to let the GPT explain the code to you. ![GPT-Code-Learner.jpg](docs%2FGPT-Code-Learner.jpg) ### Try it out in your browser ![GUI.jpg](docs%2FGUI.jpg) ## Local LLM Support (Experimental) GPT-Code-Learner supports running the LLM models locally. In general, GPT-Code-Learner uses [LocalAI](https://github.com/go-skynet/LocalAI) for local private LLM and [Sentence Transformers](https://huggingface.co/sentence-transformers) for local embedding. Please refer to [Local LLM](docs/LocalLLM.md) for more details. Note: Due to the current capability of local LLM, the performance of GPT-Code-Learner is not as good as the online version. ## Installation 1. Clone this repository and install the required packages: ``` git clone https://github.com/JinghaoZhao/GPT-Code-Learner.git pip install -r requirements.txt ``` 2. Create a `.env` file to put your API key: ``` OPENAI_API_KEY=sk-xxxxxx LLM_TYPE="OpenAI" EMBEDDING_TYPE="OpenAI" ``` If you want to run the whole program locally, please change the following line in the `.env` file: ``` LLM_TYPE="local" EMBEDDING_TYPE="local" ``` 3. Put the repo url (e.g., Github link) in the `Repo Link` textbox and click `Analyze Code Repo` button in the GUI. Or manually clone the repo you want to learn into `code_repo` folder: ``` cd code_repo git clone ``` 4. Run the GPT-Code-Learner. If you use local LLM models, please run the local model before running the GPT-Code-Learner. Please refer to [Local LLM](docs/LocalLLM.md) for more details. ``` python run.py ``` 5. Open your web browser at http://127.0.0.1:7860 to ask any questions about your repo ## Knowledge Base GPT-Code-Learner generates vector database from the code repo as a knowledge base to answer repo-related questions. By default, it will use the source codes as the knowledge base. More details can be found in [Knowledge Base](docs/KnowledgeBase.md). ## Tool Planner The core of the GPT-Code-Learner is the tool planner. It leverages available tools to process the input to provide contexts. Currently, the tool planner supports the following tools: - **Code_Searcher**: This tool searches keywords (e.g., specific functions or variables) extracted from user query in the code repository - **Repo_Parser**: This tool performs a fuzzy search with vector database of the code repo. It provides contexts for questions about the general procedures in the repo. More tools are under development. Feel free to contribute to this project! ================================================ FILE: code_learner.py ================================================ import gradio as gr import json import requests import os from termcolor import colored from repo_parser import clone_repo, generate_or_load_knowledge_from_repo import tool_planner llm_type = os.environ.get('LLM_TYPE', "local") OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "null") if llm_type == "local": API_URL = "http://localhost:8080/v1/chat/completions" model = "ggml-gpt4all-j" else: API_URL = "https://api.openai.com/v1/chat/completions" model = "gpt-3.5-turbo" code_repo_path = "./code_repo" init_system_prompt = """Now you are an expert programmer and teacher of a code repository. You will be asked to explain the code for a specific task in the repo. You will be provided with some related code snippets or documents related to the question. Please think the explanation step-by-step. Please answer the questions based on your knowledge, and you can also refer to the provided related code snippets. The README.md file and the repo structure are also available for your reference. If you need any details clarified, please ask questions until all issues are clarified. \n\n """ system_prompt = init_system_prompt def generate_response(system_msg, inputs, top_p, temperature, chat_counter, chatbot=[], history=[]): orig_inputs = inputs # Inputs are pre-processed with extra tools inputs = tool_planner.user_input_handler(inputs) print("Inputs Length: ", len(inputs)) # Add checker for the input length to fitin the GPT model window size if llm_type == "local": token_limit = 2000 else: token_limit = 8000 if len(inputs) > token_limit: inputs = inputs[:token_limit] headers = { "Content-Type": "application/json", "Authorization": f"Bearer {OPENAI_API_KEY}" } if system_msg.strip() == '': initial_message = [{"role": "user", "content": f"{inputs}"}] multi_turn_message = [] else: initial_message = [{"role": "system", "content": system_msg}, {"role": "user", "content": f"{inputs}"}] multi_turn_message = [{"role": "system", "content": init_system_prompt}] if chat_counter == 0: payload = { "model": model, "messages": initial_message, "temperature": temperature, "top_p": top_p, "n": 1, "stream": True, "presence_penalty": 0, "frequency_penalty": 0, } else: messages = multi_turn_message for data in chatbot: user = {"role": "user", "content": data[0]} assistant = {"role": "assistant", "content": data[1]} messages.extend([user, assistant]) temp = {"role": "user", "content": inputs} messages.append(temp) payload = { "model": model, "messages": messages, "temperature": temperature, "top_p": top_p, "n": 1, "stream": True, "presence_penalty": 0, "frequency_penalty": 0, } chat_counter += 1 history.append(orig_inputs) print(colored("Orig input from the user: ", "green"), colored(orig_inputs, "green")) print(colored("Input with tools: ", "blue"), colored(inputs, "blue")) response = requests.post(API_URL, headers=headers, json=payload, stream=True) token_counter = 0 partial_words = "" response_complete = False counter = 0 for chunk in response.iter_lines(): if counter == 0: counter += 1 continue if response_complete: print(colored("Response: ", "yellow"), colored(partial_words, "yellow")) if chunk.decode(): chunk = chunk.decode() if chunk.startswith("error:"): print(colored("Chunk: ", "red"), colored(chunk, "red")) # Check if the chatbot is done generating the response try: if len(chunk) > 12 and "finish_reason" in json.loads(chunk[6:])['choices'][0]: response_complete = json.loads(chunk[6:])['choices'][0].get("finish_reason", None) == "stop" except: print("Error in response_complete check") pass try: if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']: partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"] if token_counter == 0: history.append(" " + partial_words) else: history[-1] = partial_words chat = [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)] token_counter += 1 yield chat, history, chat_counter, response except: print("Error in partial_words check") pass def reset_textbox(): return gr.update(value='') def set_visible_false(): return gr.update(visible=False) def set_visible_true(): return gr.update(visible=True) def analyze_repo(repo_url, progress=gr.Progress()): progress(0, desc="Starting") repo_information = clone_repo(repo_url, progress) progress(0.6, desc="Building Knowledge Base") generate_or_load_knowledge_from_repo() if repo_information is not None: return init_system_prompt + repo_information, "Analysis completed" else: return init_system_prompt, "Analysis failed" def main(): title = """

GPT-Code-Learner

""" system_msg_info = """A conversation could begin with a system message to gently instruct the assistant.""" theme = gr.themes.Soft(text_size=gr.themes.sizes.text_md) with gr.Blocks( css="""#col_container { margin-left: auto; margin-right: auto;} #chatbot {height: 520px; overflow: auto;}""", theme=theme, title="GPT-Code-Learner", ) as demo: gr.HTML(title) with gr.Column(elem_id="col_container"): with gr.Accordion(label="System message:", open=False): system_msg = gr.Textbox( label="Instruct the AI Assistant to set its beaviour", info=system_msg_info, value=system_prompt ) accordion_msg = gr.HTML( value="Refresh the app to reset system message", visible=False ) # Add text box for the repo link with submit button with gr.Row(): with gr.Column(scale=6): repo_url = gr.Textbox( placeholder="Repo Link", lines=1, label="Repo Link" ) with gr.Column(scale=2): repo_link_btn = gr.Button("Analyze Code Repo").style(full_width=True) with gr.Column(scale=2): analyze_progress = gr.Textbox(label="Status") repo_link_btn.click(analyze_repo, [repo_url], [system_msg, analyze_progress]) with gr.Row(): with gr.Column(scale=10): chatbot = gr.Chatbot( label='GPT-Code-Learner', elem_id="chatbot" ) state = gr.State([]) with gr.Row(): with gr.Column(scale=8): inputs = gr.Textbox( placeholder="What questions do you have for the repo?", lines=1, label="Type an input and press Enter" ) with gr.Column(scale=2): b1 = gr.Button().style(full_width=True) with gr.Accordion(label="Examples", open=True): gr.Examples( examples=[ ["What is the usage of this repo?"], ["Which function launches the application in the repo?"], ], inputs=inputs) with gr.Accordion("Parameters", open=False): top_p = gr.Slider(minimum=-0, maximum=1.0, value=0.5, step=0.05, interactive=True, label="Top-p (nucleus sampling)", ) temperature = gr.Slider(minimum=-0, maximum=5.0, value=0.5, step=0.1, interactive=True, label="Temperature", ) chat_counter = gr.Number(value=0, visible=True, precision=0) inputs.submit(generate_response, [system_msg, inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter], ) b1.click(generate_response, [system_msg, inputs, top_p, temperature, chat_counter, chatbot, state], [chatbot, state, chat_counter], ) inputs.submit(set_visible_false, [], [system_msg]) b1.click(set_visible_false, [], [system_msg]) inputs.submit(set_visible_true, [], [accordion_msg]) b1.click(set_visible_true, [], [accordion_msg]) b1.click(reset_textbox, [], [inputs]) inputs.submit(reset_textbox, [], [inputs]) demo.queue(max_size=99, concurrency_count=20).launch(debug=True) if __name__ == "__main__": main() ================================================ FILE: code_searcher.py ================================================ import re import subprocess def extract_grep_output(line): # Regular expressions to match the grep output lines regex_colon = r'(.*):(\d+):(.*)' regex_dash = r'(.*?)-(\d+)-(.*)' match_colon = re.match(regex_colon, line) match_dash = re.match(regex_dash, line) if match_colon: filename, line_number, line_content = match_colon.groups() return [filename, line_number, line_content] elif match_dash: filename, line_number, line_content = match_dash.groups() return [filename, line_number, line_content] else: return ["", "", line] def search_function_with_context(function_name, before_lines=5, after_lines=10, search_dir="./code_repo"): command = [ "grep", "-r", # Recursive search "-n", # Print line numbers f"-B{before_lines}", # Show context before the match f"-A{after_lines}", # Show context after the match f"{function_name}", # The search pattern search_dir ] # Run the command and capture the output result = subprocess.run(command, capture_output=True, text=True) # Split the output by lines output_lines = result.stdout.splitlines() # Group the lines by occurrence occurrences = [] current_filename = None current_start_line = None current_lines = [] for line in output_lines: if line.startswith("--"): # This line separates occurrences if current_filename is not None: occurrences.append((current_filename, current_start_line, "\n".join(current_lines))) current_lines = [] else: current_filename, line_number, line_text = extract_grep_output(line) if function_name in line_text: current_start_line = line_number + ":" + line_text current_lines.append(line_text) # Add the last occurrence if there is one if current_filename is not None: occurrences.append((current_filename, current_start_line, "\n".join(current_lines))) return occurrences def get_function_context(function_name): results = search_function_with_context(function_name) output = "" for filename, start_line, context in results: output += f"Filename: {filename}\n" output += f"Start line: {start_line}\n" output += "Context:\n" output += context output += "\n\n" return output if __name__ == "__main__": function_name = "set_visible_true" results = search_function_with_context(function_name) for filename, start_line, context in results: print(f"Filename: {filename}") print(f"Start line: {start_line}") print("Context:") print(context) print() ================================================ FILE: docs/KnowledgeBase.md ================================================ # Knowledge Base GPT-Code-Learner supports using a knowledge base to answer questions. By default, it will use the codebase as the knowledge base. The knowledge base is powered by a vector database. GPT-Code-Learner supports two types of vector databases: local or cloud. By default, it will use the local version. The local version uses [FAISS](https://github.com/facebookresearch/faiss), while the cloud version utilizes [Supabase](https://app.supabase.com/). ## Supabase Setup For the Supabase version, create a Supabase account and project at https://app.supabase.com/sign-in. Next, add your Supabase URL and key to the `.env` file. You can find them in the portal under Project/API. ``` SUPABASE_URL=https://xxxxxx.supabase.co SUPABASE_KEY=xxxxxx ``` Create the default document table using the following SQL, which follows the format of the [langchain example](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/supabase.html). ```postgresql -- Enable the pgvector extension to work with embedding vectors create extension vector; -- Create a table to store your documents create table documents ( id bigserial primary key, content text, -- corresponds to Document.pageContent metadata jsonb, -- corresponds to Document.metadata embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed ); CREATE FUNCTION match_documents(query_embedding vector(1536), match_count int) RETURNS TABLE( id bigint, content text, metadata jsonb, -- we return matched vectors to enable maximal marginal relevance searches embedding vector(1536), similarity float) LANGUAGE plpgsql AS $$ # variable_conflict use_column BEGIN RETURN query SELECT id, content, metadata, embedding, 1 -(documents.embedding <=> query_embedding) AS similarity FROM documents ORDER BY documents.embedding <=> query_embedding LIMIT match_count; END; $$; ``` The [knowledge_base.py](..%2Fknowledge_base.py) provides examples of how to use the knowledge base. ================================================ FILE: docs/LocalLLM.md ================================================ # Using Local LLM Models GPT-Code-Learner uses [LocalAI](https://github.com/go-skynet/LocalAI) to run the LLM models locally. ## Installation Here are general steps for installation on Mac. Please refer to [LocalAI](https://github.com/go-skynet/LocalAI) for more details. ```shell # install build dependencies brew install cmake brew install go # clone the repo git clone https://github.com/go-skynet/LocalAI.git cd LocalAI # build the binary make build # Download gpt4all-j to models/ wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j # Use a template from the examples cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/ # Run LocalAI ./local-ai --models-path ./models/ --debug # Now API is accessible at localhost:8080 curl http://localhost:8080/v1/models curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "ggml-gpt4all-j", "messages": [{"role": "user", "content": "How are you?"}], "temperature": 0.9 }' ``` ## Running GPT-Code-Learner with Local LLM Models Before running GPT-Code-Learner, please make sure the LocalAI is running at localhost:8080. ```shell ./local-ai --models-path ./models/ --debug ``` Then, change the following line in the `.env` file: ``` LLM_TYPE="local" ``` Finally, run the GPT-Code-Learner: ``` python run.py ``` ## Known Issues - The accuracy of the local LLM models is not as good as the online version. We are still working on improving the performance of the local LLM models. - Also, the first message of the conversation are usually blocked in the local LLM models. Restarting the GPT-Code-Learner may solve this issue. ================================================ FILE: knowledge_base.py ================================================ import openai from dotenv import load_dotenv, find_dotenv import os from supabase import create_client, Client from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter from langchain.vectorstores import FAISS, SupabaseVectorStore from langchain.document_loaders import TextLoader, PyPDFLoader import requests from bs4 import BeautifulSoup import pickle from langchain import OpenAI from langchain.chains import VectorDBQAWithSourcesChain from langchain.embeddings.base import Embeddings from sentence_transformers import SentenceTransformer from termcolor import colored class LocalHuggingFaceEmbeddings(Embeddings): def __init__(self, model_id="all-mpnet-base-v2"): self.model = SentenceTransformer(model_id) def embed_documents(self, texts): embeddings = self.model.encode(texts) return embeddings def embed_query(self, text): embedding = self.model.encode(text) return list(map(float, embedding)) def load_documents(filenames): text_splitter = RecursiveCharacterTextSplitter( chunk_size=1500, chunk_overlap=200, length_function=len, ) docs = [] for filename in filenames: if filename.endswith(".pdf"): loader = PyPDFLoader(filename) else: loader = TextLoader(filename) documents = loader.load() splits = text_splitter.split_documents(documents) docs.extend(splits) print(f"Split {filename} into {len(splits)} chunks") return docs def load_urls(urls): text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n") docs, metadatas = [], [] for url in urls: html = requests.get(url).text soup = BeautifulSoup(html, features="html.parser") text = soup.get_text() lines = (line.strip() for line in text.splitlines()) page_content = '\n'.join(line for line in lines if line) splits = text_splitter.split_text(page_content) docs.extend(splits) metadatas.extend([{"source": url}] * len(splits)) print(f"Split {url} into {len(splits)} chunks") return docs, metadatas def load_code_chunks(chunks, filepath): text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n") docs, metadatas = [], [] for chunk in chunks: splits = text_splitter.split_text(chunk) docs.extend(splits) metadatas.extend([{"source": filepath}] * len(splits)) print(f"Split {filepath} into {len(docs)} pieces") return docs, metadatas def local_vdb(knowledge, vdb_path=None): embedding_type = os.environ.get('EMBEDDING_TYPE', "local") if embedding_type == "local": embedding = LocalHuggingFaceEmbeddings() else: embedding = OpenAIEmbeddings(disallowed_special=()) print(colored("Embedding documents...", "green")) faiss_store = FAISS.from_documents(knowledge["known_docs"], embedding=embedding) if vdb_path is not None: with open(vdb_path, "wb") as f: pickle.dump(faiss_store, f) return faiss_store def load_local_vdb(vdb_path): with open(vdb_path, "rb") as f: faiss_store = pickle.load(f) return faiss_store def supabase_vdb(knowledge): supabase_url = os.environ.get("SUPABASE_URL") supabase_key = os.environ.get("SUPABASE_KEY") supabase: Client = create_client(supabase_url, supabase_key) vector_store = SupabaseVectorStore(client=supabase, embedding=OpenAIEmbeddings(), table_name="documents") vector_store.add_documents(knowledge["known_docs"]) vector_store.add_texts(knowledge["known_text"]["pages"], metadatas=knowledge["known_text"]["metadatas"]) return vector_store if __name__ == "__main__": load_dotenv(find_dotenv()) openai.api_key = os.environ.get("OPENAI_API_KEY", "null") query = "What is the usage of this repo?" files = ["./README.md"] urls = ["https://github.com/JinghaoZhao/GPT-Code-Learner"] known_docs = load_documents(files) known_pages, metadatas = load_urls(urls) knowledge_base = {"known_docs": known_docs, "known_text": {"pages": known_pages, "metadatas": metadatas}} faiss_store = local_vdb(knowledge_base) matched_docs = faiss_store.similarity_search(query) for doc in matched_docs: print("------------------------\n", doc) supabase_store = supabase_vdb(knowledge_base) matched_docs = supabase_store.similarity_search(query) for doc in matched_docs: print("------------------------\n", doc) chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=faiss_store) result = chain({"question": query}) print("FAISS result", result) chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=supabase_store) result = chain({"question": query}) print("Supabase result", result) ================================================ FILE: repo_parser.py ================================================ import os import json import openai from termcolor import colored from dotenv import load_dotenv, find_dotenv from knowledge_base import load_documents, load_code_chunks, supabase_vdb, local_vdb, load_local_vdb from collections import deque from pathlib import Path import util import subprocess import gradio as gr def clone_repo(git_url, progress=gr.Progress(), code_repo_path="./code_repo"): print(progress(0.1, desc="Cloning the repo...")) print("Cloning the repo: ", git_url) # Check if directory exists if not os.path.exists(code_repo_path): os.makedirs(code_repo_path) try: subprocess.check_call(['git', 'clone', git_url], cwd=code_repo_path) print(f"Successfully cloned {git_url} into {code_repo_path}") except subprocess.CalledProcessError as e: print(f"Error: {e.output}") print(progress(0.3, desc="Summarizing the repo...")) readme_info = get_readme(code_repo_path) if readme_info is not None: readme_info = """The README.md file is as follows: """ + readme_info + "\n\n" print(progress(0.4, desc="Parsing repo structure...")) repo_structure = get_repo_structure(code_repo_path) if repo_structure is not None: repo_structure = """The repo structure is as follows: """ + get_repo_structure(code_repo_path) + "\n\n" return readme_info + repo_structure def generate_knowledge_from_repo(dir_path, ignore_list): knowledge = {"known_docs": [], "known_text": {"pages": [], "metadatas": []}} for root, dirs, files in os.walk(dir_path): dirs[:] = [d for d in dirs if d not in ignore_list] # modify dirs in-place for file in files: if file in ignore_list: continue filepath = os.path.join(root, file) try: # Using a more general way for code file parsing knowledge["known_docs"].extend(load_documents([filepath])) except Exception as e: print(f"Failed to process {filepath} due to error: {str(e)}") return knowledge # Find the Readme.md file from the code repo in the code_repo folder def find_repo_folder(directory): # Find the name of the folder in the specified directory folder_name = None for item in os.listdir(directory): item_path = os.path.join(directory, item) if os.path.isdir(item_path): folder_name = item break return os.path.join(directory, folder_name) def find_readme(repo_folder): # Search for the README file within the found folder for filename in os.listdir(repo_folder): if filename.lower().startswith('readme'): readme_path = os.path.join(repo_folder, filename) print("README found in folder:", repo_folder) return readme_path print("README not found in folder:", repo_folder) return None # summarize the README file def summarize_readme(readme_path): if readme_path: print(colored("Summarizing README...", "green")) system_prompt = """You are an expert developer and programmer. Please infer the programming languages from the README. You are asked to summarize the README file of the code repository in detail. Provide enough information about the code repository. Please also mention the framework used in the code repository. """ readme_content = open(readme_path, "r").read() user_prompt = f'Here is the README content: {readme_content}' return util.get_chat_response(system_prompt, user_prompt) def bfs_folder_search(text_length_limit=4000, folder_path="./code_repo"): if not Path(folder_path).is_dir(): return "Invalid directory path" root = Path(folder_path).resolve() file_structure = {str(root): {}} queue = deque([(root, file_structure[str(root)])]) while queue: current_dir, parent_node = queue.popleft() try: for path in current_dir.iterdir(): if path.is_dir(): if str(path.name) == ".git": continue parent_node[str(path.name)] = {"files": []} queue.append((path, parent_node[str(path.name)])) else: if "files" not in parent_node: parent_node["files"] = [] parent_node["files"].append(str(path.name)) # Check if we've exceeded the text length limit file_structure_text = json.dumps(file_structure) if len(file_structure_text) >= text_length_limit: return file_structure_text except PermissionError: # This can happen in directories the user doesn't have permission to read. continue return json.dumps(file_structure) def get_readme(code_repo_path="./code_repo"): repo_folder = find_repo_folder(code_repo_path) print(colored("Repo folder: " + repo_folder, "green")) readme_path = find_readme(repo_folder) if readme_path is None: return "README not found" else: summary = summarize_readme(readme_path) print(colored("README Summary: ", "green"), colored(summary, "green")) return summary def get_repo_structure(code_repo_path="./code_repo"): return bfs_folder_search(4000, code_repo_path) def get_repo_names(dir_path): folder_names = [name for name in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, name))] concatenated_names = "-".join(folder_names) return concatenated_names def generate_or_load_knowledge_from_repo(dir_path="./code_repo"): vdb_path = "./vdb-" + get_repo_names(dir_path) + ".pkl" # check if vdb_path exists if os.path.isfile(vdb_path): print(colored("Local VDB found! Loading VDB from file...", "green")) vdb = load_local_vdb(vdb_path) else: print(colored("Generating VDB from repo...", "green")) ignore_list = ['.git', 'node_modules', '__pycache__', '.idea', '.vscode'] knowledge = generate_knowledge_from_repo(dir_path, ignore_list) vdb = local_vdb(knowledge, vdb_path=vdb_path) print(colored("VDB generated!", "green")) return vdb def get_repo_context(query, vdb): matched_docs = vdb.similarity_search(query, k=10) output = "" for idx, docs in enumerate(matched_docs): output += f"Context {idx}:\n" output += str(docs) output += "\n\n" return output if __name__ == '__main__': code_repo_path = "./code_repo" load_dotenv(find_dotenv()) openai.api_key = os.environ.get("OPENAI_API_KEY", "null") print(get_repo_names(code_repo_path)) # Basic repo information get_readme(code_repo_path) print(colored(bfs_folder_search(4000, code_repo_path), "yellow")) # Generate knowledge base vdb = generate_or_load_knowledge_from_repo("./code_repo") # Search the knowledge base query = "How to use the knowledge base?" context = get_repo_context(query, vdb) print(context) ================================================ FILE: requirements.txt ================================================ python-dotenv hupper gradio termcolor openai tenacity supabase langchain tiktoken beautifulsoup4 faiss-cpu pypdf chardet sentence-transformers ================================================ FILE: run.py ================================================ import hupper from dotenv import load_dotenv, find_dotenv if __name__ == '__main__': load_dotenv(find_dotenv()) reloader = hupper.start_reloader('code_learner.main') ================================================ FILE: tool_planner.py ================================================ from code_searcher import get_function_context from repo_parser import generate_or_load_knowledge_from_repo, get_repo_context from termcolor import colored import util def tool_selection(input): system_prompt = """You are an expert developer and programmer. """ user_prompt = """ You need to act as a tool recommender according to the user's questions. You are giving a user question about the code repository. You choose one of the following tools to help you answer the question. Your answer should be the name of the tool. No any other words or symbol are allowed. The tools are defined as follows: - Code_Searcher: This module is designed to search for specific keywords in a code repository that are derived from a user's query. It is particularly beneficial when the user's question pertains to particular functions or variables. As an illustration, this tool could answer queries such as "How do I utilize the function named 'extract_function_name'?" or "How should I apply the function 'def supabase_vdb()?'". - Repo_Parser: This module conducts a fuzzy search within a code repository, offering context for inquiries concerning general procedures and operations in the repository. The inquiries may be high-level, potentially involving multiple source code files and documents. For instance, this tool could handle queries like "Which function is in charge of processing incoming messages?" or "How does the code manage the knowledge base?". - No_Tool: This is the default module that comes into play when the user's query doesn't have a direct connection to the code repository or when other tools can't provide a suitable answer. This module is particularly useful for handling generic programming queries that aren't specific to the codebase in question. For instance, it could address questions like "How is the 'asyncio' library used in Python?" or "Can you explain the workings of smart pointers in C++?". Below are some example questions and answers: - Question: How to use the function extract_function_name? - Code_Searcher - Question: How to use the function def supabase_vdb(knowledge_base):? - Code_Searcher - Question: How to create a knowledge base? - Repo_Parser - Question: How to use the knowledge base? - Repo_Parser - Question: How does this repo generate the UI interface? - Repo_Parser - Question: How to use Text Splitters in this repo? - Repo_Parser - Question: How to use the python asyncio library? - No_Tool """ + f'Here is the user input: {input}' return util.get_chat_response(system_prompt, user_prompt) def extract_function_name(input): system_prompt = """You are an expert developer and programmer. """ user_prompt = """ You will handle user questions about the code repository. Please extract the function or variable name appeared in the question. Only response the one name without the parameters or any other words. If both function and variable names are mentioned, only extract the function name. Below are two examples: - Question: How to use the function extract_function_name? - Answer: extract_function_name - Question: How to use the function def supabase_vdb(query, knowledge_base):? - Answer: supabase_vdb - Question: What is the usage of vdb? - Answer: vdb """ + f'Here is the user input: {input}' return util.get_chat_response(system_prompt, user_prompt) def user_input_handler(input): tool = tool_selection(input) print(colored(f"Tool selected: {tool}", "green")) if tool == "Code_Searcher": # extract the function or variable name from the input function_name = extract_function_name(input) print(function_name) if function_name: # search the function with context context = get_function_context(function_name) prompt = input + "\n\n" + \ f"Here are some the contexts of the function or variable {function_name}: \n\n" + context return prompt elif tool == "Repo_Parser": vdb = generate_or_load_knowledge_from_repo() context = get_repo_context(input, vdb) prompt = input + "\n\n" + \ f"Here are some contexts about the question, which are ranked by the relevance to the question: \n\n" + context return prompt else: print("No tool is selected.") return input if __name__ == "__main__": # results = user_input_handler("What is the usage of the function traffic_interval?") # print(results) results = user_input_handler("How to build a knowledge base?") print(results) ================================================ FILE: util.py ================================================ import os from dotenv import load_dotenv, find_dotenv from termcolor import colored from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain.schema import HumanMessage, SystemMessage load_dotenv(find_dotenv()) def get_chat_response(system_prompt, user_prompt): # By default, use the local LLM llm_type = os.environ.get('LLM_TYPE', "local") if llm_type == "local": return get_local_llm_response(system_prompt, user_prompt) else: return get_openai_response(system_prompt, user_prompt) def get_local_llm_response(system_prompt, user_prompt, model="ggml-gpt4all-j", temperature=0.9): base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1') model_name = os.environ.get('MODEL_NAME', model) llm = OpenAI(temperature=temperature, openai_api_base=base_path, model_name=model_name, openai_api_key="null") text = system_prompt + "\n\n" + user_prompt + "\n\n" response = llm(text) print(response) return response def get_openai_response(system_prompt, user_prompt, model="gpt-3.5-turbo", temperature=0): chat = ChatOpenAI(model_name=model, temperature=temperature) messages = [ SystemMessage(content=system_prompt), HumanMessage(content=user_prompt) ] response = chat(messages) print(response) return response.content