Repository: cmooredev/RepoReader Branch: main Commit: 9982adac8aa3 Files: 9 Total size: 11.3 KB Directory structure: gitextract_x5yhfpf7/ ├── .gitignore ├── README.md ├── app.py ├── config.py ├── file_processing.py ├── main.py ├── questions.py ├── requirements.txt └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .env env __pycache__ ================================================ FILE: README.md ================================================ # Code Repository Explorer Explore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model. ## Prerequisites - Python 3.6+ - OpenAI API key (set in the environment variable `OPENAI_API_KEY`) ## Usage 1. Set the OpenAI API key as an environment variable `OPENAI_API_KEY`. 2. Run the script: `reporeader.py` 3. Enter the GitHub URL of the repository to explore. 4. Ask questions about the repository. Type `exit()` to quit. ## Key Features - Clones and indexes the contents of a GitHub repository. - Supports various file types, including code, text, and Jupyter Notebook files. - Generates detailed answers to user queries based on the repository's contents. - Uses OpenAI's language model for generating responses. - Supports interactive conversation with the language model. - Presents top relevant documents for each question. ================================================ FILE: app.py ================================================ # app.py from main import main if __name__ == "__main__": main() ================================================ FILE: config.py ================================================ #config.py WHITE = "\033[37m" GREEN = "\033[32m" RESET_COLOR = "\033[0m" model_name = "gpt-3.5-turbo" ================================================ FILE: file_processing.py ================================================ # file_processing.py import os import uuid import subprocess from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from rank_bm25 import BM25Okapi from langchain.document_loaders import DirectoryLoader, NotebookLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from utils import clean_and_tokenize def clone_github_repo(github_url, local_path): try: subprocess.run(['git', 'clone', github_url, local_path], check=True) return True except subprocess.CalledProcessError as e: print(f"Failed to clone repository: {e}") return False def load_and_index_files(repo_path): extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb'] file_type_counts = {} documents_dict = {} for ext in extensions: glob_pattern = f'**/*.{ext}' try: loader = None if ext == 'ipynb': loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True) else: loader = DirectoryLoader(repo_path, glob=glob_pattern) loaded_documents = loader.load() if callable(loader.load) else [] if loaded_documents: file_type_counts[ext] = len(loaded_documents) for doc in loaded_documents: file_path = doc.metadata['source'] relative_path = os.path.relpath(file_path, repo_path) file_id = str(uuid.uuid4()) doc.metadata['source'] = relative_path doc.metadata['file_id'] = file_id documents_dict[file_id] = doc except Exception as e: print(f"Error loading files with pattern '{glob_pattern}': {e}") continue text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200) split_documents = [] for file_id, original_doc in documents_dict.items(): split_docs = text_splitter.split_documents([original_doc]) for split_doc in split_docs: split_doc.metadata['file_id'] = original_doc.metadata['file_id'] split_doc.metadata['source'] = original_doc.metadata['source'] split_documents.extend(split_docs) index = None if split_documents: tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents] index = BM25Okapi(tokenized_documents) return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents] def search_documents(query, index, documents, n_results=5): query_tokens = clean_and_tokenize(query) bm25_scores = index.get_scores(query_tokens) # Compute TF-IDF scores tfidf_vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, lowercase=True, stop_words='english', use_idf=True, smooth_idf=True, sublinear_tf=True) tfidf_matrix = tfidf_vectorizer.fit_transform([doc.page_content for doc in documents]) query_tfidf = tfidf_vectorizer.transform([query]) # Compute Cosine Similarity scores cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten() # Combine BM25 and Cosine Similarity scores combined_scores = bm25_scores * 0.5 + cosine_sim_scores * 0.5 # Get unique top documents unique_top_document_indices = list(set(combined_scores.argsort()[::-1]))[:n_results] return [documents[i] for i in unique_top_document_indices] ================================================ FILE: main.py ================================================ #main.py import os import tempfile from dotenv import load_dotenv from langchain import PromptTemplate, LLMChain from langchain.llms import OpenAI from config import WHITE, GREEN, RESET_COLOR, model_name from utils import format_user_question from file_processing import clone_github_repo, load_and_index_files from questions import ask_question, QuestionContext load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") def main(): github_url = input("Enter the GitHub URL of the repository: ") repo_name = github_url.split("/")[-1] print("Cloning the repository...") with tempfile.TemporaryDirectory() as local_path: if clone_github_repo(github_url, local_path): index, documents, file_type_counts, filenames = load_and_index_files(local_path) if index is None: print("No documents were found to index. Exiting.") exit() print("Repository cloned. Indexing files...") llm = OpenAI(api_key=OPENAI_API_KEY, temperature=0.2) template = """ Repo: {repo_name} ({github_url}) | Conv: {conversation_history} | Docs: {numbered_documents} | Q: {question} | FileCount: {file_type_counts} | FileNames: {filenames} Instr: 1. Answer based on context/docs. 2. Focus on repo/code. 3. Consider: a. Purpose/features - describe. b. Functions/code - provide details/samples. c. Setup/usage - give instructions. 4. Unsure? Say "I am not sure". Answer: """ prompt = PromptTemplate( template=template, input_variables=["repo_name", "github_url", "conversation_history", "question", "numbered_documents", "file_type_counts", "filenames"] ) llm_chain = LLMChain(prompt=prompt, llm=llm) conversation_history = "" question_context = QuestionContext(index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames) while True: try: user_question = input("\n" + WHITE + "Ask a question about the repository (type 'exit()' to quit): " + RESET_COLOR) if user_question.lower() == "exit()": break print('Thinking...') user_question = format_user_question(user_question) answer = ask_question(user_question, question_context) print(GREEN + '\nANSWER\n' + answer + RESET_COLOR + '\n') conversation_history += f"Question: {user_question}\nAnswer: {answer}\n" except Exception as e: print(f"An error occurred: {e}") break else: print("Failed to clone the repository.") ================================================ FILE: questions.py ================================================ # questions.py from utils import format_documents from file_processing import search_documents class QuestionContext: def __init__(self, index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames): self.index = index self.documents = documents self.llm_chain = llm_chain self.model_name = model_name self.repo_name = repo_name self.github_url = github_url self.conversation_history = conversation_history self.file_type_counts = file_type_counts self.filenames = filenames def ask_question(question, context: QuestionContext): relevant_docs = search_documents(question, context.index, context.documents, n_results=5) numbered_documents = format_documents(relevant_docs) question_context = f"This question is about the GitHub repository '{context.repo_name}' available at {context.github_url}. The most relevant documents are:\n\n{numbered_documents}" answer_with_sources = context.llm_chain.run( model=context.model_name, question=question, context=question_context, repo_name=context.repo_name, github_url=context.github_url, conversation_history=context.conversation_history, numbered_documents=numbered_documents, file_type_counts=context.file_type_counts, filenames=context.filenames ) return answer_with_sources ================================================ FILE: requirements.txt ================================================ aiohttp==3.8.4 aiosignal==1.3.1 anyio==3.6.2 argilla==1.6.0 async-timeout==4.0.2 attrs==22.2.0 backoff==2.2.1 certifi==2022.12.7 charset-normalizer==3.1.0 chromadb==0.3.21 click==8.1.3 clickhouse-connect==0.5.20 commonmark==0.9.1 dataclasses-json==0.5.7 Deprecated==1.2.13 duckdb==0.7.1 et-xmlfile==1.1.0 fastapi==0.95.0 filelock==3.11.0 frozenlist==1.3.3 h11==0.14.0 hnswlib==0.7.0 httpcore==0.16.3 httptools==0.5.0 httpx==0.23.3 huggingface-hub==0.13.4 idna==3.4 Jinja2==3.1.2 joblib==1.2.0 langchain==0.0.136 lxml==4.9.2 lz4==4.3.2 Markdown==3.4.3 MarkupSafe==2.1.2 marshmallow==3.19.0 marshmallow-enum==1.5.1 monotonic==1.6 mpmath==1.3.0 msg-parser==1.2.0 multidict==6.0.4 mypy-extensions==1.0.0 networkx==3.1 nltk==3.8.1 numpy==1.23.5 olefile==0.46 openai==0.27.4 openapi-schema-pydantic==1.2.4 openpyxl==3.1.2 packaging==23.0 pandas==1.5.3 Pillow==9.5.0 posthog==2.5.0 pydantic==1.10.7 Pygments==2.15.0 pypandoc==1.11 python-dateutil==2.8.2 python-docx==0.8.11 python-dotenv==1.0.0 python-magic==0.4.27 python-pptx==0.6.21 pytz==2023.3 PyYAML==6.0 rank-bm25==0.2.2 regex==2023.3.23 requests==2.28.2 rfc3986==1.5.0 rich==13.0.1 scikit-learn==1.2.2 scipy==1.10.1 sentence-transformers==2.2.2 sentencepiece==0.1.97 six==1.16.0 sniffio==1.3.0 SQLAlchemy==1.4.47 starlette==0.26.1 sympy==1.11.1 tenacity==8.2.2 threadpoolctl==3.1.0 tiktoken==0.3.3 tokenizers==0.13.3 torch==2.0.0 torchvision==0.15.1 tqdm==4.65.0 transformers==4.27.4 typing-inspect==0.8.0 typing_extensions==4.5.0 tzdata==2023.3 unstructured==0.5.11 urllib3==1.26.15 uvicorn==0.21.1 uvloop==0.17.0 watchfiles==0.19.0 websockets==11.0.1 wrapt==1.14.1 XlsxWriter==3.0.9 yarl==1.8.2 zstandard==0.20.0 ================================================ FILE: utils.py ================================================ #utils.py import re import nltk import os nltk.download("punkt") def clean_and_tokenize(text): text = re.sub(r'\s+', ' ', text) text = re.sub(r'<[^>]*>', '', text) text = re.sub(r'\[.*?\]', '', text) text = re.sub(r'\(.*?\)', '', text) text = re.sub(r'\b(?:http|ftp)s?://\S+', '', text) text = re.sub(r'\W', ' ', text) text = re.sub(r'\d+', '', text) text = text.lower() return nltk.word_tokenize(text) def format_documents(documents): numbered_docs = "\n".join([f"{i+1}. {os.path.basename(doc.metadata['source'])}: {doc.page_content}" for i, doc in enumerate(documents)]) return numbered_docs def format_user_question(question): question = re.sub(r'\s+', ' ', question).strip() return question