Repository: cmooredev/RepoReader
Branch: main
Commit: 9982adac8aa3
Files: 9
Total size: 11.3 KB

Directory structure:
gitextract_x5yhfpf7/

├── .gitignore
├── README.md
├── app.py
├── config.py
├── file_processing.py
├── main.py
├── questions.py
├── requirements.txt
└── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.env
env
__pycache__

================================================
FILE: README.md
================================================
# Code Repository Explorer

Explore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model.

## Prerequisites

- Python 3.6+
- OpenAI API key (set in the environment variable `OPENAI_API_KEY`)

## Usage
1. Set the OpenAI API key as an environment variable `OPENAI_API_KEY`.
2. Run the script: `reporeader.py`
3. Enter the GitHub URL of the repository to explore.
4. Ask questions about the repository. Type `exit()` to quit.

## Key Features
- Clones and indexes the contents of a GitHub repository.
- Supports various file types, including code, text, and Jupyter Notebook files.
- Generates detailed answers to user queries based on the repository's contents.
- Uses OpenAI's language model for generating responses.
- Supports interactive conversation with the language model.
- Presents top relevant documents for each question.


================================================
FILE: app.py
================================================
# app.py
from main import main

if __name__ == "__main__":
    main()

================================================
FILE: config.py
================================================
#config.py
WHITE = "\033[37m"
GREEN = "\033[32m"
RESET_COLOR = "\033[0m"
model_name = "gpt-3.5-turbo"


================================================
FILE: file_processing.py
================================================
# file_processing.py
import os
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from langchain.document_loaders import DirectoryLoader, NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize

def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False

def load_and_index_files(repo_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']

    file_type_counts = {}
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(repo_path, glob=glob_pattern)

            loaded_documents = loader.load() if callable(loader.load) else []
            if loaded_documents:
                file_type_counts[ext] = len(loaded_documents)
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, repo_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            continue

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

    split_documents = []
    for file_id, original_doc in documents_dict.items():
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']

        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)
    return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents]

def search_documents(query, index, documents, n_results=5):
    query_tokens = clean_and_tokenize(query)
    bm25_scores = index.get_scores(query_tokens)

    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, lowercase=True, stop_words='english', use_idf=True, smooth_idf=True, sublinear_tf=True)
    tfidf_matrix = tfidf_vectorizer.fit_transform([doc.page_content for doc in documents])
    query_tfidf = tfidf_vectorizer.transform([query])

    # Compute Cosine Similarity scores
    cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # Combine BM25 and Cosine Similarity scores
    combined_scores = bm25_scores * 0.5 + cosine_sim_scores * 0.5

    # Get unique top documents
    unique_top_document_indices = list(set(combined_scores.argsort()[::-1]))[:n_results]

    return [documents[i] for i in unique_top_document_indices]


================================================
FILE: main.py
================================================
#main.py
import os
import tempfile
from dotenv import load_dotenv
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI
from config import WHITE, GREEN, RESET_COLOR, model_name
from utils import format_user_question
from file_processing import clone_github_repo, load_and_index_files
from questions import ask_question, QuestionContext

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

def main():
    github_url = input("Enter the GitHub URL of the repository: ")
    repo_name = github_url.split("/")[-1]
    print("Cloning the repository...")
    with tempfile.TemporaryDirectory() as local_path:
        if clone_github_repo(github_url, local_path):
            index, documents, file_type_counts, filenames = load_and_index_files(local_path)
            if index is None:
                print("No documents were found to index. Exiting.")
                exit()

            print("Repository cloned. Indexing files...")
            llm = OpenAI(api_key=OPENAI_API_KEY, temperature=0.2)

            template = """
            Repo: {repo_name} ({github_url}) | Conv: {conversation_history} | Docs: {numbered_documents} | Q: {question} | FileCount: {file_type_counts} | FileNames: {filenames}

            Instr:
            1. Answer based on context/docs.
            2. Focus on repo/code.
            3. Consider:
                a. Purpose/features - describe.
                b. Functions/code - provide details/samples.
                c. Setup/usage - give instructions.
            4. Unsure? Say "I am not sure".

            Answer:
            """

            prompt = PromptTemplate(
                template=template,
                input_variables=["repo_name", "github_url", "conversation_history", "question", "numbered_documents", "file_type_counts", "filenames"]
            )

            llm_chain = LLMChain(prompt=prompt, llm=llm)

            conversation_history = ""
            question_context = QuestionContext(index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames)
            while True:
                try:
                    user_question = input("\n" + WHITE + "Ask a question about the repository (type 'exit()' to quit): " + RESET_COLOR)
                    if user_question.lower() == "exit()":
                        break
                    print('Thinking...')
                    user_question = format_user_question(user_question)

                    answer = ask_question(user_question, question_context)
                    print(GREEN + '\nANSWER\n' + answer + RESET_COLOR + '\n')
                    conversation_history += f"Question: {user_question}\nAnswer: {answer}\n"
                except Exception as e:
                    print(f"An error occurred: {e}")
                    break

        else:
            print("Failed to clone the repository.")


================================================
FILE: questions.py
================================================
# questions.py
from utils import format_documents
from file_processing import search_documents

class QuestionContext:
    def __init__(self, index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames):
        self.index = index
        self.documents = documents
        self.llm_chain = llm_chain
        self.model_name = model_name
        self.repo_name = repo_name
        self.github_url = github_url
        self.conversation_history = conversation_history
        self.file_type_counts = file_type_counts
        self.filenames = filenames

def ask_question(question, context: QuestionContext):
    relevant_docs = search_documents(question, context.index, context.documents, n_results=5)

    numbered_documents = format_documents(relevant_docs)
    question_context = f"This question is about the GitHub repository '{context.repo_name}' available at {context.github_url}. The most relevant documents are:\n\n{numbered_documents}"

    answer_with_sources = context.llm_chain.run(
        model=context.model_name,
        question=question,
        context=question_context,
        repo_name=context.repo_name,
        github_url=context.github_url,
        conversation_history=context.conversation_history,
        numbered_documents=numbered_documents,
        file_type_counts=context.file_type_counts,
        filenames=context.filenames
    )
    return answer_with_sources


================================================
FILE: requirements.txt
================================================
aiohttp==3.8.4
aiosignal==1.3.1
anyio==3.6.2
argilla==1.6.0
async-timeout==4.0.2
attrs==22.2.0
backoff==2.2.1
certifi==2022.12.7
charset-normalizer==3.1.0
chromadb==0.3.21
click==8.1.3
clickhouse-connect==0.5.20
commonmark==0.9.1
dataclasses-json==0.5.7
Deprecated==1.2.13
duckdb==0.7.1
et-xmlfile==1.1.0
fastapi==0.95.0
filelock==3.11.0
frozenlist==1.3.3
h11==0.14.0
hnswlib==0.7.0
httpcore==0.16.3
httptools==0.5.0
httpx==0.23.3
huggingface-hub==0.13.4
idna==3.4
Jinja2==3.1.2
joblib==1.2.0
langchain==0.0.136
lxml==4.9.2
lz4==4.3.2
Markdown==3.4.3
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
monotonic==1.6
mpmath==1.3.0
msg-parser==1.2.0
multidict==6.0.4
mypy-extensions==1.0.0
networkx==3.1
nltk==3.8.1
numpy==1.23.5
olefile==0.46
openai==0.27.4
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
packaging==23.0
pandas==1.5.3
Pillow==9.5.0
posthog==2.5.0
pydantic==1.10.7
Pygments==2.15.0
pypandoc==1.11
python-dateutil==2.8.2
python-docx==0.8.11
python-dotenv==1.0.0
python-magic==0.4.27
python-pptx==0.6.21
pytz==2023.3
PyYAML==6.0
rank-bm25==0.2.2
regex==2023.3.23
requests==2.28.2
rfc3986==1.5.0
rich==13.0.1
scikit-learn==1.2.2
scipy==1.10.1
sentence-transformers==2.2.2
sentencepiece==0.1.97
six==1.16.0
sniffio==1.3.0
SQLAlchemy==1.4.47
starlette==0.26.1
sympy==1.11.1
tenacity==8.2.2
threadpoolctl==3.1.0
tiktoken==0.3.3
tokenizers==0.13.3
torch==2.0.0
torchvision==0.15.1
tqdm==4.65.0
transformers==4.27.4
typing-inspect==0.8.0
typing_extensions==4.5.0
tzdata==2023.3
unstructured==0.5.11
urllib3==1.26.15
uvicorn==0.21.1
uvloop==0.17.0
watchfiles==0.19.0
websockets==11.0.1
wrapt==1.14.1
XlsxWriter==3.0.9
yarl==1.8.2
zstandard==0.20.0


================================================
FILE: utils.py
================================================
#utils.py
import re
import nltk
import os

nltk.download("punkt")

def clean_and_tokenize(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\b(?:http|ftp)s?://\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return nltk.word_tokenize(text)

def format_documents(documents):
    numbered_docs = "\n".join([f"{i+1}. {os.path.basename(doc.metadata['source'])}: {doc.page_content}" for i, doc in enumerate(documents)])
    return numbered_docs

def format_user_question(question):
    question = re.sub(r'\s+', ' ', question).strip()
    return question