Full Code of cmooredev/RepoReader for AI

main 9982adac8aa3 cached

9 files

11.3 KB

3.3k tokens

10 symbols

1 requests

Download .txt

Repository: cmooredev/RepoReader
Branch: main
Commit: 9982adac8aa3
Files: 9
Total size: 11.3 KB

Directory structure:
gitextract_x5yhfpf7/

├── .gitignore
├── README.md
├── app.py
├── config.py
├── file_processing.py
├── main.py
├── questions.py
├── requirements.txt
└── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.env
env
__pycache__

================================================
FILE: README.md
================================================
# Code Repository Explorer

Explore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model.

## Prerequisites

- Python 3.6+
- OpenAI API key (set in the environment variable `OPENAI_API_KEY`)

## Usage
1. Set the OpenAI API key as an environment variable `OPENAI_API_KEY`.
2. Run the script: `reporeader.py`
3. Enter the GitHub URL of the repository to explore.
4. Ask questions about the repository. Type `exit()` to quit.

## Key Features
- Clones and indexes the contents of a GitHub repository.
- Supports various file types, including code, text, and Jupyter Notebook files.
- Generates detailed answers to user queries based on the repository's contents.
- Uses OpenAI's language model for generating responses.
- Supports interactive conversation with the language model.
- Presents top relevant documents for each question.


================================================
FILE: app.py
================================================
# app.py
from main import main

if __name__ == "__main__":
    main()

================================================
FILE: config.py
================================================
#config.py
WHITE = "\033[37m"
GREEN = "\033[32m"
RESET_COLOR = "\033[0m"
model_name = "gpt-3.5-turbo"


================================================
FILE: file_processing.py
================================================
# file_processing.py
import os
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from langchain.document_loaders import DirectoryLoader, NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize

def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False

def load_and_index_files(repo_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']

    file_type_counts = {}
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(repo_path, glob=glob_pattern)

            loaded_documents = loader.load() if callable(loader.load) else []
            if loaded_documents:
                file_type_counts[ext] = len(loaded_documents)
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, repo_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            continue

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

    split_documents = []
    for file_id, original_doc in documents_dict.items():
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']

        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)
    return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents]

def search_documents(query, index, documents, n_results=5):
    query_tokens = clean_and_tokenize(query)
    bm25_scores = index.get_scores(query_tokens)

    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, lowercase=True, stop_words='english', use_idf=True, smooth_idf=True, sublinear_tf=True)
    tfidf_matrix = tfidf_vectorizer.fit_transform([doc.page_content for doc in documents])
    query_tfidf = tfidf_vectorizer.transform([query])

    # Compute Cosine Similarity scores
    cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # Combine BM25 and Cosine Similarity scores
    combined_scores = bm25_scores * 0.5 + cosine_sim_scores * 0.5

    # Get unique top documents
    unique_top_document_indices = list(set(combined_scores.argsort()[::-1]))[:n_results]

    return [documents[i] for i in unique_top_document_indices]


================================================
FILE: main.py
================================================
#main.py
import os
import tempfile
from dotenv import load_dotenv
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI
from config import WHITE, GREEN, RESET_COLOR, model_name
from utils import format_user_question
from file_processing import clone_github_repo, load_and_index_files
from questions import ask_question, QuestionContext

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

def main():
    github_url = input("Enter the GitHub URL of the repository: ")
    repo_name = github_url.split("/")[-1]
    print("Cloning the repository...")
    with tempfile.TemporaryDirectory() as local_path:
        if clone_github_repo(github_url, local_path):
            index, documents, file_type_counts, filenames = load_and_index_files(local_path)
            if index is None:
                print("No documents were found to index. Exiting.")
                exit()

            print("Repository cloned. Indexing files...")
            llm = OpenAI(api_key=OPENAI_API_KEY, temperature=0.2)

            template = """
            Repo: {repo_name} ({github_url}) | Conv: {conversation_history} | Docs: {numbered_documents} | Q: {question} | FileCount: {file_type_counts} | FileNames: {filenames}

            Instr:
            1. Answer based on context/docs.
            2. Focus on repo/code.
            3. Consider:
                a. Purpose/features - describe.
                b. Functions/code - provide details/samples.
                c. Setup/usage - give instructions.
            4. Unsure? Say "I am not sure".

            Answer:
            """

            prompt = PromptTemplate(
                template=template,
                input_variables=["repo_name", "github_url", "conversation_history", "question", "numbered_documents", "file_type_counts", "filenames"]
            )

            llm_chain = LLMChain(prompt=prompt, llm=llm)

            conversation_history = ""
            question_context = QuestionContext(index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames)
            while True:
                try:
                    user_question = input("\n" + WHITE + "Ask a question about the repository (type 'exit()' to quit): " + RESET_COLOR)
                    if user_question.lower() == "exit()":
                        break
                    print('Thinking...')
                    user_question = format_user_question(user_question)

                    answer = ask_question(user_question, question_context)
                    print(GREEN + '\nANSWER\n' + answer + RESET_COLOR + '\n')
                    conversation_history += f"Question: {user_question}\nAnswer: {answer}\n"
                except Exception as e:
                    print(f"An error occurred: {e}")
                    break

        else:
            print("Failed to clone the repository.")


================================================
FILE: questions.py
================================================
# questions.py
from utils import format_documents
from file_processing import search_documents

class QuestionContext:
    def __init__(self, index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames):
        self.index = index
        self.documents = documents
        self.llm_chain = llm_chain
        self.model_name = model_name
        self.repo_name = repo_name
        self.github_url = github_url
        self.conversation_history = conversation_history
        self.file_type_counts = file_type_counts
        self.filenames = filenames

def ask_question(question, context: QuestionContext):
    relevant_docs = search_documents(question, context.index, context.documents, n_results=5)

    numbered_documents = format_documents(relevant_docs)
    question_context = f"This question is about the GitHub repository '{context.repo_name}' available at {context.github_url}. The most relevant documents are:\n\n{numbered_documents}"

    answer_with_sources = context.llm_chain.run(
        model=context.model_name,
        question=question,
        context=question_context,
        repo_name=context.repo_name,
        github_url=context.github_url,
        conversation_history=context.conversation_history,
        numbered_documents=numbered_documents,
        file_type_counts=context.file_type_counts,
        filenames=context.filenames
    )
    return answer_with_sources


================================================
FILE: requirements.txt
================================================
aiohttp==3.8.4
aiosignal==1.3.1
anyio==3.6.2
argilla==1.6.0
async-timeout==4.0.2
attrs==22.2.0
backoff==2.2.1
certifi==2022.12.7
charset-normalizer==3.1.0
chromadb==0.3.21
click==8.1.3
clickhouse-connect==0.5.20
commonmark==0.9.1
dataclasses-json==0.5.7
Deprecated==1.2.13
duckdb==0.7.1
et-xmlfile==1.1.0
fastapi==0.95.0
filelock==3.11.0
frozenlist==1.3.3
h11==0.14.0
hnswlib==0.7.0
httpcore==0.16.3
httptools==0.5.0
httpx==0.23.3
huggingface-hub==0.13.4
idna==3.4
Jinja2==3.1.2
joblib==1.2.0
langchain==0.0.136
lxml==4.9.2
lz4==4.3.2
Markdown==3.4.3
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
monotonic==1.6
mpmath==1.3.0
msg-parser==1.2.0
multidict==6.0.4
mypy-extensions==1.0.0
networkx==3.1
nltk==3.8.1
numpy==1.23.5
olefile==0.46
openai==0.27.4
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
packaging==23.0
pandas==1.5.3
Pillow==9.5.0
posthog==2.5.0
pydantic==1.10.7
Pygments==2.15.0
pypandoc==1.11
python-dateutil==2.8.2
python-docx==0.8.11
python-dotenv==1.0.0
python-magic==0.4.27
python-pptx==0.6.21
pytz==2023.3
PyYAML==6.0
rank-bm25==0.2.2
regex==2023.3.23
requests==2.28.2
rfc3986==1.5.0
rich==13.0.1
scikit-learn==1.2.2
scipy==1.10.1
sentence-transformers==2.2.2
sentencepiece==0.1.97
six==1.16.0
sniffio==1.3.0
SQLAlchemy==1.4.47
starlette==0.26.1
sympy==1.11.1
tenacity==8.2.2
threadpoolctl==3.1.0
tiktoken==0.3.3
tokenizers==0.13.3
torch==2.0.0
torchvision==0.15.1
tqdm==4.65.0
transformers==4.27.4
typing-inspect==0.8.0
typing_extensions==4.5.0
tzdata==2023.3
unstructured==0.5.11
urllib3==1.26.15
uvicorn==0.21.1
uvloop==0.17.0
watchfiles==0.19.0
websockets==11.0.1
wrapt==1.14.1
XlsxWriter==3.0.9
yarl==1.8.2
zstandard==0.20.0


================================================
FILE: utils.py
================================================
#utils.py
import re
import nltk
import os

nltk.download("punkt")

def clean_and_tokenize(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\b(?:http|ftp)s?://\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return nltk.word_tokenize(text)

def format_documents(documents):
    numbered_docs = "\n".join([f"{i+1}. {os.path.basename(doc.metadata['source'])}: {doc.page_content}" for i, doc in enumerate(documents)])
    return numbered_docs

def format_user_question(question):
    question = re.sub(r'\s+', ' ', question).strip()
    return question

Download .txt

gitextract_x5yhfpf7/

├── .gitignore
├── README.md
├── app.py
├── config.py
├── file_processing.py
├── main.py
├── questions.py
├── requirements.txt
└── utils.py

Download .txt

SYMBOL INDEX (10 symbols across 4 files)

FILE: file_processing.py
  function clone_github_repo (line 12) | def clone_github_repo(github_url, local_path):
  function load_and_index_files (line 20) | def load_and_index_files(repo_path):
  function search_documents (line 67) | def search_documents(query, index, documents, n_results=5):

FILE: main.py
  function main (line 15) | def main():

FILE: questions.py
  class QuestionContext (line 5) | class QuestionContext:
    method __init__ (line 6) | def __init__(self, index, documents, llm_chain, model_name, repo_name,...
  function ask_question (line 17) | def ask_question(question, context: QuestionContext):

FILE: utils.py
  function clean_and_tokenize (line 8) | def clean_and_tokenize(text):
  function format_documents (line 19) | def format_documents(documents):
  function format_user_question (line 23) | def format_user_question(question):

Download .json

Condensed preview — 9 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (12K chars).

[
  {
    "path": ".gitignore",
    "chars": 20,
    "preview": ".env\nenv\n__pycache__"
  },
  {
    "path": "README.md",
    "chars": 863,
    "preview": "# Code Repository Explorer\n\nExplore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model"
  },
  {
    "path": "app.py",
    "chars": 69,
    "preview": "# app.py\nfrom main import main\n\nif __name__ == \"__main__\":\n    main()"
  },
  {
    "path": "config.py",
    "chars": 102,
    "preview": "#config.py\nWHITE = \"\\033[37m\"\nGREEN = \"\\033[32m\"\nRESET_COLOR = \"\\033[0m\"\nmodel_name = \"gpt-3.5-turbo\"\n"
  },
  {
    "path": "file_processing.py",
    "chars": 3725,
    "preview": "# file_processing.py\nimport os\nimport uuid\nimport subprocess\nfrom sklearn.feature_extraction.text import TfidfVectorizer"
  },
  {
    "path": "main.py",
    "chars": 2916,
    "preview": "#main.py\nimport os\nimport tempfile\nfrom dotenv import load_dotenv\nfrom langchain import PromptTemplate, LLMChain\nfrom la"
  },
  {
    "path": "questions.py",
    "chars": 1448,
    "preview": "# questions.py\nfrom utils import format_documents\nfrom file_processing import search_documents\n\nclass QuestionContext:\n "
  },
  {
    "path": "requirements.txt",
    "chars": 1666,
    "preview": "aiohttp==3.8.4\naiosignal==1.3.1\nanyio==3.6.2\nargilla==1.6.0\nasync-timeout==4.0.2\nattrs==22.2.0\nbackoff==2.2.1\ncertifi==2"
  },
  {
    "path": "utils.py",
    "chars": 751,
    "preview": "#utils.py\nimport re\nimport nltk\nimport os\n\nnltk.download(\"punkt\")\n\ndef clean_and_tokenize(text):\n    text = re.sub(r'\\s+"
  }
]

About this extraction

This page contains the full source code of the cmooredev/RepoReader GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 9 files (11.3 KB), approximately 3.3k tokens, and a symbol index with 10 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo