Repository: cmooredev/RepoReader
Branch: main
Commit: 9982adac8aa3
Files: 9
Total size: 11.3 KB
Directory structure:
gitextract_x5yhfpf7/
├── .gitignore
├── README.md
├── app.py
├── config.py
├── file_processing.py
├── main.py
├── questions.py
├── requirements.txt
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.env
env
__pycache__
================================================
FILE: README.md
================================================
# Code Repository Explorer
Explore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model.
## Prerequisites
- Python 3.6+
- OpenAI API key (set in the environment variable `OPENAI_API_KEY`)
## Usage
1. Set the OpenAI API key as an environment variable `OPENAI_API_KEY`.
2. Run the script: `reporeader.py`
3. Enter the GitHub URL of the repository to explore.
4. Ask questions about the repository. Type `exit()` to quit.
## Key Features
- Clones and indexes the contents of a GitHub repository.
- Supports various file types, including code, text, and Jupyter Notebook files.
- Generates detailed answers to user queries based on the repository's contents.
- Uses OpenAI's language model for generating responses.
- Supports interactive conversation with the language model.
- Presents top relevant documents for each question.
================================================
FILE: app.py
================================================
# app.py
from main import main
if __name__ == "__main__":
main()
================================================
FILE: config.py
================================================
#config.py
WHITE = "\033[37m"
GREEN = "\033[32m"
RESET_COLOR = "\033[0m"
model_name = "gpt-3.5-turbo"
================================================
FILE: file_processing.py
================================================
# file_processing.py
import os
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from langchain.document_loaders import DirectoryLoader, NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize
def clone_github_repo(github_url, local_path):
try:
subprocess.run(['git', 'clone', github_url, local_path], check=True)
return True
except subprocess.CalledProcessError as e:
print(f"Failed to clone repository: {e}")
return False
def load_and_index_files(repo_path):
extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']
file_type_counts = {}
documents_dict = {}
for ext in extensions:
glob_pattern = f'**/*.{ext}'
try:
loader = None
if ext == 'ipynb':
loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True)
else:
loader = DirectoryLoader(repo_path, glob=glob_pattern)
loaded_documents = loader.load() if callable(loader.load) else []
if loaded_documents:
file_type_counts[ext] = len(loaded_documents)
for doc in loaded_documents:
file_path = doc.metadata['source']
relative_path = os.path.relpath(file_path, repo_path)
file_id = str(uuid.uuid4())
doc.metadata['source'] = relative_path
doc.metadata['file_id'] = file_id
documents_dict[file_id] = doc
except Exception as e:
print(f"Error loading files with pattern '{glob_pattern}': {e}")
continue
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
split_documents = []
for file_id, original_doc in documents_dict.items():
split_docs = text_splitter.split_documents([original_doc])
for split_doc in split_docs:
split_doc.metadata['file_id'] = original_doc.metadata['file_id']
split_doc.metadata['source'] = original_doc.metadata['source']
split_documents.extend(split_docs)
index = None
if split_documents:
tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
index = BM25Okapi(tokenized_documents)
return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents]
def search_documents(query, index, documents, n_results=5):
query_tokens = clean_and_tokenize(query)
bm25_scores = index.get_scores(query_tokens)
# Compute TF-IDF scores
tfidf_vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, lowercase=True, stop_words='english', use_idf=True, smooth_idf=True, sublinear_tf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform([doc.page_content for doc in documents])
query_tfidf = tfidf_vectorizer.transform([query])
# Compute Cosine Similarity scores
cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
# Combine BM25 and Cosine Similarity scores
combined_scores = bm25_scores * 0.5 + cosine_sim_scores * 0.5
# Get unique top documents
unique_top_document_indices = list(set(combined_scores.argsort()[::-1]))[:n_results]
return [documents[i] for i in unique_top_document_indices]
================================================
FILE: main.py
================================================
#main.py
import os
import tempfile
from dotenv import load_dotenv
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI
from config import WHITE, GREEN, RESET_COLOR, model_name
from utils import format_user_question
from file_processing import clone_github_repo, load_and_index_files
from questions import ask_question, QuestionContext
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
def main():
github_url = input("Enter the GitHub URL of the repository: ")
repo_name = github_url.split("/")[-1]
print("Cloning the repository...")
with tempfile.TemporaryDirectory() as local_path:
if clone_github_repo(github_url, local_path):
index, documents, file_type_counts, filenames = load_and_index_files(local_path)
if index is None:
print("No documents were found to index. Exiting.")
exit()
print("Repository cloned. Indexing files...")
llm = OpenAI(api_key=OPENAI_API_KEY, temperature=0.2)
template = """
Repo: {repo_name} ({github_url}) | Conv: {conversation_history} | Docs: {numbered_documents} | Q: {question} | FileCount: {file_type_counts} | FileNames: {filenames}
Instr:
1. Answer based on context/docs.
2. Focus on repo/code.
3. Consider:
a. Purpose/features - describe.
b. Functions/code - provide details/samples.
c. Setup/usage - give instructions.
4. Unsure? Say "I am not sure".
Answer:
"""
prompt = PromptTemplate(
template=template,
input_variables=["repo_name", "github_url", "conversation_history", "question", "numbered_documents", "file_type_counts", "filenames"]
)
llm_chain = LLMChain(prompt=prompt, llm=llm)
conversation_history = ""
question_context = QuestionContext(index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames)
while True:
try:
user_question = input("\n" + WHITE + "Ask a question about the repository (type 'exit()' to quit): " + RESET_COLOR)
if user_question.lower() == "exit()":
break
print('Thinking...')
user_question = format_user_question(user_question)
answer = ask_question(user_question, question_context)
print(GREEN + '\nANSWER\n' + answer + RESET_COLOR + '\n')
conversation_history += f"Question: {user_question}\nAnswer: {answer}\n"
except Exception as e:
print(f"An error occurred: {e}")
break
else:
print("Failed to clone the repository.")
================================================
FILE: questions.py
================================================
# questions.py
from utils import format_documents
from file_processing import search_documents
class QuestionContext:
def __init__(self, index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames):
self.index = index
self.documents = documents
self.llm_chain = llm_chain
self.model_name = model_name
self.repo_name = repo_name
self.github_url = github_url
self.conversation_history = conversation_history
self.file_type_counts = file_type_counts
self.filenames = filenames
def ask_question(question, context: QuestionContext):
relevant_docs = search_documents(question, context.index, context.documents, n_results=5)
numbered_documents = format_documents(relevant_docs)
question_context = f"This question is about the GitHub repository '{context.repo_name}' available at {context.github_url}. The most relevant documents are:\n\n{numbered_documents}"
answer_with_sources = context.llm_chain.run(
model=context.model_name,
question=question,
context=question_context,
repo_name=context.repo_name,
github_url=context.github_url,
conversation_history=context.conversation_history,
numbered_documents=numbered_documents,
file_type_counts=context.file_type_counts,
filenames=context.filenames
)
return answer_with_sources
================================================
FILE: requirements.txt
================================================
aiohttp==3.8.4
aiosignal==1.3.1
anyio==3.6.2
argilla==1.6.0
async-timeout==4.0.2
attrs==22.2.0
backoff==2.2.1
certifi==2022.12.7
charset-normalizer==3.1.0
chromadb==0.3.21
click==8.1.3
clickhouse-connect==0.5.20
commonmark==0.9.1
dataclasses-json==0.5.7
Deprecated==1.2.13
duckdb==0.7.1
et-xmlfile==1.1.0
fastapi==0.95.0
filelock==3.11.0
frozenlist==1.3.3
h11==0.14.0
hnswlib==0.7.0
httpcore==0.16.3
httptools==0.5.0
httpx==0.23.3
huggingface-hub==0.13.4
idna==3.4
Jinja2==3.1.2
joblib==1.2.0
langchain==0.0.136
lxml==4.9.2
lz4==4.3.2
Markdown==3.4.3
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
monotonic==1.6
mpmath==1.3.0
msg-parser==1.2.0
multidict==6.0.4
mypy-extensions==1.0.0
networkx==3.1
nltk==3.8.1
numpy==1.23.5
olefile==0.46
openai==0.27.4
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
packaging==23.0
pandas==1.5.3
Pillow==9.5.0
posthog==2.5.0
pydantic==1.10.7
Pygments==2.15.0
pypandoc==1.11
python-dateutil==2.8.2
python-docx==0.8.11
python-dotenv==1.0.0
python-magic==0.4.27
python-pptx==0.6.21
pytz==2023.3
PyYAML==6.0
rank-bm25==0.2.2
regex==2023.3.23
requests==2.28.2
rfc3986==1.5.0
rich==13.0.1
scikit-learn==1.2.2
scipy==1.10.1
sentence-transformers==2.2.2
sentencepiece==0.1.97
six==1.16.0
sniffio==1.3.0
SQLAlchemy==1.4.47
starlette==0.26.1
sympy==1.11.1
tenacity==8.2.2
threadpoolctl==3.1.0
tiktoken==0.3.3
tokenizers==0.13.3
torch==2.0.0
torchvision==0.15.1
tqdm==4.65.0
transformers==4.27.4
typing-inspect==0.8.0
typing_extensions==4.5.0
tzdata==2023.3
unstructured==0.5.11
urllib3==1.26.15
uvicorn==0.21.1
uvloop==0.17.0
watchfiles==0.19.0
websockets==11.0.1
wrapt==1.14.1
XlsxWriter==3.0.9
yarl==1.8.2
zstandard==0.20.0
================================================
FILE: utils.py
================================================
#utils.py
import re
import nltk
import os
nltk.download("punkt")
def clean_and_tokenize(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'<[^>]*>', '', text)
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'\(.*?\)', '', text)
text = re.sub(r'\b(?:http|ftp)s?://\S+', '', text)
text = re.sub(r'\W', ' ', text)
text = re.sub(r'\d+', '', text)
text = text.lower()
return nltk.word_tokenize(text)
def format_documents(documents):
numbered_docs = "\n".join([f"{i+1}. {os.path.basename(doc.metadata['source'])}: {doc.page_content}" for i, doc in enumerate(documents)])
return numbered_docs
def format_user_question(question):
question = re.sub(r'\s+', ' ', question).strip()
return question
gitextract_x5yhfpf7/ ├── .gitignore ├── README.md ├── app.py ├── config.py ├── file_processing.py ├── main.py ├── questions.py ├── requirements.txt └── utils.py
SYMBOL INDEX (10 symbols across 4 files)
FILE: file_processing.py
function clone_github_repo (line 12) | def clone_github_repo(github_url, local_path):
function load_and_index_files (line 20) | def load_and_index_files(repo_path):
function search_documents (line 67) | def search_documents(query, index, documents, n_results=5):
FILE: main.py
function main (line 15) | def main():
FILE: questions.py
class QuestionContext (line 5) | class QuestionContext:
method __init__ (line 6) | def __init__(self, index, documents, llm_chain, model_name, repo_name,...
function ask_question (line 17) | def ask_question(question, context: QuestionContext):
FILE: utils.py
function clean_and_tokenize (line 8) | def clean_and_tokenize(text):
function format_documents (line 19) | def format_documents(documents):
function format_user_question (line 23) | def format_user_question(question):
Condensed preview — 9 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (12K chars).
[
{
"path": ".gitignore",
"chars": 20,
"preview": ".env\nenv\n__pycache__"
},
{
"path": "README.md",
"chars": 863,
"preview": "# Code Repository Explorer\n\nExplore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model"
},
{
"path": "app.py",
"chars": 69,
"preview": "# app.py\nfrom main import main\n\nif __name__ == \"__main__\":\n main()"
},
{
"path": "config.py",
"chars": 102,
"preview": "#config.py\nWHITE = \"\\033[37m\"\nGREEN = \"\\033[32m\"\nRESET_COLOR = \"\\033[0m\"\nmodel_name = \"gpt-3.5-turbo\"\n"
},
{
"path": "file_processing.py",
"chars": 3725,
"preview": "# file_processing.py\nimport os\nimport uuid\nimport subprocess\nfrom sklearn.feature_extraction.text import TfidfVectorizer"
},
{
"path": "main.py",
"chars": 2916,
"preview": "#main.py\nimport os\nimport tempfile\nfrom dotenv import load_dotenv\nfrom langchain import PromptTemplate, LLMChain\nfrom la"
},
{
"path": "questions.py",
"chars": 1448,
"preview": "# questions.py\nfrom utils import format_documents\nfrom file_processing import search_documents\n\nclass QuestionContext:\n "
},
{
"path": "requirements.txt",
"chars": 1666,
"preview": "aiohttp==3.8.4\naiosignal==1.3.1\nanyio==3.6.2\nargilla==1.6.0\nasync-timeout==4.0.2\nattrs==22.2.0\nbackoff==2.2.1\ncertifi==2"
},
{
"path": "utils.py",
"chars": 751,
"preview": "#utils.py\nimport re\nimport nltk\nimport os\n\nnltk.download(\"punkt\")\n\ndef clean_and_tokenize(text):\n text = re.sub(r'\\s+"
}
]
About this extraction
This page contains the full source code of the cmooredev/RepoReader GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 9 files (11.3 KB), approximately 3.3k tokens, and a symbol index with 10 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.