[
  {
    "path": ".gitignore",
    "content": ".env\nenv\n__pycache__"
  },
  {
    "path": "README.md",
    "content": "# Code Repository Explorer\n\nExplore and ask questions about a GitHub code repository using OpenAI's GPT-3 language model.\n\n## Prerequisites\n\n- Python 3.6+\n- OpenAI API key (set in the environment variable `OPENAI_API_KEY`)\n\n## Usage\n1. Set the OpenAI API key as an environment variable `OPENAI_API_KEY`.\n2. Run the script: `reporeader.py`\n3. Enter the GitHub URL of the repository to explore.\n4. Ask questions about the repository. Type `exit()` to quit.\n\n## Key Features\n- Clones and indexes the contents of a GitHub repository.\n- Supports various file types, including code, text, and Jupyter Notebook files.\n- Generates detailed answers to user queries based on the repository's contents.\n- Uses OpenAI's language model for generating responses.\n- Supports interactive conversation with the language model.\n- Presents top relevant documents for each question.\n"
  },
  {
    "path": "app.py",
    "content": "# app.py\nfrom main import main\n\nif __name__ == \"__main__\":\n    main()"
  },
  {
    "path": "config.py",
    "content": "#config.py\nWHITE = \"\\033[37m\"\nGREEN = \"\\033[32m\"\nRESET_COLOR = \"\\033[0m\"\nmodel_name = \"gpt-3.5-turbo\"\n"
  },
  {
    "path": "file_processing.py",
    "content": "# file_processing.py\nimport os\nimport uuid\nimport subprocess\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom rank_bm25 import BM25Okapi\nfrom langchain.document_loaders import DirectoryLoader, NotebookLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom utils import clean_and_tokenize\n\ndef clone_github_repo(github_url, local_path):\n    try:\n        subprocess.run(['git', 'clone', github_url, local_path], check=True)\n        return True\n    except subprocess.CalledProcessError as e:\n        print(f\"Failed to clone repository: {e}\")\n        return False\n\ndef load_and_index_files(repo_path):\n    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']\n\n    file_type_counts = {}\n    documents_dict = {}\n\n    for ext in extensions:\n        glob_pattern = f'**/*.{ext}'\n        try:\n            loader = None\n            if ext == 'ipynb':\n                loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True)\n            else:\n                loader = DirectoryLoader(repo_path, glob=glob_pattern)\n\n            loaded_documents = loader.load() if callable(loader.load) else []\n            if loaded_documents:\n                file_type_counts[ext] = len(loaded_documents)\n                for doc in loaded_documents:\n                    file_path = doc.metadata['source']\n                    relative_path = os.path.relpath(file_path, repo_path)\n                    file_id = str(uuid.uuid4())\n                    doc.metadata['source'] = relative_path\n                    doc.metadata['file_id'] = file_id\n\n                    documents_dict[file_id] = doc\n        except Exception as e:\n            print(f\"Error loading files with pattern '{glob_pattern}': {e}\")\n            continue\n\n    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)\n\n    split_documents = []\n    for file_id, original_doc in documents_dict.items():\n        split_docs = text_splitter.split_documents([original_doc])\n        for split_doc in split_docs:\n            split_doc.metadata['file_id'] = original_doc.metadata['file_id']\n            split_doc.metadata['source'] = original_doc.metadata['source']\n\n        split_documents.extend(split_docs)\n\n    index = None\n    if split_documents:\n        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]\n        index = BM25Okapi(tokenized_documents)\n    return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents]\n\ndef search_documents(query, index, documents, n_results=5):\n    query_tokens = clean_and_tokenize(query)\n    bm25_scores = index.get_scores(query_tokens)\n\n    # Compute TF-IDF scores\n    tfidf_vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, lowercase=True, stop_words='english', use_idf=True, smooth_idf=True, sublinear_tf=True)\n    tfidf_matrix = tfidf_vectorizer.fit_transform([doc.page_content for doc in documents])\n    query_tfidf = tfidf_vectorizer.transform([query])\n\n    # Compute Cosine Similarity scores\n    cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()\n\n    # Combine BM25 and Cosine Similarity scores\n    combined_scores = bm25_scores * 0.5 + cosine_sim_scores * 0.5\n\n    # Get unique top documents\n    unique_top_document_indices = list(set(combined_scores.argsort()[::-1]))[:n_results]\n\n    return [documents[i] for i in unique_top_document_indices]\n"
  },
  {
    "path": "main.py",
    "content": "#main.py\nimport os\nimport tempfile\nfrom dotenv import load_dotenv\nfrom langchain import PromptTemplate, LLMChain\nfrom langchain.llms import OpenAI\nfrom config import WHITE, GREEN, RESET_COLOR, model_name\nfrom utils import format_user_question\nfrom file_processing import clone_github_repo, load_and_index_files\nfrom questions import ask_question, QuestionContext\n\nload_dotenv()\nOPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n\ndef main():\n    github_url = input(\"Enter the GitHub URL of the repository: \")\n    repo_name = github_url.split(\"/\")[-1]\n    print(\"Cloning the repository...\")\n    with tempfile.TemporaryDirectory() as local_path:\n        if clone_github_repo(github_url, local_path):\n            index, documents, file_type_counts, filenames = load_and_index_files(local_path)\n            if index is None:\n                print(\"No documents were found to index. Exiting.\")\n                exit()\n\n            print(\"Repository cloned. Indexing files...\")\n            llm = OpenAI(api_key=OPENAI_API_KEY, temperature=0.2)\n\n            template = \"\"\"\n            Repo: {repo_name} ({github_url}) | Conv: {conversation_history} | Docs: {numbered_documents} | Q: {question} | FileCount: {file_type_counts} | FileNames: {filenames}\n\n            Instr:\n            1. Answer based on context/docs.\n            2. Focus on repo/code.\n            3. Consider:\n                a. Purpose/features - describe.\n                b. Functions/code - provide details/samples.\n                c. Setup/usage - give instructions.\n            4. Unsure? Say \"I am not sure\".\n\n            Answer:\n            \"\"\"\n\n            prompt = PromptTemplate(\n                template=template,\n                input_variables=[\"repo_name\", \"github_url\", \"conversation_history\", \"question\", \"numbered_documents\", \"file_type_counts\", \"filenames\"]\n            )\n\n            llm_chain = LLMChain(prompt=prompt, llm=llm)\n\n            conversation_history = \"\"\n            question_context = QuestionContext(index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames)\n            while True:\n                try:\n                    user_question = input(\"\\n\" + WHITE + \"Ask a question about the repository (type 'exit()' to quit): \" + RESET_COLOR)\n                    if user_question.lower() == \"exit()\":\n                        break\n                    print('Thinking...')\n                    user_question = format_user_question(user_question)\n\n                    answer = ask_question(user_question, question_context)\n                    print(GREEN + '\\nANSWER\\n' + answer + RESET_COLOR + '\\n')\n                    conversation_history += f\"Question: {user_question}\\nAnswer: {answer}\\n\"\n                except Exception as e:\n                    print(f\"An error occurred: {e}\")\n                    break\n\n        else:\n            print(\"Failed to clone the repository.\")\n"
  },
  {
    "path": "questions.py",
    "content": "# questions.py\nfrom utils import format_documents\nfrom file_processing import search_documents\n\nclass QuestionContext:\n    def __init__(self, index, documents, llm_chain, model_name, repo_name, github_url, conversation_history, file_type_counts, filenames):\n        self.index = index\n        self.documents = documents\n        self.llm_chain = llm_chain\n        self.model_name = model_name\n        self.repo_name = repo_name\n        self.github_url = github_url\n        self.conversation_history = conversation_history\n        self.file_type_counts = file_type_counts\n        self.filenames = filenames\n\ndef ask_question(question, context: QuestionContext):\n    relevant_docs = search_documents(question, context.index, context.documents, n_results=5)\n\n    numbered_documents = format_documents(relevant_docs)\n    question_context = f\"This question is about the GitHub repository '{context.repo_name}' available at {context.github_url}. The most relevant documents are:\\n\\n{numbered_documents}\"\n\n    answer_with_sources = context.llm_chain.run(\n        model=context.model_name,\n        question=question,\n        context=question_context,\n        repo_name=context.repo_name,\n        github_url=context.github_url,\n        conversation_history=context.conversation_history,\n        numbered_documents=numbered_documents,\n        file_type_counts=context.file_type_counts,\n        filenames=context.filenames\n    )\n    return answer_with_sources\n"
  },
  {
    "path": "requirements.txt",
    "content": "aiohttp==3.8.4\naiosignal==1.3.1\nanyio==3.6.2\nargilla==1.6.0\nasync-timeout==4.0.2\nattrs==22.2.0\nbackoff==2.2.1\ncertifi==2022.12.7\ncharset-normalizer==3.1.0\nchromadb==0.3.21\nclick==8.1.3\nclickhouse-connect==0.5.20\ncommonmark==0.9.1\ndataclasses-json==0.5.7\nDeprecated==1.2.13\nduckdb==0.7.1\net-xmlfile==1.1.0\nfastapi==0.95.0\nfilelock==3.11.0\nfrozenlist==1.3.3\nh11==0.14.0\nhnswlib==0.7.0\nhttpcore==0.16.3\nhttptools==0.5.0\nhttpx==0.23.3\nhuggingface-hub==0.13.4\nidna==3.4\nJinja2==3.1.2\njoblib==1.2.0\nlangchain==0.0.136\nlxml==4.9.2\nlz4==4.3.2\nMarkdown==3.4.3\nMarkupSafe==2.1.2\nmarshmallow==3.19.0\nmarshmallow-enum==1.5.1\nmonotonic==1.6\nmpmath==1.3.0\nmsg-parser==1.2.0\nmultidict==6.0.4\nmypy-extensions==1.0.0\nnetworkx==3.1\nnltk==3.8.1\nnumpy==1.23.5\nolefile==0.46\nopenai==0.27.4\nopenapi-schema-pydantic==1.2.4\nopenpyxl==3.1.2\npackaging==23.0\npandas==1.5.3\nPillow==9.5.0\nposthog==2.5.0\npydantic==1.10.7\nPygments==2.15.0\npypandoc==1.11\npython-dateutil==2.8.2\npython-docx==0.8.11\npython-dotenv==1.0.0\npython-magic==0.4.27\npython-pptx==0.6.21\npytz==2023.3\nPyYAML==6.0\nrank-bm25==0.2.2\nregex==2023.3.23\nrequests==2.28.2\nrfc3986==1.5.0\nrich==13.0.1\nscikit-learn==1.2.2\nscipy==1.10.1\nsentence-transformers==2.2.2\nsentencepiece==0.1.97\nsix==1.16.0\nsniffio==1.3.0\nSQLAlchemy==1.4.47\nstarlette==0.26.1\nsympy==1.11.1\ntenacity==8.2.2\nthreadpoolctl==3.1.0\ntiktoken==0.3.3\ntokenizers==0.13.3\ntorch==2.0.0\ntorchvision==0.15.1\ntqdm==4.65.0\ntransformers==4.27.4\ntyping-inspect==0.8.0\ntyping_extensions==4.5.0\ntzdata==2023.3\nunstructured==0.5.11\nurllib3==1.26.15\nuvicorn==0.21.1\nuvloop==0.17.0\nwatchfiles==0.19.0\nwebsockets==11.0.1\nwrapt==1.14.1\nXlsxWriter==3.0.9\nyarl==1.8.2\nzstandard==0.20.0\n"
  },
  {
    "path": "utils.py",
    "content": "#utils.py\nimport re\nimport nltk\nimport os\n\nnltk.download(\"punkt\")\n\ndef clean_and_tokenize(text):\n    text = re.sub(r'\\s+', ' ', text)\n    text = re.sub(r'<[^>]*>', '', text)\n    text = re.sub(r'\\[.*?\\]', '', text)\n    text = re.sub(r'\\(.*?\\)', '', text)\n    text = re.sub(r'\\b(?:http|ftp)s?://\\S+', '', text)\n    text = re.sub(r'\\W', ' ', text)\n    text = re.sub(r'\\d+', '', text)\n    text = text.lower()\n    return nltk.word_tokenize(text)\n\ndef format_documents(documents):\n    numbered_docs = \"\\n\".join([f\"{i+1}. {os.path.basename(doc.metadata['source'])}: {doc.page_content}\" for i, doc in enumerate(documents)])\n    return numbered_docs\n\ndef format_user_question(question):\n    question = re.sub(r'\\s+', ' ', question).strip()\n    return question\n"
  }
]