Repository: coderamp-labs/gitingest Branch: main Commit: 4e259a02fe72 Files: 110 Total size: 391.9 KB Directory structure: gitextract_380b_654/ ├── .docker/ │ └── minio/ │ └── setup.sh ├── .dockerignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ └── feature_request.yml │ └── workflows/ │ ├── ci.yml │ ├── codeql.yml │ ├── dependency-review.yml │ ├── deploy-pr.yml │ ├── docker-build.ecr.yml │ ├── docker-build.ghcr.yml │ ├── pr-title-check.yml │ ├── publish_to_pypi.yml │ ├── rebase-needed.yml │ ├── release-please.yml │ ├── scorecard.yml │ └── stale.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .release-please-manifest.json ├── .vscode/ │ └── launch.json ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── compose.yml ├── eslint.config.cjs ├── pyproject.toml ├── release-please-config.json ├── renovate.json ├── requirements-dev.txt ├── requirements.txt ├── src/ │ ├── gitingest/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── clone.py │ │ ├── config.py │ │ ├── entrypoint.py │ │ ├── ingestion.py │ │ ├── output_formatter.py │ │ ├── query_parser.py │ │ ├── schemas/ │ │ │ ├── __init__.py │ │ │ ├── cloning.py │ │ │ ├── filesystem.py │ │ │ └── ingestion.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── auth.py │ │ ├── compat_func.py │ │ ├── compat_typing.py │ │ ├── exceptions.py │ │ ├── file_utils.py │ │ ├── git_utils.py │ │ ├── ignore_patterns.py │ │ ├── ingestion_utils.py │ │ ├── logging_config.py │ │ ├── notebook.py │ │ ├── os_utils.py │ │ ├── pattern_utils.py │ │ ├── query_parser_utils.py │ │ └── timeout_wrapper.py │ ├── server/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── form_types.py │ │ ├── main.py │ │ ├── metrics_server.py │ │ ├── models.py │ │ ├── query_processor.py │ │ ├── routers/ │ │ │ ├── __init__.py │ │ │ ├── dynamic.py │ │ │ ├── index.py │ │ │ └── ingest.py │ │ ├── routers_utils.py │ │ ├── s3_utils.py │ │ ├── server_config.py │ │ ├── server_utils.py │ │ └── templates/ │ │ ├── base.jinja │ │ ├── components/ │ │ │ ├── _macros.jinja │ │ │ ├── footer.jinja │ │ │ ├── git_form.jinja │ │ │ ├── navbar.jinja │ │ │ ├── result.jinja │ │ │ └── tailwind_components.html │ │ ├── git.jinja │ │ ├── index.jinja │ │ └── swagger_ui.jinja │ └── static/ │ ├── js/ │ │ ├── git.js │ │ ├── git_form.js │ │ ├── index.js │ │ ├── navbar.js │ │ ├── posthog.js │ │ └── utils.js │ ├── llms.txt │ └── robots.txt └── tests/ ├── .pylintrc ├── __init__.py ├── conftest.py ├── query_parser/ │ ├── __init__.py │ ├── test_git_host_agnostic.py │ └── test_query_parser.py ├── server/ │ ├── __init__.py │ └── test_flow_integration.py ├── test_cli.py ├── test_clone.py ├── test_git_utils.py ├── test_gitignore_feature.py ├── test_ingestion.py ├── test_notebook_utils.py ├── test_pattern_utils.py └── test_summary.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .docker/minio/setup.sh ================================================ #!/bin/sh # Simple script to set up MinIO bucket and user # Based on example from MinIO issues # Format bucket name to ensure compatibility BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') # Configure MinIO client mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} # Remove bucket if it exists (for clean setup) mc rm -r --force myminio/${BUCKET_NAME} || true # Create bucket mc mb myminio/${BUCKET_NAME} # Set bucket policy to allow downloads mc anonymous set download myminio/${BUCKET_NAME} # Create user with access and secret keys mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists" # Create policy for the bucket echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json # Apply policy mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists" mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY} echo "MinIO setup completed successfully" echo "Bucket: ${BUCKET_NAME}" echo "Access via console: http://localhost:9001" ================================================ FILE: .dockerignore ================================================ # ------------------------------------------------- # Base: reuse patterns from .gitignore # ------------------------------------------------- # Operating-system .DS_Store Thumbs.db # Editor / IDE settings .vscode/ !.vscode/launch.json .idea/ *.swp # Python virtual-envs & tooling .venv*/ .python-version __pycache__/ *.egg-info/ *.egg .ruff_cache/ # Test artifacts & coverage .pytest_cache/ .coverage coverage.xml htmlcov/ # Build, distribution & docs build/ dist/ *.wheel # Logs & runtime output *.log logs/ *.tmp tmp/ # Project-specific files history.txt digest.txt # ------------------------------------------------- # Extra for Docker # ------------------------------------------------- # Git history .git/ .gitignore # Tests tests/ # Docs docs/ *.md LICENSE # Local overrides & secrets .env # Docker files .dockerignore Dockerfile* # ------------------------------------------------- # Files required during build # ------------------------------------------------- !pyproject.toml !src/ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: Bug report 🐞 description: Report a bug or internal server error when using Gitingest title: "(bug): " labels: ["bug"] body: - type: markdown attributes: value: | Thanks for taking the time to report a bug! :lady_beetle: Please fill out the following details to help us reproduce and fix the issue. :point_down: - type: dropdown id: interface attributes: label: Which interface did you use? default: 0 options: - "Select one..." - Web UI - CLI - PyPI package validations: required: true - type: input id: repo_url attributes: label: Repository URL (if public) placeholder: e.g., https://github.com///commit_branch_or_tag/blob_or_tree/subdir - type: dropdown id: git_host attributes: label: Git host description: The Git host of the repository. default: 0 options: - "Select one..." - GitHub (github.com) - GitLab (gitlab.com) - Bitbucket (bitbucket.org) - Gitea (gitea.com) - Codeberg (codeberg.org) - Gist (gist.github.com) - Kaggle (kaggle.com) - GitHub Enterprise (github.company.com) - Other (specify below) validations: required: true - type: input id: git_host_other attributes: label: Other Git host placeholder: If you selected "Other", please specify the Git host here. - type: dropdown id: repo_visibility attributes: label: Repository visibility default: 0 options: - "Select one..." - public - private validations: required: true - type: dropdown id: revision attributes: label: Commit, branch, or tag default: 0 options: - "Select one..." - default branch - commit - branch - tag validations: required: true - type: dropdown id: ingest_scope attributes: label: Did you ingest the full repository or a subdirectory? default: 0 options: - "Select one..." - full repository - subdirectory validations: required: true - type: dropdown id: os attributes: label: Operating system default: 0 options: - "Select one..." - Not relevant (Web UI) - macOS - Windows - Linux validations: required: true - type: dropdown id: browser attributes: label: Browser (Web UI only) default: 0 options: - "Select one..." - Not relevant (CLI / PyPI) - Chrome - Firefox - Safari - Edge - Other (specify below) validations: required: true - type: input id: browser_other attributes: label: Other browser placeholder: If you selected "Other", please specify the browser here. - type: input id: gitingest_version attributes: label: Gitingest version placeholder: e.g., v0.1.5 description: Not required if you used the Web UI. - type: input id: python_version attributes: label: Python version placeholder: e.g., 3.11.5 description: Not required if you used the Web UI. - type: textarea id: bug_description attributes: label: Bug description placeholder: Describe the bug here. description: A detailed but concise description of the bug. validations: required: true - type: textarea id: steps_to_reproduce attributes: label: Steps to reproduce placeholder: Include the exact commands or actions that led to the error. description: Include the exact commands or actions that led to the error *(if relevant)*. render: shell - type: textarea id: expected_behavior attributes: label: Expected behavior placeholder: Describe what you expected to happen. description: Describe what you expected to happen *(if relevant)*. - type: textarea id: actual_behavior attributes: label: Actual behavior description: Paste the full error message or stack trace here. - type: textarea id: additional_context attributes: label: Additional context, logs, or screenshots placeholder: Add any other context, links, or screenshots about the issue here. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: Feature request 💡 description: Suggest a new feature or improvement for Gitingest title: "(feat): " labels: ["enhancement"] body: - type: markdown attributes: value: | Thanks for taking the time to help us improve **Gitingest**! :sparkles: Please fill in the sections below to describe your idea. The more detail you provide, the easier it is for us to evaluate and plan the work. :point_down: - type: input id: summary attributes: label: Feature summary placeholder: One-sentence description of the feature. validations: required: true - type: textarea id: problem attributes: label: Problem / motivation description: What problem does this feature solve? How does it affect your workflow? placeholder: Why is this feature important? Describe the pain point or limitation you're facing. validations: required: true - type: textarea id: proposal attributes: label: Proposed solution placeholder: Describe what you would like to see happen. description: Outline the feature as you imagine it. *(optional)* - type: textarea id: alternatives attributes: label: Alternatives considered placeholder: List other approaches you've considered or work-arounds you use today. description: Feel free to mention why those alternatives don't fully solve the problem. - type: dropdown id: interface attributes: label: Which interface would this affect? default: 0 options: - "Select one..." - Web UI - CLI - PyPI package - CLI + PyPI package - All validations: required: true - type: dropdown id: priority attributes: label: How important is this to you? default: 0 options: - "Select one..." - Nice to have - Important - Critical validations: required: true - type: dropdown id: willingness attributes: label: Would you like to work on this feature yourself? default: 0 options: - "Select one..." - Yes, I'd like to implement it - Maybe, if I get some guidance - No, just requesting (absolutely fine!) validations: required: true - type: dropdown id: support_needed attributes: label: Would you need support from the maintainers (if you're implementing it yourself)? default: 0 options: - "Select one..." - No, I can handle it solo - Yes, I'd need some guidance - Not sure yet - This is just a suggestion, I'm not planning to implement it myself (absolutely fine!) - type: textarea id: additional_context attributes: label: Additional context, screenshots, or examples placeholder: Add links, sketches, or any other context that would help us understand and implement the feature. ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: [main] pull_request: branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: test: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.8", "3.13"] include: - os: ubuntu-latest python-version: "3.13" coverage: true steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Set up Python uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: ${{ matrix.python-version }} cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install ".[dev,server]" - name: Cache pytest results uses: actions/cache@v4 with: path: .pytest_cache key: ${{ runner.os }}-pytest-${{ matrix.python-version }}-${{ hashFiles('**/pytest.ini') }} restore-keys: | ${{ runner.os }}-pytest-${{ matrix.python-version }}- - name: Run tests if: ${{ matrix.coverage != true }} run: pytest - name: Run tests if: ${{ matrix.coverage == true }} run: pytest - name: Run pre-commit hooks uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 if: ${{ matrix.python-version == '3.13' && matrix.os == 'ubuntu-latest' }} ================================================ FILE: .github/workflows/codeql.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: ["main"] pull_request: # The branches below must be a subset of the branches above branches: ["main"] schedule: - cron: "0 0 * * 1" permissions: contents: read jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: ["javascript", "python"] # CodeQL supports [ $supported-codeql-languages ] # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - name: Checkout repository uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@df559355d593797519d70b90fc8edd5db049e7a2 # v3.29.9 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@df559355d593797519d70b90fc8edd5db049e7a2 # v3.29.9 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | # echo "Run, Build Application using script" # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@df559355d593797519d70b90fc8edd5db049e7a2 # v3.29.9 with: category: "/language:${{matrix.language}}" ================================================ FILE: .github/workflows/dependency-review.yml ================================================ # Dependency Review Action # # This Action will scan dependency manifest files that change as part of a Pull Request, # surfacing known-vulnerable versions of the packages declared or updated in the PR. # Once installed, if the workflow run is marked as required, # PRs introducing known-vulnerable packages will be blocked from merging. # # Source repository: https://github.com/actions/dependency-review-action name: 'Dependency Review' on: [pull_request] permissions: contents: read jobs: dependency-review: runs-on: ubuntu-latest steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - name: 'Checkout Repository' uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: 'Dependency Review' uses: actions/dependency-review-action@da24556b548a50705dd671f47852072ea4c105d9 # v4.7.1 ================================================ FILE: .github/workflows/deploy-pr.yml ================================================ name: Manage PR Temp Envs 'on': pull_request: types: - labeled - unlabeled - closed permissions: contents: read pull-requests: write env: APP_NAME: gitingest FLUX_OWNER: '${{ github.repository_owner }}' FLUX_REPO: '${{ secrets.CR_FLUX_REPO }}' jobs: deploy-pr-env: if: >- ${{ github.event.action == 'labeled' && github.event.label.name == 'deploy-pr-temp-env' }} runs-on: ubuntu-latest steps: - name: Create GitHub App token uses: actions/create-github-app-token@v2 id: app-token with: app-id: '${{ secrets.CR_APP_CI_APP_ID }}' private-key: '${{ secrets.CR_APP_CI_PRIVATE_KEY }}' owner: '${{ env.FLUX_OWNER }}' repositories: '${{ env.FLUX_REPO }}' - name: Checkout Flux repo uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: '${{ env.FLUX_OWNER }}/${{ env.FLUX_REPO }}' token: '${{ steps.app-token.outputs.token }}' path: flux-repo persist-credentials: false - name: Export PR ID shell: bash run: 'echo "PR_ID=${{ github.event.pull_request.number }}" >> $GITHUB_ENV' - name: Ensure template exists shell: bash run: > T="flux-repo/pr-template/${APP_NAME}" [[ -d "$T" ]] || { echo "Missing $T"; exit 1; } [[ $(find "$T" -type f | wc -l) -gt 0 ]] || { echo "No files in $T"; exit 1; } - name: Render & copy template shell: bash run: | SRC="flux-repo/pr-template/${APP_NAME}" DST="flux-repo/deployments/prs-${APP_NAME}/${PR_ID}" mkdir -p "$DST" cp -r "$SRC/." "$DST/" find "$DST" -type f -print0 \ | xargs -0 -n1 sed -i "s|@PR-ID@|${PR_ID}|g" - name: Sanity‑check rendered output shell: bash run: > E=$(find "flux-repo/pr-template/${APP_NAME}" -type f | wc -l) G=$(find "flux-repo/deployments/prs-${APP_NAME}/${PR_ID}" -type f | wc -l) (( G == E )) || { echo "Expected $E files, got $G"; exit 1; } - name: Commit & push creation shell: bash run: > cd flux-repo git config user.name "${{ steps.app-token.outputs.app-slug }}[bot]" git config user.email "${{ steps.app-token.outputs.app-slug }}[bot]@users.noreply.github.com" git add . git commit -m "chore(prs-${APP_NAME}): create temp env for PR #${{ env.PR_ID }} [skip ci]" || echo "Nothing to commit" git remote set-url origin \ https://x-access-token:${{ steps.app-token.outputs.token }}@github.com/${{ env.FLUX_OWNER }}/${{ env.FLUX_REPO }}.git git push origin HEAD:main - name: Comment preview URL on PR uses: thollander/actions-comment-pull-request@v3 with: github-token: '${{ secrets.GITHUB_TOKEN }}' pr-number: '${{ github.event.pull_request.number }}' comment-tag: 'pr-preview' create-if-not-exists: 'true' message: | 🌐 [Preview environment](https://pr-${{ env.PR_ID }}.${{ env.APP_NAME }}.coderamp.dev/) for PR #${{ env.PR_ID }} 📊 [Log viewer](https://app.datadoghq.eu/logs?query=kube_namespace%3Aprs-gitingest%20version%3Apr-${{ env.PR_ID }}) remove-pr-env: if: >- (github.event.action == 'unlabeled' && github.event.label.name == 'deploy-pr-temp-env') || (github.event.action == 'closed') runs-on: ubuntu-latest steps: - name: Create GitHub App token uses: actions/create-github-app-token@v2 id: app-token with: app-id: '${{ secrets.CR_APP_CI_APP_ID }}' private-key: '${{ secrets.CR_APP_CI_PRIVATE_KEY }}' owner: '${{ env.FLUX_OWNER }}' repositories: '${{ env.FLUX_REPO }}' - name: Checkout Flux repo uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: '${{ env.FLUX_OWNER }}/${{ env.FLUX_REPO }}' token: '${{ steps.app-token.outputs.token }}' path: flux-repo persist-credentials: false - name: Export PR ID shell: bash run: 'echo "PR_ID=${{ github.event.pull_request.number }}" >> $GITHUB_ENV' - name: Remove deployed directory shell: bash run: | DST="flux-repo/deployments/prs-${APP_NAME}/${PR_ID}" if [[ -d "$DST" ]]; then rm -rf "$DST" echo "✅ Deleted $DST" else echo "⏭️ Nothing to delete at $DST" fi - name: Commit & push deletion shell: bash run: > cd flux-repo git config user.name "${{ steps.app-token.outputs.app-slug }}[bot]" git config user.email "${{ steps.app-token.outputs.app-slug }}[bot]@users.noreply.github.com" git add -A git commit -m "chore(prs-${APP_NAME}): remove temp env for PR #${{ env.PR_ID }} [skip ci]" || echo "Nothing to commit" git remote set-url origin \ https://x-access-token:${{ steps.app-token.outputs.token }}@github.com/${{ env.FLUX_OWNER }}/${{ env.FLUX_REPO }}.git git push origin HEAD:main - name: Comment preview URL on PR uses: thollander/actions-comment-pull-request@v3 with: github-token: '${{ secrets.GITHUB_TOKEN }}' pr-number: '${{ github.event.pull_request.number }}' comment-tag: 'pr-preview' create-if-not-exists: 'true' message: | ⚙️ Preview environment was undeployed. ================================================ FILE: .github/workflows/docker-build.ecr.yml ================================================ name: Build & Push Container on: push: branches: - 'main' tags: - '*' merge_group: pull_request: types: [labeled, synchronize, reopened, ready_for_review, opened] env: PUSH_FROM_PR: >- ${{ github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'push-container') || contains(github.event.pull_request.labels.*.name, 'deploy-pr-temp-env') ) }} jobs: terraform: name: "ECR" runs-on: ubuntu-latest if: github.repository == 'coderamp-labs/gitingest' permissions: id-token: write contents: read pull-requests: write steps: - name: Checkout uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - name: configure aws credentials uses: aws-actions/configure-aws-credentials@v4 with: role-to-assume: ${{ secrets.CODERAMP_AWS_ECR_REGISTRY_PUSH_ROLE_ARN }} role-session-name: GitHub_to_AWS_via_FederatedOIDC aws-region: eu-west-1 - name: Set current timestamp id: vars run: | echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT echo "sha_full=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT - name: Determine version and deployment context id: version run: | REPO_URL="https://github.com/${{ github.repository }}" if [[ "${{ github.ref_type }}" == "tag" ]]; then # Tag deployment - display version, link to release echo "version=${{ github.ref_name }}" >> $GITHUB_OUTPUT echo "app_version=${{ github.ref_name }}" >> $GITHUB_OUTPUT echo "app_version_url=${REPO_URL}/releases/tag/${{ github.ref_name }}" >> $GITHUB_OUTPUT elif [[ "${{ github.event_name }}" == "pull_request" ]]; then # PR deployment - display pr-XXX, link to PR commit PR_NUMBER="${{ github.event.pull_request.number }}" COMMIT_HASH="${{ steps.vars.outputs.sha_full }}" echo "version=${PR_NUMBER}/merge-${COMMIT_HASH}" >> $GITHUB_OUTPUT echo "app_version=pr-${PR_NUMBER}" >> $GITHUB_OUTPUT echo "app_version_url=${REPO_URL}/pull/${PR_NUMBER}/commits/${COMMIT_HASH}" >> $GITHUB_OUTPUT else # Branch deployment - display branch name, link to commit BRANCH_NAME="${{ github.ref_name }}" COMMIT_HASH="${{ steps.vars.outputs.sha_full }}" echo "app_version=${BRANCH_NAME}" >> $GITHUB_OUTPUT echo "app_version_url=${REPO_URL}/commit/${COMMIT_HASH}" >> $GITHUB_OUTPUT fi - name: Login to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v2 - name: Docker Meta id: meta uses: docker/metadata-action@v5 with: images: | ${{ secrets.ECR_REGISTRY_URL }} flavor: | latest=false tags: | type=ref,event=branch,branch=main,suffix=-${{ steps.vars.outputs.sha_short }}-${{ steps.vars.outputs.timestamp }} type=ref,event=pr,suffix=-${{ steps.vars.outputs.sha_short }}-${{ steps.vars.outputs.timestamp }} type=pep440,pattern={{raw}} - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build and push uses: docker/build-push-action@v6 with: context: . platforms: linux/amd64, linux/arm64 push: ${{ github.event_name != 'pull_request' || env.PUSH_FROM_PR == 'true' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | APP_REPOSITORY=https://github.com/${{ github.repository }} APP_VERSION=${{ steps.version.outputs.app_version }} APP_VERSION_URL=${{ steps.version.outputs.app_version_url }} cache-from: type=gha cache-to: type=gha,mode=max ================================================ FILE: .github/workflows/docker-build.ghcr.yml ================================================ name: Build & Push Container on: push: branches: - 'main' tags: - '*' merge_group: pull_request: types: [labeled, synchronize, reopened, ready_for_review, opened] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} cancel-in-progress: true env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} PUSH_FROM_PR: >- ${{ github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'push-container') || contains(github.event.pull_request.labels.*.name, 'deploy-pr-temp-env') ) }} permissions: contents: read jobs: docker-build: name: "GHCR" runs-on: ubuntu-latest permissions: contents: read packages: write attestations: write id-token: write steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - name: Set current timestamp id: vars run: | echo "timestamp=$(date +%s)" >> $GITHUB_OUTPUT echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT echo "sha_full=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT - name: Determine version and deployment context id: version run: | REPO_URL="https://github.com/${{ github.repository }}" if [[ "${{ github.ref_type }}" == "tag" ]]; then # Tag deployment - display version, link to release echo "version=${{ github.ref_name }}" >> $GITHUB_OUTPUT echo "app_version=${{ github.ref_name }}" >> $GITHUB_OUTPUT echo "app_version_url=${REPO_URL}/releases/tag/${{ github.ref_name }}" >> $GITHUB_OUTPUT elif [[ "${{ github.event_name }}" == "pull_request" ]]; then # PR deployment - display pr-XXX, link to PR commit PR_NUMBER="${{ github.event.pull_request.number }}" COMMIT_HASH="${{ steps.vars.outputs.sha_full }}" echo "version=${PR_NUMBER}/merge-${COMMIT_HASH}" >> $GITHUB_OUTPUT echo "app_version=pr-${PR_NUMBER}" >> $GITHUB_OUTPUT echo "app_version_url=${REPO_URL}/pull/${PR_NUMBER}/commits/${COMMIT_HASH}" >> $GITHUB_OUTPUT else # Branch deployment - display branch name, link to commit BRANCH_NAME="${{ github.ref_name }}" COMMIT_HASH="${{ steps.vars.outputs.sha_full }}" echo "app_version=${BRANCH_NAME}" >> $GITHUB_OUTPUT echo "app_version_url=${REPO_URL}/commit/${COMMIT_HASH}" >> $GITHUB_OUTPUT fi - name: Log in to the Container registry uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Docker Meta id: meta uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0 with: images: | ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} flavor: | latest=false tags: | type=ref,event=branch,branch=main type=ref,event=branch,branch=main,suffix=-${{ steps.vars.outputs.sha_short }}-${{ steps.vars.outputs.timestamp }} type=pep440,pattern={{raw}} type=ref,event=pr,suffix=-${{ steps.vars.outputs.sha_short }}-${{ steps.vars.outputs.timestamp }} - name: Set up QEMU uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 - name: Set up Docker Buildx uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 - name: Build and push uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 id: push with: context: . platforms: linux/amd64, linux/arm64 push: ${{ github.event_name != 'pull_request' || env.PUSH_FROM_PR == 'true' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | APP_REPOSITORY=https://github.com/${{ github.repository }} APP_VERSION=${{ steps.version.outputs.app_version }} APP_VERSION_URL=${{ steps.version.outputs.app_version_url }} cache-from: type=gha cache-to: type=gha,mode=max - name: Generate artifact attestation if: github.event_name != 'pull_request' || env.PUSH_FROM_PR == 'true' uses: actions/attest-build-provenance@e8998f949152b193b063cb0ec769d69d929409be # v2.4.0 with: subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} subject-digest: ${{ steps.push.outputs.digest }} push-to-registry: true ================================================ FILE: .github/workflows/pr-title-check.yml ================================================ name: PR Conventional Commit Validation on: pull_request: types: [opened, synchronize, reopened, edited] jobs: validate-pr-title: runs-on: ubuntu-latest steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - name: PR Conventional Commit Validation uses: ytanikin/pr-conventional-commits@b72758283dcbee706975950e96bc4bf323a8d8c0 # 1.4.2 with: task_types: '["feat","fix","docs","test","ci","refactor","perf","chore","revert"]' add_label: 'false' ================================================ FILE: .github/workflows/publish_to_pypi.yml ================================================ name: Publish to PyPI on: release: types: [created] # Run when you click "Publish release" workflow_dispatch: # ... or run it manually from the Actions tab permissions: contents: read jobs: release-build: runs-on: ubuntu-latest steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Set up Python 3.13 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: "3.13" cache: pip cache-dependency-path: pyproject.toml - name: Build package run: | python -m pip install --upgrade pip python -m pip install build twine python -m build twine check dist/* - name: Upload dist artefact uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: dist path: dist/ # Publish to PyPI (only if "dist/" succeeded) pypi-publish: needs: release-build runs-on: ubuntu-latest environment: pypi permissions: id-token: write # OIDC token for trusted publishing steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 with: name: dist path: dist/ - uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # release/v1 with: verbose: true ================================================ FILE: .github/workflows/rebase-needed.yml ================================================ name: PR Needs Rebase on: workflow_dispatch: {} schedule: - cron: '0 * * * *' permissions: pull-requests: write jobs: label-rebase-needed: runs-on: ubuntu-latest if: github.repository == 'coderamp-labs/gitingest' concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true steps: - name: Check for merge conflicts uses: eps1lon/actions-label-merge-conflict@v3 with: dirtyLabel: 'rebase needed :construction:' repoToken: '${{ secrets.GITHUB_TOKEN }}' commentOnClean: This pull request has resolved merge conflicts and is ready for review. commentOnDirty: This pull request has merge conflicts that must be resolved before it can be merged. retryMax: 30 continueOnMissingPermissions: false ================================================ FILE: .github/workflows/release-please.yml ================================================ name: release-please on: push: branches: - main permissions: contents: write pull-requests: write jobs: release: runs-on: ubuntu-latest steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Create GitHub App token uses: actions/create-github-app-token@v2 id: app-token with: app-id: '${{ secrets.CR_APP_CI_APP_ID }}' private-key: '${{ secrets.CR_APP_CI_PRIVATE_KEY }}' owner: '${{ env.FLUX_OWNER }}' repositories: '${{ env.FLUX_REPO }}' - name: Release Please uses: googleapis/release-please-action@v4 with: token: '${{ steps.app-token.outputs.token }}' ================================================ FILE: .github/workflows/scorecard.yml ================================================ name: OSSF Scorecard on: branch_protection_rule: schedule: - cron: '33 11 * * 2' # Every Tuesday at 11:33 AM UTC push: branches: [ main ] permissions: read-all concurrency: # avoid overlapping runs group: scorecard-${{ github.ref }} cancel-in-progress: true jobs: analysis: name: Scorecard analysis runs-on: ubuntu-latest permissions: security-events: write # upload SARIF to code-scanning id-token: write # publish results for the badge steps: - name: Harden the runner (Audit all outbound calls) uses: step-security/harden-runner@ec9f2d5744a09debf3a187a3f4f675c53b671911 # v2.13.0 with: egress-policy: audit - name: Checkout uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: persist-credentials: false - name: Run Scorecard uses: ossf/scorecard-action@f35c64557cf912815708bb1126d9948f3e459487 with: results_file: results.sarif results_format: sarif publish_results: true # enables the public badge - name: Upload to code-scanning uses: github/codeql-action/upload-sarif@df559355d593797519d70b90fc8edd5db049e7a2 # v3.29.9 with: sarif_file: results.sarif ================================================ FILE: .github/workflows/stale.yml ================================================ name: "Close stale issues and PRs" on: schedule: - cron: "0 6 * * *" workflow_dispatch: {} permissions: issues: write pull-requests: write jobs: stale: runs-on: ubuntu-latest steps: - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} days-before-stale: 45 days-before-close: 10 stale-issue-label: stale stale-pr-label: stale stale-issue-message: | Hi there! We haven’t seen activity here for 45 days, so I’m marking this issue as stale. If you’d like to keep it open, please leave a comment within 10 days. Thanks! stale-pr-message: | Hi there! We haven’t seen activity on this pull request for 45 days, so I’m marking it as stale. If you’d like to keep it open, please leave a comment within 10 days. Thanks! close-issue-message: | Hi there! We haven’t heard anything for 10 days, so I’m closing this issue. Feel free to reopen if you’d like to continue the discussion. Thanks! close-pr-message: | Hi there! We haven’t heard anything for 10 days, so I’m closing this pull request. Feel free to reopen if you’d like to continue working on it. Thanks! ================================================ FILE: .gitignore ================================================ # Operating-system .DS_Store Thumbs.db # Editor / IDE settings .vscode/ !.vscode/launch.json .idea/ *.swp # Python virtual-envs & tooling .venv*/ venv/ .python-version __pycache__/ *.egg-info/ *.egg .ruff_cache/ # Test artifacts & coverage .pytest_cache/ .coverage coverage.xml htmlcov/ # Build, distribution & docs build/ dist/ *.wheel # Logs & runtime output *.log logs/ *.tmp tmp/ # Project-specific files history.txt digest.txt # Environment variables .env ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: check-added-large-files description: 'Prevent large files from being committed.' args: ['--maxkb=10000'] - id: check-case-conflict description: 'Check for files that would conflict in case-insensitive filesystems.' - id: fix-byte-order-marker description: 'Remove utf-8 byte order marker.' - id: mixed-line-ending description: 'Replace mixed line ending.' - id: destroyed-symlinks description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' - id: check-ast description: 'Check for parseable syntax.' - id: end-of-file-fixer description: 'Ensure that a file is either empty, or ends with one newline.' - id: trailing-whitespace description: 'Trim trailing whitespace.' exclude: CHANGELOG.md - id: check-docstring-first description: 'Check a common error of defining a docstring after code.' - id: requirements-txt-fixer description: 'Sort entries in requirements.txt.' - repo: https://github.com/MarcoGorelli/absolufy-imports rev: v0.3.1 hooks: - id: absolufy-imports description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' - repo: https://github.com/asottile/pyupgrade rev: v3.20.0 hooks: - id: pyupgrade description: 'Automatically upgrade syntax for newer versions.' args: [--py3-plus, --py36-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: - id: python-check-blanket-noqa description: 'Enforce that `# noqa` annotations always occur with specific codes.' - id: python-check-blanket-type-ignore description: 'Enforce that `# type: ignore` annotations always occur with specific codes.' - id: python-use-type-annotations description: 'Enforce that python3.6+ type annotations are used instead of type comments.' - repo: https://github.com/PyCQA/isort rev: 6.0.1 hooks: - id: isort description: 'Sort imports alphabetically, and automatically separated into sections and by type.' - repo: https://github.com/pre-commit/mirrors-eslint rev: v9.30.1 hooks: - id: eslint description: 'Lint javascript files.' files: \.js$ args: [--max-warnings=0, --fix] additional_dependencies: [ 'eslint@9.30.1', '@eslint/js@9.30.1', 'eslint-plugin-import@2.32.0', 'globals@16.3.0', ] - repo: https://github.com/djlint/djLint rev: v1.36.4 hooks: - id: djlint-reformat-jinja - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.45.0 hooks: - id: markdownlint description: 'Lint markdown files.' args: ['--disable=line-length', '--ignore=CHANGELOG.md'] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.12.2 hooks: - id: ruff-check - id: ruff-format - repo: https://github.com/jsh9/pydoclint rev: 0.6.7 hooks: - id: pydoclint name: pydoclint for source args: [--style=numpy] files: ^src/ - repo: https://github.com/pycqa/pylint rev: v3.3.7 hooks: - id: pylint name: pylint for source files: ^src/ additional_dependencies: [ boto3>=1.28.0, click>=8.0.0, 'fastapi[standard]>=0.109.1', gitpython>=3.1.0, httpx, loguru>=0.7.0, pathspec>=0.12.1, prometheus-client, pydantic, pytest-asyncio, pytest-mock, python-dotenv, 'sentry-sdk[fastapi]', slowapi, starlette>=0.40.0, strenum; python_version < '3.11', tiktoken>=0.7.0, typing_extensions>= 4.0.0; python_version < '3.10', uvicorn>=0.11.7, ] - id: pylint name: pylint for tests files: ^tests/ args: - --rcfile=tests/.pylintrc additional_dependencies: [ boto3>=1.28.0, click>=8.0.0, 'fastapi[standard]>=0.109.1', gitpython>=3.1.0, httpx, loguru>=0.7.0, pathspec>=0.12.1, prometheus-client, pydantic, pytest-asyncio, pytest-mock, python-dotenv, 'sentry-sdk[fastapi]', slowapi, starlette>=0.40.0, strenum; python_version < '3.11', tiktoken>=0.7.0, typing_extensions>= 4.0.0; python_version < '3.10', uvicorn>=0.11.7, ] - repo: meta hooks: - id: check-hooks-apply - id: check-useless-excludes - repo: https://github.com/gitleaks/gitleaks rev: v8.16.3 hooks: - id: gitleaks ================================================ FILE: .release-please-manifest.json ================================================ {".":"0.3.1"} ================================================ FILE: .vscode/launch.json ================================================ { "configurations": [ { "name": "Python Debugger: Module", "type": "debugpy", "request": "launch", "module": "server", "args": [], "cwd": "${workspaceFolder}/src" } ] } ================================================ FILE: CHANGELOG.md ================================================ # Changelog ## [0.3.1](https://github.com/coderamp-labs/gitingest/compare/v0.3.0...v0.3.1) (2025-07-31) ### Bug Fixes * make cache aware of subpaths ([#481](https://github.com/coderamp-labs/gitingest/issues/481)) ([8b59bef](https://github.com/coderamp-labs/gitingest/commit/8b59bef541f858ef44eba8fce6ace77df9dea01c)) ## [0.3.0](https://github.com/coderamp-labs/gitingest/compare/v0.2.1...v0.3.0) (2025-07-30) ### Features * **logging:** implement loguru ([#473](https://github.com/coderamp-labs/gitingest/issues/473)) ([d061b48](https://github.com/coderamp-labs/gitingest/commit/d061b4877a253ba3f0480d329f025427c7f70177)) * serve cached digest if available ([#462](https://github.com/coderamp-labs/gitingest/issues/462)) ([efe5a26](https://github.com/coderamp-labs/gitingest/commit/efe5a2686142b5ee4984061ebcec23c3bf3495d5)) ### Bug Fixes * handle network errors gracefully in token count estimation ([#437](https://github.com/coderamp-labs/gitingest/issues/437)) ([5fbb445](https://github.com/coderamp-labs/gitingest/commit/5fbb445cd8725e56972f43ec8b5e12cb299e9e83)) * improved server side cleanup after ingest ([#477](https://github.com/coderamp-labs/gitingest/issues/477)) ([2df0eb4](https://github.com/coderamp-labs/gitingest/commit/2df0eb43989731ae40a9dd82d310ff76a794a46d)) ### Documentation * **contributing:** update PR title guidelines to enforce convention ([#476](https://github.com/coderamp-labs/gitingest/issues/476)) ([d1f8a80](https://github.com/coderamp-labs/gitingest/commit/d1f8a80826ca38ec105a1878742fe351d4939d6e)) ## [0.2.1](https://github.com/coderamp-labs/gitingest/compare/v0.2.0...v0.2.1) (2025-07-27) ### Bug Fixes * remove logarithm conversion from the backend and correctly process max file size in kb ([#464](https://github.com/coderamp-labs/gitingest/issues/464)) ([932bfef](https://github.com/coderamp-labs/gitingest/commit/932bfef85db66704985c83f3f7c427756bd14023)) ## [0.2.0](https://github.com/coderamp-labs/gitingest/compare/v0.1.5...v0.2.0) (2025-07-26) ### Features * `include_submodules` option ([#313](https://github.com/coderamp-labs/gitingest/issues/313)) ([38c2317](https://github.com/coderamp-labs/gitingest/commit/38c23171a14556a2cdd05c0af8219f4dc789defd)) * add Tailwind CSS pipeline, tag-aware cloning & overhaul CI/CD ([#352](https://github.com/coderamp-labs/gitingest/issues/352)) ([b683e59](https://github.com/coderamp-labs/gitingest/commit/b683e59b5b1a31d27cc5c6ce8fb62da9b660613b)) * add Tailwind CSS pipeline, tag-aware cloning & overhaul CI/CD ([#352](https://github.com/coderamp-labs/gitingest/issues/352)) ([016817d](https://github.com/coderamp-labs/gitingest/commit/016817d5590c1412498b7532f6e854d20239c6be)) * **ci:** build Docker Image on PRs ([#382](https://github.com/coderamp-labs/gitingest/issues/382)) ([bc8cdb4](https://github.com/coderamp-labs/gitingest/commit/bc8cdb459482948c27e780b733ac7216d822529a)) * implement prometheus exporter ([#406](https://github.com/coderamp-labs/gitingest/issues/406)) ([1016f6e](https://github.com/coderamp-labs/gitingest/commit/1016f6ecb3b1b066d541d1eba1ddffec49b15f16)) * implement S3 integration for storing and retrieving digest files ([#427](https://github.com/coderamp-labs/gitingest/issues/427)) ([414e851](https://github.com/coderamp-labs/gitingest/commit/414e85189fb9055491530ba8c0665c798474451e)) * integrate Sentry for error tracking and performance monitoring ([#408](https://github.com/coderamp-labs/gitingest/issues/408)) ([590e55a](https://github.com/coderamp-labs/gitingest/commit/590e55a4d28a4f5c0beafbd12c525828fa79e221)) * Refactor backend to a rest api ([#346](https://github.com/coderamp-labs/gitingest/issues/346)) ([2b1f228](https://github.com/coderamp-labs/gitingest/commit/2b1f228ae1f6d1f7ee471794d258b13fcac25a96)) * **ui:** add inline PAT info tooltip inside token field ([#348](https://github.com/coderamp-labs/gitingest/issues/348)) ([2592303](https://github.com/coderamp-labs/gitingest/commit/25923037ea6cd2f8ef33a6cf1f0406c2b4f0c9b6)) ### Bug Fixes * enable metrics if env var is defined instead of being "True" ([#407](https://github.com/coderamp-labs/gitingest/issues/407)) ([fa2e192](https://github.com/coderamp-labs/gitingest/commit/fa2e192c05864c8db90bda877e9efb9b03caf098)) * fix docker container not launching ([#449](https://github.com/coderamp-labs/gitingest/issues/449)) ([998cea1](https://github.com/coderamp-labs/gitingest/commit/998cea15b4f79c5d6f840b5d3d916f83c8be3a07)) * frontend directory tree ([#363](https://github.com/coderamp-labs/gitingest/issues/363)) ([0fcf8a9](https://github.com/coderamp-labs/gitingest/commit/0fcf8a956f7ec8403a025177f998f92ddee96de0)) * gitignore and gitingestignore files are now correctly processed … ([#416](https://github.com/coderamp-labs/gitingest/issues/416)) ([74e503f](https://github.com/coderamp-labs/gitingest/commit/74e503fa1140feb74aa5350a32f0025c43097da1)) * Potential fix for code scanning alert no. 75: Uncontrolled data used in path expression ([#421](https://github.com/coderamp-labs/gitingest/issues/421)) ([9ceaf6c](https://github.com/coderamp-labs/gitingest/commit/9ceaf6cbbb0cdefbc79f78c5285406b9188b2d3d)) * reset pattern form when switching between include/exclude patterns ([#417](https://github.com/coderamp-labs/gitingest/issues/417)) ([7085e13](https://github.com/coderamp-labs/gitingest/commit/7085e138a74099b1df189b3bf9b8a333c8769380)) * temp files cleanup after ingest([#309](https://github.com/coderamp-labs/gitingest/issues/309)) ([e669e44](https://github.com/coderamp-labs/gitingest/commit/e669e444fa1e6130f3f22952dd81f0ca3fe08fa5)) * **ui:** update layout in PAT section to avoid overlaps & overflows ([#331](https://github.com/coderamp-labs/gitingest/issues/331)) ([b39ef54](https://github.com/coderamp-labs/gitingest/commit/b39ef5416c1f8a7993a8249161d2a898b7387595)) * **windows:** warn if Git long path support is disabled, do not fail ([b8e375f](https://github.com/coderamp-labs/gitingest/commit/b8e375f71cae7d980cf431396c4414a6dbd0588c)) ### Documentation * add GitHub Issue Form for bug reports ([#403](https://github.com/coderamp-labs/gitingest/issues/403)) ([4546449](https://github.com/coderamp-labs/gitingest/commit/4546449bbc1e4a7ad0950c4b831b8855a98628fd)) * add GitHub Issue Form for feature requests ([#404](https://github.com/coderamp-labs/gitingest/issues/404)) ([9b1fc58](https://github.com/coderamp-labs/gitingest/commit/9b1fc58900ae18a3416fe3cf9b5e301a65a8e9fd)) * Fix CLI help text accuracy ([#332](https://github.com/coderamp-labs/gitingest/issues/332)) ([fdcbc53](https://github.com/coderamp-labs/gitingest/commit/fdcbc53cadde6a5dc3c3626120df1935b63693b2)) ### Code Refactoring * centralize PAT validation, streamline repo checks & misc cleanup ([#349](https://github.com/coderamp-labs/gitingest/issues/349)) ([cea0edd](https://github.com/coderamp-labs/gitingest/commit/cea0eddce8c6846bc6271cb3a8d15320e103214c)) * centralize PAT validation, streamline repo checks & misc cleanup ([#349](https://github.com/coderamp-labs/gitingest/issues/349)) ([f8d397e](https://github.com/coderamp-labs/gitingest/commit/f8d397e66e3382d12f8a0ed05d291a39db830bda)) ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at . All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at . Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). For answers to common questions about this code of conduct, see the FAQ at . Translations are available at . ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Gitingest Thanks for your interest in contributing to **Gitingest** 🚀 Our goal is to keep the codebase friendly to first-time contributors. If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK9EC). --- ## How to Contribute (non-technical) - **Create an Issue** – found a bug or have a feature idea? [Open an issue](https://github.com/coderamp-labs/gitingest/issues/new). - **Spread the Word** – tweet, blog, or tell a friend. - **Use Gitingest** – real-world usage gives the best feedback. File issues or ping us on [Discord](https://discord.com/invite/zerRaGK9EC) with anything you notice. --- ## How to submit a Pull Request > **Prerequisites**: The project uses **Python 3.9+** and `pre-commit` for development. 1. **Fork** the repository. 2. **Clone** your fork: ```bash git clone https://github.com/coderamp-labs/gitingest.git cd gitingest ``` 3. **Set up the dev environment**: ```bash python -m venv .venv source .venv/bin/activate pip install -e ".[dev,server]" pre-commit install ``` 4. **Create a branch** for your changes: ```bash git checkout -b your-branch ``` 5. **Make your changes** (and add tests when relevant). 6. **Stage** the changes: ```bash git add . ``` 7. **Run the backend test suite**: ```bash pytest ``` 8. *(Optional)* **Run `pre-commit` on all files** to check hooks without committing: ```bash pre-commit run --all-files ``` 9. **Run the local server** to sanity-check: ```bash python -m server ``` Open [http://localhost:8000](http://localhost:8000) to confirm everything works. 10. **Commit** (signed): ```bash git commit -S -m "Your commit message" ``` If *pre-commit* complains, fix the problems and repeat **5 – 9**. 11. **Push** your branch: ```bash git push origin your-branch ``` 12. **Open a pull request** on GitHub with a clear description. > **Important:** Pull request titles **must follow the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) specification**. This helps with changelogs and automated releases. 13. **Iterate** on any review feedback—update your branch and repeat **6 – 11** as needed. *(Optional) Invite a maintainer to your branch for easier collaboration.* ================================================ FILE: Dockerfile ================================================ # Stage 1: Install Python dependencies FROM python:3.13.5-slim@sha256:4c2cf9917bd1cbacc5e9b07320025bdb7cdf2df7b0ceaccb55e9dd7e30987419 AS python-builder WORKDIR /build RUN set -eux; \ apt-get update; \ apt-get install -y --no-install-recommends gcc python3-dev; \ rm -rf /var/lib/apt/lists/* COPY pyproject.toml . COPY src/ ./src/ RUN set -eux; \ pip install --no-cache-dir --upgrade pip; \ pip install --no-cache-dir --timeout 1000 .[server,mcp] # Stage 2: Runtime image FROM python:3.13.5-slim@sha256:4c2cf9917bd1cbacc5e9b07320025bdb7cdf2df7b0ceaccb55e9dd7e30987419 ARG UID=1000 ARG GID=1000 ARG APP_REPOSITORY=https://github.com/coderamp-labs/gitingest ARG APP_VERSION=unknown ARG APP_VERSION_URL=https://github.com/coderamp-labs/gitingest ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ APP_REPOSITORY=${APP_REPOSITORY} \ APP_VERSION=${APP_VERSION} \ APP_VERSION_URL=${APP_VERSION_URL} RUN set -eux; \ apt-get update; \ apt-get install -y --no-install-recommends git curl; \ apt-get clean; \ rm -rf /var/lib/apt/lists/* WORKDIR /app RUN set -eux; \ groupadd -g "$GID" appuser; \ useradd -m -u "$UID" -g "$GID" appuser COPY --from=python-builder --chown=$UID:$GID /usr/local/lib/python3.13/site-packages/ /usr/local/lib/python3.13/site-packages/ COPY --chown=$UID:$GID src/ ./ RUN set -eux; \ chown -R appuser:appuser /app USER appuser EXPOSE 8000 EXPOSE 9090 CMD ["python", "-m", "server"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Romain Courtois Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Gitingest [![Screenshot of Gitingest front page](https://raw.githubusercontent.com/coderamp-labs/gitingest/refs/heads/main/docs/frontpage.png)](https://gitingest.com)

PyPI Python Versions
CI Ruff OpenSSF Scorecard
License Downloads GitHub Stars Discord
Trendshift

Turn any Git repository into a prompt-friendly text ingest for LLMs. You can also replace `hub` with `ingest` in any GitHub URL to access the corresponding digest. [gitingest.com](https://gitingest.com) · [Chrome Extension](https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood) · [Firefox Add-on](https://addons.mozilla.org/firefox/addon/gitingest) [Deutsch](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=de) | [Español](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=es) | [Français](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=fr) | [日本語](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=ja) | [한국어](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=ko) | [Português](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=pt) | [Русский](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=ru) | [中文](https://www.readme-i18n.com/coderamp-labs/gitingest?lang=zh) ## 🚀 Features - **Easy code context**: Get a text digest from a Git repository URL or a directory - **Smart Formatting**: Optimized output format for LLM prompts - **Statistics about**: - File and directory structure - Size of the extract - Token count - **CLI tool**: Run it as a shell command - **Python package**: Import it in your code ## 📚 Requirements - Python 3.8+ - For private repositories: A GitHub Personal Access Token (PAT). [Generate your token **here**!](https://github.com/settings/tokens/new?description=gitingest&scopes=repo) ### 📦 Installation Gitingest is available on [PyPI](https://pypi.org/project/gitingest/). You can install it using `pip`: ```bash pip install gitingest ``` or ```bash pip install gitingest[server] ``` to include server dependencies for self-hosting. However, it might be a good idea to use `pipx` to install it. You can install `pipx` using your preferred package manager. ```bash brew install pipx apt install pipx scoop install pipx ... ``` If you are using pipx for the first time, run: ```bash pipx ensurepath ``` ```bash # install gitingest pipx install gitingest ``` ## 🧩 Browser Extension Usage Available in the Chrome Web Store Get The Add-on for Firefox Get from the Edge Add-ons The extension is open source at [lcandy2/gitingest-extension](https://github.com/lcandy2/gitingest-extension). Issues and feature requests are welcome to the repo. ## 💡 Command line usage The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. ```bash # Basic usage (writes to digest.txt by default) gitingest /path/to/directory # From URL gitingest https://github.com/coderamp-labs/gitingest # or from specific subdirectory gitingest https://github.com/coderamp-labs/gitingest/tree/main/src/gitingest/utils ``` For private repositories, use the `--token/-t` option. ```bash # Get your token from https://github.com/settings/personal-access-tokens gitingest https://github.com/username/private-repo --token github_pat_... # Or set it as an environment variable export GITHUB_TOKEN=github_pat_... gitingest https://github.com/username/private-repo # Include repository submodules gitingest https://github.com/username/repo-with-submodules --include-submodules ``` By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you need those files in the digest. By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways: - Use `--output/-o ` to write to a specific file. - Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools). See more options and usage details with: ```bash gitingest --help ``` ## 🐍 Python package usage ```python # Synchronous usage from gitingest import ingest summary, tree, content = ingest("path/to/directory") # or from URL summary, tree, content = ingest("https://github.com/coderamp-labs/gitingest") # or from a specific subdirectory summary, tree, content = ingest("https://github.com/coderamp-labs/gitingest/tree/main/src/gitingest/utils") ``` For private repositories, you can pass a token: ```python # Using token parameter summary, tree, content = ingest("https://github.com/username/private-repo", token="github_pat_...") # Or set it as an environment variable import os os.environ["GITHUB_TOKEN"] = "github_pat_..." summary, tree, content = ingest("https://github.com/username/private-repo") # Include repository submodules summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True) ``` By default, this won't write a file but can be enabled with the `output` argument. ```python # Asynchronous usage from gitingest import ingest_async import asyncio result = asyncio.run(ingest_async("path/to/directory")) ``` ### Jupyter notebook usage ```python from gitingest import ingest_async # Use await directly in Jupyter summary, tree, content = await ingest_async("path/to/directory") ``` This is because Jupyter notebooks are asynchronous by default. ## 🐳 Self-host ### Using Docker 1. Build the image: ``` bash docker build -t gitingest . ``` 2. Run the container: ``` bash docker run -d --name gitingest -p 8000:8000 gitingest ``` The application will be available at `http://localhost:8000`. If you are hosting it on a domain, you can specify the allowed hostnames via env variable `ALLOWED_HOSTS`. ```bash # Default: "gitingest.com, *.gitingest.com, localhost, 127.0.0.1". ALLOWED_HOSTS="example.com, localhost, 127.0.0.1" ``` ### Environment Variables The application can be configured using the following environment variables: - **ALLOWED_HOSTS**: Comma-separated list of allowed hostnames (default: "gitingest.com, *.gitingest.com, localhost, 127.0.0.1") - **GITINGEST_METRICS_ENABLED**: Enable Prometheus metrics server (set to any value to enable) - **GITINGEST_METRICS_HOST**: Host for the metrics server (default: "127.0.0.1") - **GITINGEST_METRICS_PORT**: Port for the metrics server (default: "9090") - **GITINGEST_SENTRY_ENABLED**: Enable Sentry error tracking (set to any value to enable) - **GITINGEST_SENTRY_DSN**: Sentry DSN (required if Sentry is enabled) - **GITINGEST_SENTRY_TRACES_SAMPLE_RATE**: Sampling rate for performance data (default: "1.0", range: 0.0-1.0) - **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0) - **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace") - **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true") - **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket") - **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value) ### Using Docker Compose The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments. #### Compose File Structure The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services: ```yaml # Common base configuration for all services x-app-base: &app-base build: context: . dockerfile: Dockerfile ports: - "${APP_WEB_BIND:-8000}:8000" # Main application port - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port # ... other common configurations ``` #### Services The file defines three services: 1. **app**: Production service configuration - Uses the `prod` profile - Sets the Sentry environment to "production" - Configured for stable operation with `restart: unless-stopped` 2. **app-dev**: Development service configuration - Uses the `dev` profile - Enables debug mode - Mounts the source code for live development - Uses hot reloading for faster development 3. **minio**: S3-compatible object storage for development - Uses the `dev` profile (only available in development mode) - Provides S3-compatible storage for local development - Accessible via: - API: Port 9000 ([localhost:9000](http://localhost:9000)) - Web Console: Port 9001 ([localhost:9001](http://localhost:9001)) - Default admin credentials: - Username: `minioadmin` - Password: `minioadmin` - Configurable via environment variables: - `MINIO_ROOT_USER`: Custom admin username (default: minioadmin) - `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin) - Includes persistent storage via Docker volume - Auto-creates a bucket and application-specific credentials: - Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`) - Access key: `gitingest` (configurable via `S3_ACCESS_KEY`) - Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`) - These credentials are automatically passed to the app-dev service via environment variables: - `S3_ENDPOINT`: URL of the MinIO server - `S3_ACCESS_KEY`: Access key for the S3 bucket - `S3_SECRET_KEY`: Secret key for the S3 bucket - `S3_BUCKET_NAME`: Name of the S3 bucket - `S3_REGION`: Region for the S3 bucket (default: us-east-1) - `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket") #### Usage Examples To run the application in development mode: ```bash docker compose --profile dev up ``` To run the application in production mode: ```bash docker compose --profile prod up -d ``` To build and run the application: ```bash docker compose --profile prod build docker compose --profile prod up -d ``` ## 🤝 Contributing ### Non-technical ways to contribute - **Create an Issue**: If you find a bug or have an idea for a new feature, please [create an issue](https://github.com/coderamp-labs/gitingest/issues/new) on GitHub. This will help us track and prioritize your request. - **Spread the Word**: If you like Gitingest, please share it with your friends, colleagues, and on social media. This will help us grow the community and make Gitingest even better. - **Use Gitingest**: The best feedback comes from real-world usage! If you encounter any issues or have ideas for improvement, please let us know by [creating an issue](https://github.com/coderamp-labs/gitingest/issues/new) on GitHub or by reaching out to us on [Discord](https://discord.com/invite/zerRaGK9EC). ### Technical ways to contribute Gitingest aims to be friendly for first time contributors, with a simple Python and HTML codebase. If you need any help while working with the code, reach out to us on [Discord](https://discord.com/invite/zerRaGK9EC). For detailed instructions on how to make a pull request, see [CONTRIBUTING.md](./CONTRIBUTING.md). ## 🛠️ Stack - [Tailwind CSS](https://tailwindcss.com) - Frontend - [FastAPI](https://github.com/fastapi/fastapi) - Backend framework - [Jinja2](https://jinja.palletsprojects.com) - HTML templating - [tiktoken](https://github.com/openai/tiktoken) - Token estimation - [posthog](https://github.com/PostHog/posthog) - Amazing analytics - [Sentry](https://sentry.io) - Error tracking and performance monitoring ### Looking for a JavaScript/FileSystemNode package? Check out the NPM alternative 📦 Repomix: ## 🚀 Project Growth [![Star History Chart](https://api.star-history.com/svg?repos=coderamp-labs/gitingest&type=Date)](https://star-history.com/#coderamp-labs/gitingest&Date) ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Reporting a Vulnerability If you have discovered a vulnerability inside the project, report it privately at . This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. ================================================ FILE: compose.yml ================================================ x-base-environment: &base-environment # Python Configuration PYTHONUNBUFFERED: "1" PYTHONDONTWRITEBYTECODE: "1" # Host Configuration ALLOWED_HOSTS: ${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1} # Metrics Configuration GITINGEST_METRICS_ENABLED: ${GITINGEST_METRICS_ENABLED:-true} GITINGEST_METRICS_HOST: ${GITINGEST_METRICS_HOST:-0.0.0.0} GITINGEST_METRICS_PORT: ${GITINGEST_METRICS_PORT:-9090} # Sentry Configuration GITINGEST_SENTRY_ENABLED: ${GITINGEST_SENTRY_ENABLED:-false} GITINGEST_SENTRY_DSN: ${GITINGEST_SENTRY_DSN:-} GITINGEST_SENTRY_TRACES_SAMPLE_RATE: ${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0} GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE: ${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0} GITINGEST_SENTRY_PROFILE_LIFECYCLE: ${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace} GITINGEST_SENTRY_SEND_DEFAULT_PII: ${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true} x-prod-environment: &prod-environment GITINGEST_SENTRY_ENVIRONMENT: ${GITINGEST_SENTRY_ENVIRONMENT:-production} x-dev-environment: &dev-environment DEBUG: "true" LOG_LEVEL: "DEBUG" RELOAD: "true" GITINGEST_SENTRY_ENVIRONMENT: ${GITINGEST_SENTRY_ENVIRONMENT:-development} # S3 Configuration for development S3_ENABLED: "true" S3_ENDPOINT: http://minio:9000 S3_ACCESS_KEY: ${S3_ACCESS_KEY:-gitingest} S3_SECRET_KEY: ${S3_SECRET_KEY:-gitingest123} S3_BUCKET_NAME: ${S3_BUCKET_NAME:-gitingest-bucket} S3_REGION: ${S3_REGION:-us-east-1} S3_DIRECTORY_PREFIX: ${S3_DIRECTORY_PREFIX:-dev} S3_ALIAS_HOST: ${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}} x-app-base: &app-base ports: - "${APP_WEB_BIND:-8000}:8000" # Main application port - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port user: "1000:1000" command: ["python", "-m", "server"] services: # Production service configuration app: <<: *app-base image: ghcr.io/coderamp-labs/gitingest:latest profiles: - prod environment: <<: [*base-environment, *prod-environment] restart: unless-stopped # Development service configuration app-dev: <<: *app-base build: context: . dockerfile: Dockerfile profiles: - dev environment: <<: [*base-environment, *dev-environment] volumes: # Mount source code for live development - ./src:/app:ro # Use --reload flag for hot reloading during development command: ["python", "-m", "server"] depends_on: minio-setup: condition: service_completed_successfully # MinIO S3-compatible object storage for development minio: image: minio/minio:latest profiles: - dev ports: - "9000:9000" # API port - "9001:9001" # Console port environment: &minio-environment MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin} MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin} volumes: - minio-data:/data command: server /data --console-address ":9001" restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 30s timeout: 30s start_period: 30s start_interval: 1s # MinIO setup service to create bucket and user minio-setup: image: minio/mc profiles: - dev depends_on: minio: condition: service_healthy environment: <<: *minio-environment S3_ACCESS_KEY: ${S3_ACCESS_KEY:-gitingest} S3_SECRET_KEY: ${S3_SECRET_KEY:-gitingest123} S3_BUCKET_NAME: ${S3_BUCKET_NAME:-gitingest-bucket} volumes: - ./.docker/minio/setup.sh:/setup.sh:ro entrypoint: sh command: -c /setup.sh volumes: minio-data: driver: local ================================================ FILE: eslint.config.cjs ================================================ const js = require('@eslint/js'); const globals = require('globals'); const importPlugin = require('eslint-plugin-import'); module.exports = [ js.configs.recommended, { files: ['src/static/js/**/*.js'], languageOptions: { parserOptions: { ecmaVersion: 2021, sourceType: 'module' }, globals: { ...globals.browser, changePattern: 'readonly', copyFullDigest: 'readonly', copyText: 'readonly', downloadFullDigest: 'readonly', handleSubmit: 'readonly', posthog: 'readonly', submitExample: 'readonly', toggleAccessSettings: 'readonly', toggleFile: 'readonly', }, }, plugins: { import: importPlugin }, rules: { // Import hygiene (eslint-plugin-import) 'import/no-extraneous-dependencies': 'error', 'import/no-unresolved': 'error', 'import/order': ['warn', { alphabetize: { order: 'asc' } }], // Safety & bug-catchers 'consistent-return': 'error', 'default-case': 'error', 'no-implicit-globals': 'error', 'no-shadow': 'error', // Maintainability / complexity complexity: ['warn', 10], 'max-depth': ['warn', 4], 'max-lines': ['warn', 500], 'max-params': ['warn', 5], // Stylistic consistency (auto-fixable) 'arrow-parens': ['error', 'always'], curly: ['error', 'all'], indent: ['error', 4, { SwitchCase: 2 }], 'newline-per-chained-call': ['warn', { ignoreChainWithDepth: 2 }], 'no-multi-spaces': 'error', 'object-shorthand': ['error', 'always'], 'padding-line-between-statements': [ 'warn', { blankLine: 'always', prev: '*', next: 'return' }, { blankLine: 'always', prev: ['const', 'let', 'var'], next: '*' }, { blankLine: 'any', prev: ['const', 'let', 'var'], next: ['const', 'let', 'var'] }, ], 'quote-props': ['error', 'consistent-as-needed'], quotes: ['error', 'single', { avoidEscape: true }], semi: 'error', // Modern / performance tips 'arrow-body-style': ['warn', 'as-needed'], 'prefer-arrow-callback': 'error', 'prefer-exponentiation-operator': 'error', 'prefer-numeric-literals': 'error', 'prefer-object-has-own': 'warn', 'prefer-object-spread': 'error', 'prefer-template': 'error', }, }, ]; ================================================ FILE: pyproject.toml ================================================ [project] name = "gitingest" version = "0.3.1" description="CLI tool to analyze and create text dumps of codebases for LLMs" readme = {file = "README.md", content-type = "text/markdown" } requires-python = ">= 3.8" dependencies = [ "click>=8.0.0", "gitpython>=3.1.0", "httpx", "loguru>=0.7.0", "pathspec>=0.12.1", "pydantic", "python-dotenv", "starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw) "strenum; python_version < '3.11'", "tiktoken>=0.7.0", # Support for o200k_base encoding "typing_extensions>= 4.0.0; python_version < '3.10'", ] license = {file = "LICENSE"} authors = [ { name = "Romain Courtois", email = "romain@coderamp.io" }, { name = "Filip Christiansen"}, ] classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ] [project.optional-dependencies] dev = [ "eval-type-backport", "pre-commit", "pytest", "pytest-asyncio", "pytest-mock", ] server = [ "boto3>=1.28.0", # AWS SDK for S3 support "fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38) "prometheus-client", "sentry-sdk[fastapi]", "slowapi", "uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150) ] [project.scripts] gitingest = "gitingest.__main__:main" [project.urls] homepage = "https://gitingest.com" github = "https://github.com/coderamp-labs/gitingest" [build-system] requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools] packages = {find = {where = ["src"]}} include-package-data = true # Linting configuration [tool.pylint.format] max-line-length = 119 [tool.pylint.'MESSAGES CONTROL'] disable = [ "too-many-arguments", "too-many-positional-arguments", "too-many-locals", "too-few-public-methods", "broad-exception-caught", "duplicate-code", "fixme", ] [tool.ruff] line-length = 119 fix = true [tool.ruff.lint] select = ["ALL"] ignore = [ # https://docs.astral.sh/ruff/rules/... "D107", # undocumented-public-init "FIX002", # line-contains-todo "TD002", # missing-todo-author "PLR0913", # too-many-arguments, # TODO: fix the following issues: "TD003", # missing-todo-link, TODO: add issue links "S108", # hardcoded-temp-file, TODO: replace with tempfile "BLE001", # blind-except, TODO: replace with specific exceptions "FAST003", # fast-api-unused-path-parameter, TODO: fix ] per-file-ignores = { "tests/**/*.py" = ["S101"] } # Skip the "assert used" warning [tool.ruff.lint.pylint] max-returns = 10 [tool.ruff.lint.isort] order-by-type = true case-sensitive = true [tool.pycln] all = true # TODO: Remove this once we figure out how to use ruff-isort [tool.isort] profile = "black" line_length = 119 remove_redundant_aliases = true float_to_top = true # https://github.com/astral-sh/ruff/issues/6514 order_by_type = true filter_files = true # Test configuration [tool.pytest.ini_options] pythonpath = ["src"] testpaths = ["tests/"] python_files = "test_*.py" asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "function" python_classes = "Test*" python_functions = "test_*" ================================================ FILE: release-please-config.json ================================================ { "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json", "packages": { ".": { "release-type": "python", "bump-minor-pre-major": true } } } ================================================ FILE: renovate.json ================================================ { "$schema": "https://docs.renovatebot.com/renovate-schema.json", "extends": [ "config:recommended" ] } ================================================ FILE: requirements-dev.txt ================================================ -r requirements.txt eval-type-backport pre-commit pytest pytest-asyncio pytest-cov pytest-mock ================================================ FILE: requirements.txt ================================================ boto3>=1.28.0 # AWS SDK for S3 support click>=8.0.0 fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 httpx loguru>=0.7.0 pathspec>=0.12.1 prometheus-client pydantic python-dotenv sentry-sdk[fastapi] slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw tiktoken>=0.7.0 # Support for o200k_base encoding uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 ================================================ FILE: src/gitingest/__init__.py ================================================ """Gitingest: A package for ingesting data from Git repositories.""" from gitingest.entrypoint import ingest, ingest_async __all__ = ["ingest", "ingest_async"] ================================================ FILE: src/gitingest/__main__.py ================================================ """Command-line interface (CLI) for Gitingest.""" # pylint: disable=no-value-for-parameter from __future__ import annotations import asyncio from typing import TypedDict import click from typing_extensions import Unpack from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async # Import logging configuration first to intercept all logging from gitingest.utils.logging_config import get_logger # Initialize logger for this module logger = get_logger(__name__) class _CLIArgs(TypedDict): source: str max_size: int exclude_pattern: tuple[str, ...] include_pattern: tuple[str, ...] branch: str | None include_gitignored: bool include_submodules: bool token: str | None output: str | None @click.command() @click.argument("source", type=str, default=".") @click.option( "--max-size", "-s", default=MAX_FILE_SIZE, show_default=True, help="Maximum file size to process in bytes", ) @click.option("--exclude-pattern", "-e", multiple=True, help="Shell-style patterns to exclude.") @click.option( "--include-pattern", "-i", multiple=True, help="Shell-style patterns to include.", ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") @click.option( "--include-gitignored", is_flag=True, default=False, help="Include files matched by .gitignore and .gitingestignore", ) @click.option( "--include-submodules", is_flag=True, help="Include repository's submodules in the analysis", default=False, ) @click.option( "--token", "-t", envvar="GITHUB_TOKEN", default=None, help=( "GitHub personal access token (PAT) for accessing private repositories. " "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." ), ) @click.option( "--output", "-o", default=None, help="Output file path (default: digest.txt in current directory). Use '-' for stdout.", ) def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: """Run the CLI entry point to analyze a repo / directory and dump its contents. Parameters ---------- **cli_kwargs : Unpack[_CLIArgs] A dictionary of keyword arguments forwarded to ``ingest_async``. Notes ----- See ``ingest_async`` for a detailed description of each argument. Examples -------- Basic usage: $ gitingest $ gitingest /path/to/repo $ gitingest https://github.com/user/repo Output to stdout: $ gitingest -o - $ gitingest https://github.com/user/repo --output - With filtering: $ gitingest -i "*.py" -e "*.log" $ gitingest --include-pattern "*.js" --exclude-pattern "node_modules/*" Private repositories: $ gitingest https://github.com/user/private-repo -t ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo Include submodules: $ gitingest https://github.com/user/repo --include-submodules """ asyncio.run(_async_main(**cli_kwargs)) async def _async_main( source: str, *, max_size: int = MAX_FILE_SIZE, exclude_pattern: tuple[str, ...] | None = None, include_pattern: tuple[str, ...] | None = None, branch: str | None = None, include_gitignored: bool = False, include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> None: """Analyze a directory or repository and create a text dump of its contents. This command scans the specified ``source`` (a local directory or Git repo), applies custom include and exclude patterns, and generates a text summary of the analysis. The summary is written to an output file or printed to ``stdout``. Parameters ---------- source : str A directory path or a Git repository URL. max_size : int Maximum file size in bytes to ingest (default: 10 MB). exclude_pattern : tuple[str, ...] | None Glob patterns for pruning the file set. include_pattern : tuple[str, ...] | None Glob patterns for including files in the output. branch : str | None Git branch to ingest. If ``None``, the repository's default branch is used. include_gitignored : bool If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). include_submodules : bool If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None The path where the output file will be written (default: ``digest.txt`` in current directory). Use ``"-"`` to write to ``stdout``. Raises ------ click.Abort Raised if an error occurs during execution and the command must be aborted. """ try: # Normalise pattern containers (the ingest layer expects sets) exclude_patterns = set(exclude_pattern) if exclude_pattern else set() include_patterns = set(include_pattern) if include_pattern else set() output_target = output if output is not None else OUTPUT_FILE_NAME if output_target == "-": click.echo("Analyzing source, preparing output for stdout...", err=True) else: click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) summary, _, _ = await ingest_async( source, max_file_size=max_size, include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, include_gitignored=include_gitignored, include_submodules=include_submodules, token=token, output=output_target, ) except Exception as exc: # Convert any exception into Click.Abort so that exit status is non-zero click.echo(f"Error: {exc}", err=True) raise click.Abort from exc if output_target == "-": # stdout click.echo("\n--- Summary ---", err=True) click.echo(summary, err=True) click.echo("--- End Summary ---", err=True) click.echo("Analysis complete! Output sent to stdout.", err=True) else: # file click.echo(f"Analysis complete! Output written to: {output_target}") click.echo("\nSummary:") click.echo(summary) if __name__ == "__main__": main() ================================================ FILE: src/gitingest/clone.py ================================================ """Module containing functions for cloning a Git repository to a local path.""" from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING import git from gitingest.config import DEFAULT_TIMEOUT from gitingest.utils.git_utils import ( check_repo_exists, checkout_partial_clone, create_git_repo, ensure_git_installed, git_auth_context, is_github_host, resolve_commit, ) from gitingest.utils.logging_config import get_logger from gitingest.utils.os_utils import ensure_directory_exists_or_create from gitingest.utils.timeout_wrapper import async_timeout if TYPE_CHECKING: from gitingest.schemas import CloneConfig # Initialize logger for this module logger = get_logger(__name__) @async_timeout(DEFAULT_TIMEOUT) async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: """Clone a repository to a local path based on the provided configuration. This function handles the process of cloning a Git repository to the local file system. It can clone a specific branch, tag, or commit if provided, and it raises exceptions if any errors occur during the cloning process. Parameters ---------- config : CloneConfig The configuration for cloning the repository. token : str | None GitHub personal access token (PAT) for accessing private repositories. Raises ------ ValueError If the repository is not found, if the provided URL is invalid, or if the token format is invalid. RuntimeError If Git operations fail during the cloning process. """ # Extract and validate query parameters url: str = config.url local_path: str = config.local_path partial_clone: bool = config.subpath != "/" logger.info( "Starting git clone operation", extra={ "url": url, "local_path": local_path, "partial_clone": partial_clone, "subpath": config.subpath, "branch": config.branch, "tag": config.tag, "commit": config.commit, "include_submodules": config.include_submodules, }, ) logger.debug("Ensuring git is installed") await ensure_git_installed() logger.debug("Creating local directory", extra={"parent_path": str(Path(local_path).parent)}) await ensure_directory_exists_or_create(Path(local_path).parent) logger.debug("Checking if repository exists", extra={"url": url}) if not await check_repo_exists(url, token=token): logger.error("Repository not found", extra={"url": url}) msg = "Repository not found. Make sure it is public or that you have provided a valid token." raise ValueError(msg) logger.debug("Resolving commit reference") commit = await resolve_commit(config, token=token) logger.debug("Resolved commit", extra={"commit": commit}) # Clone the repository using GitPython with proper authentication logger.info("Executing git clone operation", extra={"url": "", "local_path": local_path}) try: clone_kwargs = { "single_branch": True, "no_checkout": True, "depth": 1, } with git_auth_context(url, token) as (git_cmd, auth_url): if partial_clone: # For partial clones, use git.Git() with filter and sparse options cmd_args = ["--single-branch", "--no-checkout", "--depth=1"] cmd_args.extend(["--filter=blob:none", "--sparse"]) cmd_args.extend([auth_url, local_path]) git_cmd.clone(*cmd_args) elif token and is_github_host(url): # For authenticated GitHub repos, use git_cmd with auth URL cmd_args = ["--single-branch", "--no-checkout", "--depth=1", auth_url, local_path] git_cmd.clone(*cmd_args) else: # For non-authenticated repos, use the standard GitPython method git.Repo.clone_from(url, local_path, **clone_kwargs) logger.info("Git clone completed successfully") except git.GitCommandError as exc: msg = f"Git clone failed: {exc}" raise RuntimeError(msg) from exc # Checkout the subpath if it is a partial clone if partial_clone: logger.info("Setting up partial clone for subpath", extra={"subpath": config.subpath}) await checkout_partial_clone(config, token=token) logger.debug("Partial clone setup completed") # Perform post-clone operations await _perform_post_clone_operations(config, local_path, url, token, commit) logger.info("Git clone operation completed successfully", extra={"local_path": local_path}) async def _perform_post_clone_operations( config: CloneConfig, local_path: str, url: str, token: str | None, commit: str, ) -> None: """Perform post-clone operations like fetching, checkout, and submodule updates. Parameters ---------- config : CloneConfig The configuration for cloning the repository. local_path : str The local path where the repository was cloned. url : str The repository URL. token : str | None GitHub personal access token (PAT) for accessing private repositories. commit : str The commit SHA to checkout. Raises ------ RuntimeError If any Git operation fails. """ try: repo = create_git_repo(local_path, url, token) # Ensure the commit is locally available logger.debug("Fetching specific commit", extra={"commit": commit}) repo.git.fetch("--depth=1", "origin", commit) # Write the work-tree at that commit logger.info("Checking out commit", extra={"commit": commit}) repo.git.checkout(commit) # Update submodules if config.include_submodules: logger.info("Updating submodules") repo.git.submodule("update", "--init", "--recursive", "--depth=1") logger.debug("Submodules updated successfully") except git.GitCommandError as exc: msg = f"Git operation failed: {exc}" raise RuntimeError(msg) from exc ================================================ FILE: src/gitingest/config.py ================================================ """Configuration file for the project.""" import tempfile from pathlib import Path MAX_FILE_SIZE = 10 * 1024 * 1024 # Maximum size of a single file to process (10 MB) MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal MAX_FILES = 10_000 # Maximum number of files to process MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB) DEFAULT_TIMEOUT = 60 # seconds OUTPUT_FILE_NAME = "digest.txt" TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" ================================================ FILE: src/gitingest/entrypoint.py ================================================ """Main entry point for ingesting a source and processing its contents.""" from __future__ import annotations import asyncio import errno import shutil import stat import sys from contextlib import asynccontextmanager from pathlib import Path from typing import TYPE_CHECKING, AsyncGenerator, Callable from urllib.parse import urlparse from gitingest.clone import clone_repo from gitingest.config import MAX_FILE_SIZE from gitingest.ingestion import ingest_query from gitingest.query_parser import parse_local_dir_path, parse_remote_repo from gitingest.utils.auth import resolve_token from gitingest.utils.compat_func import removesuffix from gitingest.utils.ignore_patterns import load_ignore_patterns from gitingest.utils.logging_config import get_logger from gitingest.utils.pattern_utils import process_patterns from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS if TYPE_CHECKING: from types import TracebackType from gitingest.schemas import IngestionQuery # Initialize logger for this module logger = get_logger(__name__) async def ingest_async( source: str, *, max_file_size: int = MAX_FILE_SIZE, include_patterns: str | set[str] | None = None, exclude_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: """Ingest a source and process its contents. This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a summary, a tree-like structure of the files, and the content of the files. The results can optionally be written to an output file. Parameters ---------- source : str The source to analyze, which can be a URL (for a Git repository) or a local directory path. max_file_size : int Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). include_patterns : str | set[str] | None Pattern or set of patterns specifying which files to include. If ``None``, all files are included. exclude_patterns : str | set[str] | None Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. branch : str | None The branch to clone and ingest (default: the default branch). tag : str | None The tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None File path where the summary and content should be written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. Returns ------- tuple[str, str, str] A tuple containing: - A summary string of the analyzed repository or directory. - A tree-like string representation of the file structure. - The content of the files in the repository or directory. """ logger.info("Starting ingestion process", extra={"source": source}) token = resolve_token(token) source = removesuffix(source.strip(), ".git") # Determine the parsing method based on the source type if urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): # We either have a full URL or a domain-less slug logger.info("Parsing remote repository", extra={"source": source}) query = await parse_remote_repo(source, token=token) query.include_submodules = include_submodules _override_branch_and_tag(query, branch=branch, tag=tag) else: # Local path scenario logger.info("Processing local directory", extra={"source": source}) query = parse_local_dir_path(source) query.max_file_size = max_file_size query.ignore_patterns, query.include_patterns = process_patterns( exclude_patterns=exclude_patterns, include_patterns=include_patterns, ) if query.url: _override_branch_and_tag(query, branch=branch, tag=tag) query.include_submodules = include_submodules logger.debug( "Configuration completed", extra={ "max_file_size": query.max_file_size, "include_submodules": query.include_submodules, "include_gitignored": include_gitignored, "has_include_patterns": bool(query.include_patterns), "has_exclude_patterns": bool(query.ignore_patterns), }, ) async with _clone_repo_if_remote(query, token=token): if query.url: logger.info("Repository cloned, starting file processing") else: logger.info("Starting local directory processing") if not include_gitignored: logger.debug("Applying gitignore patterns") _apply_gitignores(query) logger.info("Processing files and generating output") summary, tree, content = ingest_query(query) if output: logger.debug("Writing output to file", extra={"output_path": output}) await _write_output(tree, content=content, target=output) logger.info("Ingestion completed successfully") return summary, tree, content def ingest( source: str, *, max_file_size: int = MAX_FILE_SIZE, include_patterns: str | set[str] | None = None, exclude_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: """Provide a synchronous wrapper around ``ingest_async``. This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a summary, a tree-like structure of the files, and the content of the files. The results can optionally be written to an output file. Parameters ---------- source : str The source to analyze, which can be a URL (for a Git repository) or a local directory path. max_file_size : int Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). include_patterns : str | set[str] | None Pattern or set of patterns specifying which files to include. If ``None``, all files are included. exclude_patterns : str | set[str] | None Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. branch : str | None The branch to clone and ingest (default: the default branch). tag : str | None The tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None File path where the summary and content should be written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. Returns ------- tuple[str, str, str] A tuple containing: - A summary string of the analyzed repository or directory. - A tree-like string representation of the file structure. - The content of the files in the repository or directory. See Also -------- ``ingest_async`` : The asynchronous version of this function. """ return asyncio.run( ingest_async( source=source, max_file_size=max_file_size, include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, tag=tag, include_gitignored=include_gitignored, include_submodules=include_submodules, token=token, output=output, ), ) def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str | None) -> None: """Compare the caller-supplied ``branch`` and ``tag`` with the ones already in ``query``. If they differ, update ``query`` to the chosen values and issue a warning. If both are specified, the tag wins over the branch. Parameters ---------- query : IngestionQuery The query to update. branch : str | None The branch to use. tag : str | None The tag to use. """ if tag and query.tag and tag != query.tag: msg = f"Warning: The specified tag '{tag}' overrides the tag found in the URL '{query.tag}'." logger.warning(msg) query.tag = tag or query.tag if branch and query.branch and branch != query.branch: msg = f"Warning: The specified branch '{branch}' overrides the branch found in the URL '{query.branch}'." logger.warning(msg) query.branch = branch or query.branch if tag and branch: msg = "Warning: Both tag and branch are specified. The tag will be used." logger.warning(msg) # Tag wins over branch if both supplied if query.tag: query.branch = None def _apply_gitignores(query: IngestionQuery) -> None: """Update ``query.ignore_patterns`` in-place. Parameters ---------- query : IngestionQuery The query to update. """ for fname in (".gitignore", ".gitingestignore"): query.ignore_patterns.update(load_ignore_patterns(query.local_path, filename=fname)) @asynccontextmanager async def _clone_repo_if_remote(query: IngestionQuery, *, token: str | None) -> AsyncGenerator[None]: """Async context-manager that clones ``query.url`` if present. If ``query.url`` is set, the repo is cloned, control is yielded, and the temp directory is removed on exit. If no URL is given, the function simply yields immediately. Parameters ---------- query : IngestionQuery Parsed query describing the source to ingest. token : str | None GitHub personal access token (PAT) for accessing private repositories. """ kwargs = {} if sys.version_info >= (3, 12): kwargs["onexc"] = _handle_remove_readonly else: kwargs["onerror"] = _handle_remove_readonly if query.url: clone_config = query.extract_clone_config() await clone_repo(clone_config, token=token) try: yield finally: shutil.rmtree(query.local_path.parent, **kwargs) else: yield def _handle_remove_readonly( func: Callable, path: str, exc_info: BaseException | tuple[type[BaseException], BaseException, TracebackType], ) -> None: """Handle permission errors raised by ``shutil.rmtree()``. * Makes the target writable (removes the read-only attribute). * Retries the original operation (``func``) once. """ # 'onerror' passes a (type, value, tb) tuple; 'onexc' passes the exception if isinstance(exc_info, tuple): # 'onerror' (Python <3.12) exc: BaseException = exc_info[1] else: # 'onexc' (Python 3.12+) exc = exc_info # Handle only'Permission denied' and 'Operation not permitted' if not isinstance(exc, OSError) or exc.errno not in {errno.EACCES, errno.EPERM}: raise exc # Make the target writable Path(path).chmod(stat.S_IWRITE) func(path) async def _write_output(tree: str, content: str, target: str | None) -> None: """Write combined output to ``target`` (``"-"`` ⇒ stdout). Parameters ---------- tree : str The tree-like string representation of the file structure. content : str The content of the files in the repository or directory. target : str | None The path to the output file. If ``None``, the results are not written to a file. """ data = f"{tree}\n{content}" loop = asyncio.get_running_loop() if target == "-": await loop.run_in_executor(None, sys.stdout.write, data) await loop.run_in_executor(None, sys.stdout.flush) elif target is not None: await loop.run_in_executor(None, Path(target).write_text, data, "utf-8") ================================================ FILE: src/gitingest/ingestion.py ================================================ """Functions to ingest and analyze a codebase directory or single file.""" from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.output_formatter import format_node from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: from gitingest.schemas import IngestionQuery # Initialize logger for this module logger = get_logger(__name__) def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: """Run the ingestion process for a parsed query. This is the main entry point for analyzing a codebase directory or single file. It processes the query parameters, reads the file or directory content, and generates a summary, directory structure, and file content, along with token estimations. Parameters ---------- query : IngestionQuery The parsed query object containing information about the repository and query parameters. Returns ------- tuple[str, str, str] A tuple containing the summary, directory structure, and file contents. Raises ------ ValueError If the path cannot be found, is not a file, or the file has no content. """ logger.info( "Starting file ingestion", extra={ "slug": query.slug, "subpath": query.subpath, "local_path": str(query.local_path), "max_file_size": query.max_file_size, }, ) subpath = Path(query.subpath.strip("/")).as_posix() path = query.local_path / subpath if not path.exists(): logger.error("Path not found", extra={"path": str(path), "slug": query.slug}) msg = f"{query.slug} cannot be found" raise ValueError(msg) if (query.type and query.type == "blob") or query.local_path.is_file(): # TODO: We do this wrong! We should still check the branch and commit! logger.info("Processing single file", extra={"file_path": str(path)}) if not path.is_file(): logger.error("Expected file but found non-file", extra={"path": str(path)}) msg = f"Path {path} is not a file" raise ValueError(msg) relative_path = path.relative_to(query.local_path) file_node = FileSystemNode( name=path.name, type=FileSystemNodeType.FILE, size=path.stat().st_size, file_count=1, path_str=str(relative_path), path=path, ) if not file_node.content: logger.error("File has no content", extra={"file_name": file_node.name}) msg = f"File {file_node.name} has no content" raise ValueError(msg) logger.info( "Single file processing completed", extra={ "file_name": file_node.name, "file_size": file_node.size, }, ) return format_node(file_node, query=query) logger.info("Processing directory", extra={"directory_path": str(path)}) root_node = FileSystemNode( name=path.name, type=FileSystemNodeType.DIRECTORY, path_str=str(path.relative_to(query.local_path)), path=path, ) stats = FileSystemStats() _process_node(node=root_node, query=query, stats=stats) logger.info( "Directory processing completed", extra={ "total_files": root_node.file_count, "total_directories": root_node.dir_count, "total_size_bytes": root_node.size, "stats_total_files": stats.total_files, "stats_total_size": stats.total_size, }, ) return format_node(root_node, query=query) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: """Process a file or directory item within a directory. This function handles each file or directory item, checking if it should be included or excluded based on the provided patterns. It handles symlinks, directories, and files accordingly. Parameters ---------- node : FileSystemNode The current directory or file node being processed. query : IngestionQuery The parsed query object containing information about the repository and query parameters. stats : FileSystemStats Statistics tracking object for the total file count and size. """ if limit_exceeded(stats, depth=node.depth): return for sub_path in node.path.iterdir(): if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns): continue if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns): continue if sub_path.is_symlink(): _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_file(): if sub_path.stat().st_size > query.max_file_size: logger.debug( "Skipping file: would exceed max file size limit", extra={ "file_path": str(sub_path), "file_size": sub_path.stat().st_size, "max_file_size": query.max_file_size, }, ) continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): child_directory_node = FileSystemNode( name=sub_path.name, type=FileSystemNodeType.DIRECTORY, path_str=str(sub_path.relative_to(query.local_path)), path=sub_path, depth=node.depth + 1, ) _process_node(node=child_directory_node, query=query, stats=stats) if not child_directory_node.children: continue node.children.append(child_directory_node) node.size += child_directory_node.size node.file_count += child_directory_node.file_count node.dir_count += 1 + child_directory_node.dir_count else: logger.warning("Unknown file type, skipping", extra={"file_path": str(sub_path)}) node.sort_children() def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: """Process a symlink in the file system. This function checks the symlink's target. Parameters ---------- path : Path The full path of the symlink. parent_node : FileSystemNode The parent directory node. stats : FileSystemStats Statistics tracking object for the total file count and size. local_path : Path The base path of the repository or directory being processed. """ child = FileSystemNode( name=path.name, type=FileSystemNodeType.SYMLINK, path_str=str(path.relative_to(local_path)), path=path, depth=parent_node.depth + 1, ) stats.total_files += 1 parent_node.children.append(child) parent_node.file_count += 1 def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: """Process a file in the file system. This function checks the file's size, increments the statistics, and reads its content. If the file size exceeds the maximum allowed, it raises an error. Parameters ---------- path : Path The full path of the file. parent_node : FileSystemNode The dictionary to accumulate the results. stats : FileSystemStats Statistics tracking object for the total file count and size. local_path : Path The base path of the repository or directory being processed. """ if stats.total_files + 1 > MAX_FILES: logger.warning( "Maximum file limit reached", extra={ "current_files": stats.total_files, "max_files": MAX_FILES, "file_path": str(path), }, ) return file_size = path.stat().st_size if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: logger.warning( "Skipping file: would exceed total size limit", extra={ "file_path": str(path), "file_size": file_size, "current_total_size": stats.total_size, "max_total_size": MAX_TOTAL_SIZE_BYTES, }, ) return stats.total_files += 1 stats.total_size += file_size child = FileSystemNode( name=path.name, type=FileSystemNodeType.FILE, size=file_size, file_count=1, path_str=str(path.relative_to(local_path)), path=path, depth=parent_node.depth + 1, ) parent_node.children.append(child) parent_node.size += file_size parent_node.file_count += 1 def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: """Check if any of the traversal limits have been exceeded. This function checks if the current traversal has exceeded any of the configured limits: maximum directory depth, maximum number of files, or maximum total size in bytes. Parameters ---------- stats : FileSystemStats Statistics tracking object for the total file count and size. depth : int The current depth of directory traversal. Returns ------- bool ``True`` if any limit has been exceeded, ``False`` otherwise. """ if depth > MAX_DIRECTORY_DEPTH: logger.warning( "Maximum directory depth limit reached", extra={ "current_depth": depth, "max_depth": MAX_DIRECTORY_DEPTH, }, ) return True if stats.total_files >= MAX_FILES: logger.warning( "Maximum file limit reached", extra={ "current_files": stats.total_files, "max_files": MAX_FILES, }, ) return True # TODO: end recursion if stats.total_size >= MAX_TOTAL_SIZE_BYTES: logger.warning( "Maximum total size limit reached", extra={ "current_size_mb": stats.total_size / 1024 / 1024, "max_size_mb": MAX_TOTAL_SIZE_BYTES / 1024 / 1024, }, ) return True # TODO: end recursion return False ================================================ FILE: src/gitingest/output_formatter.py ================================================ """Functions to ingest and analyze a codebase directory or single file.""" from __future__ import annotations import ssl from typing import TYPE_CHECKING import requests.exceptions import tiktoken from gitingest.schemas import FileSystemNode, FileSystemNodeType from gitingest.utils.compat_func import readlink from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: from gitingest.schemas import IngestionQuery # Initialize logger for this module logger = get_logger(__name__) _TOKEN_THRESHOLDS: list[tuple[int, str]] = [ (1_000_000, "M"), (1_000, "k"), ] def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]: """Generate a summary, directory structure, and file contents for a given file system node. If the node represents a directory, the function will recursively process its contents. Parameters ---------- node : FileSystemNode The file system node to be summarized. query : IngestionQuery The parsed query object containing information about the repository and query parameters. Returns ------- tuple[str, str, str] A tuple containing the summary, directory structure, and file contents. """ is_single_file = node.type == FileSystemNodeType.FILE summary = _create_summary_prefix(query, single_file=is_single_file) if node.type == FileSystemNodeType.DIRECTORY: summary += f"Files analyzed: {node.file_count}\n" elif node.type == FileSystemNodeType.FILE: summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" tree = "Directory structure:\n" + _create_tree_structure(query, node=node) content = _gather_file_contents(node) token_estimate = _format_token_count(tree + content) if token_estimate: summary += f"\nEstimated tokens: {token_estimate}" return summary, tree, content def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str: """Create a prefix string for summarizing a repository or local directory. Includes repository name (if provided), commit/branch details, and subpath if relevant. Parameters ---------- query : IngestionQuery The parsed query object containing information about the repository and query parameters. single_file : bool A flag indicating whether the summary is for a single file (default: ``False``). Returns ------- str A summary prefix string containing repository, commit, branch, and subpath details. """ parts = [] if query.user_name: parts.append(f"Repository: {query.user_name}/{query.repo_name}") else: # Local scenario parts.append(f"Directory: {query.slug}") if query.tag: parts.append(f"Tag: {query.tag}") elif query.branch and query.branch not in ("main", "master"): parts.append(f"Branch: {query.branch}") if query.commit: parts.append(f"Commit: {query.commit}") if query.subpath != "/" and not single_file: parts.append(f"Subpath: {query.subpath}") return "\n".join(parts) + "\n" def _gather_file_contents(node: FileSystemNode) -> str: """Recursively gather contents of all files under the given node. This function recursively processes a directory node and gathers the contents of all files under that node. It returns the concatenated content of all files as a single string. Parameters ---------- node : FileSystemNode The current directory or file node being processed. Returns ------- str The concatenated content of all files under the given node. """ if node.type != FileSystemNodeType.DIRECTORY: return node.content_string # Recursively gather contents of all files under the current directory return "\n".join(_gather_file_contents(child) for child in node.children) def _create_tree_structure( query: IngestionQuery, *, node: FileSystemNode, prefix: str = "", is_last: bool = True, ) -> str: """Generate a tree-like string representation of the file structure. This function generates a string representation of the directory structure, formatted as a tree with appropriate indentation for nested directories and files. Parameters ---------- query : IngestionQuery The parsed query object containing information about the repository and query parameters. node : FileSystemNode The current directory or file node being processed. prefix : str A string used for indentation and formatting of the tree structure (default: ``""``). is_last : bool A flag indicating whether the current node is the last in its directory (default: ``True``). Returns ------- str A string representing the directory structure formatted as a tree. """ if not node.name: # If no name is present, use the slug as the top-level directory name node.name = query.slug tree_str = "" current_prefix = "└── " if is_last else "├── " # Indicate directories with a trailing slash display_name = node.name if node.type == FileSystemNodeType.DIRECTORY: display_name += "/" elif node.type == FileSystemNodeType.SYMLINK: display_name += " -> " + readlink(node.path).name tree_str += f"{prefix}{current_prefix}{display_name}\n" if node.type == FileSystemNodeType.DIRECTORY and node.children: prefix += " " if is_last else "│ " for i, child in enumerate(node.children): tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) return tree_str def _format_token_count(text: str) -> str | None: """Return a human-readable token-count string (e.g. 1.2k, 1.2 M). Parameters ---------- text : str The text string for which the token count is to be estimated. Returns ------- str | None The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs. """ try: encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini total_tokens = len(encoding.encode(text, disallowed_special=())) except (ValueError, UnicodeEncodeError) as exc: logger.warning("Failed to estimate token size", extra={"error": str(exc)}) return None except (requests.exceptions.RequestException, ssl.SSLError) as exc: # If network errors, skip token count estimation instead of erroring out logger.warning("Failed to download tiktoken model", extra={"error": str(exc)}) return None for threshold, suffix in _TOKEN_THRESHOLDS: if total_tokens >= threshold: return f"{total_tokens / threshold:.1f}{suffix}" return str(total_tokens) ================================================ FILE: src/gitingest/query_parser.py ================================================ """Module containing functions to parse and validate input sources and patterns.""" from __future__ import annotations import uuid from pathlib import Path from typing import Literal from gitingest.config import TMP_BASE_PATH from gitingest.schemas import IngestionQuery from gitingest.utils.git_utils import fetch_remote_branches_or_tags, resolve_commit from gitingest.utils.logging_config import get_logger from gitingest.utils.query_parser_utils import ( PathKind, _fallback_to_root, _get_user_and_repo_from_path, _is_valid_git_commit_hash, _normalise_source, ) # Initialize logger for this module logger = get_logger(__name__) async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery: """Parse a repository URL and return an ``IngestionQuery`` object. If source is: - A fully qualified URL ('https://gitlab.com/...'), parse & verify that domain - A URL missing 'https://' ('gitlab.com/...'), add 'https://' and parse - A *slug* ('pandas-dev/pandas'), attempt known domains until we find one that exists. Parameters ---------- source : str The URL or domain-less slug to parse. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- IngestionQuery A dictionary containing the parsed details of the repository. """ parsed_url = await _normalise_source(source, token=token) host = parsed_url.netloc user, repo = _get_user_and_repo_from_path(parsed_url.path) _id = uuid.uuid4() slug = f"{user}-{repo}" local_path = TMP_BASE_PATH / str(_id) / slug url = f"https://{host}/{user}/{repo}" query = IngestionQuery( host=host, user_name=user, repo_name=repo, url=url, local_path=local_path, slug=slug, id=_id, ) path_parts = parsed_url.path.strip("/").split("/")[2:] # main branch if not path_parts: return await _fallback_to_root(query, token=token) kind = PathKind(path_parts.pop(0)) # may raise ValueError query.type = kind # TODO: Handle issues and pull requests if query.type in {PathKind.ISSUES, PathKind.PULL}: msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root." return await _fallback_to_root(query, token=token, warn_msg=msg) # If no extra path parts, just return if not path_parts: msg = f"Warning: No extra path parts: {url}. Returning repository root." return await _fallback_to_root(query, token=token, warn_msg=msg) if query.type not in {PathKind.TREE, PathKind.BLOB}: # TODO: Handle other types msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root." return await _fallback_to_root(query, token=token, warn_msg=msg) # Commit, branch, or tag ref = path_parts[0] if _is_valid_git_commit_hash(ref): # Commit query.commit = ref path_parts.pop(0) # Consume the commit hash else: # Branch or tag # Try to resolve a tag query.tag = await _configure_branch_or_tag( path_parts, url=url, ref_type="tags", token=token, ) # If no tag found, try to resolve a branch if not query.tag: query.branch = await _configure_branch_or_tag( path_parts, url=url, ref_type="branches", token=token, ) # Only configure subpath if we have identified a commit, branch, or tag. if path_parts and (query.commit or query.branch or query.tag): query.subpath += "/".join(path_parts) query.commit = await resolve_commit(query.extract_clone_config(), token=token) return query def parse_local_dir_path(path_str: str) -> IngestionQuery: """Parse the given file path into a structured query dictionary. Parameters ---------- path_str : str The file path to parse. Returns ------- IngestionQuery A dictionary containing the parsed details of the file path. """ path_obj = Path(path_str).resolve() slug = path_obj.name if path_str == "." else path_str.strip("/") return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4()) async def _configure_branch_or_tag( path_parts: list[str], *, url: str, ref_type: Literal["branches", "tags"], token: str | None = None, ) -> str | None: """Configure the branch or tag based on the remaining parts of the URL. Parameters ---------- path_parts : list[str] The path parts of the URL. url : str The URL of the repository. ref_type : Literal["branches", "tags"] The type of reference to configure. Can be "branches" or "tags". token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- str | None The branch or tag name if found, otherwise ``None``. """ _ref_type = "tags" if ref_type == "tags" else "branches" try: # Fetch the list of branches or tags from the remote repository branches_or_tags: list[str] = await fetch_remote_branches_or_tags(url, ref_type=_ref_type, token=token) except RuntimeError as exc: # If remote discovery fails, we optimistically treat the first path segment as the branch/tag. msg = f"Warning: Failed to fetch {_ref_type}: {exc}" logger.warning(msg) return path_parts.pop(0) if path_parts else None # Iterate over the path components and try to find a matching branch/tag candidate_parts: list[str] = [] for part in path_parts: candidate_parts.append(part) candidate_name = "/".join(candidate_parts) if candidate_name in branches_or_tags: # We found a match — now consume exactly the parts that form the branch/tag del path_parts[: len(candidate_parts)] return candidate_name # No match found; leave path_parts intact return None ================================================ FILE: src/gitingest/schemas/__init__.py ================================================ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig from gitingest.schemas.filesystem import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.schemas.ingestion import IngestionQuery __all__ = ["CloneConfig", "FileSystemNode", "FileSystemNodeType", "FileSystemStats", "IngestionQuery"] ================================================ FILE: src/gitingest/schemas/cloning.py ================================================ """Schema for the cloning process.""" from __future__ import annotations from pydantic import BaseModel, Field class CloneConfig(BaseModel): # pylint: disable=too-many-instance-attributes """Configuration for cloning a Git repository. This model holds the necessary parameters for cloning a repository to a local path, including the repository's URL, the target local path, and optional parameters for a specific commit, branch, or tag. Attributes ---------- url : str The URL of the Git repository to clone. local_path : str The local directory where the repository will be cloned. commit : str | None The specific commit hash to check out after cloning. branch : str | None The branch to clone. tag : str | None The tag to clone. subpath : str The subpath to clone from the repository (default: ``"/"``). blob : bool Whether the repository is a blob (default: ``False``). include_submodules : bool Whether to clone submodules (default: ``False``). """ url: str local_path: str commit: str | None = None branch: str | None = None tag: str | None = None subpath: str = Field(default="/") blob: bool = Field(default=False) include_submodules: bool = Field(default=False) ================================================ FILE: src/gitingest/schemas/filesystem.py ================================================ """Schema for the filesystem representation.""" from __future__ import annotations import os from dataclasses import dataclass, field from enum import Enum, auto from typing import TYPE_CHECKING from gitingest.utils.compat_func import readlink from gitingest.utils.file_utils import _decodes, _get_preferred_encodings, _read_chunk from gitingest.utils.notebook import process_notebook if TYPE_CHECKING: from pathlib import Path SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 class FileSystemNodeType(Enum): """Enum representing the type of a file system node (directory or file).""" DIRECTORY = auto() FILE = auto() SYMLINK = auto() @dataclass class FileSystemStats: """Class for tracking statistics during file system traversal.""" total_files: int = 0 total_size: int = 0 @dataclass class FileSystemNode: # pylint: disable=too-many-instance-attributes """Class representing a node in the file system (either a file or directory). Tracks properties of files/directories for comprehensive analysis. """ name: str type: FileSystemNodeType path_str: str path: Path size: int = 0 file_count: int = 0 dir_count: int = 0 depth: int = 0 children: list[FileSystemNode] = field(default_factory=list) def sort_children(self) -> None: """Sort the children nodes of a directory according to a specific order. Order of sorting: 2. Regular files (not starting with dot) 3. Hidden files (starting with dot) 4. Regular directories (not starting with dot) 5. Hidden directories (starting with dot) All groups are sorted alphanumerically within themselves. Raises ------ ValueError If the node is not a directory. """ if self.type != FileSystemNodeType.DIRECTORY: msg = "Cannot sort children of a non-directory node" raise ValueError(msg) def _sort_key(child: FileSystemNode) -> tuple[int, str]: # returns the priority order for the sort function, 0 is first # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir name = child.name.lower() if child.type == FileSystemNodeType.FILE: if name == "readme" or name.startswith("readme."): return (0, name) return (1 if not name.startswith(".") else 2, name) return (3 if not name.startswith(".") else 4, name) self.children.sort(key=_sort_key) @property def content_string(self) -> str: """Return the content of the node as a string, including path and content. Returns ------- str A string representation of the node's content. """ parts = [ SEPARATOR, f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" + (f" -> {readlink(self.path).name}" if self.type == FileSystemNodeType.SYMLINK else ""), SEPARATOR, f"{self.content}", ] return "\n".join(parts) + "\n\n" @property def content(self) -> str: # pylint: disable=too-many-return-statements """Return file content (if text / notebook) or an explanatory placeholder. Heuristically decides whether the file is text or binary by decoding a small chunk of the file with multiple encodings and checking for common binary markers. Returns ------- str The content of the file, or an error message if the file could not be read. Raises ------ ValueError If the node is a directory. """ if self.type == FileSystemNodeType.DIRECTORY: msg = "Cannot read content of a directory node" raise ValueError(msg) if self.type == FileSystemNodeType.SYMLINK: return "" # TODO: are we including the empty content of symlinks? if self.path.suffix == ".ipynb": # Notebook try: return process_notebook(self.path) except Exception as exc: return f"Error processing notebook: {exc}" chunk = _read_chunk(self.path) if chunk is None: return "Error reading file" if chunk == b"": return "[Empty file]" if not _decodes(chunk, "utf-8"): return "[Binary file]" # Find the first encoding that decodes the sample good_enc: str | None = next( (enc for enc in _get_preferred_encodings() if _decodes(chunk, encoding=enc)), None, ) if good_enc is None: return "Error: Unable to decode file with available encodings" try: with self.path.open(encoding=good_enc) as fp: return fp.read() except (OSError, UnicodeDecodeError) as exc: return f"Error reading file with {good_enc!r}: {exc}" ================================================ FILE: src/gitingest/schemas/ingestion.py ================================================ """Module containing the dataclasses for the ingestion process.""" from __future__ import annotations from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) from pydantic import BaseModel, Field from gitingest.config import MAX_FILE_SIZE from gitingest.schemas.cloning import CloneConfig class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes """Pydantic model to store the parsed details of the repository or file path. Attributes ---------- host : str | None The host of the repository. user_name : str | None The username or owner of the repository. repo_name : str | None The name of the repository. local_path : Path The local path to the repository or file. url : str | None The URL of the repository. slug : str The slug of the repository. id : UUID The ID of the repository. subpath : str The subpath to the repository or file (default: ``"/"``). type : str | None The type of the repository or file. branch : str | None The branch of the repository. commit : str | None The commit of the repository. tag : str | None The tag of the repository. max_file_size : int The maximum file size to ingest in bytes (default: 10 MB). ignore_patterns : set[str] The patterns to ignore (default: ``set()``). include_patterns : set[str] | None The patterns to include. include_submodules : bool Whether to include all Git submodules within the repository. (default: ``False``) s3_url : str | None The S3 URL where the digest is stored if S3 is enabled. """ host: str | None = None user_name: str | None = None repo_name: str | None = None local_path: Path url: str | None = None slug: str id: UUID subpath: str = Field(default="/") type: str | None = None branch: str | None = None commit: str | None = None tag: str | None = None max_file_size: int = Field(default=MAX_FILE_SIZE) ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns include_patterns: set[str] | None = None include_submodules: bool = Field(default=False) s3_url: str | None = None def extract_clone_config(self) -> CloneConfig: """Extract the relevant fields for the CloneConfig object. Returns ------- CloneConfig A CloneConfig object containing the relevant fields. Raises ------ ValueError If the ``url`` parameter is not provided. """ if not self.url: msg = "The 'url' parameter is required." raise ValueError(msg) return CloneConfig( url=self.url, local_path=str(self.local_path), commit=self.commit, branch=self.branch, tag=self.tag, subpath=self.subpath, blob=self.type == "blob", include_submodules=self.include_submodules, ) ================================================ FILE: src/gitingest/utils/__init__.py ================================================ """Utility functions for the gitingest package.""" ================================================ FILE: src/gitingest/utils/auth.py ================================================ """Utilities for handling authentication.""" from __future__ import annotations import os from gitingest.utils.git_utils import validate_github_token def resolve_token(token: str | None) -> str | None: """Resolve the token to use for the query. Parameters ---------- token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- str | None The resolved token. """ token = token or os.getenv("GITHUB_TOKEN") if token: validate_github_token(token) return token ================================================ FILE: src/gitingest/utils/compat_func.py ================================================ """Compatibility functions for Python 3.8.""" import os from pathlib import Path def readlink(path: Path) -> Path: """Read the target of a symlink. Compatible with Python 3.8. Parameters ---------- path : Path Path to the symlink. Returns ------- Path The target of the symlink. """ return Path(os.readlink(path)) def removesuffix(s: str, suffix: str) -> str: """Remove a suffix from a string. Compatible with Python 3.8. Parameters ---------- s : str String to remove suffix from. suffix : str Suffix to remove. Returns ------- str String with suffix removed. """ return s[: -len(suffix)] if s.endswith(suffix) else s ================================================ FILE: src/gitingest/utils/compat_typing.py ================================================ """Compatibility layer for typing.""" try: from enum import StrEnum # type: ignore[attr-defined] # Py ≥ 3.11 except ImportError: from strenum import StrEnum # type: ignore[import-untyped] # Py ≤ 3.10 try: from typing import ParamSpec, TypeAlias # type: ignore[attr-defined] # Py ≥ 3.10 except ImportError: from typing_extensions import ParamSpec, TypeAlias # type: ignore[attr-defined] # Py ≤ 3.9 try: from typing import Annotated # type: ignore[attr-defined] # Py ≥ 3.9 except ImportError: from typing_extensions import Annotated # type: ignore[attr-defined] # Py ≤ 3.8 __all__ = ["Annotated", "ParamSpec", "StrEnum", "TypeAlias"] ================================================ FILE: src/gitingest/utils/exceptions.py ================================================ """Custom exceptions for the Gitingest package.""" class AsyncTimeoutError(Exception): """Exception raised when an async operation exceeds its timeout limit. This exception is used by the ``async_timeout`` decorator to signal that the wrapped asynchronous function has exceeded the specified time limit for execution. """ class InvalidNotebookError(Exception): """Exception raised when a Jupyter notebook is invalid or cannot be processed.""" def __init__(self, message: str) -> None: super().__init__(message) class InvalidGitHubTokenError(ValueError): """Exception raised when a GitHub Personal Access Token is malformed.""" def __init__(self) -> None: msg = ( "Invalid GitHub token format. To generate a token, go to " "https://github.com/settings/tokens/new?description=gitingest&scopes=repo." ) super().__init__(msg) ================================================ FILE: src/gitingest/utils/file_utils.py ================================================ """Utility functions for working with files and directories.""" from __future__ import annotations import locale import platform from typing import TYPE_CHECKING if TYPE_CHECKING: from pathlib import Path try: locale.setlocale(locale.LC_ALL, "") except locale.Error: locale.setlocale(locale.LC_ALL, "C") _CHUNK_SIZE = 1024 # bytes def _get_preferred_encodings() -> list[str]: """Get list of encodings to try, prioritized for the current platform. Returns ------- list[str] List of encoding names to try in priority order, starting with the platform's default encoding followed by common fallback encodings. """ encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] if platform.system() == "Windows": encodings += ["cp1252", "iso-8859-1"] return list(dict.fromkeys(encodings)) def _read_chunk(path: Path) -> bytes | None: """Attempt to read the first *size* bytes of *path* in binary mode. Parameters ---------- path : Path The path to the file to read. Returns ------- bytes | None The first ``_CHUNK_SIZE`` bytes of ``path``, or ``None`` on any ``OSError``. """ try: with path.open("rb") as fp: return fp.read(_CHUNK_SIZE) except OSError: return None def _decodes(chunk: bytes, encoding: str) -> bool: """Return ``True`` if ``chunk`` decodes cleanly with ``encoding``. Parameters ---------- chunk : bytes The chunk of bytes to decode. encoding : str The encoding to use to decode the chunk. Returns ------- bool ``True`` if the chunk decodes cleanly with the encoding, ``False`` otherwise. """ try: chunk.decode(encoding) except UnicodeDecodeError: return False return True ================================================ FILE: src/gitingest/utils/git_utils.py ================================================ """Utility functions for interacting with Git repositories.""" from __future__ import annotations import asyncio import base64 import re import sys from contextlib import contextmanager from pathlib import Path from typing import TYPE_CHECKING, Final, Generator, Iterable from urllib.parse import urlparse, urlunparse import git from gitingest.utils.compat_func import removesuffix from gitingest.utils.exceptions import InvalidGitHubTokenError from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: from gitingest.schemas import CloneConfig # Initialize logger for this module logger = get_logger(__name__) # GitHub Personal-Access tokens (classic + fine-grained). # - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics # - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics _GITHUB_PAT_PATTERN: Final[str] = r"^(?:gh[pousr]_[A-Za-z0-9]{36}|github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59})$" def is_github_host(url: str) -> bool: """Check if a URL is from a GitHub host (github.com or GitHub Enterprise). Parameters ---------- url : str The URL to check Returns ------- bool True if the URL is from a GitHub host, False otherwise """ hostname = urlparse(url).hostname or "" return hostname.startswith("github.") async def run_command(*args: str) -> tuple[bytes, bytes]: """Execute a shell command asynchronously and return (stdout, stderr) bytes. This function is kept for backward compatibility with non-git commands. Git operations should use GitPython directly. Parameters ---------- *args : str The command and its arguments to execute. Returns ------- tuple[bytes, bytes] A tuple containing the stdout and stderr of the command. Raises ------ RuntimeError If command exits with a non-zero status. """ # Execute the requested command proc = await asyncio.create_subprocess_exec( *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() if proc.returncode != 0: msg = f"Command failed: {' '.join(args)}\nError: {stderr.decode().strip()}" raise RuntimeError(msg) return stdout, stderr async def ensure_git_installed() -> None: """Ensure Git is installed and accessible on the system. On Windows, this also checks whether Git is configured to support long file paths. Raises ------ RuntimeError If Git is not installed or not accessible. """ try: # Use GitPython to check git availability git_cmd = git.Git() git_cmd.version() except git.GitCommandError as exc: msg = "Git is not installed or not accessible. Please install Git first." raise RuntimeError(msg) from exc except Exception as exc: msg = "Git is not installed or not accessible. Please install Git first." raise RuntimeError(msg) from exc if sys.platform == "win32": try: longpaths_value = git_cmd.config("core.longpaths") if longpaths_value.lower() != "true": logger.warning( "Git clone may fail on Windows due to long file paths. " "Consider enabling long path support with: 'git config --global core.longpaths true'. " "Note: This command may require administrator privileges.", extra={"platform": "windows", "longpaths_enabled": False}, ) except git.GitCommandError: # Ignore if checking 'core.longpaths' fails. pass async def check_repo_exists(url: str, token: str | None = None) -> bool: """Check whether a remote Git repository is reachable. Parameters ---------- url : str URL of the Git repository to check. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- bool ``True`` if the repository exists, ``False`` otherwise. """ try: # Try to resolve HEAD - if repo exists, this will work await _resolve_ref_to_sha(url, "HEAD", token=token) except (ValueError, Exception): # Repository doesn't exist, is private without proper auth, or other error return False return True def _parse_github_url(url: str) -> tuple[str, str, str]: """Parse a GitHub URL and return (hostname, owner, repo). Parameters ---------- url : str The URL of the GitHub repository to parse. Returns ------- tuple[str, str, str] A tuple containing the hostname, owner, and repository name. Raises ------ ValueError If the URL is not a valid GitHub repository URL. """ parsed = urlparse(url) if parsed.scheme not in {"http", "https"}: msg = f"URL must start with http:// or https://: {url!r}" raise ValueError(msg) if not parsed.hostname or not parsed.hostname.startswith("github."): msg = f"Un-recognised GitHub hostname: {parsed.hostname!r}" raise ValueError(msg) parts = removesuffix(parsed.path, ".git").strip("/").split("/") expected_path_length = 2 if len(parts) != expected_path_length: msg = f"Path must look like //: {parsed.path!r}" raise ValueError(msg) owner, repo = parts return parsed.hostname, owner, repo async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | None = None) -> list[str]: """Fetch the list of branches or tags from a remote Git repository. Parameters ---------- url : str The URL of the Git repository to fetch branches or tags from. ref_type: str The type of reference to fetch. Can be "branches" or "tags". token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- list[str] A list of branch names available in the remote repository. Raises ------ ValueError If the ``ref_type`` parameter is not "branches" or "tags". RuntimeError If fetching branches or tags from the remote repository fails. """ if ref_type not in ("branches", "tags"): msg = f"Invalid fetch type: {ref_type}" raise ValueError(msg) await ensure_git_installed() # Use GitPython to get remote references try: fetch_tags = ref_type == "tags" to_fetch = "tags" if fetch_tags else "heads" # Build ls-remote command cmd_args = [f"--{to_fetch}"] if fetch_tags: cmd_args.append("--refs") # Filter out peeled tag objects cmd_args.append(url) # Run the command with proper authentication with git_auth_context(url, token) as (git_cmd, auth_url): # Replace the URL in cmd_args with the authenticated URL cmd_args[-1] = auth_url # URL is the last argument output = git_cmd.ls_remote(*cmd_args) # Parse output return [ line.split(f"refs/{to_fetch}/", 1)[1] for line in output.splitlines() if line.strip() and f"refs/{to_fetch}/" in line ] except git.GitCommandError as exc: msg = f"Failed to fetch {ref_type} from {url}: {exc}" raise RuntimeError(msg) from exc def create_git_repo(local_path: str, url: str, token: str | None = None) -> git.Repo: """Create a GitPython Repo object with authentication if needed. Parameters ---------- local_path : str The local path where the git repository is located. url : str The repository URL to check if it's a GitHub repository. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- git.Repo A GitPython Repo object configured with authentication. Raises ------ ValueError If the local path is not a valid git repository. """ try: repo = git.Repo(local_path) # Configure authentication if needed if token and is_github_host(url): auth_header = create_git_auth_header(token, url=url) # Set the auth header in git config for this repo key, value = auth_header.split("=", 1) repo.git.config(key, value) except git.InvalidGitRepositoryError as exc: msg = f"Invalid git repository at {local_path}" raise ValueError(msg) from exc return repo def create_git_auth_header(token: str, url: str = "https://github.com") -> str: """Create a Basic authentication header for GitHub git operations. Parameters ---------- token : str GitHub personal access token (PAT) for accessing private repositories. url : str The GitHub URL to create the authentication header for. Defaults to "https://github.com" if not provided. Returns ------- str The git config command for setting the authentication header. Raises ------ ValueError If the URL is not a valid GitHub repository URL. """ hostname = urlparse(url).hostname if not hostname: msg = f"Invalid GitHub URL: {url!r}" raise ValueError(msg) basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}" def create_authenticated_url(url: str, token: str | None = None) -> str: """Create an authenticated URL for Git operations. This is the safest approach for multi-user environments - no global state. Parameters ---------- url : str The repository URL. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- str The URL with authentication embedded (for GitHub) or original URL. """ if not (token and is_github_host(url)): return url parsed = urlparse(url) # Add token as username in URL (GitHub supports this) netloc = f"x-oauth-basic:{token}@{parsed.hostname}" if parsed.port: netloc += f":{parsed.port}" return urlunparse( ( parsed.scheme, netloc, parsed.path, parsed.params, parsed.query, parsed.fragment, ), ) @contextmanager def git_auth_context(url: str, token: str | None = None) -> Generator[tuple[git.Git, str]]: """Context manager that provides Git command and authenticated URL. Returns both a Git command object and the authenticated URL to use. This avoids any global state contamination between users. Parameters ---------- url : str The repository URL to check if authentication is needed. token : str | None GitHub personal access token (PAT) for accessing private repositories. Yields ------ Generator[tuple[git.Git, str]] Tuple of (Git command object, authenticated URL to use). """ git_cmd = git.Git() auth_url = create_authenticated_url(url, token) yield git_cmd, auth_url def validate_github_token(token: str) -> None: """Validate the format of a GitHub Personal Access Token. Parameters ---------- token : str GitHub personal access token (PAT) for accessing private repositories. Raises ------ InvalidGitHubTokenError If the token format is invalid. """ if not re.fullmatch(_GITHUB_PAT_PATTERN, token): raise InvalidGitHubTokenError async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None: """Configure sparse-checkout for a partially cloned repository. Parameters ---------- config : CloneConfig The configuration for cloning the repository, including subpath and blob flag. token : str | None GitHub personal access token (PAT) for accessing private repositories. Raises ------ RuntimeError If the sparse-checkout configuration fails. """ subpath = config.subpath.lstrip("/") if config.blob: # Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt) subpath = str(Path(subpath).parent.as_posix()) try: repo = create_git_repo(config.local_path, config.url, token) repo.git.sparse_checkout("set", subpath) except git.GitCommandError as exc: msg = f"Failed to configure sparse-checkout: {exc}" raise RuntimeError(msg) from exc async def resolve_commit(config: CloneConfig, token: str | None) -> str: """Resolve the commit to use for the clone. Parameters ---------- config : CloneConfig The configuration for cloning the repository. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- str The commit SHA. """ if config.commit: commit = config.commit elif config.tag: commit = await _resolve_ref_to_sha(config.url, pattern=f"refs/tags/{config.tag}*", token=token) elif config.branch: commit = await _resolve_ref_to_sha(config.url, pattern=f"refs/heads/{config.branch}", token=token) else: commit = await _resolve_ref_to_sha(config.url, pattern="HEAD", token=token) return commit async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None) -> str: """Return the commit SHA that / points to in . * Branch → first line from ``git ls-remote``. * Tag → if annotated, prefer the peeled ``^{}`` line (commit). Parameters ---------- url : str The URL of the remote repository. pattern : str The pattern to use to resolve the commit SHA. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- str The commit SHA. Raises ------ ValueError If the ref does not exist in the remote repository. """ try: # Execute ls-remote command with proper authentication with git_auth_context(url, token) as (git_cmd, auth_url): output = git_cmd.ls_remote(auth_url, pattern) lines = output.splitlines() sha = _pick_commit_sha(lines) if not sha: msg = f"{pattern!r} not found in {url}" raise ValueError(msg) except git.GitCommandError as exc: msg = f"Failed to resolve {pattern} in {url}:\n{exc}" raise ValueError(msg) from exc return sha def _pick_commit_sha(lines: Iterable[str]) -> str | None: """Return a commit SHA from ``git ls-remote`` output. • Annotated tag → prefer the peeled line ( refs/tags/x^{}) • Branch / lightweight tag → first non-peeled line Parameters ---------- lines : Iterable[str] The lines of a ``git ls-remote`` output. Returns ------- str | None The commit SHA, or ``None`` if no commit SHA is found. """ first_non_peeled: str | None = None for ln in lines: if not ln.strip(): continue sha, ref = ln.split(maxsplit=1) if ref.endswith("^{}"): # peeled commit of annotated tag return sha # ← best match, done if first_non_peeled is None: # remember the first ordinary line first_non_peeled = sha return first_non_peeled # branch or lightweight tag (or None) ================================================ FILE: src/gitingest/utils/ignore_patterns.py ================================================ """Default ignore patterns for Gitingest.""" from __future__ import annotations from pathlib import Path DEFAULT_IGNORE_PATTERNS: set[str] = { # Python "*.pyc", "*.pyo", "*.pyd", "__pycache__", ".pytest_cache", ".coverage", ".tox", ".nox", ".mypy_cache", ".ruff_cache", ".hypothesis", "poetry.lock", "Pipfile.lock", # JavaScript/FileSystemNode "node_modules", "bower_components", "package-lock.json", "yarn.lock", ".npm", ".yarn", ".pnpm-store", "bun.lock", "bun.lockb", # Java "*.class", "*.jar", "*.war", "*.ear", "*.nar", ".gradle/", "build/", ".settings/", ".classpath", "gradle-app.setting", "*.gradle", # IDEs and editors / Java ".project", # C/C++ "*.o", "*.obj", "*.dll", "*.dylib", "*.exe", "*.lib", "*.out", "*.a", "*.pdb", # Binary "*.bin", # Swift/Xcode ".build/", "*.xcodeproj/", "*.xcworkspace/", "*.pbxuser", "*.mode1v3", "*.mode2v3", "*.perspectivev3", "*.xcuserstate", "xcuserdata/", ".swiftpm/", # Ruby "*.gem", ".bundle/", "vendor/bundle", "Gemfile.lock", ".ruby-version", ".ruby-gemset", ".rvmrc", # Rust "Cargo.lock", "**/*.rs.bk", # Java / Rust "target/", # Go "pkg/", # .NET/C# "obj/", "*.suo", "*.user", "*.userosscache", "*.sln.docstates", "*.nupkg", # Go / .NET / C# "bin/", # Version control ".git", ".svn", ".hg", ".gitignore", ".gitattributes", ".gitmodules", # Images and media "*.svg", "*.png", "*.jpg", "*.jpeg", "*.gif", "*.ico", "*.pdf", "*.mov", "*.mp4", "*.mp3", "*.wav", # Virtual environments "venv", ".venv", "env", ".env", "virtualenv", # IDEs and editors ".idea", ".vscode", ".vs", "*.swo", "*.swn", ".settings", "*.sublime-*", # Temporary and cache files "*.log", "*.bak", "*.swp", "*.tmp", "*.temp", ".cache", ".sass-cache", ".eslintcache", ".DS_Store", "Thumbs.db", "desktop.ini", # Build directories and artifacts "build", "dist", "target", "out", "*.egg-info", "*.egg", "*.whl", "*.so", # Documentation "site-packages", ".docusaurus", ".next", ".nuxt", # Database "*.db", "*.sqlite", "*.sqlite3", # Other common patterns ## Minified files "*.min.js", "*.min.css", ## Source maps "*.map", ## Terraform "*.tfstate*", ## Dependencies in various languages "vendor/", # Gitingest "digest.txt", } def load_ignore_patterns(root: Path, filename: str) -> set[str]: """Load ignore patterns from ``filename`` found under ``root``. The loader walks the directory tree, looks for the supplied ``filename``, and returns a unified set of patterns. It implements the same parsing rules we use for ``.gitignore`` and ``.gitingestignore`` (git-wildmatch syntax with support for negation and root-relative paths). Parameters ---------- root : Path Directory to walk. filename : str The filename to look for in each directory. Returns ------- set[str] A set of ignore patterns extracted from the ``filename`` file found under the ``root`` directory. """ patterns: set[str] = set() for ignore_file in root.rglob(filename): if ignore_file.is_file(): patterns.update(_parse_ignore_file(ignore_file, root)) return patterns def _parse_ignore_file(ignore_file: Path, root: Path) -> set[str]: """Parse an ignore file and return a set of ignore patterns. Parameters ---------- ignore_file : Path The path to the ignore file. root : Path The root directory of the repository. Returns ------- set[str] A set of ignore patterns. """ patterns: set[str] = set() # Path of the ignore file relative to the repository root rel_dir = ignore_file.parent.relative_to(root) base_dir = Path() if rel_dir == Path() else rel_dir with ignore_file.open(encoding="utf-8") as fh: for raw in fh: line = raw.strip() if not line or line.startswith("#"): # comments / blank lines continue # Handle negation ("!foobar") negated = line.startswith("!") if negated: line = line[1:] # Handle leading slash ("/foobar") if line.startswith("/"): line = line.lstrip("/") pattern_body = (base_dir / line).as_posix() patterns.add(f"!{pattern_body}" if negated else pattern_body) return patterns ================================================ FILE: src/gitingest/utils/ingestion_utils.py ================================================ """Utility functions for the ingestion process.""" from __future__ import annotations from typing import TYPE_CHECKING from pathspec import PathSpec if TYPE_CHECKING: from pathlib import Path def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> bool: """Return ``True`` if ``path`` matches any of ``include_patterns``. Parameters ---------- path : Path The absolute path of the file or directory to check. base_path : Path The base directory from which the relative path is calculated. include_patterns : set[str] A set of patterns to check against the relative path. Returns ------- bool ``True`` if the path matches any of the include patterns, ``False`` otherwise. """ rel_path = _relative_or_none(path, base_path) if rel_path is None: # outside repo → do *not* include return False if path.is_dir(): # keep directories so children are visited return True spec = PathSpec.from_lines("gitwildmatch", include_patterns) return spec.match_file(str(rel_path)) def _should_exclude(path: Path, base_path: Path, ignore_patterns: set[str]) -> bool: """Return ``True`` if ``path`` matches any of ``ignore_patterns``. Parameters ---------- path : Path The absolute path of the file or directory to check. base_path : Path The base directory from which the relative path is calculated. ignore_patterns : set[str] A set of patterns to check against the relative path. Returns ------- bool ``True`` if the path matches any of the ignore patterns, ``False`` otherwise. """ rel_path = _relative_or_none(path, base_path) if rel_path is None: # outside repo → already "excluded" return True spec = PathSpec.from_lines("gitwildmatch", ignore_patterns) return spec.match_file(str(rel_path)) def _relative_or_none(path: Path, base: Path) -> Path | None: """Return *path* relative to *base* or ``None`` if *path* is outside *base*. Parameters ---------- path : Path The absolute path of the file or directory to check. base : Path The base directory from which the relative path is calculated. Returns ------- Path | None The relative path of ``path`` to ``base``, or ``None`` if ``path`` is outside ``base``. """ try: return path.relative_to(base) except ValueError: # path is not a sub-path of base return None ================================================ FILE: src/gitingest/utils/logging_config.py ================================================ """Logging configuration for gitingest using loguru. This module provides structured JSON logging suitable for Kubernetes deployments while also supporting human-readable logging for development. """ from __future__ import annotations import json import logging import os import sys from typing import Any from loguru import logger def json_sink(message: Any) -> None: # noqa: ANN401 """Create JSON formatted log output. Parameters ---------- message : Any The loguru message record """ record = message.record log_entry = { "timestamp": record["time"].isoformat(), "level": record["level"].name.upper(), "logger": record["name"], "module": record["module"], "function": record["function"], "line": record["line"], "message": record["message"], } # Add exception info if present if record["exception"]: log_entry["exception"] = { "type": record["exception"].type.__name__, "value": str(record["exception"].value), "traceback": record["exception"].traceback, } # Add extra fields if present if record["extra"]: log_entry.update(record["extra"]) sys.stdout.write(json.dumps(log_entry, ensure_ascii=False, separators=(",", ":")) + "\n") def format_extra_fields(record: dict) -> str: """Format extra fields as JSON string. Parameters ---------- record : dict The loguru record dictionary Returns ------- str JSON formatted extra fields or empty string """ if not record.get("extra"): return "" # Filter out loguru's internal extra fields filtered_extra = {k: v for k, v in record["extra"].items() if not k.startswith("_") and k not in ["name"]} # Handle nested extra structure - if there's an 'extra' key, use its contents if "extra" in filtered_extra and isinstance(filtered_extra["extra"], dict): filtered_extra = filtered_extra["extra"] if filtered_extra: extra_json = json.dumps(filtered_extra, ensure_ascii=False, separators=(",", ":")) return f" | {extra_json}" return "" def extra_filter(record: dict) -> dict: """Filter function to add extra fields to the message. Parameters ---------- record : dict The loguru record dictionary Returns ------- dict Modified record with extra fields appended to message """ extra_str = format_extra_fields(record) if extra_str: record["message"] = record["message"] + extra_str return record class InterceptHandler(logging.Handler): """Intercept standard library logging and redirect to loguru.""" def emit(self, record: logging.LogRecord) -> None: """Emit a record to loguru.""" # Get corresponding loguru level try: level = logger.level(record.levelname).name except ValueError: level = record.levelno # Find caller from where originated the logged message frame, depth = logging.currentframe(), 2 while frame.f_code.co_filename == logging.__file__: frame = frame.f_back depth += 1 logger.opt(depth=depth, exception=record.exc_info).log( level, record.getMessage(), ) def configure_logging() -> None: """Configure loguru for the application. Sets up JSON logging for production/Kubernetes environments or human-readable logging for development. Intercepts all standard library logging including uvicorn. """ # Remove default handler logger.remove() # Check if we're in Kubernetes or production environment is_k8s = os.getenv("KUBERNETES_SERVICE_HOST") is not None log_format = os.getenv("LOG_FORMAT", "json" if is_k8s else "human") log_level = os.getenv("LOG_LEVEL", "INFO") if log_format.lower() == "json": # JSON format for structured logging (Kubernetes/production) logger.add( json_sink, level=log_level, enqueue=True, # Async logging for better performance diagnose=False, # Don't include variable values in exceptions (security) backtrace=True, # Include full traceback serialize=True, # Ensure proper serialization ) else: # Human-readable format for development logger_format = ( "{time:YYYY-MM-DD HH:mm:ss.SSS} | " "{level: <8} | " "{name}:{function}:{line} | " "{message}" ) logger.add( sys.stderr, format=logger_format, filter=extra_filter, level=log_level, enqueue=True, diagnose=True, # Include variable values in development backtrace=True, ) # Intercept all standard library logging logging.basicConfig(handlers=[InterceptHandler()], level=0, force=True) # Intercept specific loggers that might bypass basicConfig for name in logging.root.manager.loggerDict: # pylint: disable=no-member logging.getLogger(name).handlers = [] logging.getLogger(name).propagate = True def get_logger(name: str | None = None) -> logger.__class__: """Get a configured logger instance. Parameters ---------- name : str | None, optional Logger name, defaults to the calling module name Returns ------- logger.__class__ Configured logger instance """ if name: return logger.bind(name=name) return logger # Initialize logging when module is imported configure_logging() ================================================ FILE: src/gitingest/utils/notebook.py ================================================ """Utilities for processing Jupyter notebooks.""" from __future__ import annotations import json from itertools import chain from typing import TYPE_CHECKING, Any from gitingest.utils.exceptions import InvalidNotebookError from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: from pathlib import Path # Initialize logger for this module logger = get_logger(__name__) def process_notebook(file: Path, *, include_output: bool = True) -> str: """Process a Jupyter notebook file and return an executable Python script as a string. Parameters ---------- file : Path The path to the Jupyter notebook file. include_output : bool Whether to include cell outputs in the generated script (default: ``True``). Returns ------- str The executable Python script as a string. Raises ------ InvalidNotebookError If the notebook file is invalid or cannot be processed. """ try: with file.open(encoding="utf-8") as f: notebook: dict[str, Any] = json.load(f) except json.JSONDecodeError as exc: msg = f"Invalid JSON in notebook: {file}" raise InvalidNotebookError(msg) from exc # Check if the notebook contains worksheets worksheets = notebook.get("worksheets") if worksheets: logger.warning( "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. " "(See: https://github.com/jupyter/nbformat and " "https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets " "for more information.)", ) if len(worksheets) > 1: logger.warning( "Multiple worksheets detected. Combining all worksheets into a single script.", ) cells = list(chain.from_iterable(ws["cells"] for ws in worksheets)) else: cells = notebook["cells"] result = ["# Jupyter notebook converted to Python script."] for cell in cells: cell_str = _process_cell(cell, include_output=include_output) if cell_str: result.append(cell_str) return "\n\n".join(result) + "\n" def _process_cell(cell: dict[str, Any], *, include_output: bool) -> str | None: """Process a Jupyter notebook cell and return the cell content as a string. Parameters ---------- cell : dict[str, Any] The cell dictionary from a Jupyter notebook. include_output : bool Whether to include cell outputs in the generated script. Returns ------- str | None The cell content as a string, or ``None`` if the cell is empty. Raises ------ ValueError If an unexpected cell type is encountered. """ cell_type = cell["cell_type"] # Validate cell type and handle unexpected types if cell_type not in ("markdown", "code", "raw"): msg = f"Unknown cell type: {cell_type}" raise ValueError(msg) cell_str = "".join(cell["source"]) # Skip empty cells if not cell_str: return None # Convert Markdown and raw cells to multi-line comments if cell_type in ("markdown", "raw"): return f'"""\n{cell_str}\n"""' # Add cell output as comments outputs = cell.get("outputs") if include_output and outputs: # Include cell outputs as comments raw_lines: list[str] = [] for output in outputs: raw_lines += _extract_output(output) cell_str += "\n# Output:\n# " + "\n# ".join(raw_lines) return cell_str def _extract_output(output: dict[str, Any]) -> list[str]: """Extract the output from a Jupyter notebook cell. Parameters ---------- output : dict[str, Any] The output dictionary from a Jupyter notebook cell. Returns ------- list[str] The output as a list of strings. Raises ------ ValueError If an unknown output type is encountered. """ output_type = output["output_type"] if output_type == "stream": return output["text"] if output_type in ("execute_result", "display_data"): return output["data"]["text/plain"] if output_type == "error": return [f"Error: {output['ename']}: {output['evalue']}"] msg = f"Unknown output type: {output_type}" raise ValueError(msg) ================================================ FILE: src/gitingest/utils/os_utils.py ================================================ """Utility functions for working with the operating system.""" from pathlib import Path async def ensure_directory_exists_or_create(path: Path) -> None: """Ensure the directory exists, creating it if necessary. Parameters ---------- path : Path The path to ensure exists. Raises ------ OSError If the directory cannot be created. """ try: path.mkdir(parents=True, exist_ok=True) except OSError as exc: msg = f"Failed to create directory {path}: {exc}" raise OSError(msg) from exc ================================================ FILE: src/gitingest/utils/pattern_utils.py ================================================ """Pattern utilities for the Gitingest package.""" from __future__ import annotations import re from typing import Iterable from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS _PATTERN_SPLIT_RE = re.compile(r"[,\s]+") def process_patterns( exclude_patterns: str | set[str] | None = None, include_patterns: str | set[str] | None = None, ) -> tuple[set[str], set[str] | None]: """Process include and exclude patterns. Parameters ---------- exclude_patterns : str | set[str] | None Exclude patterns to process. include_patterns : str | set[str] | None Include patterns to process. Returns ------- tuple[set[str], set[str] | None] A tuple containing the processed ignore patterns and include patterns. """ # Combine default ignore patterns + custom patterns ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() if exclude_patterns: ignore_patterns_set.update(_parse_patterns(exclude_patterns)) # Process include patterns and override ignore patterns accordingly if include_patterns: parsed_include = _parse_patterns(include_patterns) # Override ignore patterns with include patterns ignore_patterns_set = set(ignore_patterns_set) - set(parsed_include) else: parsed_include = None return ignore_patterns_set, parsed_include def _parse_patterns(patterns: str | Iterable[str]) -> set[str]: """Normalize a collection of file or directory patterns. Parameters ---------- patterns : str | Iterable[str] One pattern string or an iterable of pattern strings. Each pattern may contain multiple comma- or whitespace-separated sub-patterns, e.g. "src/*, tests *.md". Returns ------- set[str] Normalized patterns with Windows back-slashes converted to forward-slashes and duplicates removed. """ # Treat a lone string as the iterable [string] if isinstance(patterns, str): patterns = [patterns] # Flatten, split on commas/whitespace, strip empties, normalise slashes return { part.replace("\\", "/") for pat in patterns for part in _PATTERN_SPLIT_RE.split(pat.strip()) if part # discard empty tokens } ================================================ FILE: src/gitingest/utils/query_parser_utils.py ================================================ """Utility functions for parsing and validating query parameters.""" from __future__ import annotations import string from typing import TYPE_CHECKING, cast from urllib.parse import ParseResult, unquote, urlparse from gitingest.utils.compat_typing import StrEnum from gitingest.utils.git_utils import _resolve_ref_to_sha, check_repo_exists from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: from gitingest.schemas import IngestionQuery # Initialize logger for this module logger = get_logger(__name__) HEX_DIGITS: set[str] = set(string.hexdigits) KNOWN_GIT_HOSTS: list[str] = [ "github.com", "gitlab.com", "bitbucket.org", "gitea.com", "codeberg.org", "gist.github.com", ] class PathKind(StrEnum): """Path kind enum.""" TREE = "tree" BLOB = "blob" ISSUES = "issues" PULL = "pull" async def _fallback_to_root(query: IngestionQuery, token: str | None, warn_msg: str | None = None) -> IngestionQuery: """Fallback to the root of the repository if no extra path parts are provided. Parameters ---------- query : IngestionQuery The query to fallback to the root of the repository. token : str | None The token to use to access the repository. warn_msg : str | None The message to warn. Returns ------- IngestionQuery The query with the fallback to the root of the repository. """ url = cast("str", query.url) query.commit = await _resolve_ref_to_sha(url, pattern="HEAD", token=token) if warn_msg: logger.warning(warn_msg) return query async def _normalise_source(raw: str, token: str | None) -> ParseResult: """Return a fully-qualified ParseResult or raise. Parameters ---------- raw : str The raw URL to parse. token : str | None The token to use to access the repository. Returns ------- ParseResult The parsed URL. """ raw = unquote(raw) parsed = urlparse(raw) if parsed.scheme: _validate_url_scheme(parsed.scheme) _validate_host(parsed.netloc) return parsed # no scheme ('host/user/repo' or 'user/repo') host = raw.split("/", 1)[0].lower() if "." in host: _validate_host(host) return urlparse(f"https://{raw}") # "user/repo" slug host = await _try_domains_for_user_and_repo(*_get_user_and_repo_from_path(raw), token=token) return urlparse(f"https://{host}/{raw}") async def _try_domains_for_user_and_repo(user_name: str, repo_name: str, token: str | None = None) -> str: """Attempt to find a valid repository host for the given ``user_name`` and ``repo_name``. Parameters ---------- user_name : str The username or owner of the repository. repo_name : str The name of the repository. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- str The domain of the valid repository host. Raises ------ ValueError If no valid repository host is found for the given ``user_name`` and ``repo_name``. """ for domain in KNOWN_GIT_HOSTS: candidate = f"https://{domain}/{user_name}/{repo_name}" if await check_repo_exists(candidate, token=token if domain.startswith("github.") else None): return domain msg = f"Could not find a valid repository host for '{user_name}/{repo_name}'." raise ValueError(msg) def _is_valid_git_commit_hash(commit: str) -> bool: """Validate if the provided string is a valid Git commit hash. This function checks if the commit hash is a 40-character string consisting only of hexadecimal digits, which is the standard format for Git commit hashes. Parameters ---------- commit : str The string to validate as a Git commit hash. Returns ------- bool ``True`` if the string is a valid 40-character Git commit hash, otherwise ``False``. """ sha_hex_length = 40 return len(commit) == sha_hex_length and all(c in HEX_DIGITS for c in commit) def _validate_host(host: str) -> None: """Validate a hostname. The host is accepted if it is either present in the hard-coded ``KNOWN_GIT_HOSTS`` list or if it satisfies the simple heuristics in ``_looks_like_git_host``, which try to recognise common self-hosted Git services (e.g. GitLab instances on sub-domains such as 'gitlab.example.com' or 'git.example.com'). Parameters ---------- host : str Hostname (case-insensitive). Raises ------ ValueError If the host cannot be recognised as a probable Git hosting domain. """ host = host.lower() if host not in KNOWN_GIT_HOSTS and not _looks_like_git_host(host): msg = f"Unknown domain '{host}' in URL" raise ValueError(msg) def _looks_like_git_host(host: str) -> bool: """Check if the given host looks like a Git host. The current heuristic returns ``True`` when the host starts with ``git.`` (e.g. 'git.example.com'), starts with 'gitlab.' (e.g. 'gitlab.company.com'), or starts with 'github.' (e.g. 'github.company.com' for GitHub Enterprise). Parameters ---------- host : str Hostname (case-insensitive). Returns ------- bool ``True`` if the host looks like a Git host, otherwise ``False``. """ host = host.lower() return host.startswith(("git.", "gitlab.", "github.")) def _validate_url_scheme(scheme: str) -> None: """Validate the given scheme against the known schemes. Parameters ---------- scheme : str The scheme to validate. Raises ------ ValueError If the scheme is not 'http' or 'https'. """ scheme = scheme.lower() if scheme not in ("https", "http"): msg = f"Invalid URL scheme '{scheme}' in URL" raise ValueError(msg) def _get_user_and_repo_from_path(path: str) -> tuple[str, str]: """Extract the user and repository names from a given path. Parameters ---------- path : str The path to extract the user and repository names from. Returns ------- tuple[str, str] A tuple containing the user and repository names. Raises ------ ValueError If the path does not contain at least two parts. """ min_path_parts = 2 path_parts = path.lower().strip("/").split("/") if len(path_parts) < min_path_parts: msg = f"Invalid repository URL '{path}'" raise ValueError(msg) return path_parts[0], path_parts[1] ================================================ FILE: src/gitingest/utils/timeout_wrapper.py ================================================ """Utility functions for the Gitingest package.""" import asyncio import functools from typing import Awaitable, Callable, TypeVar from gitingest.utils.compat_typing import ParamSpec from gitingest.utils.exceptions import AsyncTimeoutError T = TypeVar("T") P = ParamSpec("P") def async_timeout(seconds: int) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: """Async Timeout decorator. This decorator wraps an asynchronous function and ensures it does not run for longer than the specified number of seconds. If the function execution exceeds this limit, it raises an ``AsyncTimeoutError``. Parameters ---------- seconds : int The maximum allowed time (in seconds) for the asynchronous function to complete. Returns ------- Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]] A decorator that, when applied to an async function, ensures the function completes within the specified time limit. If the function takes too long, an ``AsyncTimeoutError`` is raised. """ def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: try: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) except asyncio.TimeoutError as exc: msg = f"Operation timed out after {seconds} seconds" raise AsyncTimeoutError(msg) from exc return wrapper return decorator ================================================ FILE: src/server/__init__.py ================================================ """Server module.""" ================================================ FILE: src/server/__main__.py ================================================ """Server module entry point for running with python -m server.""" import os import uvicorn # Import logging configuration first to intercept all logging from gitingest.utils.logging_config import get_logger logger = get_logger(__name__) if __name__ == "__main__": # Get configuration from environment variables host = os.getenv("HOST", "0.0.0.0") # noqa: S104 port = int(os.getenv("PORT", "8000")) reload = os.getenv("RELOAD", "false").lower() == "true" logger.info( "Starting Gitingest server", extra={ "host": host, "port": port, }, ) uvicorn.run( "server.main:app", host=host, port=port, reload=reload, log_config=None, # Disable uvicorn's default logging config ) ================================================ FILE: src/server/form_types.py ================================================ """Reusable form type aliases for FastAPI form parameters.""" from __future__ import annotations from typing import TYPE_CHECKING, Optional from fastapi import Form from gitingest.utils.compat_typing import Annotated if TYPE_CHECKING: from gitingest.utils.compat_typing import TypeAlias StrForm: TypeAlias = Annotated[str, Form(...)] IntForm: TypeAlias = Annotated[int, Form(...)] OptStrForm: TypeAlias = Annotated[Optional[str], Form()] ================================================ FILE: src/server/main.py ================================================ """Main module for the FastAPI application.""" from __future__ import annotations import os import threading from pathlib import Path import sentry_sdk from dotenv import load_dotenv from fastapi import FastAPI, Request from fastapi.responses import FileResponse, HTMLResponse, JSONResponse from fastapi.staticfiles import StaticFiles from slowapi.errors import RateLimitExceeded from starlette.middleware.trustedhost import TrustedHostMiddleware # Import logging configuration first to intercept all logging from gitingest.utils.logging_config import get_logger from server.metrics_server import start_metrics_server from server.routers import dynamic, index, ingest from server.server_config import get_version_info, templates from server.server_utils import limiter, rate_limit_exception_handler # Load environment variables from .env file load_dotenv() # Initialize logger for this module logger = get_logger(__name__) # Initialize Sentry SDK if enabled if os.getenv("GITINGEST_SENTRY_ENABLED") is not None: sentry_dsn = os.getenv("GITINGEST_SENTRY_DSN") # Only initialize Sentry if DSN is provided if sentry_dsn: # Configure Sentry options from environment variables traces_sample_rate = float(os.getenv("GITINGEST_SENTRY_TRACES_SAMPLE_RATE", "1.0")) profile_session_sample_rate = float(os.getenv("GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE", "1.0")) profile_lifecycle_raw = os.getenv("GITINGEST_SENTRY_PROFILE_LIFECYCLE", "trace") profile_lifecycle = profile_lifecycle_raw if profile_lifecycle_raw in ("manual", "trace") else "trace" send_default_pii = os.getenv("GITINGEST_SENTRY_SEND_DEFAULT_PII", "true").lower() == "true" sentry_environment = os.getenv("GITINGEST_SENTRY_ENVIRONMENT", "") sentry_sdk.init( dsn=sentry_dsn, # Add data like request headers and IP for users send_default_pii=send_default_pii, # Set traces_sample_rate to capture transactions for tracing traces_sample_rate=traces_sample_rate, # Set profile_session_sample_rate to profile sessions profile_session_sample_rate=profile_session_sample_rate, # Set profile_lifecycle to automatically run the profiler profile_lifecycle=profile_lifecycle, # Set environment name environment=sentry_environment, ) # Initialize the FastAPI application app = FastAPI(docs_url=None, redoc_url=None) app.state.limiter = limiter # Register the custom exception handler for rate limits app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) # Start metrics server in a separate thread if enabled if os.getenv("GITINGEST_METRICS_ENABLED") is not None: metrics_host = os.getenv("GITINGEST_METRICS_HOST", "127.0.0.1") metrics_port = int(os.getenv("GITINGEST_METRICS_PORT", "9090")) metrics_thread = threading.Thread( target=start_metrics_server, args=(metrics_host, metrics_port), daemon=True, ) metrics_thread.start() # Mount static files dynamically to serve CSS, JS, and other static assets static_dir = Path(__file__).parent.parent / "static" app.mount("/static", StaticFiles(directory=static_dir), name="static") # Fetch allowed hosts from the environment or use the default values allowed_hosts = os.getenv("ALLOWED_HOSTS") if allowed_hosts: allowed_hosts = allowed_hosts.split(",") else: # Define the default allowed hosts for the application default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] allowed_hosts = default_allowed_hosts # Add middleware to enforce allowed hosts app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) @app.get("/health") async def health_check() -> dict[str, str]: """Health check endpoint to verify that the server is running. **Returns** - **dict[str, str]**: A JSON object with a "status" key indicating the server's health status. """ return {"status": "healthy"} @app.head("/", include_in_schema=False) async def head_root() -> HTMLResponse: """Respond to HTTP HEAD requests for the root URL. **This endpoint mirrors the headers and status code of the index page** for HTTP HEAD requests, providing a lightweight way to check if the server is responding without downloading the full page content. **Returns** - **HTMLResponse**: An empty HTML response with appropriate headers """ return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) @app.get("/robots.txt", include_in_schema=False) async def robots() -> FileResponse: """Serve the robots.txt file to guide search engine crawlers. **This endpoint serves the ``robots.txt`` file located in the static directory** to provide instructions to search engine crawlers about which parts of the site they should or should not index. **Returns** - **FileResponse**: The ``robots.txt`` file located in the static directory """ return FileResponse("static/robots.txt") @app.get("/llms.txt") async def llm_txt() -> FileResponse: """Serve the llm.txt file to provide information about the site to LLMs. **This endpoint serves the ``llms.txt`` file located in the static directory** to provide information about the site to Large Language Models (LLMs) and other AI systems that may be crawling the site. **Returns** - **FileResponse**: The ``llms.txt`` file located in the static directory """ return FileResponse("static/llms.txt") @app.get("/docs", response_class=HTMLResponse, include_in_schema=False) async def custom_swagger_ui(request: Request) -> HTMLResponse: """Serve custom Swagger UI documentation. **This endpoint serves a custom Swagger UI interface** for the API documentation, providing an interactive way to explore and test the available endpoints. **Parameters** - **request** (`Request`): The incoming HTTP request **Returns** - **HTMLResponse**: Custom Swagger UI documentation page """ context = {"request": request} context.update(get_version_info()) return templates.TemplateResponse("swagger_ui.jinja", context) @app.get("/api", include_in_schema=True) def openapi_json_get() -> JSONResponse: """Return the OpenAPI schema. **This endpoint returns the OpenAPI schema (openapi.json)** that describes the API structure, endpoints, and data models for documentation and client generation purposes. **Returns** - **JSONResponse**: The OpenAPI schema as JSON """ return JSONResponse(app.openapi()) @app.api_route("/api", methods=["POST", "PUT", "DELETE", "OPTIONS", "HEAD"], include_in_schema=False) @app.api_route("/api/", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD"], include_in_schema=False) def openapi_json() -> JSONResponse: """Return the OpenAPI schema for various HTTP methods. **This endpoint returns the OpenAPI schema (openapi.json)** for multiple HTTP methods, providing API documentation for clients that may use different request methods. **Returns** - **JSONResponse**: The OpenAPI schema as JSON """ return JSONResponse(app.openapi()) # Include routers for modular endpoints app.include_router(index) app.include_router(ingest) app.include_router(dynamic) ================================================ FILE: src/server/metrics_server.py ================================================ """Prometheus metrics server running on a separate port.""" import uvicorn from fastapi import FastAPI from fastapi.responses import HTMLResponse from prometheus_client import REGISTRY, generate_latest from gitingest.utils.logging_config import get_logger # Create a logger for this module logger = get_logger(__name__) # Create a separate FastAPI app for metrics metrics_app = FastAPI( title="Gitingest Metrics", description="Prometheus metrics for Gitingest", docs_url=None, redoc_url=None, ) @metrics_app.get("/metrics") async def metrics() -> HTMLResponse: """Serve Prometheus metrics without authentication. This endpoint is only accessible from the local network. Returns ------- HTMLResponse Prometheus metrics in text format """ return HTMLResponse( content=generate_latest(REGISTRY), status_code=200, media_type="text/plain", ) def start_metrics_server(host: str = "127.0.0.1", port: int = 9090) -> None: """Start the metrics server on a separate port. Parameters ---------- host : str The host to bind to (default: 127.0.0.1 for local network only) port : int The port to bind to (default: 9090) Returns ------- None """ logger.info("Starting metrics server", extra={"host": host, "port": port}) # Configure uvicorn to suppress startup messages to avoid duplicates # since the main server already shows similar messages uvicorn.run( metrics_app, host=host, port=port, log_config=None, # Disable uvicorn's default logging config access_log=False, # Disable access logging for metrics server # Suppress uvicorn's startup messages by setting log level higher log_level="warning", ) ================================================ FILE: src/server/models.py ================================================ """Pydantic models for the query form.""" from __future__ import annotations from enum import Enum from typing import TYPE_CHECKING, Union from pydantic import BaseModel, Field, field_validator from gitingest.utils.compat_func import removesuffix from server.server_config import MAX_FILE_SIZE_KB # needed for type checking (pydantic) if TYPE_CHECKING: from server.form_types import IntForm, OptStrForm, StrForm class PatternType(str, Enum): """Enumeration for pattern types used in file filtering.""" INCLUDE = "include" EXCLUDE = "exclude" class IngestRequest(BaseModel): """Request model for the /api/ingest endpoint. Attributes ---------- input_text : str The Git repository URL or slug to ingest. max_file_size : int Maximum file size slider position (0-500) for filtering files. pattern_type : PatternType Type of pattern to use for file filtering (include or exclude). pattern : str Glob/regex pattern string for file filtering. token : str | None GitHub personal access token (PAT) for accessing private repositories. """ input_text: str = Field(..., description="Git repository URL or slug to ingest") max_file_size: int = Field(..., ge=1, le=MAX_FILE_SIZE_KB, description="File size in KB") pattern_type: PatternType = Field(default=PatternType.EXCLUDE, description="Pattern type for file filtering") pattern: str = Field(default="", description="Glob/regex pattern for file filtering") token: str | None = Field(default=None, description="GitHub PAT for private repositories") @field_validator("input_text") @classmethod def validate_input_text(cls, v: str) -> str: """Validate that ``input_text`` is not empty.""" if not v.strip(): err = "input_text cannot be empty" raise ValueError(err) return removesuffix(v.strip(), ".git") @field_validator("pattern") @classmethod def validate_pattern(cls, v: str) -> str: """Validate ``pattern`` field.""" return v.strip() class IngestSuccessResponse(BaseModel): """Success response model for the /api/ingest endpoint. Attributes ---------- repo_url : str The original repository URL that was processed. short_repo_url : str Short form of repository URL (user/repo). summary : str Summary of the ingestion process including token estimates. digest_url : str URL to download the full digest content (either S3 URL or local download endpoint). tree : str File tree structure of the repository. content : str Processed content from the repository files. default_max_file_size : int The file size slider position used. pattern_type : str The pattern type used for filtering. pattern : str The pattern used for filtering. """ repo_url: str = Field(..., description="Original repository URL") short_repo_url: str = Field(..., description="Short repository URL (user/repo)") summary: str = Field(..., description="Ingestion summary with token estimates") digest_url: str = Field(..., description="URL to download the full digest content") tree: str = Field(..., description="File tree structure") content: str = Field(..., description="Processed file content") default_max_file_size: int = Field(..., description="File size slider position used") pattern_type: str = Field(..., description="Pattern type used") pattern: str = Field(..., description="Pattern used") class IngestErrorResponse(BaseModel): """Error response model for the /api/ingest endpoint. Attributes ---------- error : str Error message describing what went wrong. """ error: str = Field(..., description="Error message") # Union type for API responses IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse] class S3Metadata(BaseModel): """Model for S3 metadata structure. Attributes ---------- summary : str Summary of the ingestion process including token estimates. tree : str File tree structure of the repository. content : str Processed content from the repository files. """ summary: str = Field(..., description="Ingestion summary with token estimates") tree: str = Field(..., description="File tree structure") content: str = Field(..., description="Processed file content") class QueryForm(BaseModel): """Form data for the query. Attributes ---------- input_text : str Text or URL supplied in the form. max_file_size : int The maximum allowed file size for the input, specified by the user. pattern_type : str The type of pattern used for the query (``include`` or ``exclude``). pattern : str Glob/regex pattern string. token : str | None GitHub personal access token (PAT) for accessing private repositories. """ input_text: str max_file_size: int pattern_type: str pattern: str token: str | None = None @classmethod def as_form( cls, input_text: StrForm, max_file_size: IntForm, pattern_type: StrForm, pattern: StrForm, token: OptStrForm, ) -> QueryForm: """Create a QueryForm from FastAPI form parameters. Parameters ---------- input_text : StrForm The input text provided by the user. max_file_size : IntForm The maximum allowed file size for the input. pattern_type : StrForm The type of pattern used for the query (``include`` or ``exclude``). pattern : StrForm Glob/regex pattern string. token : OptStrForm GitHub personal access token (PAT) for accessing private repositories. Returns ------- QueryForm The QueryForm instance. """ return cls( input_text=input_text, max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, token=token, ) ================================================ FILE: src/server/query_processor.py ================================================ """Process a query by parsing input, cloning a repository, and generating a summary.""" from __future__ import annotations import shutil from pathlib import Path from typing import TYPE_CHECKING, cast from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger from gitingest.utils.pattern_utils import process_patterns from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata from server.s3_utils import ( _build_s3_url, check_s3_object_exists, generate_s3_file_path, get_metadata_from_s3, is_s3_enabled, upload_metadata_to_s3, upload_to_s3, ) from server.server_config import MAX_DISPLAY_SIZE # Initialize logger for this module logger = get_logger(__name__) if TYPE_CHECKING: from gitingest.schemas.cloning import CloneConfig from gitingest.schemas.ingestion import IngestionQuery def _cleanup_repository(clone_config: CloneConfig) -> None: """Clean up the cloned repository after processing.""" try: local_path = Path(clone_config.local_path) if local_path.exists(): shutil.rmtree(local_path) logger.info("Successfully cleaned up repository", extra={"local_path": str(local_path)}) except (PermissionError, OSError): logger.exception("Could not delete repository", extra={"local_path": str(clone_config.local_path)}) async def _check_s3_cache( query: IngestionQuery, input_text: str, max_file_size: int, pattern_type: str, pattern: str, token: str | None, ) -> IngestSuccessResponse | None: """Check if digest already exists on S3 and return response if found. Parameters ---------- query : IngestionQuery The parsed query object. input_text : str Original input text. max_file_size : int Maximum file size in KB. pattern_type : str Pattern type (include/exclude). pattern : str Pattern string. token : str | None GitHub token. Returns ------- IngestSuccessResponse | None Response if file exists on S3, None otherwise. """ if not is_s3_enabled(): return None try: # Use git ls-remote to get commit SHA without cloning clone_config = query.extract_clone_config() logger.info("Resolving commit for S3 cache check", extra={"repo_url": query.url}) query.commit = await resolve_commit(clone_config, token=token) logger.info("Commit resolved successfully", extra={"repo_url": query.url, "commit": query.commit}) # Generate S3 file path using the resolved commit s3_file_path = generate_s3_file_path( source=query.url, user_name=cast("str", query.user_name), repo_name=cast("str", query.repo_name), commit=query.commit, subpath=query.subpath, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, ) # Check if file exists on S3 if check_s3_object_exists(s3_file_path): # File exists on S3, serve it directly without cloning s3_url = _build_s3_url(s3_file_path) query.s3_url = s3_url short_repo_url = f"{query.user_name}/{query.repo_name}" # Try to get cached metadata metadata = get_metadata_from_s3(s3_file_path) if metadata: # Use cached metadata if available summary = metadata.summary tree = metadata.tree content = metadata.content else: # Fallback to placeholder messages if metadata not available summary = "Digest served from cache (S3). Download the full digest to see content details." tree = "Digest served from cache. Download the full digest to see the file tree." content = "Digest served from cache. Download the full digest to see the content." return IngestSuccessResponse( repo_url=input_text, short_repo_url=short_repo_url, summary=summary, digest_url=s3_url, tree=tree, content=content, default_max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, ) except Exception as exc: # Log the exception but don't fail the entire request logger.warning("S3 cache check failed, falling back to normal cloning", extra={"error": str(exc)}) logger.info("Digest not found in S3 cache, proceeding with normal cloning", extra={"repo_url": query.url}) return None def _store_digest_content( query: IngestionQuery, clone_config: CloneConfig, digest_content: str, summary: str, tree: str, content: str, ) -> None: """Store digest content either to S3 or locally based on configuration. Parameters ---------- query : IngestionQuery The query object containing repository information. clone_config : CloneConfig The clone configuration object. digest_content : str The complete digest content to store. summary : str The summary content for metadata. tree : str The tree content for metadata. content : str The file content for metadata. """ if is_s3_enabled(): # Upload to S3 instead of storing locally s3_file_path = generate_s3_file_path( source=query.url, user_name=cast("str", query.user_name), repo_name=cast("str", query.repo_name), commit=query.commit, subpath=query.subpath, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, ) s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) # Also upload metadata JSON for caching metadata = S3Metadata( summary=summary, tree=tree, content=content, ) try: upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id) logger.info("Successfully uploaded metadata to S3") except Exception as metadata_exc: # Log the error but don't fail the entire request logger.warning("Failed to upload metadata to S3", extra={"error": str(metadata_exc)}) # Store S3 URL in query for later use query.s3_url = s3_url else: # Store locally local_txt_file = Path(clone_config.local_path).with_suffix(".txt") with local_txt_file.open("w", encoding="utf-8") as f: f.write(digest_content) def _generate_digest_url(query: IngestionQuery) -> str: """Generate the digest URL based on S3 configuration. Parameters ---------- query : IngestionQuery The query object containing repository information. Returns ------- str The digest URL. Raises ------ RuntimeError If S3 is enabled but no S3 URL was generated. """ if is_s3_enabled(): digest_url = getattr(query, "s3_url", None) if not digest_url: # This should not happen if S3 upload was successful msg = "S3 is enabled but no S3 URL was generated" raise RuntimeError(msg) return digest_url return f"/api/download/file/{query.id}" async def process_query( input_text: str, max_file_size: int, pattern_type: PatternType, pattern: str, token: str | None = None, ) -> IngestResponse: """Process a query by parsing input, cloning a repository, and generating a summary. Handle user input, process Git repository data, and prepare a response for rendering a template with the processed results or an error message. Parameters ---------- input_text : str Input text provided by the user, typically a Git repository URL or slug. max_file_size : int Max file size in KB to be include in the digest. pattern_type : PatternType Type of pattern to use (either "include" or "exclude") pattern : str Pattern to include or exclude in the query, depending on the pattern type. token : str | None GitHub personal access token (PAT) for accessing private repositories. Returns ------- IngestResponse A union type, corresponding to IngestErrorResponse or IngestSuccessResponse Raises ------ RuntimeError If the commit hash is not found (should never happen). """ if token: validate_github_token(token) try: query = await parse_remote_repo(input_text, token=token) except Exception as exc: logger.warning("Failed to parse remote repository", extra={"input_text": input_text, "error": str(exc)}) return IngestErrorResponse(error=str(exc)) query.url = cast("str", query.url) query.max_file_size = max_file_size * 1024 # Convert to bytes since we currently use KB in higher levels query.ignore_patterns, query.include_patterns = process_patterns( exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None, include_patterns=pattern if pattern_type == PatternType.INCLUDE else None, ) # Check if digest already exists on S3 before cloning s3_response = await _check_s3_cache( query=query, input_text=input_text, max_file_size=max_file_size, pattern_type=pattern_type.value, pattern=pattern, token=token, ) if s3_response: return s3_response clone_config = query.extract_clone_config() await clone_repo(clone_config, token=token) short_repo_url = f"{query.user_name}/{query.repo_name}" # The commit hash should always be available at this point if not query.commit: msg = "Unexpected error: no commit hash found" raise RuntimeError(msg) try: summary, tree, content = ingest_query(query) digest_content = tree + "\n" + content _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) # Clean up repository even if processing failed _cleanup_repository(clone_config) return IngestErrorResponse(error=f"{exc!s}") if len(content) > MAX_DISPLAY_SIZE: content = ( f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] ) _print_success( url=query.url, max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, summary=summary, ) digest_url = _generate_digest_url(query) # Clean up the repository after successful processing _cleanup_repository(clone_config) return IngestSuccessResponse( repo_url=input_text, short_repo_url=short_repo_url, summary=summary, digest_url=digest_url, tree=tree, content=content, default_max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, ) def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: """Print a formatted summary of the query details for debugging. Parameters ---------- url : str The URL associated with the query. max_file_size : int The maximum file size allowed for the query, in bytes. pattern_type : str Specifies the type of pattern to use, either "include" or "exclude". pattern : str The actual pattern string to include or exclude in the query. """ default_max_file_kb = 50 logger.info( "Processing query", extra={ "url": url, "max_file_size_kb": int(max_file_size / 1024), "pattern_type": pattern_type, "pattern": pattern, "custom_size": int(max_file_size / 1024) != default_max_file_kb, }, ) def _print_error(url: str, exc: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: """Print a formatted error message for debugging. Parameters ---------- url : str The URL associated with the query that caused the error. exc : Exception The exception raised during the query or process. max_file_size : int The maximum file size allowed for the query, in bytes. pattern_type : str Specifies the type of pattern to use, either "include" or "exclude". pattern : str The actual pattern string to include or exclude in the query. """ logger.error( "Query processing failed", extra={ "url": url, "max_file_size_kb": int(max_file_size / 1024), "pattern_type": pattern_type, "pattern": pattern, "error": str(exc), }, ) def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: """Print a formatted success message for debugging. Parameters ---------- url : str The URL associated with the successful query. max_file_size : int The maximum file size allowed for the query, in bytes. pattern_type : str Specifies the type of pattern to use, either "include" or "exclude". pattern : str The actual pattern string to include or exclude in the query. summary : str A summary of the query result, including details like estimated tokens. """ estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] logger.info( "Query processing completed successfully", extra={ "url": url, "max_file_size_kb": int(max_file_size / 1024), "pattern_type": pattern_type, "pattern": pattern, "estimated_tokens": estimated_tokens, }, ) ================================================ FILE: src/server/routers/__init__.py ================================================ """Module containing the routers for the FastAPI application.""" from server.routers.dynamic import router as dynamic from server.routers.index import router as index from server.routers.ingest import router as ingest __all__ = ["dynamic", "index", "ingest"] ================================================ FILE: src/server/routers/dynamic.py ================================================ """The dynamic router module defines handlers for dynamic path requests.""" from fastapi import APIRouter, Request from fastapi.responses import HTMLResponse from server.server_config import get_version_info, templates router = APIRouter() @router.get("/{full_path:path}", include_in_schema=False) async def catch_all(request: Request, full_path: str) -> HTMLResponse: """Render a page with a Git URL based on the provided path. This endpoint catches all GET requests with a dynamic path, constructs a Git URL using the ``full_path`` parameter, and renders the ``git.jinja`` template with that URL. Parameters ---------- request : Request The incoming request object, which provides context for rendering the response. full_path : str The full path extracted from the URL, which is used to build the Git URL. Returns ------- HTMLResponse An HTML response containing the rendered template, with the Git URL and other default parameters such as file size. """ context = { "request": request, "repo_url": full_path, "default_max_file_size": 243, } context.update(get_version_info()) return templates.TemplateResponse("git.jinja", context) ================================================ FILE: src/server/routers/index.py ================================================ """Module defining the FastAPI router for the home page of the application.""" from fastapi import APIRouter, Request from fastapi.responses import HTMLResponse from server.server_config import EXAMPLE_REPOS, get_version_info, templates router = APIRouter() @router.get("/", response_class=HTMLResponse, include_in_schema=False) async def home(request: Request) -> HTMLResponse: """Render the home page with example repositories and default parameters. This endpoint serves the home page of the application, rendering the ``index.jinja`` template and providing it with a list of example repositories and default file size values. Parameters ---------- request : Request The incoming request object, which provides context for rendering the response. Returns ------- HTMLResponse An HTML response containing the rendered home page template, with example repositories and other default parameters such as file size. """ context = { "request": request, "examples": EXAMPLE_REPOS, "default_max_file_size": 243, } context.update(get_version_info()) return templates.TemplateResponse("index.jinja", context) ================================================ FILE: src/server/routers/ingest.py ================================================ """Ingest endpoint for the API.""" from typing import Union from uuid import UUID from fastapi import APIRouter, HTTPException, Request, status from fastapi.responses import FileResponse, JSONResponse, RedirectResponse from prometheus_client import Counter from gitingest.config import TMP_BASE_PATH from server.models import IngestRequest from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion from server.s3_utils import is_s3_enabled from server.server_config import DEFAULT_FILE_SIZE_KB from server.server_utils import limiter ingest_counter = Counter("gitingest_ingest_total", "Number of ingests", ["status", "url"]) router = APIRouter() @router.post("/api/ingest", responses=COMMON_INGEST_RESPONSES) @limiter.limit("10/minute") async def api_ingest( request: Request, # noqa: ARG001 (unused-function-argument) # pylint: disable=unused-argument ingest_request: IngestRequest, ) -> JSONResponse: """Ingest a Git repository and return processed content. **This endpoint processes a Git repository by cloning it, analyzing its structure,** and returning a summary with the repository's content. The response includes file tree structure, processed content, and metadata about the ingestion. **Parameters** - **ingest_request** (`IngestRequest`): Pydantic model containing ingestion parameters **Returns** - **JSONResponse**: Success response with ingestion results or error response with appropriate HTTP status code """ response = await _perform_ingestion( input_text=ingest_request.input_text, max_file_size=ingest_request.max_file_size, pattern_type=ingest_request.pattern_type.value, pattern=ingest_request.pattern, token=ingest_request.token, ) # limit URL to 255 characters ingest_counter.labels(status=response.status_code, url=ingest_request.input_text[:255]).inc() return response @router.get("/api/{user}/{repository}", responses=COMMON_INGEST_RESPONSES) @limiter.limit("10/minute") async def api_ingest_get( request: Request, # noqa: ARG001 (unused-function-argument) # pylint: disable=unused-argument user: str, repository: str, max_file_size: int = DEFAULT_FILE_SIZE_KB, pattern_type: str = "exclude", pattern: str = "", token: str = "", ) -> JSONResponse: """Ingest a GitHub repository via GET and return processed content. **This endpoint processes a GitHub repository by analyzing its structure and returning a summary** with the repository's content. The response includes file tree structure, processed content, and metadata about the ingestion. All ingestion parameters are optional and can be provided as query parameters. **Path Parameters** - **user** (`str`): GitHub username or organization - **repository** (`str`): GitHub repository name **Query Parameters** - **max_file_size** (`int`, optional): Maximum file size in KB to include in the digest (default: 5120 KB) - **pattern_type** (`str`, optional): Type of pattern to use ("include" or "exclude", default: "exclude") - **pattern** (`str`, optional): Pattern to include or exclude in the query (default: "") - **token** (`str`, optional): GitHub personal access token for private repositories (default: "") **Returns** - **JSONResponse**: Success response with ingestion results or error response with appropriate HTTP status code """ response = await _perform_ingestion( input_text=f"{user}/{repository}", max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, token=token or None, ) # limit URL to 255 characters ingest_counter.labels(status=response.status_code, url=f"{user}/{repository}"[:255]).inc() return response @router.get("/api/download/file/{ingest_id}", response_model=None) async def download_ingest( ingest_id: UUID, ) -> Union[RedirectResponse, FileResponse]: # noqa: FA100 (future-rewritable-type-annotation) (pydantic) """Download the first text file produced for an ingest ID. **This endpoint retrieves the first ``*.txt`` file produced during the ingestion process** and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled and clients should use the S3 URL provided in the ingest response instead. **Parameters** - **ingest_id** (`UUID`): Identifier that the ingest step emitted **Returns** - **FileResponse**: Streamed response with media type ``text/plain`` for local files **Raises** - **HTTPException**: **503** - endpoint is disabled when S3 is enabled - **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file - **HTTPException**: **403** - the process lacks permission to read the directory or file """ # Disable download endpoint when S3 is enabled if is_s3_enabled(): raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Download endpoint is disabled when S3 is enabled. " "Use the S3 URL provided in the ingest response instead.", ) # Fall back to local file serving # Normalize and validate the directory path directory = (TMP_BASE_PATH / str(ingest_id)).resolve() if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") if not directory.is_dir(): raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Digest {ingest_id!r} not found") try: first_txt_file = next(directory.glob("*.txt")) except StopIteration as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"No .txt file found for digest {ingest_id!r}", ) from exc try: return FileResponse(path=first_txt_file, media_type="text/plain", filename=first_txt_file.name) except PermissionError as exc: raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail=f"Permission denied for {first_txt_file}", ) from exc ================================================ FILE: src/server/routers_utils.py ================================================ """Utility functions for the ingest endpoints.""" from __future__ import annotations from typing import Any from fastapi import status from fastapi.responses import JSONResponse from server.models import IngestErrorResponse, IngestSuccessResponse, PatternType from server.query_processor import process_query COMMON_INGEST_RESPONSES: dict[int | str, dict[str, Any]] = { status.HTTP_200_OK: {"model": IngestSuccessResponse, "description": "Successful ingestion"}, status.HTTP_400_BAD_REQUEST: {"model": IngestErrorResponse, "description": "Bad request or processing error"}, status.HTTP_500_INTERNAL_SERVER_ERROR: {"model": IngestErrorResponse, "description": "Internal server error"}, } async def _perform_ingestion( input_text: str, max_file_size: int, pattern_type: str, pattern: str, token: str | None, ) -> JSONResponse: """Run ``process_query`` and wrap the result in a ``FastAPI`` ``JSONResponse``. Consolidates error handling shared by the ``POST`` and ``GET`` ingest endpoints. """ try: pattern_type = PatternType(pattern_type) result = await process_query( input_text=input_text, max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, token=token, ) if isinstance(result, IngestErrorResponse): # Return structured error response with 400 status code return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=result.model_dump()) # Return structured success response with 200 status code return JSONResponse(status_code=status.HTTP_200_OK, content=result.model_dump()) except ValueError as ve: # Handle validation errors with 400 status code error_response = IngestErrorResponse(error=f"Validation error: {ve!s}") return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=error_response.model_dump()) except Exception as exc: # Handle unexpected errors with 500 status code error_response = IngestErrorResponse(error=f"Internal server error: {exc!s}") return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content=error_response.model_dump()) ================================================ FILE: src/server/s3_utils.py ================================================ """S3 utility functions for uploading and managing digest files.""" from __future__ import annotations import hashlib import os from typing import TYPE_CHECKING from urllib.parse import urlparse from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) import boto3 from botocore.exceptions import ClientError from prometheus_client import Counter from gitingest.utils.logging_config import get_logger from server.models import S3Metadata if TYPE_CHECKING: from botocore.client import BaseClient # Initialize logger for this module logger = get_logger(__name__) _s3_ingest_lookup_counter = Counter("gitingest_s3_ingest_lookup", "Number of S3 ingest file lookups") _s3_ingest_hit_counter = Counter("gitingest_s3_ingest_hit", "Number of S3 ingest file cache hits") _s3_ingest_miss_counter = Counter("gitingest_s3_ingest_miss", "Number of S3 ingest file cache misses") class S3UploadError(Exception): """Custom exception for S3 upload failures.""" def is_s3_enabled() -> bool: """Check if S3 is enabled via environment variables.""" return os.getenv("S3_ENABLED", "false").lower() == "true" def get_s3_config() -> dict[str, str | None]: """Get S3 configuration from environment variables.""" config = { "endpoint_url": os.getenv("S3_ENDPOINT"), "aws_access_key_id": os.getenv("S3_ACCESS_KEY"), "aws_secret_access_key": os.getenv("S3_SECRET_KEY"), "region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"), } return {k: v for k, v in config.items() if v is not None} def get_s3_bucket_name() -> str: """Get S3 bucket name from environment variables.""" return os.getenv("S3_BUCKET_NAME", "gitingest-bucket") def get_s3_alias_host() -> str | None: """Get S3 alias host for public URLs.""" return os.getenv("S3_ALIAS_HOST") def generate_s3_file_path( source: str, user_name: str, repo_name: str, commit: str, subpath: str, include_patterns: set[str] | None, ignore_patterns: set[str], ) -> str: """Generate S3 file path with proper naming convention. The file path is formatted as: [/]ingest////// /--.txt If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path. The commit-ID is always included in the URL. If no specific commit is provided, the actual commit hash from the cloned repository is used. Parameters ---------- source : str Git host (e.g., github, gitlab, bitbucket, etc.). user_name : str Repository owner or user. repo_name : str Repository name. commit : str Commit hash. subpath : str Subpath of the repository. include_patterns : set[str] | None Set of patterns specifying which files to include. ignore_patterns : set[str] Set of patterns specifying which files to exclude. Returns ------- str S3 file path string. Raises ------ ValueError If the source URL is invalid. """ hostname = urlparse(source).hostname if hostname is None: msg = "Invalid source URL" logger.error(msg) raise ValueError(msg) # Create hash of exclude/include patterns for uniqueness patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" patterns_str += f"exclude:{sorted(ignore_patterns)}" patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] subpath_hash = hashlib.sha256(subpath.encode()).hexdigest()[:16] file_name = f"{user_name}-{repo_name}-{subpath_hash}.txt" base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{file_name}" # Check for S3_DIRECTORY_PREFIX environment variable s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX") if not s3_directory_prefix: return base_path # Remove trailing slash if present and add the prefix s3_directory_prefix = s3_directory_prefix.rstrip("/") return f"{s3_directory_prefix}/{base_path}" def create_s3_client() -> BaseClient: """Create and return an S3 client with configuration from environment.""" config = get_s3_config() # Log S3 client creation (excluding sensitive info) log_config = config.copy() has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None)) logger.debug( "Creating S3 client", extra={ "s3_config": log_config, "has_credentials": has_credentials, }, ) return boto3.client("s3", **config) def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: """Upload content to S3 and return the public URL. This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file. The ingest ID is stored as an S3 object tag. Parameters ---------- content : str The digest content to upload. s3_file_path : str The S3 file path where the content will be stored. ingest_id : UUID The ingest ID to store as an S3 object tag. Returns ------- str Public URL to access the uploaded file. Raises ------ ValueError If S3 is not enabled. S3UploadError If the upload to S3 fails. """ if not is_s3_enabled(): msg = "S3 is not enabled" logger.error(msg) raise ValueError(msg) s3_client = create_s3_client() bucket_name = get_s3_bucket_name() extra_fields = { "bucket_name": bucket_name, "s3_file_path": s3_file_path, "ingest_id": str(ingest_id), "content_size": len(content), } # Log upload attempt logger.info("Starting S3 upload", extra=extra_fields) try: # Upload the content with ingest_id as tag s3_client.put_object( Bucket=bucket_name, Key=s3_file_path, Body=content.encode("utf-8"), ContentType="text/plain", Tagging=f"ingest_id={ingest_id!s}", ) except ClientError as err: # Log upload failure logger.exception( "S3 upload failed", extra={ "bucket_name": bucket_name, "s3_file_path": s3_file_path, "ingest_id": str(ingest_id), "error_code": err.response.get("Error", {}).get("Code"), "error_message": str(err), }, ) msg = f"Failed to upload to S3: {err}" raise S3UploadError(msg) from err # Generate public URL alias_host = get_s3_alias_host() if alias_host: # Use alias host if configured public_url = f"{alias_host.rstrip('/')}/{s3_file_path}" else: # Fallback to direct S3 URL endpoint = get_s3_config().get("endpoint_url") if endpoint: public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}" else: public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" # Log successful upload logger.info( "S3 upload completed successfully", extra={ "bucket_name": bucket_name, "s3_file_path": s3_file_path, "ingest_id": str(ingest_id), "public_url": public_url, }, ) return public_url def upload_metadata_to_s3(metadata: S3Metadata, s3_file_path: str, ingest_id: UUID) -> str: """Upload metadata JSON to S3 alongside the digest file. Parameters ---------- metadata : S3Metadata The metadata struct containing summary, tree, and content. s3_file_path : str The S3 file path for the digest (metadata will use .json extension). ingest_id : UUID The ingest ID to store as an S3 object tag. Returns ------- str Public URL to access the uploaded metadata file. Raises ------ ValueError If S3 is not enabled. S3UploadError If the upload to S3 fails. """ if not is_s3_enabled(): msg = "S3 is not enabled" logger.error(msg) raise ValueError(msg) # Generate metadata file path by replacing .txt with .json metadata_file_path = s3_file_path.replace(".txt", ".json") s3_client = create_s3_client() bucket_name = get_s3_bucket_name() extra_fields = { "bucket_name": bucket_name, "metadata_file_path": metadata_file_path, "ingest_id": str(ingest_id), "metadata_size": len(metadata.model_dump_json()), } # Log upload attempt logger.info("Starting S3 metadata upload", extra=extra_fields) try: # Upload the metadata with ingest_id as tag s3_client.put_object( Bucket=bucket_name, Key=metadata_file_path, Body=metadata.model_dump_json(indent=2).encode("utf-8"), ContentType="application/json", Tagging=f"ingest_id={ingest_id!s}", ) except ClientError as err: # Log upload failure logger.exception( "S3 metadata upload failed", extra={ "bucket_name": bucket_name, "metadata_file_path": metadata_file_path, "ingest_id": str(ingest_id), "error_code": err.response.get("Error", {}).get("Code"), "error_message": str(err), }, ) msg = f"Failed to upload metadata to S3: {err}" raise S3UploadError(msg) from err # Generate public URL alias_host = get_s3_alias_host() if alias_host: # Use alias host if configured public_url = f"{alias_host.rstrip('/')}/{metadata_file_path}" else: # Fallback to direct S3 URL endpoint = get_s3_config().get("endpoint_url") if endpoint: public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{metadata_file_path}" else: public_url = ( f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{metadata_file_path}" ) # Log successful upload logger.info( "S3 metadata upload completed successfully", extra={ "bucket_name": bucket_name, "metadata_file_path": metadata_file_path, "ingest_id": str(ingest_id), "public_url": public_url, }, ) return public_url def get_metadata_from_s3(s3_file_path: str) -> S3Metadata | None: """Retrieve metadata JSON from S3. Parameters ---------- s3_file_path : str The S3 file path for the digest (metadata will use .json extension). Returns ------- S3Metadata | None The metadata struct if found, None otherwise. """ if not is_s3_enabled(): return None # Generate metadata file path by replacing .txt with .json metadata_file_path = s3_file_path.replace(".txt", ".json") try: s3_client = create_s3_client() bucket_name = get_s3_bucket_name() # Get the metadata object response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path) metadata_content = response["Body"].read().decode("utf-8") return S3Metadata.model_validate_json(metadata_content) except ClientError as err: # Object doesn't exist if we get a 404 error error_code = err.response.get("Error", {}).get("Code") if error_code == "404": logger.info("Metadata file not found", extra={"metadata_file_path": metadata_file_path}) return None # Log other errors but don't fail logger.warning("Failed to retrieve metadata from S3", extra={"error": str(err)}) return None except Exception as exc: # For any other exception, log and return None logger.warning("Unexpected error retrieving metadata from S3", extra={"error": str(exc)}) return None def _build_s3_url(key: str) -> str: """Build S3 URL for a given key.""" alias_host = get_s3_alias_host() if alias_host: return f"{alias_host.rstrip('/')}/{key}" bucket_name = get_s3_bucket_name() config = get_s3_config() endpoint = config["endpoint_url"] if endpoint: return f"{endpoint.rstrip('/')}/{bucket_name}/{key}" return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}" def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool: """Check if an S3 object has the matching ingest_id tag.""" try: tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key) tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])} return tags.get("ingest_id") == str(target_ingest_id) except ClientError: return False def check_s3_object_exists(s3_file_path: str) -> bool: """Check if an S3 object exists at the given path. Parameters ---------- s3_file_path : str The S3 file path to check. Returns ------- bool True if the object exists, False otherwise. Raises ------ ClientError If there's an S3 error other than 404 (not found). """ if not is_s3_enabled(): logger.info("S3 not enabled, skipping object existence check", extra={"s3_file_path": s3_file_path}) return False logger.info("Checking S3 object existence", extra={"s3_file_path": s3_file_path}) _s3_ingest_lookup_counter.inc() try: s3_client = create_s3_client() bucket_name = get_s3_bucket_name() # Use head_object to check if the object exists without downloading it s3_client.head_object(Bucket=bucket_name, Key=s3_file_path) except ClientError as err: # Object doesn't exist if we get a 404 error error_code = err.response.get("Error", {}).get("Code") if error_code == "404": logger.info( "S3 object not found", extra={ "s3_file_path": s3_file_path, "bucket_name": get_s3_bucket_name(), "error_code": error_code, }, ) _s3_ingest_miss_counter.inc() return False # Re-raise other errors (permissions, etc.) raise except Exception as exc: # For any other exception, assume object doesn't exist logger.info( "S3 object check failed with exception, assuming not found", extra={ "s3_file_path": s3_file_path, "bucket_name": get_s3_bucket_name(), "exception": str(exc), }, ) _s3_ingest_miss_counter.inc() return False else: logger.info( "S3 object found", extra={ "s3_file_path": s3_file_path, "bucket_name": get_s3_bucket_name(), }, ) _s3_ingest_hit_counter.inc() return True def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: """Get S3 URL for a given ingest ID if it exists. Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found. Used by the download endpoint to redirect to S3 if available. Parameters ---------- ingest_id : UUID The ingest ID to search for in S3 object tags. Returns ------- str | None S3 URL if file exists, None otherwise. """ if not is_s3_enabled(): logger.debug("S3 not enabled, skipping URL lookup", extra={"ingest_id": str(ingest_id)}) return None logger.info("Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)}) try: s3_client = create_s3_client() bucket_name = get_s3_bucket_name() # List all objects in the ingest/ prefix and check their tags paginator = s3_client.get_paginator("list_objects_v2") page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/") objects_checked = 0 for page in page_iterator: if "Contents" not in page: continue for obj in page["Contents"]: key = obj["Key"] objects_checked += 1 if _check_object_tags( s3_client=s3_client, bucket_name=bucket_name, key=key, target_ingest_id=ingest_id, ): s3_url = _build_s3_url(key) logger.info( "Found S3 object for ingest ID", extra={ "ingest_id": str(ingest_id), "s3_key": key, "s3_url": s3_url, "objects_checked": objects_checked, }, ) return s3_url logger.info( "No S3 object found for ingest ID", extra={ "ingest_id": str(ingest_id), "objects_checked": objects_checked, }, ) except ClientError as err: logger.exception( "Error during S3 URL lookup", extra={ "ingest_id": str(ingest_id), "error_code": err.response.get("Error", {}).get("Code"), "error_message": str(err), }, ) return None ================================================ FILE: src/server/server_config.py ================================================ """Configuration for the server.""" from __future__ import annotations import os from pathlib import Path from fastapi.templating import Jinja2Templates MAX_DISPLAY_SIZE: int = 300_000 # Slider configuration (if updated, update the logSliderToSize function in src/static/js/utils.js) DEFAULT_FILE_SIZE_KB: int = 5 * 1024 # 5 mb MAX_FILE_SIZE_KB: int = 100 * 1024 # 100 mb EXAMPLE_REPOS: list[dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/coderamp-labs/gitingest"}, {"name": "FastAPI", "url": "https://github.com/fastapi/fastapi"}, {"name": "Flask", "url": "https://github.com/pallets/flask"}, {"name": "Excalidraw", "url": "https://github.com/excalidraw/excalidraw"}, {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, ] # Version and repository configuration APP_REPOSITORY = os.getenv("APP_REPOSITORY", "https://github.com/coderamp-labs/gitingest") APP_VERSION = os.getenv("APP_VERSION", "unknown") APP_VERSION_URL = os.getenv("APP_VERSION_URL", "https://github.com/coderamp-labs/gitingest") def get_version_info() -> dict[str, str]: """Get version information including display version and link. Returns ------- dict[str, str] Dictionary containing 'version' and 'version_link' keys. """ # Use pre-computed values from GitHub Actions display_version = APP_VERSION version_link = APP_VERSION_URL # Fallback to repository root if no URL is provided if version_link == APP_REPOSITORY or not version_link: version_link = f"{APP_REPOSITORY.rstrip('/')}/tree/main" return { "version": display_version, "version_link": version_link, } # Use absolute path to templates directory templates_dir = Path(__file__).parent / "templates" templates = Jinja2Templates(directory=templates_dir) ================================================ FILE: src/server/server_utils.py ================================================ """Utility functions for the server.""" from fastapi import Request from fastapi.responses import Response from slowapi import Limiter, _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded from slowapi.util import get_remote_address from gitingest.utils.logging_config import get_logger # Initialize logger for this module logger = get_logger(__name__) # Initialize a rate limiter limiter = Limiter(key_func=get_remote_address) async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: """Handle rate-limiting errors with a custom exception handler. Parameters ---------- request : Request The incoming HTTP request. exc : Exception The exception raised, expected to be RateLimitExceeded. Returns ------- Response A response indicating that the rate limit has been exceeded. Raises ------ exc If the exception is not a RateLimitExceeded error, it is re-raised. """ if isinstance(exc, RateLimitExceeded): # Delegate to the default rate limit handler return _rate_limit_exceeded_handler(request, exc) # Re-raise other exceptions raise exc ## Color printing utility class Colors: """ANSI color codes.""" BLACK = "\033[0;30m" RED = "\033[0;31m" GREEN = "\033[0;32m" BROWN = "\033[0;33m" BLUE = "\033[0;34m" PURPLE = "\033[0;35m" CYAN = "\033[0;36m" LIGHT_GRAY = "\033[0;37m" DARK_GRAY = "\033[1;30m" LIGHT_RED = "\033[1;31m" LIGHT_GREEN = "\033[1;32m" YELLOW = "\033[1;33m" LIGHT_BLUE = "\033[1;34m" LIGHT_PURPLE = "\033[1;35m" LIGHT_CYAN = "\033[1;36m" WHITE = "\033[1;37m" BOLD = "\033[1m" FAINT = "\033[2m" ITALIC = "\033[3m" UNDERLINE = "\033[4m" BLINK = "\033[5m" NEGATIVE = "\033[7m" CROSSED = "\033[9m" END = "\033[0m" ================================================ FILE: src/server/templates/base.jinja ================================================ {# Favicons #} {# Search Engine Meta Tags #} {# Open Graph Meta Tags #} {# Web App Meta #} {# Twitter card #} {# Title #} {% block title %} {% if short_repo_url %} Gitingest - {{ short_repo_url }} {% else %} Gitingest {% endif %} {% endblock %} {% include 'components/tailwind_components.html' %} {% include 'components/navbar.jinja' %} {# Main content wrapper #}
{% block content %}{% endblock %}
{# Footer #} {% include 'components/footer.jinja' %} {# Scripts #} ================================================ FILE: src/server/templates/components/_macros.jinja ================================================ {# Icon link #} {% macro footer_icon_link(href, icon, label) -%} {{ label }} logo {{ label }} {%- endmacro %} ================================================ FILE: src/server/templates/components/footer.jinja ================================================ {% from 'components/_macros.jinja' import footer_icon_link %}
{# Left column — Chrome + PyPI #}
{{ footer_icon_link('https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood', 'icons/chrome.svg', 'Chrome Extension') }} {{ footer_icon_link('https://pypi.org/project/gitingest', 'icons/python.svg', 'Python Package') }}
{# Middle column - Version information #}
Version:  {% if version != "unknown" %} {{ version }} {% else %} {{ version }} {% endif %}
{# Right column - Discord #}
{{ footer_icon_link('https://discord.gg/zerRaGK9EC', 'icons/discord.svg', 'Discord') }}
================================================ FILE: src/server/templates/components/git_form.jinja ================================================
NEW
  • PAT is never stored in the backend
  • Used once for cloning, then discarded from memory
  • No browser caching
  • Cloned repos are deleted after processing
{% if show_examples %}

Try these example repositories:

{% for example in examples %} {% endfor %}
{% endif %}
================================================ FILE: src/server/templates/components/navbar.jinja ================================================
{# Logo #} {# Navigation with updated styling #}
{# Load GitHub stars script #} ================================================ FILE: src/server/templates/components/result.jinja ================================================
================================================ FILE: src/server/templates/components/tailwind_components.html ================================================ ================================================ FILE: src/server/templates/git.jinja ================================================ {% extends "base.jinja" %} {% block content %} {% if error_message %}
{{ error_message }}
{% endif %} {% with show_examples=false %} {% include 'components/git_form.jinja' %} {% endwith %} {% include 'components/result.jinja' %} {% endblock content %} ================================================ FILE: src/server/templates/index.jinja ================================================ {% extends "base.jinja" %} {% block content %}
{# Title & Sparkles #}

Prompt-friendly
codebase 

Turn any Git repository into a simple text digest of its codebase.

This is useful for feeding a codebase into any LLM.

{% if error_message %}
{{ error_message }}
{% endif %} {% with show_examples=true %} {% include 'components/git_form.jinja' %} {% endwith %}

You can also replace 'hub' with 'ingest' in any GitHub URL.

{% include 'components/result.jinja' %} {% endblock %} ================================================ FILE: src/server/templates/swagger_ui.jinja ================================================ {% extends "base.jinja" %} {% block title %}GitIngest API{% endblock %} {% block content %}
{# Title & Sparkles #}

GitIngest
API 

Turn any Git repository into a simple text digest of its codebase.

This is useful for feeding a codebase into any LLM.

{% endblock %} ================================================ FILE: src/static/js/git.js ================================================ function waitForStars() { return new Promise((resolve) => { const check = () => { const stars = document.getElementById('github-stars'); if (stars && stars.textContent !== '0') {resolve();} else {setTimeout(check, 10);} }; check(); }); } document.addEventListener('DOMContentLoaded', () => { const urlInput = document.getElementById('input_text'); const form = document.getElementById('ingestForm'); if (urlInput && urlInput.value.trim() && form) { // Wait for stars to be loaded before submitting waitForStars().then(() => { const submitEvent = new SubmitEvent('submit', { cancelable: true, bubbles: true }); Object.defineProperty(submitEvent, 'target', { value: form, enumerable: true }); handleSubmit(submitEvent, true); }); } }); ================================================ FILE: src/static/js/git_form.js ================================================ // Strike-through / un-strike file lines when the pattern-type menu flips. function changePattern() { const dirPre = document.getElementById('directory-structure-pre'); if (!dirPre) {return;} const treeLineElements = Array.from(dirPre.querySelectorAll('pre[name="tree-line"]')); // Skip the first tree line element treeLineElements.slice(2).forEach((element) => { element.classList.remove('line-through'); element.classList.remove('text-gray-500'); }); // Reset the pattern input field const patternInput = document.getElementById('pattern'); if (patternInput) { patternInput.value = ''; } } // Show/hide the Personal-Access-Token section when the "Private repository" checkbox is toggled. function toggleAccessSettings() { const container = document.getElementById('accessSettingsContainer'); const examples = document.getElementById('exampleRepositories'); const show = document.getElementById('showAccessSettings')?.checked; container?.classList.toggle('hidden', !show); examples?.classList.toggle('lg:mt-0', show); } document.addEventListener('DOMContentLoaded', () => { toggleAccessSettings(); changePattern(); }); // Make them available to existing inline attributes window.changePattern = changePattern; window.toggleAccessSettings = toggleAccessSettings; ================================================ FILE: src/static/js/index.js ================================================ function submitExample(repoName) { const input = document.getElementById('input_text'); if (input) { input.value = repoName; input.focus(); } } // Make it visible to inline onclick handlers window.submitExample = submitExample; ================================================ FILE: src/static/js/navbar.js ================================================ // Fetch GitHub stars function formatStarCount(count) { if (count >= 1000) {return `${ (count / 1000).toFixed(1) }k`;} return count.toString(); } async function fetchGitHubStars() { try { const res = await fetch('https://api.github.com/repos/coderamp-labs/gitingest'); if (!res.ok) {throw new Error(`${res.status} ${res.statusText}`);} const data = await res.json(); document.getElementById('github-stars').textContent = formatStarCount(data.stargazers_count); } catch (err) { console.error('Error fetching GitHub stars:', err); const el = document.getElementById('github-stars').parentElement; if (el) {el.style.display = 'none';} } } // auto-run when script loads fetchGitHubStars(); ================================================ FILE: src/static/js/posthog.js ================================================ /* eslint-disable */ !function (t, e) { let o, n, p, r; if (e.__SV) {return;} // already loaded window.posthog = e; e._i = []; e.init = function (i, s, a) { function g(t, e) { const o = e.split("."); if (o.length === 2) { t = t[o[0]]; e = o[1]; } t[e] = function () { t.push([e].concat(Array.prototype.slice.call(arguments, 0))); }; } p = t.createElement("script"); p.type = "text/javascript"; p.crossOrigin = "anonymous"; p.async = true; p.src = `${ s.api_host.replace(".i.posthog.com", "-assets.i.posthog.com") }/static/array.js`; r = t.getElementsByTagName("script")[0]; r.parentNode.insertBefore(p, r); let u = e; if (a !== undefined) { u = e[a] = []; } else { a = "posthog"; } u.people = u.people || []; u.toString = function (t) { let e = "posthog"; if (a !== "posthog") {e += `.${ a }`;} if (!t) {e += " (stub)";} return e; }; u.people.toString = function () { return `${ u.toString(1) }.people (stub)`; }; o = [ "init", "capture", "register", "register_once", "register_for_session", "unregister", "unregister_for_session", "getFeatureFlag", "getFeatureFlagPayload", "isFeatureEnabled", "reloadFeatureFlags", "updateEarlyAccessFeatureEnrollment", "getEarlyAccessFeatures", "on", "onFeatureFlags", "onSessionId", "getSurveys", "getActiveMatchingSurveys", "renderSurvey", "canRenderSurvey", "getNextSurveyStep", "identify", "setPersonProperties", "group", "resetGroups", "setPersonPropertiesForFlags", "resetPersonPropertiesForFlags", "setGroupPropertiesForFlags", "resetGroupPropertiesForFlags", "reset", "get_distinct_id", "getGroups", "get_session_id", "get_session_replay_url", "alias", "set_config", "startSessionRecording", "stopSessionRecording", "sessionRecordingStarted", "captureException", "loadToolbar", "get_property", "getSessionProperty", "createPersonProfile", "opt_in_capturing", "opt_out_capturing", "has_opted_in_capturing", "has_opted_out_capturing", "clear_opt_in_out_capturing", "debug", "getPageViewId" ]; for (n = 0; n < o.length; n++) {g(u, o[n]);} e._i.push([i, s, a]); }; e.__SV = 1; }(document, window.posthog || []); /* Initialise PostHog */ posthog.init('phc_9aNpiIVH2zfTWeY84vdTWxvrJRCQQhP5kcVDXUvcdou', { api_host: 'https://eu.i.posthog.com', person_profiles: 'always', }); ================================================ FILE: src/static/js/utils.js ================================================ function getFileName(element) { const indentSize = 4; let path = ''; let prevIndentLevel = null; while (element) { const line = element.textContent; const index = line.search(/[a-zA-Z0-9_.-]/); const indentLevel = index / indentSize; // Stop when we reach or go above the top-level directory if (indentLevel <= 1) { break; } // Only include directories that are one level above the previous if (prevIndentLevel === null || indentLevel === prevIndentLevel - 1) { const fileName = line.substring(index).trim(); path = fileName + path; prevIndentLevel = indentLevel; } element = element.previousElementSibling; } return path; } function toggleFile(element) { const patternInput = document.getElementById('pattern'); const patternFiles = patternInput.value ? patternInput.value.split(',').map((item) => item.trim()) : []; const directoryContainer = document.getElementById('directory-structure-container'); const treeLineElements = Array.from(directoryContainer.children).filter((child) => child.tagName === 'PRE'); // Skip the first two tree lines (header and repository name) if (treeLineElements[0] === element || treeLineElements[1] === element) { return; } element.classList.toggle('line-through'); element.classList.toggle('text-gray-500'); const fileName = getFileName(element); const fileIndex = patternFiles.indexOf(fileName); if (fileIndex !== -1) { patternFiles.splice(fileIndex, 1); } else { patternFiles.push(fileName); } patternInput.value = patternFiles.join(', '); } // Copy functionality function copyText(className) { let textToCopy; if (className === 'directory-structure') { // For directory structure, get the hidden input value const hiddenInput = document.getElementById('directory-structure-content'); if (!hiddenInput) {return;} textToCopy = hiddenInput.value; } else { // For other elements, get the textarea value const textarea = document.querySelector(`.${ className }`); if (!textarea) {return;} textToCopy = textarea.value; } const button = document.querySelector(`button[onclick="copyText('${className}')"]`); if (!button) {return;} // Copy text navigator.clipboard.writeText(textToCopy) .then(() => { // Store original content const originalContent = button.innerHTML; // Change button content button.innerHTML = 'Copied!'; // Reset after 1 second setTimeout(() => { button.innerHTML = originalContent; }, 1000); }) .catch((err) => { console.error('Failed to copy text:', err); const originalContent = button.innerHTML; button.innerHTML = 'Failed to copy'; setTimeout(() => { button.innerHTML = originalContent; }, 1000); }); } // Helper functions for toggling result blocks function showLoading() { document.getElementById('results-loading').style.display = 'block'; document.getElementById('results-section').style.display = 'none'; document.getElementById('results-error').style.display = 'none'; } function showResults() { document.getElementById('results-loading').style.display = 'none'; document.getElementById('results-section').style.display = 'block'; document.getElementById('results-error').style.display = 'none'; } function showError(msg) { document.getElementById('results-loading').style.display = 'none'; document.getElementById('results-section').style.display = 'none'; const errorDiv = document.getElementById('results-error'); errorDiv.innerHTML = msg; errorDiv.style.display = 'block'; } // Helper function to collect form data function collectFormData(form) { const json_data = {}; const inputText = form.querySelector('[name="input_text"]'); const token = form.querySelector('[name="token"]'); const hiddenInput = document.getElementById('max_file_size_kb'); const patternType = document.getElementById('pattern_type'); const pattern = document.getElementById('pattern'); if (inputText) {json_data.input_text = inputText.value;} if (token) {json_data.token = token.value;} if (hiddenInput) {json_data.max_file_size = hiddenInput.value;} if (patternType) {json_data.pattern_type = patternType.value;} if (pattern) {json_data.pattern = pattern.value;} return json_data; } // Helper function to manage button loading state function setButtonLoadingState(submitButton, isLoading) { if (!isLoading) { submitButton.disabled = false; submitButton.innerHTML = submitButton.getAttribute('data-original-content') || 'Submit'; submitButton.classList.remove('bg-[#ffb14d]'); return; } // Store original content if not already stored if (!submitButton.getAttribute('data-original-content')) { submitButton.setAttribute('data-original-content', submitButton.innerHTML); } submitButton.disabled = true; submitButton.innerHTML = `
Processing...
`; submitButton.classList.add('bg-[#ffb14d]'); } // Helper function to handle successful response function handleSuccessfulResponse(data) { // Show results section showResults(); // Store the digest_url for download functionality window.currentDigestUrl = data.digest_url; // Set plain text content for summary, tree, and content document.getElementById('result-summary').value = data.summary || ''; document.getElementById('directory-structure-content').value = data.tree || ''; document.getElementById('result-content').value = data.content || ''; // Populate directory structure lines as clickable
 elements
    const dirPre = document.getElementById('directory-structure-pre');

    if (dirPre && data.tree) {
        dirPre.innerHTML = '';
        data.tree.split('\n').forEach((line) => {
            const pre = document.createElement('pre');

            pre.setAttribute('name', 'tree-line');
            pre.className = 'cursor-pointer hover:line-through hover:text-gray-500';
            pre.textContent = line;
            pre.onclick = function () { toggleFile(this); };
            dirPre.appendChild(pre);
        });
    }

    // Scroll to results
    document.getElementById('results-section').scrollIntoView({ behavior: 'smooth', block: 'start' });
}

function handleSubmit(event, showLoadingSpinner = false) {
    event.preventDefault();
    const form = event.target || document.getElementById('ingestForm');

    if (!form) {return;}

    // Ensure hidden input is updated before collecting form data
    const slider = document.getElementById('file_size');
    const hiddenInput = document.getElementById('max_file_size_kb');

    if (slider && hiddenInput) {
        hiddenInput.value = logSliderToSize(slider.value);
    }

    if (showLoadingSpinner) {
        showLoading();
    }

    const submitButton = form.querySelector('button[type="submit"]');

    if (!submitButton) {return;}

    const json_data = collectFormData(form);

    if (showLoadingSpinner) {
        setButtonLoadingState(submitButton, true);
    }

    // Submit the form to /api/ingest as JSON
    fetch('/api/ingest', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify(json_data)
    })
        .then(async (response) => {
            let data;

            try {
                data = await response.json();
            } catch {
                data = {};
            }
            setButtonLoadingState(submitButton, false);

            if (!response.ok) {
                // Show all error details if present
                if (Array.isArray(data.detail)) {
                    const details = data.detail.map((d) => `
  • ${d.msg || JSON.stringify(d)}
  • `).join(''); showError(`
    Error(s):
      ${details}
    `); return; } // Other errors showError(`
    ${data.error || JSON.stringify(data) || 'An error occurred.'}
    `); return; } // Handle error in data if (data.error) { showError(`
    ${data.error}
    `); return; } handleSuccessfulResponse(data); }) .catch((error) => { setButtonLoadingState(submitButton, false); showError(`
    ${error}
    `); }); } function copyFullDigest() { const directoryStructure = document.getElementById('directory-structure-content').value; const filesContent = document.querySelector('.result-text').value; const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; const button = document.querySelector('[onclick="copyFullDigest()"]'); const originalText = button.innerHTML; navigator.clipboard.writeText(fullDigest).then(() => { button.innerHTML = ` Copied! `; setTimeout(() => { button.innerHTML = originalText; }, 2000); }) .catch((err) => { console.error('Failed to copy text: ', err); }); } function downloadFullDigest() { // Check if we have a digest_url if (!window.currentDigestUrl) { console.error('No digest_url available for download'); return; } // Show feedback on the button const button = document.querySelector('[onclick="downloadFullDigest()"]'); const originalText = button.innerHTML; button.innerHTML = ` Downloading... `; // Create a download link using the digest_url const a = document.createElement('a'); a.href = window.currentDigestUrl; a.download = 'digest.txt'; document.body.appendChild(a); a.click(); // Clean up document.body.removeChild(a); // Update button to show success button.innerHTML = ` Downloaded! `; setTimeout(() => { button.innerHTML = originalText; }, 2000); } // Add the logSliderToSize helper function function logSliderToSize(position) { const maxPosition = 500; const maxValue = Math.log(102400); // 100 MB const value = Math.exp(maxValue * (position / maxPosition)**1.5); return Math.round(value); } // Move slider initialization to a separate function function initializeSlider() { const slider = document.getElementById('file_size'); const sizeValue = document.getElementById('size_value'); const hiddenInput = document.getElementById('max_file_size_kb'); if (!slider || !sizeValue || !hiddenInput) {return;} function updateSlider() { const value = logSliderToSize(slider.value); sizeValue.textContent = formatSize(value); slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; hiddenInput.value = value; // Set hidden input to KB value } // Update on slider change slider.addEventListener('input', updateSlider); // Initialize slider position updateSlider(); } // Add helper function for formatting size function formatSize(sizeInKB) { if (sizeInKB >= 1024) { return `${ Math.round(sizeInKB / 1024) }MB`; } return `${ Math.round(sizeInKB) }kB`; } // Add this new function function setupGlobalEnterHandler() { document.addEventListener('keydown', (event) => { if (event.key === 'Enter' && !event.target.matches('textarea')) { const form = document.getElementById('ingestForm'); if (form) { handleSubmit(new Event('submit'), true); } } }); } // Add to the DOMContentLoaded event listener document.addEventListener('DOMContentLoaded', () => { initializeSlider(); setupGlobalEnterHandler(); }); // Make sure these are available globally window.handleSubmit = handleSubmit; window.toggleFile = toggleFile; window.copyText = copyText; window.copyFullDigest = copyFullDigest; window.downloadFullDigest = downloadFullDigest; ================================================ FILE: src/static/llms.txt ================================================ # GitIngest – **AI Agent Integration Guide** Turn any Git repository into a prompt-ready text digest. GitIngest fetches, cleans, and formats source code so AI agents and Large Language Models can reason over complete projects programmatically. **🤖 For AI Agents**: Use CLI or Python package for automated integration. Web UI is designed for human interaction only. --- ## 1. Installation ### 1.1 CLI Installation (Recommended for Scripts & Automation) ```bash # Best practice: Use pipx for CLI tools (isolated environment) pipx install gitingest # Alternative: Use pip (may conflict with other packages) pip install gitingest # Verify installation gitingest --help ``` ### 1.2 Python Package Installation (For Code Integration) ```bash # For projects/notebooks: Use pip in virtual environment python -m venv gitingest-env source gitingest-env/bin/activate # On Windows: gitingest-env\Scripts\activate pip install gitingest # Or add to requirements.txt echo "gitingest" >> requirements.txt pip install -r requirements.txt # For self-hosting: Install with server dependencies pip install gitingest[server] # For development: Install with dev dependencies pip install gitingest[dev,server] ``` ### 1.3 Installation Verification ```bash # Test CLI installation gitingest --version # Test Python package python -c "from gitingest import ingest; print('GitIngest installed successfully')" # Quick functionality test gitingest https://github.com/octocat/Hello-World -o test_output.txt ``` --- ## 2. Quick-Start for AI Agents | Method | Best for | One-liner | |--------|----------|-----------| | **CLI** | Scripts, automation, pipelines | `gitingest https://github.com/user/repo -o - \| your-llm` | | **Python** | Code integration, notebooks, async tasks | `from gitingest import ingest; s,t,c = ingest('repo-url'); process(c)` | | **URL Hack** | Quick web scraping (limited) | Replace `github.com` → `gitingest.com` in any GitHub URL | | **Web UI** | **Human use only** | ~~Not recommended for AI agents~~ | --- ## 3. Output Format for AI Processing GitIngest returns **structured plain-text** optimized for LLM consumption with three distinct sections: ### 3.1 Repository Summary ``` Repository: owner/repo-name Files analyzed: 42 Estimated tokens: 15.2k ``` Contains basic metadata: repository name, file count, and token estimation for LLM planning. ### 3.2 Directory Structure ``` Directory structure: └── project-name/ ├── src/ │ ├── main.py │ └── utils.py ├── tests/ │ └── test_main.py └── README.md ``` Hierarchical tree view showing the complete project structure for context and navigation. ### 3.3 File Contents Each file is wrapped with clear delimiters: ``` ================================================ FILE: src/main.py ================================================ def hello_world(): print("Hello, World!") if __name__ == "__main__": hello_world() ================================================ FILE: README.md ================================================ # Project Title This is a sample project... ``` ### 3.4 Usage Example ```python # Python package usage from gitingest import ingest summary, tree, content = ingest("https://github.com/octocat/Hello-World") # Returns exactly: # summary = "Repository: octocat/hello-world\nFiles analyzed: 1\nEstimated tokens: 29" # tree = "Directory structure:\n└── octocat-hello-world/\n └── README" # content = "================================================\nFILE: README\n================================================\nHello World!\n\n\n" # For AI processing, combine all sections: full_context = f"{summary}\n\n{tree}\n\n{content}" ``` ```bash # CLI usage - pipe directly to your AI system gitingest https://github.com/octocat/Hello-World -o - | your_llm_processor # Output streams the complete formatted text: # Repository: octocat/hello-world # Files analyzed: 1 # Estimated tokens: 29 # # Directory structure: # └── octocat-hello-world/ # └── README # # ================================================ # FILE: README # ================================================ # Hello World! ``` --- ## 4. AI Agent Integration Methods ### 4.1 CLI Integration (Recommended for Automation) ```bash # Basic usage - pipe directly to your AI system gitingest https://github.com/user/repo -o - | your_ai_processor # Advanced filtering for focused analysis (long flags) gitingest https://github.com/user/repo \ --include-pattern "*.py" --include-pattern "*.js" --include-pattern "*.md" \ --max-size 102400 \ -o - | python your_analyzer.py # Same command with short flags (more concise) gitingest https://github.com/user/repo \ -i "*.py" -i "*.js" -i "*.md" \ -s 102400 \ -o - | python your_analyzer.py # Exclude unwanted files and directories (long flags) gitingest https://github.com/user/repo \ --exclude-pattern "node_modules/*" --exclude-pattern "*.log" \ --exclude-pattern "dist/*" \ -o - | your_analyzer # Same with short flags gitingest https://github.com/user/repo \ -e "node_modules/*" -e "*.log" -e "dist/*" \ -o - | your_analyzer # Private repositories with token (short flag) export GITHUB_TOKEN="ghp_your_token_here" gitingest https://github.com/user/private-repo -t $GITHUB_TOKEN -o - # Specific branch analysis (short flag) gitingest https://github.com/user/repo -b main -o - # Save to file (default: digest.txt in current directory) gitingest https://github.com/user/repo -o my_analysis.txt # Ultra-concise example for small files only gitingest https://github.com/user/repo -i "*.py" -s 51200 -o - ``` **Key Parameters for AI Agents**: - `-s` / `--max-size`: Maximum file size in bytes to process (default: no limit) - `-i` / `--include-pattern`: Include files matching Unix shell-style wildcards - `-e` / `--exclude-pattern`: Exclude files matching Unix shell-style wildcards - `-b` / `--branch`: Specify branch to analyze (defaults to repository's default branch) - `-t` / `--token`: GitHub personal access token for private repositories - `-o` / `--output`: Stream to STDOUT with `-` (default saves to `digest.txt`) ### 4.2 Python Package (Best for Code Integration) ```python from gitingest import ingest, ingest_async import asyncio # Synchronous processing def analyze_repository(repo_url: str): summary, tree, content = ingest(repo_url) # Process metadata repo_info = parse_summary(summary) # Analyze structure file_structure = parse_tree(tree) # Process code content return analyze_code(content) # Asynchronous processing (recommended for AI services) async def batch_analyze_repos(repo_urls: list): tasks = [ingest_async(url) for url in repo_urls] results = await asyncio.gather(*tasks) return [process_repo_data(*result) for result in results] # Memory-efficient processing for large repos def stream_process_repo(repo_url: str): summary, tree, content = ingest( repo_url, max_file_size=51200, # 50KB max per file include_patterns=["*.py", "*.js"], # Focus on code files ) # Process in chunks to manage memory for file_content in split_content(content): yield analyze_file(file_content) # Filtering with exclude patterns def analyze_without_deps(repo_url: str): summary, tree, content = ingest( repo_url, exclude_patterns=[ "node_modules/*", "*.lock", "dist/*", "build/*", "*.min.js", "*.log" ] ) return analyze_code(content) ``` **Python Integration Patterns**: - **Batch Processing**: Use `ingest_async` for multiple repositories - **Memory Management**: Use `max_file_size` and pattern filtering for large repos - **Error Handling**: Wrap in try-catch for network/auth issues - **Caching**: Store results to avoid repeated API calls - **Pattern Filtering**: Use `include_patterns` and `exclude_patterns` lists ### 4.3 Web UI (❌ Not for AI Agents) The web interface at `https://gitingest.com` is designed for **human interaction only**. **Why AI agents should avoid the web UI**: - Requires manual interaction and browser automation - No programmatic access to results - Rate limiting and CAPTCHA protection - Inefficient for automated workflows **Use CLI or Python package instead** for all AI agent integrations. --- ## 5. AI Agent Best Practices ### 5.1 Repository Analysis Workflows ```python # Pattern 1: Full repository analysis def full_repo_analysis(repo_url: str): summary, tree, content = ingest(repo_url) return { 'metadata': extract_metadata(summary), 'structure': analyze_structure(tree), 'code_analysis': analyze_all_files(content), 'insights': generate_insights(summary, tree, content) } # Pattern 2: Selective file processing def selective_analysis(repo_url: str, file_patterns: list): summary, tree, content = ingest( repo_url, include_patterns=file_patterns ) return focused_analysis(content) # Pattern 3: Streaming for large repos def stream_analysis(repo_url: str): # First pass: get structure and metadata only summary, tree, _ = ingest( repo_url, include_patterns=["*.md", "*.txt"], max_file_size=10240 # 10KB limit for docs ) # Then process code files selectively by language for pattern in ["*.py", "*.js", "*.go", "*.rs"]: _, _, content = ingest( repo_url, include_patterns=[pattern], max_file_size=51200 # 50KB limit for code ) yield process_language_specific(content, pattern) ``` ### 5.2 Error Handling for AI Agents ```python from gitingest import ingest from gitingest.utils.exceptions import GitIngestError import time def robust_ingest(repo_url: str, retries: int = 3): for attempt in range(retries): try: return ingest(repo_url) except GitIngestError as e: if attempt == retries - 1: return None, None, f"Failed to ingest: {e}" time.sleep(2 ** attempt) # Exponential backoff ``` ### 5.3 Private Repository Access ```python import os from gitingest import ingest # Method 1: Environment variable def ingest_private_repo(repo_url: str): token = os.getenv('GITHUB_TOKEN') if not token: raise ValueError("GITHUB_TOKEN environment variable required") return ingest(repo_url, token=token) # Method 2: Secure token management def ingest_with_token_rotation(repo_url: str, token_manager): token = token_manager.get_active_token() try: return ingest(repo_url, token=token) except AuthenticationError: token = token_manager.rotate_token() return ingest(repo_url, token=token) ``` --- ## 6. Integration Scenarios for AI Agents | Use Case | Recommended Method | Example Implementation | |----------|-------------------|----------------------| | **Code Review Bot** | Python async | `await ingest_async(pr_repo)` → analyze changes | | **Documentation Generator** | CLI with filtering | `gitingest repo -i "*.py" -i "*.md" -o -` | | **Vulnerability Scanner** | Python with error handling | Batch process multiple repos | | **Code Search Engine** | CLI → Vector DB | `gitingest repo -o - \| embed \| store` | | **AI Coding Assistant** | Python integration | Load repo context into conversation | | **CI/CD Analysis** | CLI integration | `gitingest repo -o - \| analyze_pipeline` | | **Repository Summarization** | Python with streaming | Process large repos in chunks | | **Dependency Analysis** | CLI exclude patterns | `gitingest repo -e "node_modules/*" -e "*.lock" -o -` | | **Security Audit** | CLI with size limits | `gitingest repo -i "*.py" -i "*.js" -s 204800 -o -` | --- ## 7. Support & Resources for AI Developers * **Web UI official instance**: https://gitingest.com * **GitHub Repository**: https://github.com/coderamp-labs/gitingest * **Python Package**: https://pypi.org/project/gitingest/ * **Community Support**: https://discord.gg/zerRaGK9EC _GitIngest – Purpose-built for AI agents to understand entire codebases programmatically._ ================================================ FILE: src/static/robots.txt ================================================ User-agent: * Allow: / Allow: /api/ Allow: /coderamp-labs/gitingest/ ================================================ FILE: tests/.pylintrc ================================================ [MASTER] init-hook= import sys sys.path.append('./src') [MESSAGES CONTROL] disable=missing-class-docstring,missing-function-docstring,protected-access,fixme [FORMAT] max-line-length=119 ================================================ FILE: tests/__init__.py ================================================ """Tests for the gitingest package.""" ================================================ FILE: tests/conftest.py ================================================ """Fixtures for tests. This file provides shared fixtures for creating sample queries, a temporary directory structure, and a helper function to write ``.ipynb`` notebooks for testing notebook utilities. """ from __future__ import annotations import json import sys import uuid from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict from unittest.mock import AsyncMock, MagicMock import pytest from gitingest.query_parser import IngestionQuery if TYPE_CHECKING: from pytest_mock import MockerFixture WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] DEMO_URL = "https://github.com/user/repo" LOCAL_REPO_PATH = "/tmp/repo" DEMO_COMMIT = "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef" def get_ensure_git_installed_call_count() -> int: """Get the number of calls made by ensure_git_installed based on platform. On Windows, ensure_git_installed makes 2 calls: 1. git --version 2. git config core.longpaths On other platforms, it makes 1 call: 1. git --version Returns ------- int The number of calls made by ensure_git_installed """ return 2 if sys.platform == "win32" else 1 @pytest.fixture def sample_query() -> IngestionQuery: """Provide a default ``IngestionQuery`` object for use in tests. This fixture returns a ``IngestionQuery`` pre-populated with typical fields and some default ignore patterns. Returns ------- IngestionQuery The sample ``IngestionQuery`` object. """ return IngestionQuery( user_name="test_user", repo_name="test_repo", local_path=Path("/tmp/test_repo").resolve(), slug="test_user/test_repo", id=uuid.uuid4(), branch="main", max_file_size=1_000_000, ignore_patterns={"*.pyc", "__pycache__", ".git"}, ) @pytest.fixture def temp_directory(tmp_path: Path) -> Path: """Create a temporary directory structure for testing repository scanning. The structure includes: test_repo/ ├── file1.txt ├── file2.py ├── src/ │ ├── subfile1.txt │ ├── subfile2.py │ └── subdir/ │ ├── file_subdir.txt │ └── file_subdir.py ├── dir1/ │ └── file_dir1.txt └── dir2/ └── file_dir2.txt Parameters ---------- tmp_path : Path The temporary directory path provided by the ``tmp_path`` fixture. Returns ------- Path The path to the created ``test_repo`` directory. """ test_dir = tmp_path / "test_repo" test_dir.mkdir() # Root files (test_dir / "file1.txt").write_text("Hello World") (test_dir / "file2.py").write_text("print('Hello')") # src directory and its files src_dir = test_dir / "src" src_dir.mkdir() (src_dir / "subfile1.txt").write_text("Hello from src") (src_dir / "subfile2.py").write_text("print('Hello from src')") # src/subdir and its files subdir = src_dir / "subdir" subdir.mkdir() (subdir / "file_subdir.txt").write_text("Hello from subdir") (subdir / "file_subdir.py").write_text("print('Hello from subdir')") # dir1 and its file dir1 = test_dir / "dir1" dir1.mkdir() (dir1 / "file_dir1.txt").write_text("Hello from dir1") # dir2 and its file dir2 = test_dir / "dir2" dir2.mkdir() (dir2 / "file_dir2.txt").write_text("Hello from dir2") return test_dir @pytest.fixture def write_notebook(tmp_path: Path) -> WriteNotebookFunc: """Provide a helper function to write a ``.ipynb`` notebook file with the given content. Parameters ---------- tmp_path : Path The temporary directory path provided by the ``tmp_path`` fixture. Returns ------- WriteNotebookFunc A callable that accepts a filename and a dictionary (representing JSON notebook data), writes it to a ``.ipynb`` file, and returns the path to the file. """ def _write_notebook(name: str, content: dict[str, Any]) -> Path: notebook_path = tmp_path / name with notebook_path.open(mode="w", encoding="utf-8") as f: json.dump(content, f) return notebook_path return _write_notebook @pytest.fixture def stub_resolve_sha(mocker: MockerFixture) -> dict[str, AsyncMock]: """Patch *both* async helpers that hit the network. Include this fixture *only* in tests that should stay offline. """ head_mock = mocker.patch( "gitingest.utils.query_parser_utils._resolve_ref_to_sha", new_callable=mocker.AsyncMock, return_value=DEMO_COMMIT, ) ref_mock = mocker.patch( "gitingest.utils.git_utils._resolve_ref_to_sha", new_callable=mocker.AsyncMock, return_value=DEMO_COMMIT, ) # return whichever you want to assert on; here we return the dict return {"head": head_mock, "ref": ref_mock} @pytest.fixture def stub_branches(mocker: MockerFixture) -> Callable[[list[str]], None]: """Return a function that stubs git branch discovery to *branches*.""" def _factory(branches: list[str]) -> None: # Patch the GitPython fetch function mocker.patch( "gitingest.utils.git_utils.fetch_remote_branches_or_tags", new_callable=AsyncMock, return_value=branches, ) # Patch GitPython's ls_remote method to return the mocked output ls_remote_output = "\n".join(f"{DEMO_COMMIT[:12]}{i:02d}\trefs/heads/{b}" for i, b in enumerate(branches)) mock_git_cmd = mocker.patch("git.Git") mock_git_cmd.return_value.ls_remote.return_value = ls_remote_output # Also patch the git module imports in our utils mocker.patch("gitingest.utils.git_utils.git.Git", return_value=mock_git_cmd.return_value) return _factory @pytest.fixture def repo_exists_true(mocker: MockerFixture) -> AsyncMock: """Patch ``gitingest.clone.check_repo_exists`` to always return ``True``.""" return mocker.patch("gitingest.clone.check_repo_exists", return_value=True) @pytest.fixture def run_command_mock(mocker: MockerFixture) -> AsyncMock: """Patch ``gitingest.clone.run_command`` with an ``AsyncMock``. The mocked function returns a dummy process whose ``communicate`` method yields generic ``stdout`` / ``stderr`` bytes. Tests can still access / tweak the mock via the fixture argument. """ mock = AsyncMock(side_effect=_fake_run_command) mocker.patch("gitingest.utils.git_utils.run_command", mock) # Mock GitPython components _setup_gitpython_mocks(mocker) return mock @pytest.fixture def gitpython_mocks(mocker: MockerFixture) -> dict[str, MagicMock]: """Provide comprehensive GitPython mocks for testing.""" return _setup_gitpython_mocks(mocker) def _setup_gitpython_mocks(mocker: MockerFixture) -> dict[str, MagicMock]: """Set up comprehensive GitPython mocks.""" # Mock git.Git class mock_git_cmd = MagicMock() mock_git_cmd.version.return_value = "git version 2.34.1" mock_git_cmd.config.return_value = "true" mock_git_cmd.execute.return_value = f"{DEMO_COMMIT}\trefs/heads/main\n" mock_git_cmd.ls_remote.return_value = f"{DEMO_COMMIT}\trefs/heads/main\n" mock_git_cmd.clone.return_value = "" # Mock git.Repo class mock_repo = MagicMock() mock_repo.git = MagicMock() mock_repo.git.fetch = MagicMock() mock_repo.git.checkout = MagicMock() mock_repo.git.submodule = MagicMock() mock_repo.git.execute = MagicMock() mock_repo.git.config = MagicMock() mock_repo.git.sparse_checkout = MagicMock() # Mock git.Repo.clone_from mock_clone_from = MagicMock(return_value=mock_repo) git_git_mock = mocker.patch("git.Git", return_value=mock_git_cmd) git_repo_mock = mocker.patch("git.Repo", return_value=mock_repo) mocker.patch("git.Repo.clone_from", mock_clone_from) # Patch imports in our modules mocker.patch("gitingest.utils.git_utils.git.Git", return_value=mock_git_cmd) mocker.patch("gitingest.utils.git_utils.git.Repo", return_value=mock_repo) mocker.patch("gitingest.clone.git.Git", return_value=mock_git_cmd) mocker.patch("gitingest.clone.git.Repo", return_value=mock_repo) mocker.patch("gitingest.clone.git.Repo.clone_from", mock_clone_from) return { "git_cmd": mock_git_cmd, "repo": mock_repo, "clone_from": mock_clone_from, "git_git_mock": git_git_mock, "git_repo_mock": git_repo_mock, } async def _fake_run_command(*args: str) -> tuple[bytes, bytes]: if "ls-remote" in args: # single match: refs/heads/main return (f"{DEMO_COMMIT}\trefs/heads/main\n".encode(), b"") return (b"output", b"error") ================================================ FILE: tests/query_parser/__init__.py ================================================ """Tests for the query parser.""" ================================================ FILE: tests/query_parser/test_git_host_agnostic.py ================================================ """Tests to verify that the query parser is Git host agnostic. These tests confirm that ``parse_query`` correctly identifies user/repo pairs and canonical URLs for GitHub, GitLab, Bitbucket, Gitea, and Codeberg, even if the host is omitted. """ from __future__ import annotations import pytest from gitingest.config import MAX_FILE_SIZE from gitingest.query_parser import parse_remote_repo from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS, _is_valid_git_commit_hash # Repository matrix: (host, user, repo) _REPOS: list[tuple[str, str, str]] = [ ("github.com", "fastapi", "fastapi"), ("gitlab.com", "gitlab-org", "gitlab-runner"), ("bitbucket.org", "na-dna", "llm-knowledge-share"), ("gitea.com", "xorm", "xorm"), ("codeberg.org", "forgejo", "forgejo"), ("git.rwth-aachen.de", "medialab", "19squared"), ("gitlab.alpinelinux.org", "alpine", "apk-tools"), ] # Generate cartesian product of repository tuples with URL variants. @pytest.mark.parametrize(("host", "user", "repo"), _REPOS, ids=[f"{h}:{u}/{r}" for h, u, r in _REPOS]) @pytest.mark.parametrize("variant", ["full", "noscheme", "slug"]) @pytest.mark.asyncio async def test_parse_query_without_host( host: str, user: str, repo: str, variant: str, ) -> None: """Verify that ``parse_remote_repo`` handles URLs, host-omitted URLs and raw slugs.""" # Build the input URL based on the selected variant if variant == "full": url = f"https://{host}/{user}/{repo}" elif variant == "noscheme": url = f"{host}/{user}/{repo}" else: # "slug" url = f"{user}/{repo}" expected_url = f"https://{host}/{user}/{repo}" # For slug form with a custom host (not in KNOWN_GIT_HOSTS) we expect a failure, # because the parser cannot guess which domain to use. if variant == "slug" and host not in KNOWN_GIT_HOSTS: with pytest.raises(ValueError, match="Could not find a valid repository host"): await parse_remote_repo(url) return query = await parse_remote_repo(url) # Compare against the canonical dict while ignoring unpredictable fields. actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"}) assert "commit" in actual assert _is_valid_git_commit_hash(actual["commit"]) del actual["commit"] expected = { "host": host, "user_name": user, "repo_name": repo, "url": expected_url, "slug": f"{user}-{repo}", "subpath": "/", "type": None, "branch": None, "tag": None, "max_file_size": MAX_FILE_SIZE, "include_patterns": None, "include_submodules": False, } assert actual == expected ================================================ FILE: tests/query_parser/test_query_parser.py ================================================ """Tests for the ``query_parser`` module. These tests cover URL parsing, pattern parsing, and handling of branches/subpaths for HTTP(S) repositories and local paths. """ # pylint: disable=too-many-arguments, too-many-positional-arguments from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, Callable import pytest from gitingest.query_parser import parse_local_dir_path, parse_remote_repo from gitingest.utils.query_parser_utils import _is_valid_git_commit_hash from tests.conftest import DEMO_URL if TYPE_CHECKING: from unittest.mock import AsyncMock from gitingest.schemas import IngestionQuery URLS_HTTPS: list[str] = [ DEMO_URL, "https://gitlab.com/user/repo", "https://bitbucket.org/user/repo", "https://gitea.com/user/repo", "https://codeberg.org/user/repo", "https://gist.github.com/user/repo", "https://git.example.com/user/repo", "https://gitlab.example.com/user/repo", "https://gitlab.example.se/user/repo", ] URLS_HTTP: list[str] = [url.replace("https://", "http://") for url in URLS_HTTPS] @pytest.mark.parametrize("url", URLS_HTTPS, ids=lambda u: u) @pytest.mark.asyncio async def test_parse_url_valid_https(url: str, stub_resolve_sha: dict[str, AsyncMock]) -> None: """Valid HTTPS URLs parse correctly and ``query.url`` equals the input.""" query = await _assert_basic_repo_fields(url, stub_resolve_sha["head"]) assert query.url == url # HTTPS: canonical URL should equal input @pytest.mark.parametrize("url", URLS_HTTP, ids=lambda u: u) @pytest.mark.asyncio async def test_parse_url_valid_http(url: str, stub_resolve_sha: dict[str, AsyncMock]) -> None: """Valid HTTP URLs parse correctly (slug check only).""" await _assert_basic_repo_fields(url, stub_resolve_sha["head"]) @pytest.mark.asyncio async def test_parse_url_invalid(stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with an invalid URL. Given an HTTPS URL lacking a repository structure (e.g., "https://github.com"), When ``parse_remote_repo`` is called, Then a ValueError should be raised indicating an invalid repository URL. """ url = "https://github.com" with pytest.raises(ValueError, match="Invalid repository URL"): await parse_remote_repo(url) stub_resolve_sha["head"].assert_not_awaited() @pytest.mark.asyncio @pytest.mark.parametrize("url", [DEMO_URL, "https://gitlab.com/user/repo"]) async def test_parse_query_basic(url: str, stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with a basic valid repository URL. Given an HTTPS URL: When ``parse_remote_repo`` is called, Then user/repo, URL should be parsed correctly. """ query = await parse_remote_repo(url) stub_resolve_sha["head"].assert_awaited_once() assert query.user_name == "user" assert query.repo_name == "repo" assert query.url == url @pytest.mark.asyncio async def test_parse_query_mixed_case(stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with mixed-case URLs. Given a URL with mixed-case parts (e.g. "Https://GitHub.COM/UsEr/rEpO"): When ``parse_remote_repo`` is called, Then the user and repo names should be normalized to lowercase. """ url = "Https://GitHub.COM/UsEr/rEpO" query = await parse_remote_repo(url) stub_resolve_sha["head"].assert_awaited_once() assert query.user_name == "user" assert query.repo_name == "repo" @pytest.mark.asyncio async def test_parse_url_with_subpaths( stub_branches: Callable[[list[str]], None], stub_resolve_sha: dict[str, AsyncMock], ) -> None: """Test ``parse_remote_repo`` with a URL containing branch and subpath. Given a URL referencing a branch ("main") and a subdir ("subdir/file"): When ``parse_remote_repo`` is called with remote branch fetching, Then user, repo, branch, and subpath should be identified correctly. """ url = DEMO_URL + "/tree/main/subdir/file" stub_branches(["main", "dev", "feature-branch"]) query = await _assert_basic_repo_fields(url, stub_resolve_sha["ref"]) assert query.user_name == "user" assert query.repo_name == "repo" assert query.branch == "main" assert query.subpath == "/subdir/file" @pytest.mark.asyncio async def test_parse_url_invalid_repo_structure(stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with a URL missing a repository name. Given a URL like "https://github.com/user": When ``parse_remote_repo`` is called, Then a ValueError should be raised indicating an invalid repository URL. """ url = "https://github.com/user" with pytest.raises(ValueError, match="Invalid repository URL"): await parse_remote_repo(url) stub_resolve_sha["head"].assert_not_awaited() async def test_parse_local_dir_path_local_path() -> None: """Test ``parse_local_dir_path``. Given "/home/user/project": When ``parse_local_dir_path`` is called, Then the local path should be set, id generated, and slug formed accordingly. """ path = "/home/user/project" query = parse_local_dir_path(path) tail = Path("home/user/project") assert query.local_path.parts[-len(tail.parts) :] == tail.parts assert query.id is not None assert query.slug == "home/user/project" async def test_parse_local_dir_path_relative_path() -> None: """Test ``parse_local_dir_path`` with a relative path. Given "./project": When ``parse_local_dir_path`` is called, Then ``local_path`` resolves relatively, and ``slug`` ends with "project". """ path = "./project" query = parse_local_dir_path(path) tail = Path("project") assert query.local_path.parts[-len(tail.parts) :] == tail.parts assert query.slug.endswith("project") @pytest.mark.asyncio async def test_parse_remote_repo_empty_source(stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with an empty string. Given an empty source string: When ``parse_remote_repo`` is called, Then a ValueError should be raised indicating an invalid repository URL. """ url = "" with pytest.raises(ValueError, match="Invalid repository URL"): await parse_remote_repo(url) stub_resolve_sha["head"].assert_not_awaited() @pytest.mark.asyncio @pytest.mark.parametrize( ("path", "expected_branch", "mock_name"), [ ("/tree/main", "main", "ref"), ("/tree/abcd1234abcd1234abcd1234abcd1234abcd1234", None, "ref"), ], ) async def test_parse_url_branch_and_commit_distinction( path: str, expected_branch: str, stub_branches: Callable[[list[str]], None], stub_resolve_sha: dict[str, AsyncMock], mock_name: str, ) -> None: """Test ``parse_remote_repo`` distinguishing branch vs. commit hash. Given either a branch URL (e.g., ".../tree/main") or a 40-character commit URL: When ``parse_remote_repo`` is called with branch fetching, Then the function should correctly set ``branch`` or ``commit`` based on the URL content. """ stub_branches(["main", "dev", "feature-branch"]) url = DEMO_URL + path query = await _assert_basic_repo_fields(url, stub_resolve_sha[mock_name]) assert query.branch == expected_branch assert query.commit is not None assert _is_valid_git_commit_hash(query.commit) async def test_parse_local_dir_path_uuid_uniqueness() -> None: """Test ``parse_local_dir_path`` for unique UUID generation. Given the same path twice: When ``parse_local_dir_path`` is called repeatedly, Then each call should produce a different query id. """ path = "/home/user/project" query_1 = parse_local_dir_path(path) query_2 = parse_local_dir_path(path) assert query_1.id != query_2.id @pytest.mark.asyncio async def test_parse_url_with_query_and_fragment(stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with query parameters and a fragment. Given a URL like "https://github.com/user/repo?arg=value#fragment": When ``parse_remote_repo`` is called, Then those parts should be stripped, leaving a clean user/repo URL. """ url = DEMO_URL + "?arg=value#fragment" query = await parse_remote_repo(url) stub_resolve_sha["head"].assert_awaited_once() assert query.user_name == "user" assert query.repo_name == "repo" assert query.url == DEMO_URL # URL should be cleaned @pytest.mark.asyncio async def test_parse_url_unsupported_host(stub_resolve_sha: dict[str, AsyncMock]) -> None: """Test ``parse_remote_repo`` with an unsupported host. Given "https://only-domain.com": When ``parse_remote_repo`` is called, Then a ValueError should be raised for the unknown domain. """ url = "https://only-domain.com" with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"): await parse_remote_repo(url) stub_resolve_sha["head"].assert_not_awaited() @pytest.mark.asyncio async def test_parse_query_with_branch() -> None: """Test ``parse_remote_repo`` when a branch is specified in a blob path. Given "https://github.com/pandas-dev/pandas/blob/2.2.x/...": When ``parse_remote_repo`` is called, Then the branch should be identified, subpath set, and commit remain None. """ url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" query = await parse_remote_repo(url) assert query.user_name == "pandas-dev" assert query.repo_name == "pandas" assert query.url == "https://github.com/pandas-dev/pandas" assert query.slug == "pandas-dev-pandas" assert query.id is not None assert query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" assert query.branch == "2.2.x" assert query.commit is not None assert _is_valid_git_commit_hash(query.commit) assert query.type == "blob" @pytest.mark.asyncio @pytest.mark.parametrize( ("path", "expected_branch", "expected_subpath", "mock_name"), [ ("/tree/feature/fix1/src", "feature/fix1", "/src", "ref"), ("/tree/main/src", "main", "/src", "ref"), ("", None, "/", "head"), ("/tree/nonexistent-branch/src", None, "/", "ref"), ("/tree/fix", "fix", "/", "ref"), ("/blob/fix/page.html", "fix", "/page.html", "ref"), ], ) async def test_parse_repo_source_with_various_url_patterns( path: str, expected_branch: str | None, expected_subpath: str, stub_branches: Callable[[list[str]], None], stub_resolve_sha: dict[str, AsyncMock], mock_name: str, ) -> None: """Test ``parse_remote_repo`` with various GitHub-style URL permutations. Given various GitHub-style URL permutations: When ``parse_remote_repo`` is called, Then it should detect (or reject) a branch and resolve the sub-path. Branch discovery is stubbed so that only names passed to ``stub_branches`` are considered "remote". """ stub_branches(["feature/fix1", "main", "feature-branch", "fix"]) url = DEMO_URL + path query = await _assert_basic_repo_fields(url, stub_resolve_sha[mock_name]) assert query.branch == expected_branch assert query.subpath == expected_subpath @pytest.mark.asyncio async def _assert_basic_repo_fields(url: str, sha_mock: AsyncMock) -> IngestionQuery: """Run ``parse_remote_repo`` and assert user, repo and slug are parsed.""" query = await parse_remote_repo(url) assert query.commit is not None assert _is_valid_git_commit_hash(query.commit) if query.commit in url: sha_mock.assert_not_awaited() else: sha_mock.assert_awaited_once() assert query.user_name == "user" assert query.repo_name == "repo" assert query.slug == "user-repo" return query ================================================ FILE: tests/server/__init__.py ================================================ """Tests for the server.""" ================================================ FILE: tests/server/test_flow_integration.py ================================================ """Integration tests covering core functionalities, edge cases, and concurrency handling.""" import shutil import sys from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Generator import pytest from fastapi import status from fastapi.testclient import TestClient from pytest_mock import MockerFixture from src.server.main import app BASE_DIR = Path(__file__).resolve().parent.parent TEMPLATE_DIR = BASE_DIR / "src" / "templates" @pytest.fixture(scope="module") def test_client() -> Generator[TestClient, None, None]: """Create a test client fixture.""" with TestClient(app) as client_instance: client_instance.headers.update({"Host": "localhost"}) yield client_instance @pytest.fixture(autouse=True) def mock_static_files(mocker: MockerFixture) -> None: """Mock the static file mount to avoid directory errors.""" mock_static = mocker.patch("src.server.main.StaticFiles", autospec=True) mock_static.return_value = None return mock_static @pytest.fixture(scope="module", autouse=True) def cleanup_tmp_dir() -> Generator[None, None, None]: """Remove ``/tmp/gitingest`` after this test-module is done.""" yield # run tests temp_dir = Path("/tmp/gitingest") if temp_dir.exists(): try: shutil.rmtree(temp_dir) except PermissionError as exc: sys.stderr.write(f"Error cleaning up {temp_dir}: {exc}\n") @pytest.mark.asyncio async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> None: """Test the complete flow of analyzing a remote repository.""" client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", } response = client.post("/api/ingest", json=form_data) assert response.status_code == status.HTTP_200_OK, f"Form submission failed: {response.text}" # Check that response is JSON response_data = response.json() assert "content" in response_data assert response_data["content"] assert "repo_url" in response_data assert "summary" in response_data assert "tree" in response_data assert "content" in response_data @pytest.mark.asyncio async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None: """Test handling of an invalid repository URL.""" client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/nonexistent/repo", "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", } response = client.post("/api/ingest", json=form_data) # Should return 400 for invalid repository assert response.status_code == status.HTTP_400_BAD_REQUEST, f"Request failed: {response.text}" # Check that response is JSON error response_data = response.json() assert "error" in response_data @pytest.mark.asyncio async def test_large_repository(request: pytest.FixtureRequest) -> None: """Simulate analysis of a large repository with nested folders.""" client = request.getfixturevalue("test_client") # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository) form_data = { "input_text": "https://github.com/octocat/hello-world", "max_file_size": 10, "pattern_type": "exclude", "pattern": "", "token": "", } response = client.post("/api/ingest", json=form_data) assert response.status_code == status.HTTP_200_OK, f"Request failed: {response.text}" response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data assert response_data["content"] else: assert "error" in response_data @pytest.mark.asyncio async def test_concurrent_requests(request: pytest.FixtureRequest) -> None: """Test handling of multiple concurrent requests.""" client = request.getfixturevalue("test_client") def make_request() -> None: form_data = { "input_text": "https://github.com/octocat/hello-world", "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", } response = client.post("/api/ingest", json=form_data) assert response.status_code == status.HTTP_200_OK, f"Request failed: {response.text}" response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data assert response_data["content"] else: assert "error" in response_data with ThreadPoolExecutor(max_workers=5) as executor: futures = [executor.submit(make_request) for _ in range(5)] for future in futures: future.result() @pytest.mark.asyncio async def test_large_file_handling(request: pytest.FixtureRequest) -> None: """Test handling of repositories with large files.""" client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", "max_file_size": 1, "pattern_type": "exclude", "pattern": "", "token": "", } response = client.post("/api/ingest", json=form_data) assert response.status_code == status.HTTP_200_OK, f"Request failed: {response.text}" response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data assert response_data["content"] else: assert "error" in response_data @pytest.mark.asyncio async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: """Test repository analysis with include/exclude patterns.""" client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", "max_file_size": 243, "pattern_type": "include", "pattern": "*.md", "token": "", } response = client.post("/api/ingest", json=form_data) assert response.status_code == status.HTTP_200_OK, f"Request failed: {response.text}" response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data assert "pattern_type" in response_data assert response_data["pattern_type"] == "include" assert "pattern" in response_data assert response_data["pattern"] == "*.md" else: assert "error" in response_data ================================================ FILE: tests/test_cli.py ================================================ """Tests for the Gitingest CLI.""" from __future__ import annotations from inspect import signature from pathlib import Path import pytest from click.testing import CliRunner, Result from gitingest.__main__ import main from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME @pytest.mark.parametrize( ("cli_args", "expect_file"), [ pytest.param(["./"], True, id="default-options"), pytest.param( [ "./", "--output", str(OUTPUT_FILE_NAME), "--max-size", str(MAX_FILE_SIZE), "--exclude-pattern", "tests/", "--include-pattern", "src/", "--include-submodules", ], True, id="custom-options", ), ], ) def test_cli_writes_file( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, *, cli_args: list[str], expect_file: bool, ) -> None: """Run the CLI and verify that the SARIF file is created (or not).""" expectes_exit_code = 0 # Work inside an isolated temp directory monkeypatch.chdir(tmp_path) result = _invoke_isolated_cli_runner(cli_args) assert result.exit_code == expectes_exit_code, result.stderr # Summary line should be on STDOUT stdout_lines = result.stdout.splitlines() assert f"Analysis complete! Output written to: {OUTPUT_FILE_NAME}" in stdout_lines # File side-effect sarif_file = tmp_path / OUTPUT_FILE_NAME assert sarif_file.exists() is expect_file, f"{OUTPUT_FILE_NAME} existence did not match expectation" def test_cli_with_stdout_output() -> None: """Test CLI invocation with output directed to STDOUT.""" output_file = Path(OUTPUT_FILE_NAME) # Clean up any existing digest.txt file before test if output_file.exists(): output_file.unlink() try: result = _invoke_isolated_cli_runner(["./", "--output", "-", "--exclude-pattern", "tests/"]) # ─── core expectations (stdout) ────────────────────────────────────- assert result.exit_code == 0, f"CLI exited with code {result.exit_code}, stderr: {result.stderr}" assert "---" in result.stdout, "Expected file separator '---' not found in STDOUT" assert "src/gitingest/__main__.py" in result.stdout, ( "Expected content (e.g., src/gitingest/__main__.py) not found in STDOUT" ) assert not output_file.exists(), f"Output file {output_file} was unexpectedly created." # ─── the summary must *not* pollute STDOUT, must appear on STDERR ─── summary = "Analysis complete! Output sent to stdout." stdout_lines = result.stdout.splitlines() stderr_lines = result.stderr.splitlines() assert summary not in stdout_lines, "Unexpected summary message found in STDOUT" assert summary in stderr_lines, "Expected summary message not found in STDERR" assert f"Output written to: {output_file.name}" not in stderr_lines finally: # Clean up any digest.txt file that might have been created during test if output_file.exists(): output_file.unlink() def _invoke_isolated_cli_runner(args: list[str]) -> Result: """Return a ``CliRunner`` that keeps ``stderr`` separate on Click 8.0-8.1.""" kwargs = {} if "mix_stderr" in signature(CliRunner.__init__).parameters: kwargs["mix_stderr"] = False # Click 8.0-8.1 runner = CliRunner(**kwargs) return runner.invoke(main, args) ================================================ FILE: tests/test_clone.py ================================================ """Tests for the ``clone`` module. These tests cover various scenarios for cloning repositories, verifying that the appropriate Git commands are invoked and handling edge cases such as nonexistent URLs, timeouts, redirects, and specific commits or branches. """ from __future__ import annotations import sys from typing import TYPE_CHECKING import pytest from gitingest.clone import clone_repo from gitingest.schemas import CloneConfig from gitingest.utils.git_utils import check_repo_exists from tests.conftest import DEMO_URL, LOCAL_REPO_PATH if TYPE_CHECKING: from pathlib import Path from unittest.mock import AsyncMock from pytest_mock import MockerFixture # All cloning-related tests assume (unless explicitly overridden) that the repository exists. # Apply the check-repo patch automatically so individual tests don't need to repeat it. pytestmark = pytest.mark.usefixtures("repo_exists_true") GIT_INSTALLED_CALLS = 2 if sys.platform == "win32" else 1 @pytest.mark.asyncio async def test_clone_with_commit(repo_exists_true: AsyncMock, gitpython_mocks: dict) -> None: """Test cloning a repository with a specific commit hash. Given a valid URL and a commit hash: When ``clone_repo`` is called, Then the repository should be cloned and checked out at that commit. """ commit_hash = "a" * 40 # Simulating a valid commit hash clone_config = CloneConfig( url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit=commit_hash, branch="main", ) await clone_repo(clone_config) repo_exists_true.assert_any_call(clone_config.url, token=None) # Verify GitPython calls were made mock_git_cmd = gitpython_mocks["git_cmd"] mock_repo = gitpython_mocks["repo"] mock_clone_from = gitpython_mocks["clone_from"] # Should have called version (for ensure_git_installed) mock_git_cmd.version.assert_called() # Should have called clone_from (since partial_clone=False) mock_clone_from.assert_called_once() # Should have called fetch and checkout on the repo mock_repo.git.fetch.assert_called() mock_repo.git.checkout.assert_called_with(commit_hash) @pytest.mark.asyncio async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None: """Test cloning a nonexistent repository URL. Given an invalid or nonexistent URL: When ``clone_repo`` is called, Then a ValueError should be raised with an appropriate error message. """ clone_config = CloneConfig( url="https://github.com/user/nonexistent-repo", local_path=LOCAL_REPO_PATH, commit=None, branch="main", ) # Override the default fixture behaviour for this test repo_exists_true.return_value = False with pytest.raises(ValueError, match="Repository not found"): await clone_repo(clone_config) repo_exists_true.assert_any_call(clone_config.url, token=None) @pytest.mark.asyncio @pytest.mark.parametrize( ("git_command_succeeds", "expected"), [ (True, True), # git ls-remote succeeds -> repo exists (False, False), # git ls-remote fails -> repo doesn't exist or no access ], ) async def test_check_repo_exists( git_command_succeeds: bool, # noqa: FBT001 *, expected: bool, mocker: MockerFixture, ) -> None: """Verify that ``check_repo_exists`` works by using _resolve_ref_to_sha.""" mock_resolve = mocker.patch("gitingest.utils.git_utils._resolve_ref_to_sha") if git_command_succeeds: mock_resolve.return_value = "abc123def456" # Mock SHA else: mock_resolve.side_effect = ValueError("Repository not found") result = await check_repo_exists(DEMO_URL) assert result is expected mock_resolve.assert_called_once_with(DEMO_URL, "HEAD", token=None) @pytest.mark.asyncio async def test_clone_without_commit(repo_exists_true: AsyncMock, gitpython_mocks: dict) -> None: """Test cloning a repository when no commit hash is provided. Given a valid URL and no commit hash: When ``clone_repo`` is called, Then the repository should be cloned and checked out at the resolved commit. """ clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit=None, branch="main") await clone_repo(clone_config) repo_exists_true.assert_any_call(clone_config.url, token=None) # Verify GitPython calls were made mock_git_cmd = gitpython_mocks["git_cmd"] mock_repo = gitpython_mocks["repo"] mock_clone_from = gitpython_mocks["clone_from"] # Should have resolved the commit via ls_remote mock_git_cmd.ls_remote.assert_called() # Should have cloned the repo mock_clone_from.assert_called_once() # Should have fetched and checked out mock_repo.git.fetch.assert_called() mock_repo.git.checkout.assert_called() @pytest.mark.asyncio async def test_clone_creates_parent_directory(tmp_path: Path, gitpython_mocks: dict) -> None: """Test that ``clone_repo`` creates parent directories if they don't exist. Given a local path with non-existent parent directories: When ``clone_repo`` is called, Then it should create the parent directories before attempting to clone. """ nested_path = tmp_path / "deep" / "nested" / "path" / "repo" clone_config = CloneConfig(url=DEMO_URL, local_path=str(nested_path)) await clone_repo(clone_config) # Verify parent directories were created assert nested_path.parent.exists() # Verify clone operation happened mock_clone_from = gitpython_mocks["clone_from"] mock_clone_from.assert_called_once() @pytest.mark.asyncio async def test_clone_with_specific_subpath(gitpython_mocks: dict) -> None: """Test cloning a repository with a specific subpath. Given a valid repository URL and a specific subpath: When ``clone_repo`` is called, Then the repository should be cloned with sparse checkout enabled. """ subpath = "src/docs" clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, subpath=subpath) await clone_repo(clone_config) # Verify partial clone (using git.clone instead of Repo.clone_from) mock_git_cmd = gitpython_mocks["git_cmd"] mock_git_cmd.clone.assert_called() # Verify sparse checkout was configured mock_repo = gitpython_mocks["repo"] mock_repo.git.sparse_checkout.assert_called() @pytest.mark.asyncio async def test_clone_with_include_submodules(gitpython_mocks: dict) -> None: """Test cloning a repository with submodules included. Given a valid URL and ``include_submodules=True``: When ``clone_repo`` is called, Then the repository should update submodules after cloning. """ clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True) await clone_repo(clone_config) # Verify submodule update was called mock_repo = gitpython_mocks["repo"] mock_repo.git.submodule.assert_called_with("update", "--init", "--recursive", "--depth=1") @pytest.mark.asyncio async def test_check_repo_exists_with_auth_token(mocker: MockerFixture) -> None: """Test ``check_repo_exists`` with authentication token. Given a GitHub URL and a token: When ``check_repo_exists`` is called, Then it should pass the token to _resolve_ref_to_sha. """ mock_resolve = mocker.patch("gitingest.utils.git_utils._resolve_ref_to_sha") mock_resolve.return_value = "abc123def456" # Mock SHA test_token = "token123" # noqa: S105 result = await check_repo_exists("https://github.com/test/repo", token=test_token) assert result is True mock_resolve.assert_called_once_with("https://github.com/test/repo", "HEAD", token=test_token) ================================================ FILE: tests/test_git_utils.py ================================================ """Tests for the ``git_utils`` module. These tests validate the ``validate_github_token`` function, which ensures that GitHub personal access tokens (PATs) are properly formatted. """ from __future__ import annotations import base64 from typing import TYPE_CHECKING import pytest from gitingest.utils.exceptions import InvalidGitHubTokenError from gitingest.utils.git_utils import create_git_auth_header, create_git_repo, is_github_host, validate_github_token if TYPE_CHECKING: from pathlib import Path from pytest_mock import MockerFixture @pytest.mark.parametrize( "token", [ # Valid tokens: correct prefixes and at least 36 allowed characters afterwards "github_pat_" + "a" * 22 + "_" + "b" * 59, "ghp_" + "A" * 36, "ghu_" + "B" * 36, "ghs_" + "C" * 36, "ghr_" + "D" * 36, "gho_" + "E" * 36, ], ) def test_validate_github_token_valid(token: str) -> None: """validate_github_token should accept properly-formatted tokens.""" # Should not raise any exception validate_github_token(token) @pytest.mark.parametrize( "token", [ "github_pat_short", # Too short after prefix "ghp_" + "b" * 35, # one character short "invalidprefix_" + "c" * 36, # Wrong prefix "github_pat_" + "!" * 36, # Disallowed characters "github_pat_" + "a" * 36, # Too short after 'github_pat_' prefix "", # Empty string ], ) def test_validate_github_token_invalid(token: str) -> None: """Test that ``validate_github_token`` raises ``InvalidGitHubTokenError`` on malformed tokens.""" with pytest.raises(InvalidGitHubTokenError): validate_github_token(token) @pytest.mark.parametrize( ("local_path", "url", "token", "should_configure_auth"), [ ( "/some/path", "https://github.com/owner/repo.git", None, False, # No auth configuration expected when token is None ), ( "/some/path", "https://github.com/owner/repo.git", "ghp_" + "d" * 36, True, # Auth configuration expected for GitHub URL + token ), ( "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "e" * 36, False, # No auth configuration for non-GitHub URL even if token provided ), ], ) def test_create_git_repo( local_path: str, url: str, token: str | None, should_configure_auth: bool, # noqa: FBT001 mocker: MockerFixture, ) -> None: """Test that ``create_git_repo`` creates a proper Git repo object.""" # Mock git.Repo to avoid actual filesystem operations mock_repo = mocker.MagicMock() mock_repo_class = mocker.patch("git.Repo", return_value=mock_repo) repo = create_git_repo(local_path, url, token) # Should create repo with correct path mock_repo_class.assert_called_once_with(local_path) assert repo == mock_repo # Check auth configuration if should_configure_auth: mock_repo.git.config.assert_called_once() else: mock_repo.git.config.assert_not_called() @pytest.mark.parametrize( "token", [ "ghp_abcdefghijklmnopqrstuvwxyz012345", # typical ghp_ token "github_pat_1234567890abcdef1234567890abcdef1234", ], ) def test_create_git_auth_header(token: str) -> None: """Test that ``create_git_auth_header`` produces correct base64-encoded header.""" header = create_git_auth_header(token) expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() expected = f"http.https://github.com/.extraheader=Authorization: Basic {expected_basic}" assert header == expected @pytest.mark.parametrize( ("url", "token", "should_call"), [ ("https://github.com/foo/bar.git", "ghp_" + "f" * 36, True), ("https://github.com/foo/bar.git", None, False), ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, False), ], ) def test_create_git_repo_helper_calls( mocker: MockerFixture, tmp_path: Path, *, url: str, token: str | None, should_call: bool, ) -> None: """Test that ``create_git_auth_header`` is invoked only when appropriate.""" work_dir = tmp_path / "repo" header_mock = mocker.patch("gitingest.utils.git_utils.create_git_auth_header", return_value="key=value") mock_repo = mocker.MagicMock() mocker.patch("git.Repo", return_value=mock_repo) create_git_repo(str(work_dir), url, token) if should_call: header_mock.assert_called_once_with(token, url=url) mock_repo.git.config.assert_called_once_with("key", "value") else: header_mock.assert_not_called() mock_repo.git.config.assert_not_called() @pytest.mark.parametrize( ("url", "expected"), [ # GitHub.com URLs ("https://github.com/owner/repo.git", True), ("http://github.com/owner/repo.git", True), ("https://github.com/owner/repo", True), # GitHub Enterprise URLs ("https://github.company.com/owner/repo.git", True), ("https://github.enterprise.org/owner/repo.git", True), ("http://github.internal/owner/repo.git", True), ("https://github.example.co.uk/owner/repo.git", True), # Non-GitHub URLs ("https://gitlab.com/owner/repo.git", False), ("https://bitbucket.org/owner/repo.git", False), ("https://git.example.com/owner/repo.git", False), ("https://mygithub.com/owner/repo.git", False), # doesn't start with "github." ("https://subgithub.com/owner/repo.git", False), ("https://example.com/github/repo.git", False), # Edge cases ("", False), ("not-a-url", False), ("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com ], ) def test_is_github_host(url: str, *, expected: bool) -> None: """Test that ``is_github_host`` correctly identifies GitHub and GitHub Enterprise URLs.""" assert is_github_host(url) == expected @pytest.mark.parametrize( ("token", "url", "expected_hostname"), [ # GitHub.com URLs (default) ("ghp_" + "a" * 36, "https://github.com", "github.com"), ("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"), # GitHub Enterprise URLs ("ghp_" + "b" * 36, "https://github.company.com", "github.company.com"), ("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"), ("ghp_" + "d" * 36, "http://github.internal", "github.internal"), ], ) def test_create_git_auth_header_with_ghe_url(token: str, url: str, expected_hostname: str) -> None: """Test that ``create_git_auth_header`` handles GitHub Enterprise URLs correctly.""" header = create_git_auth_header(token, url=url) expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}" assert header == expected @pytest.mark.parametrize( ("local_path", "url", "token", "expected_auth_hostname"), [ # GitHub.com URLs - should use default hostname ( "/some/path", "https://github.com/owner/repo.git", "ghp_" + "a" * 36, "github.com", ), # GitHub Enterprise URLs - should use custom hostname ( "/some/path", "https://github.company.com/owner/repo.git", "ghp_" + "b" * 36, "github.company.com", ), ( "/some/path", "https://github.enterprise.org/owner/repo.git", "ghp_" + "c" * 36, "github.enterprise.org", ), ( "/some/path", "http://github.internal/owner/repo.git", "ghp_" + "d" * 36, "github.internal", ), ], ) def test_create_git_repo_with_ghe_urls( local_path: str, url: str, token: str, expected_auth_hostname: str, mocker: MockerFixture, ) -> None: """Test that ``create_git_repo`` handles GitHub Enterprise URLs correctly.""" mock_repo = mocker.MagicMock() mocker.patch("git.Repo", return_value=mock_repo) create_git_repo(local_path, url, token) # Should configure auth with the correct hostname mock_repo.git.config.assert_called_once() auth_config_call = mock_repo.git.config.call_args[0] # The first argument should contain the hostname assert expected_auth_hostname in auth_config_call[0] @pytest.mark.parametrize( ("local_path", "url", "token"), [ # Should NOT configure auth for non-GitHub URLs ("/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "a" * 36), ("/some/path", "https://bitbucket.org/owner/repo.git", "ghp_" + "b" * 36), ("/some/path", "https://git.example.com/owner/repo.git", "ghp_" + "c" * 36), ], ) def test_create_git_repo_ignores_non_github_urls( local_path: str, url: str, token: str, mocker: MockerFixture, ) -> None: """Test that ``create_git_repo`` does not configure auth for non-GitHub URLs.""" mock_repo = mocker.MagicMock() mocker.patch("git.Repo", return_value=mock_repo) create_git_repo(local_path, url, token) # Should not configure auth for non-GitHub URLs mock_repo.git.config.assert_not_called() ================================================ FILE: tests/test_gitignore_feature.py ================================================ """Tests for the gitignore functionality in Gitingest.""" from pathlib import Path import pytest from gitingest.entrypoint import ingest_async from gitingest.utils.ignore_patterns import load_ignore_patterns @pytest.fixture(name="repo_path") def repo_fixture(tmp_path: Path) -> Path: """Create a temporary repository structure. The repository structure includes: - A ``.gitignore`` that excludes ``exclude.txt`` - ``include.txt`` (should be processed) - ``exclude.txt`` (should be skipped when gitignore rules are respected) """ # Create a .gitignore file that excludes 'exclude.txt' gitignore_file = tmp_path / ".gitignore" gitignore_file.write_text("exclude.txt\n") # Create a file that should be included include_file = tmp_path / "include.txt" include_file.write_text("This file should be included.") # Create a file that should be excluded exclude_file = tmp_path / "exclude.txt" exclude_file.write_text("This file should be excluded.") return tmp_path def test_load_gitignore_patterns(tmp_path: Path) -> None: """Test that ``load_ignore_patterns()`` correctly loads patterns from a ``.gitignore`` file.""" gitignore = tmp_path / ".gitignore" # Write some sample patterns with a comment line included gitignore.write_text("exclude.txt\n*.log\n# a comment\n") patterns = load_ignore_patterns(tmp_path, filename=".gitignore") # Check that the expected patterns are loaded assert "exclude.txt" in patterns assert "*.log" in patterns # Ensure that comment lines are not added for pattern in patterns: assert not pattern.startswith("#") @pytest.mark.asyncio async def test_ingest_with_gitignore(repo_path: Path) -> None: """Integration test for ``ingest_async()`` respecting ``.gitignore`` rules. When ``include_gitignored`` is ``False`` (default), the content of ``exclude.txt`` should be omitted. When ``include_gitignored`` is ``True``, both files should be present. """ # Run ingestion with the gitignore functionality enabled. _, _, content_with_ignore = await ingest_async(source=str(repo_path)) # 'exclude.txt' should be skipped. assert "This file should be excluded." not in content_with_ignore # 'include.txt' should be processed. assert "This file should be included." in content_with_ignore # Run ingestion with the gitignore functionality disabled. _, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True) # Now both files should be present. assert "This file should be excluded." in content_without_ignore assert "This file should be included." in content_without_ignore ================================================ FILE: tests/test_ingestion.py ================================================ """Tests for the ``query_ingestion`` module. These tests validate directory scanning, file content extraction, notebook handling, and the overall ingestion logic, including filtering patterns and subpaths. """ from __future__ import annotations import re from typing import TYPE_CHECKING, TypedDict import pytest from gitingest.ingestion import ingest_query if TYPE_CHECKING: from pathlib import Path from gitingest.query_parser import IngestionQuery def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> None: """Test ``ingest_query`` to ensure it processes the directory and returns expected results. Given a directory with ``.txt`` and ``.py`` files: When ``ingest_query`` is invoked, Then it should produce a summary string listing the files analyzed and a combined content string. """ sample_query.local_path = temp_directory sample_query.subpath = "/" sample_query.type = None summary, _, content = ingest_query(sample_query) assert "Repository: test_user/test_repo" in summary assert "Files analyzed: 8" in summary # Check presence of key files in the content assert "src/subfile1.txt" in content assert "src/subfile2.py" in content assert "src/subdir/file_subdir.txt" in content assert "src/subdir/file_subdir.py" in content assert "file1.txt" in content assert "file2.py" in content assert "dir1/file_dir1.txt" in content assert "dir2/file_dir2.txt" in content # TODO: Additional tests: # - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. # - Edge cases with weird file names or deep subdirectory structures. # TODO : def test_include_nonexistent_extension class PatternScenario(TypedDict): """A scenario for testing the ingestion of a set of patterns.""" include_patterns: set[str] ignore_patterns: set[str] expected_num_files: int expected_content: set[str] expected_structure: set[str] expected_not_structure: set[str] @pytest.mark.parametrize( "pattern_scenario", [ pytest.param( PatternScenario( { "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, "ignore_patterns": {*()}, "expected_num_files": 2, "expected_content": {"file2.py", "dir2/file_dir2.txt"}, "expected_structure": {"test_repo/", "dir2/"}, "expected_not_structure": {"src/", "subdir/", "dir1/"}, }, ), id="include-explicit-files", ), pytest.param( PatternScenario( { "include_patterns": { "file1.txt", "file2.py", "file_dir1.txt", "*/file_dir2.txt", }, "ignore_patterns": {*()}, "expected_num_files": 4, "expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"}, "expected_structure": {"test_repo/", "dir1/", "dir2/"}, "expected_not_structure": {"src/", "subdir/"}, }, ), id="include-wildcard-directory", ), pytest.param( PatternScenario( { "include_patterns": {"*.py"}, "ignore_patterns": {*()}, "expected_num_files": 3, "expected_content": { "file2.py", "src/subfile2.py", "src/subdir/file_subdir.py", }, "expected_structure": {"test_repo/", "src/", "subdir/"}, "expected_not_structure": {"dir1/", "dir2/"}, }, ), id="include-wildcard-files", ), pytest.param( PatternScenario( { "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, "ignore_patterns": {*()}, "expected_num_files": 3, "expected_content": { "dir2/file_dir2.txt", "src/subfile2.py", "src/subdir/file_subdir.py", }, "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, "expected_not_structure": {"dir1/"}, }, ), id="include-recursive-wildcard", ), pytest.param( PatternScenario( { "include_patterns": {*()}, "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, "expected_num_files": 6, "expected_content": { "file1.txt", "src/subfile1.txt", "src/subfile2.py", "src/subdir/file_subdir.txt", "src/subdir/file_subdir.py", "dir1/file_dir1.txt", }, "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, "expected_not_structure": {"dir2/"}, }, ), id="exclude-explicit-files", ), pytest.param( PatternScenario( { "include_patterns": {*()}, "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, "expected_num_files": 5, "expected_content": { "src/subfile1.txt", "src/subfile2.py", "src/subdir/file_subdir.txt", "src/subdir/file_subdir.py", "dir2/file_dir2.txt", }, "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, "expected_not_structure": {"dir1/"}, }, ), id="exclude-wildcard-directory", ), pytest.param( PatternScenario( { "include_patterns": {*()}, "ignore_patterns": {"src/**/*.py"}, "expected_num_files": 6, "expected_content": { "file1.txt", "file2.py", "src/subfile1.txt", "src/subdir/file_subdir.txt", "dir1/file_dir1.txt", "dir2/file_dir2.txt", }, "expected_structure": { "test_repo/", "dir1/", "dir2/", "src/", "subdir/", }, "expected_not_structure": {*()}, }, ), id="exclude-recursive-wildcard", ), ], ) def test_include_ignore_patterns( temp_directory: Path, sample_query: IngestionQuery, pattern_scenario: PatternScenario, ) -> None: """Test ``ingest_query`` to ensure included and ignored paths are included and ignored respectively. Given a directory with ``.txt`` and ``.py`` files, and a set of include patterns or a set of ignore patterns: When ``ingest_query`` is invoked, Then it should produce a summary string listing the files analyzed and a combined content string. """ sample_query.local_path = temp_directory sample_query.subpath = "/" sample_query.type = None sample_query.include_patterns = pattern_scenario["include_patterns"] sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] summary, structure, content = ingest_query(sample_query) assert "Repository: test_user/test_repo" in summary num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE) assert (num_files_match := num_files_regex.search(summary)) is not None assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] # Check presence of key files in the content for expected_content_item in pattern_scenario["expected_content"]: assert expected_content_item in content # check presence of included directories in structure for expected_structure_item in pattern_scenario["expected_structure"]: assert expected_structure_item in structure # check non-presence of non-included directories in structure for expected_not_structure_item in pattern_scenario["expected_not_structure"]: assert expected_not_structure_item not in structure ================================================ FILE: tests/test_notebook_utils.py ================================================ """Tests for the ``notebook`` utils module. These tests validate how notebooks are processed into Python-like output, ensuring that markdown/raw cells are converted to triple-quoted blocks, code cells remain executable code, and various edge cases (multiple worksheets, empty cells, outputs, etc.) are handled appropriately. """ import pytest from gitingest.utils.notebook import process_notebook from tests.conftest import WriteNotebookFunc def test_process_notebook_all_cells(write_notebook: WriteNotebookFunc) -> None: """Test processing a notebook containing markdown, code, and raw cells. Given a notebook with: - One markdown cell - One code cell - One raw cell When ``process_notebook`` is invoked, Then markdown and raw cells should appear in triple-quoted blocks, and code cells remain as normal code. """ expected_count = 4 notebook_content = { "cells": [ {"cell_type": "markdown", "source": ["# Markdown cell"]}, {"cell_type": "code", "source": ['print("Hello Code")']}, {"cell_type": "raw", "source": [""]}, ], } nb_path = write_notebook("all_cells.ipynb", notebook_content) result = process_notebook(nb_path) assert result.count('"""') == expected_count, ( "Two non-code cells => 2 triple-quoted blocks => 4 total triple quotes." ) # Ensure markdown and raw cells are in triple quotes assert "# Markdown cell" in result assert "" in result # Ensure code cell is not in triple quotes assert 'print("Hello Code")' in result assert '"""\nprint("Hello Code")\n"""' not in result def test_process_notebook_with_worksheets(write_notebook: WriteNotebookFunc) -> None: """Test a notebook containing the (as of IPEP-17 deprecated) ``worksheets`` key. Given a notebook that uses the ``worksheets`` key with a single worksheet, When ``process_notebook`` is called, Then a ``DeprecationWarning`` should be raised, and the content should match an equivalent notebook that has top-level ``cells``. """ with_worksheets = { "worksheets": [ { "cells": [ {"cell_type": "markdown", "source": ["# Markdown cell"]}, {"cell_type": "code", "source": ['print("Hello Code")']}, {"cell_type": "raw", "source": [""]}, ], }, ], } without_worksheets = with_worksheets["worksheets"][0] # same, but no 'worksheets' key nb_with = write_notebook("with_worksheets.ipynb", with_worksheets) nb_without = write_notebook("without_worksheets.ipynb", without_worksheets) result_with = process_notebook(nb_with) # Should not raise a warning result_without = process_notebook(nb_without) assert result_with == result_without, "Content from the single worksheet should match the top-level equivalent." def test_process_notebook_multiple_worksheets(write_notebook: WriteNotebookFunc) -> None: """Test a notebook containing multiple ``worksheets``. Given a notebook with two worksheets: - First with a markdown cell - Second with a code cell When ``process_notebook`` is called, Then a warning about multiple worksheets should be raised, and the second worksheet's content should appear in the final output. """ multi_worksheets = { "worksheets": [ {"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, {"cells": [{"cell_type": "code", "source": ["# Second Worksheet"]}]}, ], } single_worksheet = { "worksheets": [ {"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, ], } nb_multi = write_notebook("multiple_worksheets.ipynb", multi_worksheets) nb_single = write_notebook("single_worksheet.ipynb", single_worksheet) result_multi = process_notebook(nb_multi) result_single = process_notebook(nb_single) assert result_multi != result_single, "Two worksheets should produce more content than one." assert len(result_multi) > len(result_single), "The multi-worksheet notebook should have extra code content." assert "# First Worksheet" in result_single assert "# Second Worksheet" not in result_single assert "# First Worksheet" in result_multi assert "# Second Worksheet" in result_multi def test_process_notebook_code_only(write_notebook: WriteNotebookFunc) -> None: """Test a notebook containing only code cells. Given a notebook with code cells only: When ``process_notebook`` is called, Then no triple quotes should appear in the output. """ notebook_content = { "cells": [ {"cell_type": "code", "source": ["print('Code Cell 1')"]}, {"cell_type": "code", "source": ["x = 42"]}, ], } nb_path = write_notebook("code_only.ipynb", notebook_content) result = process_notebook(nb_path) assert '"""' not in result, "No triple quotes expected when there are only code cells." assert "print('Code Cell 1')" in result assert "x = 42" in result def test_process_notebook_markdown_only(write_notebook: WriteNotebookFunc) -> None: """Test a notebook with only markdown cells. Given a notebook with two markdown cells: When ``process_notebook`` is called, Then each markdown cell should become a triple-quoted block (2 blocks => 4 triple quotes total). """ expected_count = 4 notebook_content = { "cells": [ {"cell_type": "markdown", "source": ["# Markdown Header"]}, {"cell_type": "markdown", "source": ["Some more markdown."]}, ], } nb_path = write_notebook("markdown_only.ipynb", notebook_content) result = process_notebook(nb_path) assert result.count('"""') == expected_count, "Two markdown cells => 2 blocks => 4 triple quotes total." assert "# Markdown Header" in result assert "Some more markdown." in result def test_process_notebook_raw_only(write_notebook: WriteNotebookFunc) -> None: """Test a notebook with only raw cells. Given two raw cells: When ``process_notebook`` is called, Then each raw cell should become a triple-quoted block (2 blocks => 4 triple quotes total). """ expected_count = 4 notebook_content = { "cells": [ {"cell_type": "raw", "source": ["Raw content line 1"]}, {"cell_type": "raw", "source": ["Raw content line 2"]}, ], } nb_path = write_notebook("raw_only.ipynb", notebook_content) result = process_notebook(nb_path) assert result.count('"""') == expected_count, "Two raw cells => 2 blocks => 4 triple quotes." assert "Raw content line 1" in result assert "Raw content line 2" in result def test_process_notebook_empty_cells(write_notebook: WriteNotebookFunc) -> None: """Test that cells with an empty ``source`` are skipped. Given a notebook with 4 cells, 3 of which have empty ``source``: When ``process_notebook`` is called, Then only the non-empty cell should appear in the output (1 block => 2 triple quotes). """ expected_count = 2 notebook_content = { "cells": [ {"cell_type": "markdown", "source": []}, {"cell_type": "code", "source": []}, {"cell_type": "raw", "source": []}, {"cell_type": "markdown", "source": ["# Non-empty markdown"]}, ], } nb_path = write_notebook("empty_cells.ipynb", notebook_content) result = process_notebook(nb_path) assert result.count('"""') == expected_count, "Only one non-empty cell => 1 block => 2 triple quotes" assert "# Non-empty markdown" in result def test_process_notebook_invalid_cell_type(write_notebook: WriteNotebookFunc) -> None: """Test a notebook with an unknown cell type. Given a notebook cell whose ``cell_type`` is unrecognized: When ``process_notebook`` is called, Then a ValueError should be raised. """ notebook_content = { "cells": [ {"cell_type": "markdown", "source": ["# Valid markdown"]}, {"cell_type": "unknown", "source": ["Unrecognized cell type"]}, ], } nb_path = write_notebook("invalid_cell_type.ipynb", notebook_content) with pytest.raises(ValueError, match="Unknown cell type: unknown"): process_notebook(nb_path) def test_process_notebook_with_output(write_notebook: WriteNotebookFunc) -> None: """Test a notebook that has code cells with outputs. Given a code cell and multiple output objects: When ``process_notebook`` is called with ``include_output=True``, Then the outputs should be appended as commented lines under the code. """ notebook_content = { "cells": [ { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "print('my_data')\n", "my_data = [1, 2, 3, 4, 5]\n", "plt.plot(my_data)\n", "my_data", ], "outputs": [ {"output_type": "stream", "text": ["my_data"]}, {"output_type": "execute_result", "data": {"text/plain": ["[1, 2, 3, 4, 5]"]}}, {"output_type": "display_data", "data": {"text/plain": ["
    "]}}, ], }, ], } nb_path = write_notebook("with_output.ipynb", notebook_content) with_output = process_notebook(nb_path, include_output=True) without_output = process_notebook(nb_path, include_output=False) expected_source = ( "# Jupyter notebook converted to Python script.\n\n" "import matplotlib.pyplot as plt\n" "print('my_data')\n" "my_data = [1, 2, 3, 4, 5]\n" "plt.plot(my_data)\n" "my_data\n" ) expected_output = "# Output:\n# my_data\n# [1, 2, 3, 4, 5]\n#
    \n" expected_combined = expected_source + expected_output assert with_output == expected_combined, "Should include source code and comment-ified output." assert without_output == expected_source, "Should include only the source code without output." ================================================ FILE: tests/test_pattern_utils.py ================================================ """Test pattern utilities.""" from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.utils.pattern_utils import _parse_patterns, process_patterns def test_process_patterns_empty_patterns() -> None: """Test ``process_patterns`` with empty patterns. Given empty ``include_patterns`` and ``exclude_patterns``: When ``process_patterns`` is called, Then ``include_patterns`` becomes ``None`` and ``DEFAULT_IGNORE_PATTERNS`` apply. """ exclude_patterns, include_patterns = process_patterns(exclude_patterns="", include_patterns="") assert include_patterns is None assert exclude_patterns == DEFAULT_IGNORE_PATTERNS def test_parse_patterns_valid() -> None: """Test ``_parse_patterns`` with valid comma-separated patterns. Given patterns like "*.py, *.md, docs/*": When ``_parse_patterns`` is called, Then it should return a set of parsed strings. """ patterns = "*.py, *.md, docs/*" parsed_patterns = _parse_patterns(patterns) assert parsed_patterns == {"*.py", "*.md", "docs/*"} def test_process_patterns_include_and_ignore_overlap() -> None: """Test ``process_patterns`` with overlapping patterns. Given include="*.py" and ignore={"*.py", "*.txt"}: When ``process_patterns`` is called, Then "*.py" should be removed from ignore patterns. """ exclude_patterns, include_patterns = process_patterns(exclude_patterns={"*.py", "*.txt"}, include_patterns="*.py") assert include_patterns == {"*.py"} assert exclude_patterns is not None assert "*.py" not in exclude_patterns assert "*.txt" in exclude_patterns ================================================ FILE: tests/test_summary.py ================================================ """Test that ``gitingest.ingest()`` emits a concise, 5-or-6-line summary.""" import re from pathlib import Path import pytest from gitingest import ingest REPO = "pallets/flask" PATH_CASES = [ ("tree", "/examples/celery"), ("blob", "/examples/celery/make_celery.py"), ("blob", "/.gitignore"), ] REF_CASES = [ ("Branch", "main"), ("Branch", "stable"), ("Tag", "3.0.3"), ("Commit", "e9741288637e0d9abe95311247b4842a017f7d5c"), ] @pytest.mark.parametrize(("path_type", "path"), PATH_CASES) @pytest.mark.parametrize(("ref_type", "ref"), REF_CASES) def test_ingest_summary(path_type: str, path: str, ref_type: str, ref: str) -> None: """Assert that ``gitingest.ingest()`` emits a concise, 5-or-6-line summary. - Non-'main” refs → 5 key/value pairs + blank line (6 total). - 'main” branch → ref line omitted (5 total). - Required keys: - Repository - ``ref_type`` (absent on 'main”) - File│Subpath (chosen by ``path_type``) - Lines│Files analyzed (chosen by ``path_type``) - Estimated tokens (positive integer) Any missing key, wrong value, or incorrect line count should fail. Parameters ---------- path_type : {"tree", "blob"} GitHub object type under test. path : str The repository sub-path or file path to feed into the URL. ref_type : {"Branch", "Tag", "Commit"} Label expected on line 2 of the summary (absent if `ref` is "main"). ref : str Actual branch name, tag, or commit hash. """ is_main_branch = ref == "main" is_blob = path_type == "blob" expected_lines = _calculate_expected_lines(ref_type, is_main_branch=is_main_branch) expected_non_empty_lines = expected_lines - 1 summary, _, _ = ingest(f"https://github.com/{REPO}/{path_type}/{ref}{path}") lines = summary.splitlines() parsed_lines = dict(line.split(": ", 1) for line in lines if ": " in line) assert parsed_lines["Repository"] == REPO if is_main_branch: # We omit the 'Branch' line for 'main' branches. assert ref_type not in parsed_lines else: assert parsed_lines[ref_type] == ref if is_blob: assert parsed_lines["File"] == Path(path).name assert "Lines" in parsed_lines else: # 'tree' assert parsed_lines["Subpath"] == path assert "Files analyzed" in parsed_lines token_match = re.search(r"\d+", parsed_lines["Estimated tokens"]) assert token_match, "'Estimated tokens' should contain a number" assert int(token_match.group()) > 0 assert len(lines) == expected_lines assert len(parsed_lines) == expected_non_empty_lines def _calculate_expected_lines(ref_type: str, *, is_main_branch: bool) -> int: """Calculate the expected number of lines in the summary. The total number of lines depends on the following: - Commit type does not include the 'Branch'/'Tag' line, reducing the count by 1. - The "main" branch omits the 'Branch' line, reducing the count by 1. Parameters ---------- ref_type : str The type of reference, e.g., "Branch", "Tag", or "Commit". is_main_branch : bool True if the reference is the "main" branch, False otherwise. Returns ------- int The expected number of lines in the summary. """ base_lines = 7 if is_main_branch: base_lines -= 1 if ref_type == "Commit": base_lines -= 1 return base_lines