Repository: sammcj/ingest Branch: main Commit: 80679e8030f7 Files: 39 Total size: 219.1 KB Directory structure: gitextract_e4c47gl8/ ├── .github/ │ ├── FUNDING.yml │ ├── copilot-instructions.md │ └── workflows/ │ └── build-release-publish.yml ├── .gitignore ├── .golangci.yml ├── LICENSE ├── Makefile ├── README.md ├── config/ │ └── config.go ├── filesystem/ │ ├── defaultExcludes.go │ └── filesystem.go ├── git/ │ └── git.go ├── go.mod ├── go.sum ├── internal/ │ └── compressor/ │ ├── compressor.go │ ├── compressor_test.go │ ├── genericCompressor.go │ └── testdata/ │ ├── example.c │ ├── example.css │ ├── example.go │ ├── example.html │ ├── example.java │ ├── example.js │ ├── example.py │ ├── example.rs │ ├── example.sh │ ├── example.swift │ └── example_anon_func.js ├── main.go ├── pdf/ │ └── pdf.go ├── scripts/ │ └── install.sh ├── template/ │ └── template.go ├── token/ │ ├── anthropic.go │ └── token.go ├── tree-sitter-dev-plan.md ├── utils/ │ ├── output_manager.go │ └── utils.go └── web/ ├── crawler.go └── integration.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: sammcj ================================================ FILE: .github/copilot-instructions.md ================================================ # GitHub Copilot Instructions ## Contribution Guidelines ### Before Committing 1. **Run linters:** `make lint` (must pass without warnings or errors) 2. **Run tests:** `make test` (must pass all tests) 3. **Build successfully:** `make build` (must compile without warnings or errors) ### Code Standards - Follow Go best practices and idiomatic patterns - Use Australian English spelling throughout code (unless it's a function or parameter to an upstream library) and documentation - No marketing terms like "comprehensive" or "production-grade" - Focus on clear, concise, actionable technical guidance - Keep responses token-efficient (avoid returning unnecessary data) verbosity ## Code Quality Checks ### General Code Quality - Verify proper module imports and dependencies - Check for hardcoded credentials or sensitive data - Ensure proper resource cleanup (defer statements) - Validate input parameters thoroughly - Use appropriate data types and structures - Follow consistent error message formatting ## Configuration & Environment - Environment variables should have sensible defaults - Configuration should be documented in README - Support both development and production modes - Handle missing optional dependencies gracefully ## General Guidelines - Do not use marketing terms such as 'comprehensive' or 'production-grade' in documentation or code comments. - Focus on clear, concise actionable technical guidance. ## Review Checklist for Every PR Before approving any pull request, verify: - [ ] Code follows the latest Golang best practices - [ ] No security issues or vulnerabilities introduced - [ ] All linting and tests pass successfully - [ ] Documentation updated if required - [ ] Australian English spelling used throughout, No American English spelling used (unless it's a function or parameter to an upstream library) - [ ] Context cancellation handled properly if applicable - [ ] Resource cleanup with defer statements if applicable If you are re-reviewing a PR you've reviewed in the past and your previous comments / suggestions have been addressed or are no longer valid please resolve those previous review comments to keep the review history clean and easy to follow. ================================================ FILE: .github/workflows/build-release-publish.yml ================================================ name: Build, Release, and Publish on: push: branches: [ main ] paths-ignore: - 'README.md' pull_request: branches: [ main ] paths-ignore: - 'README.md' env: GO_VERSION: '1.25.4' # Updated to match toolchain BINARY_NAME: 'ingest' permissions: contents: write packages: write jobs: build: if: ${{ ! contains(github.event.head_commit.message, '[skip ci]') && ! contains(github.event.pull_request.title, '[skip ci]')}} name: Build strategy: matrix: target: - os: darwin arch: arm64 runner: macos-14 c_compiler_package: "" - os: linux arch: amd64 runner: ubuntu-latest c_compiler_package: "build-essential" # - os: linux # arch: arm64 # runner: ubuntu-latest-arm64 # Use native ARM64 runner # c_compiler_package: "build-essential" # Native compiler runs-on: ${{ matrix.target.runner }} outputs: version: ${{ steps.set_version.outputs.new_tag }} changelog: ${{ steps.set_version.outputs.changelog }} steps: - name: Checkout code uses: actions/checkout@v6 with: fetch-depth: 0 - name: Set up Go and cache dependencies uses: actions/setup-go@v6 with: go-version-file: "go.mod" - name: Get version id: set_version uses: mathieudutour/github-tag-action@a22cf08638b34d5badda920f9daf6e72c477b07b # v6.2 with: github_token: ${{ secrets.GITHUB_TOKEN }} dry_run: true - name: Get dependencies run: go mod download - name: Set up C compiler if: startsWith(matrix.target.runner, 'ubuntu') && matrix.target.c_compiler_package != '' run: | sudo apt-get update sudo apt-get install -y ${{ matrix.target.c_compiler_package }} - name: golangci-lint uses: golangci/golangci-lint-action@v9 with: version: v2.6 - name: Run tests run: make test - name: Build env: CGO_ENABLED: "1" # Explicitly enable CGo GOOS: ${{ matrix.target.os }} GOARCH: ${{ matrix.target.arch }} VERSION: ${{ steps.set_version.outputs.new_tag }} run: | go build -v -ldflags "-w -s -X main.Version=$VERSION" -o build/${{ env.BINARY_NAME }}-${{ matrix.target.os }}-${{ matrix.target.arch }} . ls -ltarh build/ - name: Upload artifact uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4 if: github.event_name == 'push' && github.ref == 'refs/heads/main' with: name: ${{ env.BINARY_NAME }}-${{ matrix.target.os }}-${{ matrix.target.arch }} path: build/${{ env.BINARY_NAME }}-${{ matrix.target.os }}-${{ matrix.target.arch }} retention-days: 90 release: name: Release needs: build if: ${{github.event_name == 'push' && github.ref == 'refs/heads/main' && !contains(github.event.head_commit.message, '[skip ci]') && ! contains(github.event.pull_request.title, '[skip ci]')}} runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 with: fetch-depth: 0 - name: Download artifact uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4 with: path: build/ - name: Create a GitHub release uses: ncipollo/release-action@2c591bcc8ecdcd2db72b97d6147f871fcd833ba5 # v1 if: ${{ startsWith(github.ref, 'refs/heads/main') && !contains(github.event.head_commit.message, '[skip ci]') && ! contains(github.event.pull_request.title, '[skip ci]') }} with: tag: ${{ needs.build.outputs.version }} name: ${{ needs.build.outputs.version }} body: ${{ needs.build.outputs.changelog }} skipIfReleaseExists: true generateReleaseNotes: true allowUpdates: true makeLatest: ${{ startsWith(github.ref, 'refs/heads/main') && !contains(github.event.head_commit.message, '[skip ci]') && ! contains(github.event.pull_request.title, '[skip ci]') }} prerelease: ${{ !startsWith(github.ref, 'refs/heads/main') }} artifactErrorsFailBuild: true artifacts: | build/${{ env.BINARY_NAME }}*/${{ env.BINARY_NAME }}* env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ # If you prefer the allow list template instead of the deny list, see community template: # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore # # Binaries for programs and plugins *.exe *.exe~ *.dll *.so *.dylib # Test binary, built with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out # Dependency directories (remove the comment below to include it) # vendor/ # Go workspace file go.work go.work.sum # build files ingest build/ **/.vscode **/.idea **/*.tmp **/*.swp **/*.log **/ingest.out.md vendor/ ================================================ FILE: .golangci.yml ================================================ version: "2" linters: enable: - unparam settings: unparam: check-exported: false exclusions: generated: lax presets: - comments - common-false-positives - legacy - std-error-handling paths: - third_party$ - builtin$ - examples$ formatters: exclusions: generated: lax paths: - third_party$ - builtin$ - examples$ - screenshots$ - .github$ - .claude$ - bin$ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Sam McLeod Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ # Makefile for ingest project # Go parameters GOCMD=go GOBUILD=$(GOCMD) build GOCLEAN=$(GOCMD) clean GOTEST=$(GOCMD) test GOGET=$(GOCMD) get # Binary name BINARY_NAME=ingest # Version information VERSION := $(shell git describe --tags --always) BUILD_TIME := $(shell date -u '+%Y-%m-%d_%I:%M:%S%p') LDFLAGS := -ldflags "-w -s -X main.Version=$(VERSION) -X main.BuildTime=$(BUILD_TIME)" # Main package path MAIN_PACKAGE=. .PHONY: all build clean test deps all: clean build build: $(GOBUILD) $(LDFLAGS) -o $(BINARY_NAME) $(MAIN_PACKAGE) clean: $(GOCLEAN) rm -f $(BINARY_NAME) lint: gofmt -w -s . golangci-lint run go run golang.org/x/tools/gopls/internal/analysis/modernize/cmd/modernize@latest -fix -test ./... test: $(GOTEST) -v ./... deps: $(GOGET) ./... # Run the application run: build ./$(BINARY_NAME) # Build for multiple platforms build-all: GOOS=linux GOARCH=amd64 $(GOBUILD) $(LDFLAGS) -o $(BINARY_NAME)-linux-amd64 $(MAIN_PACKAGE) GOOS=darwin GOARCH=amd64 $(GOBUILD) $(LDFLAGS) -o $(BINARY_NAME)-darwin-amd64 $(MAIN_PACKAGE) GOOS=windows GOARCH=amd64 $(GOBUILD) $(LDFLAGS) -o $(BINARY_NAME)-windows-amd64.exe $(MAIN_PACKAGE) # Install the binary install: build mv $(BINARY_NAME) $(GOPATH)/bin/$(BINARY_NAME) # Uninstall the binary uninstall: rm -f $(GOPATH)/bin/$(BINARY_NAME) # output the version information version: @echo $(VERSION) ================================================ FILE: README.md ================================================ # Ingest ![](./ingest-logo-400.png) Ingest parses directories of plain text files, such as source code, into a single markdown file suitable for ingestion by AI/LLMs. --- ![ingest](screenshot2.png) Ingest can also pass the prompt directly to an LLM such as Ollama for processing. ![ingest with --llm](screenshot.png) And ingest web URLs. ![ingest with --web](screenshot3.png) ## Features - Traverse directory structures and generate a tree view - Include/exclude files based on glob patterns - Compress code using Tree-sitter to extract key structural information while omitting implementation details - Estimate vRAM requirements and check model compatibility using another package I've created called [quantest](https://github.com/sammcj/quantest) - Parse output directly to LLMs such as Ollama or any OpenAI compatible API for processing - Generate and include git diffs and logs - Count tokens using offline tokeniser (default) or optionally use Anthropic API (API key required, but no charge for counting) - Customisable output templates - Copy output to clipboard (when available) - Export to file or print to console - Optional JSON output - Optionally save output to a file in ~/ingest - Shell completions for Bash, Zsh, and Fish - Web crawling to ingest web pages as Markdown - PDF to markdown conversion and ingestion Ingest Intro ("Podcast" Episode): ## Installation ### go install (recommended) Make sure you have Go installed on your system, then run: ```shell go install github.com/sammcj/ingest@HEAD ``` ### curl I don't recommend this method as it's not as easy to update, but you can use the following command: ```shell curl -sL https://raw.githubusercontent.com/sammcj/ingest/refs/heads/main/scripts/install.sh | bash ``` ### Manual install 1. Download the latest release from the [releases page](https://github.com/sammcj/ingest/releases) 2. Move the binary to a directory in your PATH, e.g. `mv ingest* /usr/local/bin/ingest` ## Usage Basic usage: ```shell ingest [flags] ``` ingest will default to the current working directory if no path is provided, e.g: ```shell $ ingest ⠋ Traversing directory and building tree... [0s] [ℹ️] Tokens (Approximate): 15,945 [✅] Copied to clipboard successfully. ``` The first time ingest runs, it will download a small [tokeniser](https://github.com/pkoukk/tiktoken-go-loader/blob/main/assets/cl100k_base.tiktoken) called 'cl100k_base.tiktoken' this is used for tokenisation. Generate a prompt from a directory, including only Python files: ```shell ingest -i "**/*.py" /path/to/project ``` Generate a prompt with git diff and copy to clipboard: ```shell ingest -d /path/to/project ``` Generate a prompt for multiple files/directories: ```shell ingest /path/to/project /path/to/other/project ``` Generate a prompt and save to a file: ```shell ingest -o output.md /path/to/project ``` You can also provide individual files or multiple paths: ```shell ingest /path/to/file /path/to/directory ``` Save output to to ~/ingest/.md: ```shell ingest --save /path/to/project ``` ### VRAM Estimation and Model Compatibility Ingest includes a feature to estimate VRAM requirements and check model compatibility using the [Gollama](https://github.com/sammcj/gollama)'s vramestimator package. This helps you determine if your generated content will fit within the specified model, VRAM, and quantisation constraints. To use this feature, add the following flags to your ingest command: ```shell ingest --vram --model [--memory ] [--quant ] [--context ] [--kvcache ] [--quanttype ] [other flags] ``` Examples: Estimate VRAM usage for a specific context: ```shell ingest --vram --model NousResearch/Hermes-2-Theta-Llama-3-8B --quant q4_k_m --context 2048 --kvcache q4_0 . # Estimated VRAM usage: 5.35 GB ``` Calculate maximum context for a given memory constraint: ```shell ingest --vram --model NousResearch/Hermes-2-Theta-Llama-3-8B --quant q4_k_m --memory 6 --kvcache q8_0 . # Maximum context for 6.00 GB of memory: 5069 ``` Find the best BPW (Bits Per Weight): ```shell ingest --vram --model NousResearch/Hermes-2-Theta-Llama-3-8B --memory 6 --quanttype gguf . # Best BPW for 6.00 GB of memory: IQ3_S ``` The tool also works for exl2 (ExllamaV2) models: ```shell ingest --vram --model NousResearch/Hermes-2-Theta-Llama-3-8B --quant 5.0 --context 2048 --kvcache q4_0 . # For exl2 models ingest --vram --model NousResearch/Hermes-2-Theta-Llama-3-8B --quant 5.0 --memory 6 --kvcache q8_0 . # For exl2 models ``` When using the VRAM estimation feature along with content generation, ingest will provide information about the generated content's compatibility with the specified constraints: ```shell ingest --vram --model NousResearch/Hermes-2-Theta-Llama-3-8B --memory 8 --quant q4_0 . ⠋ Traversing directory and building tree... [0s] [ℹ️] 14,702 Tokens (Approximate) [ℹ️] Maximum context for 8.00 GB of memory: 10240 [✅] Generated content (14,702 tokens) fits within maximum context. Top 15 largest files (by estimated token count): 1. /Users/samm/git/sammcj/ingest/main.go (4,682 tokens) 2. /Users/samm/git/sammcj/ingest/filesystem/filesystem.go (2,694 tokens) 3. /Users/samm/git/sammcj/ingest/README.md (1,895 tokens) 4. /Users/samm/git/sammcj/ingest/utils/utils.go (948 tokens) 5. /Users/samm/git/sammcj/ingest/config/config.go (884 tokens) [✅] Copied to clipboard successfully. ``` Available flags for VRAM estimation: - `--vram`: Enable VRAM estimation and model compatibility check - `--model`: Specify the model ID to check against (required for estimation) - `--memory`: Specify the available memory in GB for context calculation (optional) - `--quant`: Specify the quantisation type (e.g., q4_k_m) or bits per weight (e.g., 5.0) - `--context`: Specify the context length for VRAM estimation (optional) - `--kvcache`: Specify the KV cache quantisation (fp16, q8_0, or q4_0) - `--quanttype`: Specify the quantisation type (gguf or exl2) Ingest will provide appropriate output based on the combination of flags used, such as estimating VRAM usage, calculating maximum context, or finding the best BPW. If the generated content fits within the specified constraints, you'll see a success message. Otherwise, you'll receive a warning that the content may not fit. ## LLM Integration Ingest can pass the generated prompt to LLMs that have an OpenAI compatible API such as [Ollama](https://ollama.com) for processing. ```shell ingest --llm /path/to/project ``` By default this will use any prompt suffix from your configuration file: ```shell ./ingest utils.go --llm ⠋ Traversing directory and building tree... [0s] This is Go code for a file named `utils.go`. It contains various utility functions for handling terminal output, clipboard operations, and configuration directories. ... ``` You can provide a prompt suffix to append to the generated prompt: ```shell ingest --llm -p "explain this code" /path/to/project ``` ## Token Counting Ingest provides token counting using either an offline tokeniser (default) or the Anthropic API for more accurate counts. ### Offline Token Counting (Default) By default, ingest uses an offline tokeniser with a correction factor for improved accuracy: ```shell ingest /path/to/project # [ℹ️] Tokens (Approximate): 15,945 ``` The offline tokeniser applies a 1.18x multiplier based on empirical analysis comparing it with Anthropic's API. This correction reduces average estimation error from ~17% to ~2%, providing slightly more accurate token counts without requiring an API key. To disable the correction factor and use raw token counts, use the `--no-correction` flag: ```shell ingest --no-correction /path/to/project # Uses raw offline tokeniser without correction multiplier ``` The first time ingest runs, it downloads a small tokeniser file for offline use. ### Anthropic API Token Counting For accurate token counts using Anthropic's counting API, use the `-a` or `--anthropic` flag: ```shell export ANTHROPIC_API_KEY="your-api-key" ingest -a /path/to/project # ✓ Using Anthropic API (claude-sonnet-4-5) for token counting # [ℹ️] Tokens (Approximate): 15,942 ``` The API accepts keys from these environment variables (checked in order): - `ANTHROPIC_API_KEY` - `ANTHROPIC_TOKEN` - `ANTHROPIC_TOKEN_COUNT_KEY` **Performance optimisation**: When counting tokens for multiple files (e.g. in the "Top 15 largest files" report), ingest processes API requests in parallel batches of 4, significantly reducing the time needed for token counting. If the API call fails, ingest automatically falls back to the offline tokeniser. ## Code Compression with Tree-sitter **Experimental** Ingest can compress source code files by extracting key structural information while omitting implementation details. This is useful for reducing token usage while preserving the important parts of the code structure. ```shell ingest --compress /path/to/project ``` The compression extracts: - Package/module declarations - Import statements - Function/method signatures (without bodies) - Class definitions (without method bodies) - Type definitions - Comments Currently supported languages: - Go - Python - JavaScript (including arrow functions and ES6 module syntax) - Bash - C - CSS Example of compressed JavaScript: ``` // This is a JavaScript comment import { something } from 'module'; export class MyJSClass { ... } // Body removed constructor(name) { ... } // Body removed greet(message) { ... } // Body removed export function myJSFunction(x, y) { ... } // Body removed const myArrowFunc = (a, b) => { ... } // Body removed ``` ## Web Crawling & Ingestion Crawl with explicit web mode ```shell ingest --web https://example.com ``` Auto-detect URL and crawl ```shell ingest https://example.com ``` Crawl with domain restriction ```shell ingest --web --web-domains example.com https://example.com ``` Crawl deeper with more concurrency ```shell ingest --web --web-depth 3 --web-concurrent 10 https://example.com ``` Exclude a path from the crawl ```shell ingest --web https://example.com -e '/posts/**' ``` ## Shell Completions Ingest includes shell completions for Bash, Zsh, and Fish. To load completions for the current session: **Bash:** ```shell source <(ingest completion bash) ``` **Zsh:** ```shell source <(ingest completion zsh) ``` **Fish:** ```shell ingest completion fish | source ``` For persistent completions (loaded automatically in each new shell session), see `ingest completion --help` for installation instructions specific to your system. ## Configuration Ingest uses a configuration file located at `~/.config/ingest/ingest.json`. You can make Ollama processing run without prompting setting `"llm_auto_run": true` in the config file. The config file also contains: - `llm_model`: The model to use for processing the prompt, e.g. "llama3.1:8b-q5_k_m". - `llm_prompt_prefix`: An optional prefix to prepend to the prompt, e.g. "This is my application." - `llm_prompt_suffix`: An optional suffix to append to the prompt, e.g. "explain this code" Ingest uses the following directories for user-specific configuration: - `~/.config/ingest/patterns/exclude`: Add .glob files here to exclude additional patterns. - `~/.config/ingest/patterns/templates`: Add custom .tmpl files here for different output formats. These directories will be created automatically on first run, along with README files explaining their purpose. ### Flags - `-a, --anthropic`: Use Anthropic API for token counting (requires API key in environment) - `--compress`: Enable code compression using Tree-sitter to extract key structural information while omitting implementation details - `--config`: Opens the config file in the default editor - `--no-correction`: Disable offline tokeniser correction factor (use raw token count) - `--context`: Specify the context length for VRAM estimation - `--exclude-from-tree`: Exclude files/folders from the source tree based on exclude patterns - `--git-diff-branch`: Generate git diff between two branches - `--git-log-branch`: Retrieve git log between two branches - `--include-priority`: Include files in case of conflict between include and exclude patterns - `--json`: Print output as JSON - `--kvcache`: Specify the KV cache quantisation - `--llm`: Send the generated prompt to an OpenAI compatible LLM server (such as Ollama) for processing - `--memory`: Specify the available memory in GB for context calculation - `--model`: Specify the model ID for VRAM estimation - `--no-codeblock`: Disable wrapping code inside markdown code blocks - `--no-default-excludes`: Disable default exclude patterns - `--pattern-exclude`: Path to a specific .glob file for exclude patterns - `--print-default-excludes`: Print the default exclude patterns - `--print-default-template`: Print the default template - `--quant`: Specify the quantisation type or bits per weight - `--quanttype`: Specify the quantisation type (gguf or exl2) - `--relative-paths`: Use relative paths instead of absolute paths - `--report`: Print the largest parsed files - `--save`: Save output to ~/ingest/.md - `--tokens`: Display the token count of the generated prompt - `--verbose`: Print verbose output - `--vram`: Estimate VRAM usage and check model compatibility - `--web-concurrent`: Maximum concurrent requests for web crawling - `--web-depth`: Maximum depth for web crawling - `--web-domains`: Comma-separated list of domains to restrict web crawling - `--web`: Crawl a web page - `-c, --encoding`: Optional tokeniser to use for token count - `-d, --diff`: Include git diff - `-e, --exclude`: Patterns to exclude (can be used multiple times) - `-i, --include`: Patterns to include (can be used multiple times) - `-l, --line-number`: Add line numbers to the source code - `-n, --no-clipboard`: Disable copying to clipboard - `-o, --output`: Optional output file path - `-p, --prompt`: Optional prompt suffix to append to the generated prompt - `-t, --template`: Path to a custom Handlebars template - `-V, --version`: Print the version number (WIP - still trying to get this to work nicely) ### Excludes You can get a list of the default excludes by parsing `--print-default-excludes` to ingest. These are defined in [defaultExcludes.go](https://github.com/sammcj/ingest/blob/main/filesystem/defaultExcludes.go). To override the default excludes, create a `default.glob` file in `~/.config/ingest/patterns/exclude` with the patterns you want to exclude. ### Templates Templates are written in standard [go templating syntax](https://pkg.go.dev/text/template). You can get a list of the default templates by parsing `--print-default-template` to ingest. These are defined in [template.go](https://github.com/sammcj/ingest/blob/main/template/template.go). To override the default templates, create a `default.tmpl` file in `~/.config/ingest/patterns/templates` with the template you want to use by default. ## Contributing Contributions are welcome, Please feel free to submit a Pull Request. You can help sponsor the project by trading the $INGEST SOL Token: https://bags.fm/Dm98Qa1Xw2n35bq73R2t1bFgXPApUKu2YwzU8TjWBAGS ## License - Copyright 2024 Sam McLeod - This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. ================================================ FILE: config/config.go ================================================ package config import ( "encoding/json" "fmt" "os" "os/exec" "path/filepath" "github.com/mitchellh/go-homedir" ) type OllamaConfig struct { Model string `json:"llm_model"` PromptPrefix string `json:"llm_prompt_prefix"` PromptSuffix string `json:"llm_prompt_suffix"` AutoRun bool `json:"llm_auto_run"` } type Config struct { Ollama []OllamaConfig `json:"ollama"` LLM LLMConfig `json:"llm"` AutoSave bool `json:"auto_save"` } type LLMConfig struct { AuthToken string `json:"llm_auth_token"` BaseURL string `json:"llm_base_url"` Model string `json:"llm_model"` MaxTokens int `json:"llm_max_tokens"` Temperature *float32 `json:"llm_temperature,omitempty"` TopP *float32 `json:"llm_top_p,omitempty"` PresencePenalty *float32 `json:"llm_presence_penalty,omitempty"` FrequencyPenalty *float32 `json:"llm_frequency_penalty,omitempty"` APIType string `json:"llm_api_type"` } // loads the config file func LoadConfig() (*Config, error) { home, err := homedir.Dir() if err != nil { return nil, fmt.Errorf("failed to get home directory: %w", err) } configPath := filepath.Join(home, ".config", "ingest", "ingest.json") if _, err := os.Stat(configPath); os.IsNotExist(err) { return createDefaultConfig(configPath) } file, err := os.ReadFile(configPath) if err != nil { return nil, fmt.Errorf("failed to read config file: %w", err) } var config Config if err := json.Unmarshal(file, &config); err != nil { return nil, fmt.Errorf("failed to parse config file: %w", err) } // Set default values for LLM config if config.LLM.AuthToken == "" { config.LLM.AuthToken = os.Getenv("OPENAI_API_KEY") } if config.LLM.BaseURL == "" { config.LLM.BaseURL = getDefaultBaseURL() } if config.LLM.Model == "" { config.LLM.Model = "llama3.1:8b-instruct-q6_K" } if config.LLM.MaxTokens == 0 { config.LLM.MaxTokens = 2048 } if config.LLM.APIType == "" { config.LLM.APIType = "OPEN_AI" } return &config, nil } func createDefaultConfig(configPath string) (*Config, error) { defaultConfig := Config{ Ollama: []OllamaConfig{ { Model: "llama3.1:8b-instruct-q6_K", PromptPrefix: "Code: ", PromptSuffix: "", AutoRun: false, }, }, LLM: LLMConfig{ BaseURL: getDefaultBaseURL(), Model: "llama3.1:8b-instruct-q6_K", MaxTokens: 2048, }, AutoSave: false, } err := os.MkdirAll(filepath.Dir(configPath), 0750) if err != nil { return nil, fmt.Errorf("failed to create config directory: %w", err) } file, err := json.MarshalIndent(defaultConfig, "", " ") if err != nil { return nil, fmt.Errorf("failed to marshal default config: %w", err) } if err := os.WriteFile(configPath, file, 0644); err != nil { return nil, fmt.Errorf("failed to write default config file: %w", err) } return &defaultConfig, nil } // returns the default base URL for the LLM API func getDefaultBaseURL() string { if url := os.Getenv("OPENAI_API_BASE"); url != "" { return url } if url := os.Getenv("llm_HOST"); url != "" { return url + "/v1" } return "http://localhost:11434/v1" } // opens the config file in the default editor func OpenConfig() error { home, err := homedir.Dir() if err != nil { return fmt.Errorf("failed to get home directory: %w", err) } configPath := filepath.Join(home, ".config", "ingest", "ingest.json") if _, err := os.Stat(configPath); os.IsNotExist(err) { return fmt.Errorf("config file does not exist") } editor := os.Getenv("EDITOR") if editor == "" { editor = "vim" } return runCommand(editor, configPath) } // runs a command in the shell func runCommand(command string, args ...string) error { cmd := exec.Command(command, args...) cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr return cmd.Run() } ================================================ FILE: filesystem/defaultExcludes.go ================================================ package filesystem import ( "bufio" "fmt" "strings" ) // defaultGlobContent contains the content of default.glob const defaultGlobContent = ` # Directories **/.cargo/** **/.devcontainer/** **/.git/** **/.github/** **/.next/** **/.venv/** **/.vim/** **/.vscode-insiders/** **/.vscode-oss/** **/.vscode-print-resource-cache/** **/.vscode-react-native/** **/.vscode/** **/.wasmedge/** **/.widsurf/** **/.widsurf/** **/.wine/** **/.wine32/** **/.yarn/** **/.zcompcache/** **/.zfunc/** **/.zgen/** **/.zsh_sessions/** **/.zsh.d/** **/backups/** **/build/** **/conda/** **/coverage/** **/dist/** **/mamba/** **/node_modules/** **/out/** **/pyenv/** **/__pycache__/** **/screenshots/** **/target/** **/temp/** **/tmp/** **/venv/** **/virtualenv/** **/wineprefix/** # File patterns **/__tests__/* **/_data/* **/.aider* **/.aider/* **/.bash_history **/.boto **/.claude.json **/.cline/* **/.condarc **/.cursor/* **/.dream_history **/.fzf.bash **/.fzf.zsh **/.git-credentials **/.llamafile_history **/.lscolors **/.netrc **/.psql_history **/.python_history **/.terraform/* **/.webpack/* **/.Xauthority **/*.7z **/*.apk **/*.app **/*.avi **/*.bak **/*.baseline **/*.bin **/*.blend **/*.bmp **/*.bz2 **/*.cert **/*.crt **/*.csv **/*.dat **/*.deb **/*.db **/*.diff **/*.dll **/*.dmg **/*.doc **/*.docx **/*.DS_Store **/*.eot **/*.excalidrawlib **/*.exe **/*.fbx **/*.fig **/*.flac **/*.gif **/*.gguf **/*.ggml **/*.exl2 **/*.exl3 **/*.gz **/*.heic **/*.hiec **/*.icns **/*.ico **/*.iso **/*.jar **/*.jpeg **/*.jpg **/*.key **/*.lock **/*.log* **/*.mp3 **/*.mp4 **/*.msi **/*.mvnw* **/*.obj **/*.odf **/*.otf **/*.pdf **/*.partial **/*.pem **/*.png **/*.ppt **/*.pptx **/*.ps1 **/*.pub **/*.pyc **/*.pyo **/*.pysave **/*.rpm **/*.sqlite **/*.sqlite3 **/*.svg **/*.swp* **/*.tar* **/*.terraform.tfstate.lock.info **/*.tfgraph **/*.tmp **/*.ttf* **/*.war **/*.wav **/*.webm **/*.webp **/*.woff **/*.woff2 **/*.xd **/*.xls **/*.xlsx **/*.zip **/terraform.tfstate.* **/test/* **/tests/* **/vendor/* # Specific files **/.aiderrules **/.aider.* **/.clinerules **/.cursorrules **/.DS_Store **/.editorconfig **/.env* **/.eslintignore **/.eslintrc* **/.gitattributes **/.gitconfig **/.gitconfig-no_push **/.gitignore **/.gitignoreglobal **/.gitlab-ci.yml **/.gitmodules **/.gitpod.yml **/.npmrc **/.nvmrc **/.pre-commit-config.yaml **/.pre-commit-config.yml **/.prettierignore **/.prettierrc* **/.saml2aws **/.saml2aws-auto.yml **/.stylelintrc* **/.terraform.lock.hcl **/.terraform.lock.hcl.lock **/.vimrc **/.whitesource **/.zcompdump* **/.claude/*.json **/.mcp.json **/bat-config **/changelog.md **/CHANGELOG* **/CLA.md **/CODE_OF_CONDUCT.md **/CODEOWNERS **/CONTRIBUTORS.md **/commitlint.config.js **/contributing.md **/CONTRIBUTING* **/dircolors **/esbuild.config.mjs **/go.mod **/go.sum **/LICENSE* **/LICENCE* **/manifest.json **/package-lock.json **/plan **/plan.out **/pnpm-lock.yaml **/poetry.lock **/pre-commit-config.yaml **/renovate.json **/SECURITY* **/SUPPORT.md **/terraform.rc **/terraform.tfplan **/terraform.tfplan.json **/terraform.tfstate **/terraform.tfstate.backup **/TODO.md **/TROUBLESHOOTING.md **/tsconfig.json **/version-bump.mjs **/versions.json **/yarn.lock ` // GetDefaultExcludes returns a list of default exclude patterns func GetDefaultExcludes() ([]string, error) { var defaultExcludes []string scanner := bufio.NewScanner(strings.NewReader(defaultGlobContent)) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line != "" && !strings.HasPrefix(line, "#") { defaultExcludes = append(defaultExcludes, line) } } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("error scanning default glob content: %w", err) } return defaultExcludes, nil } ================================================ FILE: filesystem/filesystem.go ================================================ package filesystem import ( "bufio" "fmt" "io" "io/fs" "net/http" "os" "path/filepath" "sort" "strings" "sync" "github.com/bmatcuk/doublestar/v4" "github.com/fatih/color" "github.com/mitchellh/go-homedir" ignore "github.com/sabhiram/go-gitignore" "github.com/sammcj/ingest/internal/compressor" "github.com/sammcj/ingest/pdf" "github.com/sammcj/ingest/utils" ) type FileInfo struct { Path string `json:"path"` Extension string `json:"extension"` Code string `json:"code"` } // New type to track excluded files and directories type ExcludedInfo struct { Directories map[string]int // Directory path -> count of excluded files Extensions map[string]int // File extension -> count of excluded files TotalFiles int // Total number of excluded files Files []string // List of excluded files (if total ≤ 20) } type treeNode struct { name string children []*treeNode isDir bool excluded bool } func ReadExcludePatterns(patternExclude string, noDefaultExcludes bool) ([]string, error) { var patterns []string // If a specific pattern exclude file is provided, use it if patternExclude != "" { return readGlobFile(patternExclude) } if !noDefaultExcludes { // Get the default excludes defaultPatterns, err := GetDefaultExcludes() if err != nil { return nil, fmt.Errorf("failed to read default exclude patterns: %w", err) } patterns = defaultPatterns } // Check for user-specific patterns home, err := homedir.Dir() if err == nil { userPatternsDir := filepath.Join(home, ".config", "ingest", "patterns", "exclude") userDefaultGlob := filepath.Join(userPatternsDir, "default.glob") // If user has a default.glob, it overrides the default patterns if _, err := os.Stat(userDefaultGlob); err == nil { return readGlobFile(userDefaultGlob) } // Read other user-defined patterns userPatterns, _ := readGlobFilesFromDir(userPatternsDir) // Combine user patterns with default patterns (if not disabled) patterns = append(patterns, userPatterns...) } return patterns, nil } // Helper functions to track exclusions func trackExcludedFile(excluded *ExcludedInfo, path string, mu *sync.Mutex) { mu.Lock() defer mu.Unlock() excluded.TotalFiles++ // Track the directory dir := filepath.Dir(path) excluded.Directories[dir]++ // Track the extension ext := filepath.Ext(path) if ext != "" { excluded.Extensions[ext]++ } // Only store individual files if we haven't exceeded 20 if excluded.TotalFiles <= 20 { excluded.Files = append(excluded.Files, path) } } func readGlobFile(filename string) ([]string, error) { file, err := os.Open(filename) if err != nil { return nil, err } defer file.Close() var patterns []string scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line != "" && !strings.HasPrefix(line, "#") { patterns = append(patterns, line) } } if err := scanner.Err(); err != nil { return nil, err } return patterns, nil } func readGlobFilesFromDir(dir string) ([]string, error) { var patterns []string err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if !info.IsDir() && strings.HasSuffix(info.Name(), ".glob") { filePatterns, err := readGlobFile(path) if err != nil { return err } patterns = append(patterns, filePatterns...) } return nil }) return patterns, err } func trackExcludedDirectory(excluded *ExcludedInfo, path string, mu *sync.Mutex) { mu.Lock() defer mu.Unlock() excluded.Directories[path] = 0 // Initialize directory count } func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, patternExclude string, includePriority, lineNumber, relativePaths, excludeFromTree, noCodeblock, noDefaultExcludes, followSymlinks bool, comp *compressor.GenericCompressor) (string, []FileInfo, *ExcludedInfo, error) { var files []FileInfo var mu sync.Mutex var wg sync.WaitGroup excluded := &ExcludedInfo{ Directories: make(map[string]int), Extensions: make(map[string]int), Files: make([]string, 0), } // Read exclude patterns defaultExcludes, err := ReadExcludePatterns(patternExclude, noDefaultExcludes) if err != nil { return "", nil, nil, fmt.Errorf("failed to read exclude patterns: %w", err) } // Combine user-provided exclude patterns with default excludes (if not disabled) allExcludePatterns := append(excludePatterns, defaultExcludes...) // Always exclude .git directories allExcludePatterns = append(allExcludePatterns, "**/.git/**") // Read .gitignore if it exists gitignore, err := readGitignore(rootPath) if err != nil { return "", nil, nil, fmt.Errorf("failed to read .gitignore: %w", err) } // Check if rootPath is a file or directory fileInfo, err := os.Stat(rootPath) if err != nil { return "", nil, nil, fmt.Errorf("failed to get file info: %w", err) } // Check if rootPath is a single PDF file if !fileInfo.IsDir() { isPDF, err := pdf.IsPDF(rootPath) if err != nil { return "", nil, nil, fmt.Errorf("failed to check if file is PDF: %w", err) } if isPDF { // Process single PDF file directly content, err := pdf.ConvertPDFToMarkdown(rootPath, false) if err != nil { return "", nil, nil, fmt.Errorf("failed to convert PDF: %w", err) } return fmt.Sprintf("File: %s", rootPath), []FileInfo{{ Path: rootPath, Extension: ".md", Code: content, }}, excluded, nil } } var treeString string if !fileInfo.IsDir() { // Check if the single file is a symlink if !followSymlinks { linkInfo, err := os.Lstat(rootPath) if err != nil { return "", nil, nil, fmt.Errorf("failed to get symlink info: %w", err) } if linkInfo.Mode()&os.ModeSymlink != 0 { utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Skipping symlinked file: %s", rootPath), color.FgCyan) return fmt.Sprintf("File: %s (symlink, skipped)", rootPath), []FileInfo{}, excluded, nil } } // Handle single file relPath := filepath.Base(rootPath) if shouldIncludeFile(relPath, includePatterns, allExcludePatterns, gitignore, includePriority) { wg.Go(func() { processFile(rootPath, relPath, filepath.Dir(rootPath), lineNumber, relativePaths, noCodeblock, &mu, &files, comp) }) } else { trackExcludedFile(excluded, rootPath, &mu) } treeString = fmt.Sprintf("File: %s", rootPath) } else { // Generate the tree representation for directory treeString, err = generateTreeString(rootPath, allExcludePatterns) if err != nil { return "", nil, nil, fmt.Errorf("failed to generate directory tree: %w", err) } // Process files in directory err = filepath.Walk(rootPath, func(path string, info os.FileInfo, err error) error { if err != nil { return err } relPath, err := filepath.Rel(rootPath, path) if err != nil { return err } // Check if the path is a symlink if !followSymlinks { linkInfo, err := os.Lstat(path) if err != nil { return err } if linkInfo.Mode()&os.ModeSymlink != 0 { if linkInfo.IsDir() || (info != nil && info.IsDir()) { utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Skipping symlinked directory: %s", path), color.FgCyan) return filepath.SkipDir } utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Skipping symlinked file: %s", path), color.FgCyan) return nil } } // Check if the current path (file or directory) should be excluded if shouldExcludePath(relPath, allExcludePatterns, gitignore) { if info.IsDir() { trackExcludedDirectory(excluded, path, &mu) return filepath.SkipDir } trackExcludedFile(excluded, path, &mu) return nil } if !info.IsDir() && !shouldIncludeFile(relPath, includePatterns, allExcludePatterns, gitignore, includePriority) { trackExcludedFile(excluded, path, &mu) return nil } if !info.IsDir() && shouldIncludeFile(relPath, includePatterns, allExcludePatterns, gitignore, includePriority) { wg.Add(1) go func(path, relPath string) { defer wg.Done() processFile(path, relPath, rootPath, lineNumber, relativePaths, noCodeblock, &mu, &files, comp) }(path, relPath) } return nil }) } wg.Wait() if err != nil { return "", nil, excluded, err } return treeString, files, excluded, nil } // New helper function to check if a path should be excluded func shouldExcludePath(path string, excludePatterns []string, gitignore *ignore.GitIgnore) bool { for _, pattern := range excludePatterns { if match, _ := doublestar.Match(pattern, path); match { return true } } return gitignore != nil && gitignore.MatchesPath(path) } func shouldIncludeFile(path string, includePatterns, excludePatterns []string, gitignore *ignore.GitIgnore, includePriority bool) bool { // Check if the file is explicitly included included := len(includePatterns) == 0 || matchesAny(path, includePatterns) // Check if the file is explicitly excluded excluded := isExcluded(path, excludePatterns) || (gitignore != nil && gitignore.MatchesPath(path)) if included && excluded { return includePriority } return included && !excluded } func matchesAny(path string, patterns []string) bool { for _, pattern := range patterns { if match, _ := doublestar.Match(pattern, path); match { return true } } return false } func readGitignore(rootPath string) (*ignore.GitIgnore, error) { gitignorePath := filepath.Join(rootPath, ".gitignore") if _, err := os.Stat(gitignorePath); os.IsNotExist(err) { return nil, nil } return ignore.CompileIgnoreFile(gitignorePath) } func addLineNumbers(code string) string { lines := strings.Split(code, "\n") for i := range lines { lines[i] = fmt.Sprintf("%4d | %s", i+1, lines[i]) } return strings.Join(lines, "\n") } func wrapCodeBlock(code, extension string) string { if extension == "" { return fmt.Sprintf("```\n%s\n```", code) } return fmt.Sprintf("```%s\n%s\n```", extension[1:], code) } func isBinaryFile(filePath string) (bool, error) { // First check if it's a PDF isPDF, err := pdf.IsPDF(filePath) if err != nil { return false, err } if isPDF { return false, nil // Don't treat PDFs as binary files } file, err := os.Open(filePath) if err != nil { return false, err } defer file.Close() // Read the first 512 bytes of the file buffer := make([]byte, 512) n, err := file.Read(buffer) if err != nil && err != io.EOF { return false, err } // Use http.DetectContentType to determine the content type contentType := http.DetectContentType(buffer[:n]) // Allow PDFs and text files return !strings.HasPrefix(contentType, "text/") && contentType != "application/pdf", nil } func PrintDefaultExcludes() { excludes, err := GetDefaultExcludes() if err != nil { utils.PrintColouredMessage("!", fmt.Sprintf("Failed to get default excludes: %v", err), color.FgRed) os.Exit(1) } fmt.Println(strings.Join(excludes, "\n")) } func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo, comp *compressor.GenericCompressor) { // Check if it's the root path being processed (explicitly provided file) isExplicitFile := path == rootPath // Check if file is a PDF isPDF, err := pdf.IsPDF(path) if err != nil { utils.PrintColouredMessage("!", fmt.Sprintf("Failed to check if file is PDF %s: %v", path, err), color.FgRed) return } if isPDF { if !isExplicitFile { // Skip PDFs during directory traversal return } utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Converting PDF to markdown: %s", path), color.FgBlue) content, err := pdf.ConvertPDFToMarkdown(path, false) if err != nil { utils.PrintColouredMessage("!", fmt.Sprintf("Failed to convert PDF %s: %v", path, err), color.FgRed) return } filePath := path if relativePaths { filePath = filepath.Join(filepath.Base(rootPath), relPath) } mu.Lock() *files = append(*files, FileInfo{ Path: filePath, Extension: ".md", Code: content, }) mu.Unlock() return } // Check if the file is binary isBinary, err := isBinaryFile(path) if err != nil { utils.PrintColouredMessage("!", fmt.Sprintf("Failed to check if file is binary %s: %v", path, err), color.FgRed) return } if isBinary { return // Skip binary files } content, err := os.ReadFile(path) if err != nil { utils.PrintColouredMessage("!", fmt.Sprintf("Failed to read file %s: %v", path, err), color.FgRed) return } code := string(content) // Attempt compression if compressor is provided and it's not a PDF if comp != nil && !isPDF { langID, err := compressor.IdentifyLanguage(path) if err == nil { // Language identified compressedCode, err := comp.Compress(content, langID) if err == nil { code = compressedCode // If compressed, we might not want to add line numbers or wrap in a generic code block // as the compressor might handle formatting. For now, let's assume compressed output // is final for this file's content. // We'll skip line numbering and code block wrapping for compressed content. goto skipFormatting } else { utils.PrintColouredMessage("⚠️", fmt.Sprintf("Compression failed for %s: %v. Using original content.", path, err), color.FgYellow) } } else { // Language not identified for compression, use original content utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Language not identified for compression for %s. Using original content.", path), color.FgBlue) } } if lineNumber { code = addLineNumbers(code) } if !noCodeblock { code = wrapCodeBlock(code, filepath.Ext(path)) } skipFormatting: filePath := path if relativePaths { filePath = filepath.Join(filepath.Base(rootPath), relPath) } mu.Lock() *files = append(*files, FileInfo{ Path: filePath, Extension: filepath.Ext(path), Code: code, }) mu.Unlock() } func generateTreeString(rootPath string, excludePatterns []string) (string, error) { root := &treeNode{name: filepath.Base(rootPath), isDir: true} hasExclusions := false err := filepath.Walk(rootPath, func(path string, info fs.FileInfo, err error) error { if err != nil { return err } relPath, err := filepath.Rel(rootPath, path) if err != nil { return err } // Skip the root directory if relPath == "." { return nil } // Check if the path should be excluded excluded := isExcluded(relPath, excludePatterns) if excluded { hasExclusions = true if info.IsDir() { // Add the excluded directory to the tree with an X marker parts := strings.Split(relPath, string(os.PathSeparator)) current := root for i, part := range parts { found := false for _, child := range current.children { if child.name == part { current = child found = true break } } if !found { newNode := &treeNode{ name: part, isDir: true, excluded: true, } current.children = append(current.children, newNode) current = newNode } if i == len(parts)-1 { current.isDir = true current.excluded = true } } return filepath.SkipDir } // Add excluded files to the tree with an X marker parts := strings.Split(relPath, string(os.PathSeparator)) current := root for i, part := range parts { found := false for _, child := range current.children { if child.name == part { current = child found = true break } } if !found { newNode := &treeNode{ name: part, isDir: i < len(parts)-1, excluded: true, } current.children = append(current.children, newNode) current = newNode } } return nil } parts := strings.Split(relPath, string(os.PathSeparator)) current := root for i, part := range parts { found := false for _, child := range current.children { if child.name == part { current = child found = true break } } if !found { newNode := &treeNode{name: part, isDir: info.IsDir()} current.children = append(current.children, newNode) current = newNode } if i == len(parts)-1 && !info.IsDir() { current.isDir = false } } return nil }) if err != nil { return "", err } var output strings.Builder if hasExclusions { output.WriteString("(Files/directories marked with ❌ are excluded or not included here)\n\n") } output.WriteString(root.name + "/\n") for i, child := range root.children { printTree(child, "", i == len(root.children)-1, &output) } return strings.TrimSuffix(output.String(), "\n"), nil } func printTree(node *treeNode, prefix string, isLast bool, output *strings.Builder) { output.WriteString(prefix) if isLast { output.WriteString("└── ") prefix += " " } else { output.WriteString("├── ") prefix += "│ " } output.WriteString(node.name) if node.isDir { output.WriteString("/") } if node.excluded { output.WriteString(" ❌") } output.WriteString("\n") sort.Slice(node.children, func(i, j int) bool { if node.children[i].isDir != node.children[j].isDir { return node.children[i].isDir } return node.children[i].name < node.children[j].name }) for i, child := range node.children { printTree(child, prefix, i == len(node.children)-1, output) } } func isExcluded(path string, patterns []string) bool { for _, pattern := range patterns { if match, _ := doublestar.Match(pattern, path); match { if strings.HasSuffix(path, ".pdf") { utils.PrintColouredMessage("ℹ️", fmt.Sprintf("PDF file detected: %s. PDF to markdown conversion is supported, but the file was excluded", path), color.FgYellow) } return true } } return false } func ProcessSingleFile(path string, lineNumber, relativePaths, noCodeblock, followSymlinks bool, comp *compressor.GenericCompressor) (FileInfo, error) { // Check if the file is a symlink if !followSymlinks { linkInfo, err := os.Lstat(path) if err != nil { return FileInfo{}, fmt.Errorf("failed to get symlink info: %w", err) } if linkInfo.Mode()&os.ModeSymlink != 0 { utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Skipping symlinked file: %s", path), color.FgCyan) return FileInfo{}, fmt.Errorf("file is a symlink and --follow-symlinks is not set") } } // Check if it's a PDF first isPDF, err := pdf.IsPDF(path) if err != nil { return FileInfo{}, fmt.Errorf("failed to check if file is PDF: %w", err) } if isPDF { content, err := pdf.ConvertPDFToMarkdown(path, false) if err != nil { return FileInfo{}, fmt.Errorf("failed to convert PDF: %w", err) } return FileInfo{ Path: path, Extension: ".md", Code: content, }, nil } // Handle non-PDF files content, err := os.ReadFile(path) if err != nil { return FileInfo{}, fmt.Errorf("failed to read file: %w", err) } code := string(content) // Attempt compression if compressor is provided and it's not a PDF if comp != nil && !isPDF { langID, err := compressor.IdentifyLanguage(path) if err == nil { // Language identified compressedCode, err := comp.Compress(content, langID) if err == nil { code = compressedCode // Skip standard formatting for compressed content goto skipSingleFileFormatting } else { utils.PrintColouredMessage("⚠️", fmt.Sprintf("Compression failed for %s: %v. Using original content.", path, err), color.FgYellow) } } else { utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Language not identified for compression for %s. Using original content.", path), color.FgBlue) } } if lineNumber { code = addLineNumbers(code) } if !noCodeblock { code = wrapCodeBlock(code, filepath.Ext(path)) } skipSingleFileFormatting: filePath := path if relativePaths { filePath = filepath.Base(path) } return FileInfo{ Path: filePath, Extension: filepath.Ext(path), Code: code, }, nil } ================================================ FILE: git/git.go ================================================ package git import ( "fmt" "os/exec" ) func GetGitDiff(repoPath string) (string, error) { cmd := exec.Command("git", "-C", repoPath, "diff") output, err := cmd.Output() if err != nil { return "", fmt.Errorf("failed to get git diff: %w", err) } return string(output), nil } func GetGitDiffBetweenBranches(repoPath, branch1, branch2 string) (string, error) { cmd := exec.Command("git", "-C", repoPath, "diff", branch1+".."+branch2) output, err := cmd.Output() if err != nil { return "", fmt.Errorf("failed to get git diff between branches: %w", err) } return string(output), nil } func GetGitLog(repoPath, branch1, branch2 string) (string, error) { cmd := exec.Command("git", "-C", repoPath, "log", "--oneline", branch1+".."+branch2) output, err := cmd.Output() if err != nil { return "", fmt.Errorf("failed to get git log: %w", err) } return string(output), nil } func BranchExists(repoPath, branchName string) (bool, error) { cmd := exec.Command("git", "-C", repoPath, "rev-parse", "--verify", branchName) err := cmd.Run() if err != nil { if exitError, ok := err.(*exec.ExitError); ok { // Branch doesn't exist if exitError.ExitCode() == 1 { return false, nil } } return false, fmt.Errorf("failed to check if branch exists: %w", err) } return true, nil } ================================================ FILE: go.mod ================================================ module github.com/sammcj/ingest go 1.25.6 require ( github.com/JohannesKaufmann/html-to-markdown v1.6.0 github.com/PuerkitoBio/goquery v1.11.0 github.com/atotto/clipboard v0.1.4 github.com/bmatcuk/doublestar/v4 v4.9.2 github.com/charmbracelet/glamour v0.10.0 github.com/fatih/color v1.18.0 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 github.com/mitchellh/go-homedir v1.1.0 github.com/pkoukk/tiktoken-go v0.1.8 github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 github.com/sashabaranov/go-openai v1.41.2 github.com/schollz/progressbar/v3 v3.19.0 github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 github.com/spf13/cobra v1.10.2 ) require ( github.com/andybalholm/cascadia v1.3.3 // indirect github.com/charmbracelet/colorprofile v0.4.1 // indirect github.com/charmbracelet/x/cellbuf v0.0.14 // indirect github.com/charmbracelet/x/exp/slice v0.0.0-20260119114936-fd556377ea59 // indirect github.com/charmbracelet/x/term v0.2.2 // indirect github.com/clipperhouse/displaywidth v0.7.0 // indirect github.com/clipperhouse/stringish v0.1.1 // indirect github.com/clipperhouse/uax29/v2 v2.3.1 // indirect github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 // indirect github.com/olekukonko/errors v1.2.0 // indirect github.com/olekukonko/ll v0.1.4-0.20260115111900-9e59c2286df0 // indirect github.com/sammcj/gollama v1.37.5 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect golang.org/x/text v0.33.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) require ( github.com/alecthomas/chroma/v2 v2.23.0 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/aymerick/douceur v0.2.0 // indirect github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834 // indirect github.com/charmbracelet/x/ansi v0.11.4 // indirect github.com/dlclark/regexp2 v1.11.5 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/css v1.0.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/lucasb-eyer/go-colorful v1.3.0 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-runewidth v0.0.19 // indirect github.com/microcosm-cc/bluemonday v1.0.27 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/natefinch/lumberjack v2.0.0+incompatible // indirect github.com/olekukonko/tablewriter v1.1.3 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/rs/zerolog v1.34.0 // indirect github.com/sammcj/quantest v0.0.13 github.com/shirou/gopsutil v3.21.11+incompatible // indirect github.com/spf13/pflag v1.0.10 // indirect github.com/yuin/goldmark v1.7.16 // indirect github.com/yuin/goldmark-emoji v1.0.6 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect golang.org/x/net v0.49.0 // indirect golang.org/x/sys v0.40.0 // indirect golang.org/x/term v0.39.0 // indirect ) ================================================ FILE: go.sum ================================================ github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0= github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/JohannesKaufmann/html-to-markdown v1.6.0 h1:04VXMiE50YYfCfLboJCLcgqF5x+rHJnb1ssNmqpLH/k= github.com/JohannesKaufmann/html-to-markdown v1.6.0/go.mod h1:NUI78lGg/a7vpEJTz/0uOcYMaibytE4BUOQS8k78yPQ= github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= github.com/alecthomas/assert/v2 v2.11.0 h1:2Q9r3ki8+JYXvGsDyBXwH3LcJ+WK5D0gc5E8vS6K3D0= github.com/alecthomas/assert/v2 v2.11.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= github.com/alecthomas/chroma/v2 v2.20.0 h1:sfIHpxPyR07/Oylvmcai3X/exDlE8+FA820NTz+9sGw= github.com/alecthomas/chroma/v2 v2.20.0/go.mod h1:e7tViK0xh/Nf4BYHl00ycY6rV7b8iXBksI9E359yNmA= github.com/alecthomas/chroma/v2 v2.23.0 h1:u/Orux1J0eLuZDeQ44froV8smumheieI0EofhbyKhhk= github.com/alecthomas/chroma/v2 v2.23.0/go.mod h1:NqVhfBR0lte5Ouh3DcthuUCTUpDC9cxBOfyMbMQPs3o= github.com/alecthomas/repr v0.5.1 h1:E3G4t2QbHTSNpPKBgMTln5KLkZHLOcU7r37J4pXBuIg= github.com/alecthomas/repr v0.5.1/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8= github.com/aymanbagabas/go-udiff v0.2.0/go.mod h1:RE4Ex0qsGkTAJoQdQQCA0uG+nAzJO/pI/QwceO5fgrA= github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= github.com/bmatcuk/doublestar/v4 v4.9.1 h1:X8jg9rRZmJd4yRy7ZeNDRnM+T3ZfHv15JiBJ/avrEXE= github.com/bmatcuk/doublestar/v4 v4.9.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= github.com/bmatcuk/doublestar/v4 v4.9.2 h1:b0mc6WyRSYLjzofB2v/0cuDUZ+MqoGyH3r0dVij35GI= github.com/bmatcuk/doublestar/v4 v4.9.2/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= github.com/charmbracelet/colorprofile v0.3.3 h1:DjJzJtLP6/NZ8p7Cgjno0CKGr7wwRJGxWUwh2IyhfAI= github.com/charmbracelet/colorprofile v0.3.3/go.mod h1:nB1FugsAbzq284eJcjfah2nhdSLppN2NqvfotkfRYP4= github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk= github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk= github.com/charmbracelet/glamour v0.10.0 h1:MtZvfwsYCx8jEPFJm3rIBFIMZUfUJ765oX8V6kXldcY= github.com/charmbracelet/glamour v0.10.0/go.mod h1:f+uf+I/ChNmqo087elLnVdCiVgjSKWuXa/l6NU2ndYk= github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834 h1:ZR7e0ro+SZZiIZD7msJyA+NjkCNNavuiPBLgerbOziE= github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834/go.mod h1:aKC/t2arECF6rNOnaKaVU6y4t4ZeHQzqfxedE/VkVhA= github.com/charmbracelet/x/ansi v0.11.2 h1:XAG3FSjiVtFvgEgGrNBkCNNYrsucAt8c6bfxHyROLLs= github.com/charmbracelet/x/ansi v0.11.2/go.mod h1:9tY2bzX5SiJCU0iWyskjBeI2BRQfvPqI+J760Mjf+Rg= github.com/charmbracelet/x/ansi v0.11.4 h1:6G65PLu6HjmE858CnTUQY1LXT3ZUWwfvqEROLF8vqHI= github.com/charmbracelet/x/ansi v0.11.4/go.mod h1:/5AZ+UfWExW3int5H5ugnsG/PWjNcSQcwYsHBlPFQN4= github.com/charmbracelet/x/cellbuf v0.0.14 h1:iUEMryGyFTelKW3THW4+FfPgi4fkmKnnaLOXuc+/Kj4= github.com/charmbracelet/x/cellbuf v0.0.14/go.mod h1:P447lJl49ywBbil/KjCk2HexGh4tEY9LH0/1QrZZ9rA= github.com/charmbracelet/x/exp/golden v0.0.0-20240806155701-69247e0abc2a h1:G99klV19u0QnhiizODirwVksQB91TJKV/UaTnACcG30= github.com/charmbracelet/x/exp/golden v0.0.0-20240806155701-69247e0abc2a/go.mod h1:wDlXFlCrmJ8J+swcL/MnGUuYnqgQdW9rhSD61oNMb6U= github.com/charmbracelet/x/exp/slice v0.0.0-20251126160633-0b68cdcd21da h1:6HDQl5MSww6jOImEZ6qu4OPUOcOauUgexOrAOkWGFs8= github.com/charmbracelet/x/exp/slice v0.0.0-20251126160633-0b68cdcd21da/go.mod h1:vqEfX6xzqW1pKKZUUiFOKg0OQ7bCh54Q2vR/tserrRA= github.com/charmbracelet/x/exp/slice v0.0.0-20260119114936-fd556377ea59 h1:QtkqQl+yAR6RwQnNjdWHRX093ajX8FZ/WAz3Dvw+xWg= github.com/charmbracelet/x/exp/slice v0.0.0-20260119114936-fd556377ea59/go.mod h1:vqEfX6xzqW1pKKZUUiFOKg0OQ7bCh54Q2vR/tserrRA= github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= github.com/clipperhouse/displaywidth v0.6.0 h1:k32vueaksef9WIKCNcoqRNyKbyvkvkysNYnAWz2fN4s= github.com/clipperhouse/displaywidth v0.6.0/go.mod h1:R+kHuzaYWFkTm7xoMmK1lFydbci4X2CicfbGstSGg0o= github.com/clipperhouse/displaywidth v0.7.0 h1:QNv1GYsnLX9QBrcWUtMlogpTXuM5FVnBwKWp1O5NwmE= github.com/clipperhouse/displaywidth v0.7.0/go.mod h1:R+kHuzaYWFkTm7xoMmK1lFydbci4X2CicfbGstSGg0o= github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs= github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA= github.com/clipperhouse/uax29/v2 v2.3.0 h1:SNdx9DVUqMoBuBoW3iLOj4FQv3dN5mDtuqwuhIGpJy4= github.com/clipperhouse/uax29/v2 v2.3.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g= github.com/clipperhouse/uax29/v2 v2.3.1 h1:RjM8gnVbFbgI67SBekIC7ihFpyXwRPYWXn9BZActHbw= github.com/clipperhouse/uax29/v2 v2.3.1/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ= github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8= github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4= github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag= github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk= github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/natefinch/lumberjack v2.0.0+incompatible h1:4QJd3OLAMgj7ph+yZTuX13Ld4UpgHp07nNdFX7mqFfM= github.com/natefinch/lumberjack v2.0.0+incompatible/go.mod h1:Wi9p2TTF5DG5oU+6YfsmYQpsTIOm0B1VNzQg9Mw6nPk= github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 h1:zrbMGy9YXpIeTnGj4EljqMiZsIcE09mmF8XsD5AYOJc= github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6/go.mod h1:rEKTHC9roVVicUIfZK7DYrdIoM0EOr8mK1Hj5s3JjH0= github.com/olekukonko/errors v1.1.0 h1:RNuGIh15QdDenh+hNvKrJkmxxjV4hcS50Db478Ou5sM= github.com/olekukonko/errors v1.1.0/go.mod h1:ppzxA5jBKcO1vIpCXQ9ZqgDh8iwODz6OXIGKU8r5m4Y= github.com/olekukonko/errors v1.2.0 h1:10Zcn4GeV59t/EGqJc8fUjtFT/FuUh5bTMzZ1XwmCRo= github.com/olekukonko/errors v1.2.0/go.mod h1:ppzxA5jBKcO1vIpCXQ9ZqgDh8iwODz6OXIGKU8r5m4Y= github.com/olekukonko/ll v0.1.2 h1:lkg/k/9mlsy0SxO5aC+WEpbdT5K83ddnNhAepz7TQc0= github.com/olekukonko/ll v0.1.2/go.mod h1:b52bVQRRPObe+yyBl0TxNfhesL0nedD4Cht0/zx55Ew= github.com/olekukonko/ll v0.1.4-0.20260115111900-9e59c2286df0 h1:jrYnow5+hy3WRDCBypUFvVKNSPPCdqgSXIE9eJDD8LM= github.com/olekukonko/ll v0.1.4-0.20260115111900-9e59c2286df0/go.mod h1:b52bVQRRPObe+yyBl0TxNfhesL0nedD4Cht0/zx55Ew= github.com/olekukonko/tablewriter v1.1.0 h1:N0LHrshF4T39KvI96fn6GT8HEjXRXYNDrDjKFDB7RIY= github.com/olekukonko/tablewriter v1.1.0/go.mod h1:5c+EBPeSqvXnLLgkm9isDdzR3wjfBkHR9Nhfp3NWrzo= github.com/olekukonko/tablewriter v1.1.3 h1:VSHhghXxrP0JHl+0NnKid7WoEmd9/urKRJLysb70nnA= github.com/olekukonko/tablewriter v1.1.3/go.mod h1:9VU0knjhmMkXjnMKrZ3+L2JhhtsQ/L38BbL3CRNE8tM= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkoukk/tiktoken-go v0.1.8 h1:85ENo+3FpWgAACBaEUVp+lctuTcYUO7BtmfhlN/QTRo= github.com/pkoukk/tiktoken-go v0.1.8/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY= github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 h1:OkMGxebDjyw0ULyrTYWeN0UNCCkmCWfjPnIA2W6oviI= github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06/go.mod h1:+ePHsJ1keEjQtpvf9HHw0f4ZeJ0TLRsxhunSI2hYJSs= github.com/sammcj/gollama v1.37.3 h1:aOkGsyCObF8Y2TQ5fzj2PVnvZMugexMAwux+WH+wjtA= github.com/sammcj/gollama v1.37.3/go.mod h1:NOkydPF8yjWOsSypVQkMJH1yphHq80L52Z0AMzfnPfY= github.com/sammcj/gollama v1.37.5 h1:SrKJjdwDtrTkiOoySrTLiF7INm4Ik1GcqVYnm4InaBU= github.com/sammcj/gollama v1.37.5/go.mod h1:0fICL7D5ZUsUIIXSTrSh5OC/BxmLQ521HgtBWTLY+A8= github.com/sammcj/quantest v0.0.13 h1:d/f+Pp1aXFL0P2DRyZNOlDf4AAlqhmB1HQM+egHzDyo= github.com/sammcj/quantest v0.0.13/go.mod h1:nfmfRnybimGRLXd/Yq3xwIyetGyJDzJPi0a3X5RKCWk= github.com/sashabaranov/go-openai v1.41.2 h1:vfPRBZNMpnqu8ELsclWcAvF19lDNgh1t6TVfFFOPiSM= github.com/sashabaranov/go-openai v1.41.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA= github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc= github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI= github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 h1:6C8qej6f1bStuePVkLSFxoU22XBS165D3klxlzRg8F4= github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82/go.mod h1:xe4pgH49k4SsmkQq5OT8abwhWmnzkhpgnXeekbx2efw= github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yuin/goldmark v1.7.1/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E= github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA= github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= github.com/yuin/goldmark v1.7.16 h1:n+CJdUxaFMiDUNnWC3dMWCIQJSkxH4uz3ZwQBkAlVNE= github.com/yuin/goldmark v1.7.16/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= github.com/yuin/goldmark-emoji v1.0.6 h1:QWfF2FYaXwL74tfGOW5izeiZepUDroDJfWubQI9HTHs= github.com/yuin/goldmark-emoji v1.0.6/go.mod h1:ukxJDKFpdFb5x0a5HqbdlcKtebh086iJpI31LTKmWuA= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 h1:nDVHiLt8aIbd/VzvPWN6kSOPE7+F/fNFDSXLVYkE/Iw= golang.org/x/exp v0.0.0-20250305212735-054e65f0b394/go.mod h1:sIifuuw/Yco/y6yb6+bDNfyeQ/MdPUy/hKEMYQV17cM= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: internal/compressor/compressor.go ================================================ package compressor import ( "context" "fmt" "path/filepath" "strings" sitter "github.com/smacker/go-tree-sitter" "github.com/smacker/go-tree-sitter/bash" "github.com/smacker/go-tree-sitter/c" "github.com/smacker/go-tree-sitter/css" "github.com/smacker/go-tree-sitter/golang" "github.com/smacker/go-tree-sitter/javascript" "github.com/smacker/go-tree-sitter/python" ) // LanguageMap maps language identifiers to their Tree-sitter Language. var LanguageMap = map[string]*sitter.Language{ "go": golang.GetLanguage(), "python": python.GetLanguage(), "javascript": javascript.GetLanguage(), "bash": bash.GetLanguage(), "c": c.GetLanguage(), "css": css.GetLanguage(), "html": javascript.GetLanguage(), // Use JavaScript parser for HTML "rust": javascript.GetLanguage(), // Use JavaScript parser for Rust "java": javascript.GetLanguage(), // Use JavaScript parser for Java "swift": javascript.GetLanguage(), // Use JavaScript parser for Swift } // IdentifyLanguage identifies the programming language of a file based on its extension. func IdentifyLanguage(filePath string) (string, error) { ext := strings.ToLower(filepath.Ext(filePath)) switch ext { case ".go": return "go", nil case ".py": return "python", nil case ".js", ".jsx", ".mjs", ".cjs", ".ts", ".tsx": return "javascript", nil case ".sh", ".bash": return "bash", nil case ".c", ".h": return "c", nil case ".css": return "css", nil case ".html", ".htm": return "html", nil case ".rs": return "rust", nil case ".java": return "java", nil case ".swift": return "swift", nil // Add more extensions and languages here default: return "", fmt.Errorf("unsupported file extension: %s", ext) } } // GetLanguage returns the Tree-sitter language for a given language identifier. func GetLanguage(langIdentifier string) (*sitter.Language, error) { lang, ok := LanguageMap[strings.ToLower(langIdentifier)] if !ok { return nil, fmt.Errorf("unsupported language: %s", langIdentifier) } return lang, nil } // ParseSource parses the source code content using the appropriate Tree-sitter language. func ParseSource(content []byte, lang *sitter.Language) (*sitter.Tree, error) { parser := sitter.NewParser() parser.SetLanguage(lang) tree, err := parser.ParseCtx(context.Background(), nil, content) if err != nil { return nil, fmt.Errorf("failed to parse source: %w", err) } return tree, nil } // QueryMap holds Tree-sitter queries for different languages. var QueryMap = map[string]string{ "go": ` (package_clause) @package (import_declaration) @import (type_declaration) @definition.type (function_declaration name: (identifier) @definition.function) @definition.function.full (method_declaration name: (field_identifier) @definition.method) @definition.method.full (comment) @comment `, "python": ` (import_statement) @import (import_from_statement) @import (function_definition name: (identifier) @definition.function) @definition.function.full (class_definition name: (identifier) @definition.class) @definition.class.full (comment) @comment `, "bash": ` (function_definition name: (word) @definition.function) @definition.function.full (command name: (command_name) @command_name) @command (comment) @comment `, "c": ` (preproc_include) @import (function_definition declarator: (function_declarator declarator: (identifier) @definition.function)) @definition.function.full (struct_specifier name: (type_identifier) @definition.struct) @definition.struct.full (enum_specifier name: (type_identifier) @definition.enum) @definition.enum.full (union_specifier name: (type_identifier) @definition.union) @definition.union.full (type_definition) @definition.typedef (comment) @comment `, "css": ` (import_statement) @import (rule_set) @rule_set (media_statement) @media (keyframes_statement) @keyframes (declaration) @declaration (comment) @comment `, "html": ` ; HTML elements in JavaScript parser ((comment) @comment) ((regex) @regex) ((string) @string) ((template_string) @template_string) ((identifier) @identifier) ((property_identifier) @property) `, "rust": ` ; Rust elements using JavaScript parser ((comment) @comment) ((string) @string) ((regex) @regex) ((template_string) @template_string) ((identifier) @identifier) ((property_identifier) @property) `, "java": ` ; Java elements using JavaScript parser ((comment) @comment) ((string) @string) ((regex) @regex) ((template_string) @template_string) ((identifier) @identifier) ((property_identifier) @property) `, "swift": ` ; Swift elements using JavaScript parser ((comment) @comment) ((string) @string) ((regex) @regex) ((template_string) @template_string) ((identifier) @identifier) ((property_identifier) @property) `, "javascript": ` (import_statement) @import (comment) @comment (method_definition name: (_) @_.name) @definition.method.full ; === Definitions whose bodies should be stripped === ; Case 1a: Exported NAMED function/class/generator (e.g., export function foo() {}) ; Captures the export_statement node. 'declaration' is the field name for the exported item. (export_statement declaration: (function_declaration name: (identifier)) ) @definition.function.full (export_statement declaration: (generator_function_declaration name: (identifier)) ) @definition.function.full (export_statement declaration: (class_declaration name: (identifier)) ) @definition.class.full ; Case 1b: Export DEFAULT NAMED function/class/generator (e.g., export default function foo() {}) ; Captures the export_statement node. The declaration is a direct child after the 'default' keyword. (export_statement "default" ; Matches the anonymous 'default' keyword child node by its string content (function_declaration name: (identifier)) ) @definition.function.full (export_statement "default" (generator_function_declaration name: (identifier)) ) @definition.function.full (export_statement "default" (class_declaration name: (identifier)) ) @definition.class.full ; Case 2: Standalone (non-exported by above rules) NAMED function/class/generator ; Captures the function_declaration or class_declaration node itself. (function_declaration name: (identifier)) @definition.function.full (generator_function_declaration name: (identifier)) @definition.function.full (class_declaration name: (identifier)) @definition.class.full ; Case 3: Arrow functions assigned to variables (const/let/var) ; The whole lexical_declaration or variable_declaration is captured if it contains an arrow function with a block body. (lexical_declaration (variable_declarator value: (arrow_function body: (statement_block)) ) ) @definition.function.full (variable_declaration (variable_declarator value: (arrow_function body: (statement_block)) ) ) @definition.function.full ; Case 4: Arrow functions with expression bodies assigned to variables ; These are arrow functions that don't have a statement_block body (e.g., const myArrow = () => expression) (lexical_declaration (variable_declarator name: (identifier) @arrow.name value: (arrow_function) @arrow.function ) ) @definition.function.full (variable_declaration (variable_declarator name: (identifier) @arrow.name value: (arrow_function) @arrow.function ) ) @definition.function.full ; Case 5: Anonymous default exported functions and classes ; These are functions and classes without names that are exported as default (export_statement "default" (function_declaration) @anon.function ) @definition.function.full (export_statement "default" (class_declaration) @anon.class ) @definition.class.full ; === Other export forms to be kept whole === ; Capture name is @export.other ; export const x = 1;, export let y = 2; (declaration is lexical_declaration) (export_statement declaration: (lexical_declaration)) @export.other ; export var z = 3; (declaration is variable_declaration) (export_statement declaration: (variable_declaration)) @export.other ; export { foo, bar };, export { foo as bar } from 'module';, export * from 'module'; (export_statement (export_clause)) @export.other ; export default foo; (where foo is an identifier/expression - 'value' is the field name for these) (export_statement value: (identifier)) @export.other (export_statement value: (string)) @export.other (export_statement value: (number)) @export.other (export_statement value: (object)) @export.other (export_statement value: (array)) @export.other (export_statement value: (arrow_function)) @export.other `, } // GetQuery retrieves a Tree-sitter query for a given language identifier. func GetQuery(languageIdentifier string) (string, error) { query, ok := QueryMap[strings.ToLower(languageIdentifier)] if !ok { return "", fmt.Errorf("no query found for language: %s", languageIdentifier) } return query, nil } // CompileQuery compiles a query string for a given language. func CompileQuery(queryStr string, lang *sitter.Language) (*sitter.Query, error) { query, err := sitter.NewQuery([]byte(queryStr), lang) if err != nil { return nil, fmt.Errorf("failed to compile query: %w", err) } return query, nil } // ExecuteQuery executes a compiled query against a syntax tree. func ExecuteQuery(tree *sitter.Tree, query *sitter.Query, source []byte) ([]*sitter.QueryMatch, error) { qc := sitter.NewQueryCursor() qc.Exec(query, tree.RootNode()) var matches []*sitter.QueryMatch for { match, ok := qc.NextMatch() if !ok { break } isDuplicate := false for _, existingMatch := range matches { if existingMatch.ID == match.ID && existingMatch.PatternIndex == match.PatternIndex { if len(existingMatch.Captures) == len(match.Captures) { allCapturesSame := true for i := range match.Captures { if existingMatch.Captures[i].Index != match.Captures[i].Index || existingMatch.Captures[i].Node.StartByte() != match.Captures[i].Node.StartByte() || existingMatch.Captures[i].Node.EndByte() != match.Captures[i].Node.EndByte() { allCapturesSame = false break } } if allCapturesSame { isDuplicate = true break } } } } if !isDuplicate { matches = append(matches, match) } } return matches, nil } // CodeChunk represents a captured piece of code. type CodeChunk struct { Content string StartByte uint32 EndByte uint32 OriginalLine int } func isNamedDeclarationType(n *sitter.Node) bool { if n == nil { return false } switch n.Type() { case "function_declaration", "generator_function_declaration", "class_declaration": nameNode := n.ChildByFieldName("name") return nameNode != nil && nameNode.Type() == "identifier" default: return false } } func LogCaptures(matches []*sitter.QueryMatch, query *sitter.Query, source []byte) { for _, match := range matches { for _, capture := range match.Captures { captureName := query.CaptureNameForId(capture.Index) nodeContent := capture.Node.Content(source) fmt.Printf("Capture: @%s, Node Type: %s, Content: %s\n", captureName, capture.Node.Type(), strings.TrimSpace(nodeContent)) } } } ================================================ FILE: internal/compressor/compressor_test.go ================================================ package compressor import ( "os" "strings" "testing" ) func TestGenericCompressor_Compress_Go(t *testing.T) { compressor := NewGenericCompressor() goCode := ` package main import "fmt" // This is a comment type MyStruct struct { FieldA int FieldB string } func (s *MyStruct) MyMethod(val int) string { // Method body if val > 0 { return fmt.Sprintf("Positive: %d", val) } return "Zero or Negative" } func main() { // Main function body instance := MyStruct{FieldA: 1, FieldB: "test"} fmt.Println(instance.MyMethod(5)) fmt.Println("Hello, world!") } ` expectedCompressedParts := []string{ "package main", "import \"fmt\"", "// This is a comment", "type MyStruct struct", "func (s *MyStruct) MyMethod(val int) string { ... }", "func main() { ... }", } compressed, err := compressor.Compress([]byte(goCode), "go") if err != nil { t.Fatalf("Compress failed for Go: %v", err) } t.Logf("Compressed Go code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed output does not contain expected part: %s", part) } } if strings.Contains(compressed, `return fmt.Sprintf("Positive: %d", val)`) { t.Errorf("Compressed output unexpectedly contains function body content") } if strings.Contains(compressed, `fmt.Println("Hello, world!")`) { t.Errorf("Compressed output unexpectedly contains main function body content") } } func TestGenericCompressor_Compress_Python(t *testing.T) { compressor := NewGenericCompressor() pythonCode := ` # This is a Python comment import os from sys import argv class MyClass: """ A simple class """ def __init__(self, name): self.name = name def greet(self, message): """Greets the person.""" # Method body print(f"{message}, {self.name}!") if len(self.name) > 3: print("Long name") return True def my_function(x, y): # Function body result = x + y print(f"Result is {result}") return result if __name__ == "__main__": c = MyClass("Test") c.greet("Hello") my_function(1, 2) ` expectedCompressedParts := []string{ "# This is a Python comment", "import os", "from sys import argv", "class MyClass: { ... } # Body removed", "def __init__(self, name): { ... } # Body removed", "def greet(self, message): { ... } # Body removed", "def my_function(x, y): { ... } # Body removed", } compressed, err := compressor.Compress([]byte(pythonCode), "python") if err != nil { t.Fatalf("Compress failed for Python: %v", err) } t.Logf("Compressed Python code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed Python output does not contain expected part: '%s'", part) } } if strings.Contains(compressed, `self.name = name`) { t.Errorf("Compressed Python output unexpectedly contains __init__ body content") } if strings.Contains(compressed, `print(f"{message}, {self.name}!")`) { t.Errorf("Compressed Python output unexpectedly contains greet method body content") } if strings.Contains(compressed, `result = x + y`) { t.Errorf("Compressed Python output unexpectedly contains my_function body content") } if strings.Contains(compressed, `c = MyClass("Test")`) { t.Errorf("Compressed Python output unexpectedly contains __main__ block content") } } func TestGenericCompressor_Compress_JavaScript(t *testing.T) { compressor := NewGenericCompressor() // Read test file content jsCode, err := os.ReadFile("testdata/example.js") if err != nil { t.Fatalf("Failed to read test file: %v", err) } expectedPlaceholder := " { ... } // Body removed" // Generic placeholder for JS // Update expected parts for JS with its specific placeholder expectedCompressedParts := []string{ "// This is a JavaScript comment", "import { something } from 'module';", "export class MyJSClass" + expectedPlaceholder, // constructor is a method_definition, its signature includes 'constructor' "constructor(name)" + expectedPlaceholder, "greet(message)" + expectedPlaceholder, "export function myJSFunction(x, y)" + expectedPlaceholder, "const myArrowFunc = (a, b) =>" + expectedPlaceholder, // Arrow function with expression body should be compressed "const myExpressionArrow = (x) =>" + expectedPlaceholder, "function* myGenerator()" + expectedPlaceholder, "export const myVar = 42;", // Should be kept by @export.other "constructor()" + expectedPlaceholder, // Constructor of anonymous default class } compressed, err := compressor.Compress(jsCode, "javascript") if err != nil { t.Fatalf("Compress failed for JavaScript: %v", err) } t.Logf("Compressed JavaScript code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed JS output does not contain expected part: '%s'", part) } } if strings.Contains(compressed, `this.name = name;`) { t.Errorf("Compressed JS output unexpectedly contains constructor body content") } if strings.Contains(compressed, `console.log(message + ", " + this.name + "!");`) { t.Errorf("Compressed JS output unexpectedly contains greet method body content") } if strings.Contains(compressed, `const result = x + y;`) { t.Errorf("Compressed JS output unexpectedly contains myJSFunction body content") } if strings.Contains(compressed, `return a * b;`) { t.Errorf("Compressed JS output unexpectedly contains myArrowFunc body content") } if strings.Contains(compressed, `yield 1;`) { t.Errorf("Compressed JS output unexpectedly contains generator function body content") } if strings.Contains(compressed, `this.x = 1;`) { t.Errorf("Compressed JS output unexpectedly contains anonymous default class body content") } if strings.Contains(compressed, `x * x`) { t.Errorf("Compressed JS output unexpectedly contains expression arrow function body content") } } func TestGenericCompressor_Compress_JavaScript_AnonDefaultFunction(t *testing.T) { compressor := NewGenericCompressor() // Read test file content jsCode, err := os.ReadFile("testdata/example_anon_func.js") if err != nil { t.Fatalf("Failed to read test file: %v", err) } expectedPlaceholder := " { ... } // Body removed" // Generic placeholder for JS // Update expected parts for JS with its specific placeholder expectedCompressedParts := []string{ "// This is a JavaScript comment", "import { something } from 'module';", // Arrow function with expression body should be compressed "const myExpressionArrow = (x) =>" + expectedPlaceholder, } compressed, err := compressor.Compress(jsCode, "javascript") if err != nil { t.Fatalf("Compress failed for JavaScript: %v", err) } t.Logf("Compressed JavaScript code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed JS output does not contain expected part: '%s'", part) } } if strings.Contains(compressed, `console.log("Anon default func body");`) { t.Errorf("Compressed JS output unexpectedly contains anonymous default function body content") } if strings.Contains(compressed, `x * x`) { t.Errorf("Compressed JS output unexpectedly contains expression arrow function body content") } } func TestGenericCompressor_Compress_Bash(t *testing.T) { compressor := NewGenericCompressor() // Read test file content bashCode, err := os.ReadFile("testdata/example.sh") if err != nil { t.Fatalf("Failed to read test file: %v", err) } expectedPlaceholder := " { ... } # Body removed" // Generic placeholder for Bash // Update expected parts for Bash expectedCompressedParts := []string{ "# This is a bash comment", "function greet()" + expectedPlaceholder, "process_file()" + expectedPlaceholder, "echo \"Starting script\"", } compressed, err := compressor.Compress(bashCode, "bash") if err != nil { t.Fatalf("Compress failed for Bash: %v", err) } t.Logf("Compressed Bash code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed Bash output does not contain expected part: '%s'", part) } } if strings.Contains(compressed, `local name=$1`) { t.Errorf("Compressed Bash output unexpectedly contains function body content") } } func TestGenericCompressor_Compress_C(t *testing.T) { compressor := NewGenericCompressor() // Read test file content cCode, err := os.ReadFile("testdata/example.c") if err != nil { t.Fatalf("Failed to read test file: %v", err) } expectedPlaceholder := " { ... } // Body removed" // Generic placeholder for C // Update expected parts for C expectedCompressedParts := []string{ "#include ", "#include ", "#include ", "struct Person {", "enum Color {", "union Data {", "typedef unsigned long int UINT32;", "int main(int argc, char *argv[])" + expectedPlaceholder, "void greet(const char* name)" + expectedPlaceholder, "int calculate_sum(int a, int b)" + expectedPlaceholder, } compressed, err := compressor.Compress(cCode, "c") if err != nil { t.Fatalf("Compress failed for C: %v", err) } t.Logf("Compressed C code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed C output does not contain expected part: '%s'", part) } } if strings.Contains(compressed, `strcpy(person.name, "John");`) { t.Errorf("Compressed C output unexpectedly contains function body content") } } func TestGenericCompressor_Compress_CSS(t *testing.T) { compressor := NewGenericCompressor() // Read test file content cssCode, err := os.ReadFile("testdata/example.css") if err != nil { t.Fatalf("Failed to read test file: %v", err) } // Update expected parts for CSS expectedCompressedParts := []string{ "/* This is a CSS comment */", "@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap')", ":root {", "body {", "header {", "nav {", ".card {", ".button {", "@media (max-width: 768px) {", "@keyframes fadeIn {", } compressed, err := compressor.Compress(cssCode, "css") if err != nil { t.Fatalf("Compress failed for CSS: %v", err) } t.Logf("Compressed CSS code:\n%s", compressed) for _, part := range expectedCompressedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed CSS output does not contain expected part: '%s'", part) } } } func TestGenericCompressor_Compress_HTML(t *testing.T) { compressor := NewGenericCompressor() // Read test file content htmlCode, err := os.ReadFile("testdata/example.html") if err != nil { t.Fatalf("Failed to read test file: %v", err) } // For HTML, we're just testing that the compression doesn't fail // Since we're using JavaScript parser for HTML, the results may vary compressed, err := compressor.Compress(htmlCode, "html") if err != nil { t.Fatalf("Compress failed for HTML: %v", err) } t.Logf("Compressed HTML code:\n%s", compressed) // Check that script content is properly handled if strings.Contains(compressed, "document.addEventListener('DOMContentLoaded', function()") { t.Errorf("Compressed HTML output unexpectedly contains script content that should be compressed") } // Test passes as long as compression doesn't fail t.Log("HTML compression completed without errors") } func TestGenericCompressor_Compress_Rust(t *testing.T) { compressor := NewGenericCompressor() // Read test file content rustCode, err := os.ReadFile("testdata/example.rs") if err != nil { t.Fatalf("Failed to read test file: %v", err) } // For Rust, we're using JavaScript parser, so we're just testing that the compression doesn't fail compressed, err := compressor.Compress(rustCode, "rust") if err != nil { t.Fatalf("Compress failed for Rust: %v", err) } t.Logf("Compressed Rust code:\n%s", compressed) // Check for expected parts expectedParts := []string{ "// This is a Rust comment", "use std::collections::HashMap;", "use std::io::{self, Read, Write};", "pub struct Person {", "pub trait Printable {", "pub enum Status {", } for _, part := range expectedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed Rust output does not contain expected part: '%s'", part) } } // Test passes as long as compression doesn't fail t.Log("Rust compression completed without errors") } func TestGenericCompressor_Compress_Java(t *testing.T) { compressor := NewGenericCompressor() // Read test file content javaCode, err := os.ReadFile("testdata/example.java") if err != nil { t.Fatalf("Failed to read test file: %v", err) } // For Java, we're using JavaScript parser, so we're just testing that the compression doesn't fail compressed, err := compressor.Compress(javaCode, "java") if err != nil { t.Fatalf("Compress failed for Java: %v", err) } t.Logf("Compressed Java code:\n%s", compressed) // Check for expected parts expectedParts := []string{ "// This is a Java comment", "package com.example.demo;", "import java.util.ArrayList;", "public class Person {", "interface Printable {", "enum Status {", } for _, part := range expectedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed Java output does not contain expected part: '%s'", part) } } // Test passes as long as compression doesn't fail t.Log("Java compression completed without errors") } func TestGenericCompressor_Compress_Swift(t *testing.T) { compressor := NewGenericCompressor() // Read test file content swiftCode, err := os.ReadFile("testdata/example.swift") if err != nil { t.Fatalf("Failed to read test file: %v", err) } // For Swift, we're using JavaScript parser, so we're just testing that the compression doesn't fail compressed, err := compressor.Compress(swiftCode, "swift") if err != nil { t.Fatalf("Compress failed for Swift: %v", err) } t.Logf("Compressed Swift code:\n%s", compressed) // Check for expected parts expectedParts := []string{ "// This is a Swift comment", "import Foundation", "struct Address {", "class Person {", "protocol Printable {", "enum Status {", } for _, part := range expectedParts { if !strings.Contains(compressed, part) { t.Errorf("Compressed Swift output does not contain expected part: '%s'", part) } } // Test passes as long as compression doesn't fail t.Log("Swift compression completed without errors") } func TestIdentifyLanguage(t *testing.T) { tests := []struct { filePath string expectedLang string expectedError bool }{ {"main.go", "go", false}, {"script.py", "python", false}, {"app.js", "javascript", false}, {"style.css", "css", false}, {"script.sh", "bash", false}, {"header.h", "c", false}, {"program.c", "c", false}, {"index.html", "html", false}, {"page.htm", "html", false}, {"main.rs", "rust", false}, {"App.java", "java", false}, {"Main.swift", "swift", false}, {"README.md", "", true}, } for _, tt := range tests { lang, err := IdentifyLanguage(tt.filePath) if tt.expectedError { if err == nil { t.Errorf("IdentifyLanguage(%s): expected error, got nil", tt.filePath) } } else { if err != nil { t.Errorf("IdentifyLanguage(%s): unexpected error: %v", tt.filePath, err) } if lang != tt.expectedLang { t.Errorf("IdentifyLanguage(%s): expected lang %s, got %s", tt.filePath, tt.expectedLang, lang) } } } } func TestGetLanguage(t *testing.T) { _, err := GetLanguage("go") if err != nil { t.Errorf("GetLanguage(go) failed: %v", err) } _, err = GetLanguage("nonexistent") if err == nil { t.Errorf("GetLanguage(nonexistent) expected error, got nil") } } func TestParseSource_ValidGo(t *testing.T) { content := []byte("package main\nfunc main() {}") lang, _ := GetLanguage("go") tree, err := ParseSource(content, lang) if err != nil { t.Fatalf("ParseSource failed for valid Go: %v", err) } if tree == nil { t.Fatalf("ParseSource returned nil tree for valid Go") } defer tree.Close() if tree.RootNode() == nil { t.Errorf("ParseSource returned tree with nil root node") } } func TestParseSource_InvalidGo(t *testing.T) { content := []byte("package main\nfunc main() {") lang, _ := GetLanguage("go") tree, err := ParseSource(content, lang) if err != nil { t.Fatalf("ParseSource returned an unexpected error for malformed Go: %v", err) } if tree == nil { t.Fatalf("ParseSource returned nil tree for malformed Go") } defer tree.Close() if tree.RootNode().HasError() { t.Logf("Malformed Go code parsed with errors, as expected.") } else { t.Logf("Malformed Go code parsed without explicit error nodes (might be recovered by parser).") } } func TestGetQuery_Go(t *testing.T) { queryStr, err := GetQuery("go") if err != nil { t.Fatalf("GetQuery(go) failed: %v", err) } if queryStr == "" { t.Errorf("GetQuery(go) returned empty query string") } } func TestCompileQuery_ValidGoQuery(t *testing.T) { lang, _ := GetLanguage("go") queryStr, _ := GetQuery("go") query, err := CompileQuery(queryStr, lang) if err != nil { t.Fatalf("CompileQuery failed for valid Go query: %v", err) } if query == nil { t.Fatalf("CompileQuery returned nil query for valid Go query") } defer query.Close() } func TestExecuteQuery_Go(t *testing.T) { content := []byte("package main\nimport \"fmt\"\nfunc main() { fmt.Println(\"hi\") }") lang, _ := GetLanguage("go") tree, _ := ParseSource(content, lang) defer tree.Close() queryStr, _ := GetQuery("go") query, _ := CompileQuery(queryStr, lang) defer query.Close() matches, err := ExecuteQuery(tree, query, content) if err != nil { t.Fatalf("ExecuteQuery failed for Go: %v", err) } if len(matches) == 0 { t.Errorf("ExecuteQuery returned no matches for basic Go code") } foundPackage := false foundImport := false foundFunc := false for _, match := range matches { for _, capture := range match.Captures { captureName := query.CaptureNameForId(capture.Index) if captureName == "package" { foundPackage = true } if captureName == "import" { foundImport = true } if captureName == "definition.function" { foundFunc = true } } } if !foundPackage { t.Errorf("Expected @package capture, not found") } if !foundImport { t.Errorf("Expected @import capture, not found") } if !foundFunc { t.Errorf("Expected @definition.function capture, not found") } } ================================================ FILE: internal/compressor/genericCompressor.go ================================================ package compressor import ( "fmt" "regexp" "sort" "strings" sitter "github.com/smacker/go-tree-sitter" ) // GenericCompressor handles the compression of source code. type GenericCompressor struct{} // NewGenericCompressor creates a new GenericCompressor. func NewGenericCompressor() *GenericCompressor { return &GenericCompressor{} } // Compress takes source code content and a language identifier, // and returns the compressed code as a string. func (gc *GenericCompressor) Compress(content []byte, languageIdentifier string) (string, error) { // Special cases for languages that need custom handling switch languageIdentifier { case "html": return gc.compressHTML(content), nil case "rust": return gc.compressRust(content), nil case "java": return gc.compressJava(content), nil case "swift": return gc.compressSwift(content), nil } lang, err := GetLanguage(languageIdentifier) if err != nil { return "", fmt.Errorf("could not get language for '%s': %w", languageIdentifier, err) } tree, err := ParseSource(content, lang) if err != nil { return "", fmt.Errorf("could not parse source for '%s': %w", languageIdentifier, err) } defer tree.Close() queryStr, err := GetQuery(languageIdentifier) if err != nil { return "", fmt.Errorf("could not get query for '%s': %w", languageIdentifier, err) } query, err := CompileQuery(queryStr, lang) if err != nil { return "", fmt.Errorf("could not compile query for '%s': %w", languageIdentifier, err) } defer query.Close() matches, err := ExecuteQuery(tree, query, content) if err != nil { return "", fmt.Errorf("could not execute query for '%s': %w", languageIdentifier, err) } processedChunks := gc.processCaptures(matches, query, content, languageIdentifier) sort.SliceStable(processedChunks, func(i, j int) bool { return processedChunks[i].StartByte < processedChunks[j].StartByte }) var result strings.Builder processedNodes := make(map[uint32]struct{}) for _, chunk := range processedChunks { if _, exists := processedNodes[chunk.StartByte]; !exists { result.WriteString(chunk.Content) result.WriteString("\n// -----\n") processedNodes[chunk.StartByte] = struct{}{} } } if result.Len() == 0 { return "// No relevant code found after compression.\n", nil } return result.String(), nil } // compressHTML is a specialized function to compress HTML content // since Tree-sitter doesn't handle HTML well with the JavaScript parser func (gc *GenericCompressor) compressHTML(content []byte) string { // Convert content to string for easier processing htmlStr := string(content) // Extract HTML comments commentRegex := regexp.MustCompile(``) comments := commentRegex.FindAllString(htmlStr, -1) // Extract important tags (doctype, html, head, body, script, style) doctypeRegex := regexp.MustCompile(`]*>`) doctypes := doctypeRegex.FindAllString(htmlStr, -1) htmlTagRegex := regexp.MustCompile(`]*>|`) htmlTags := htmlTagRegex.FindAllString(htmlStr, -1) headTagRegex := regexp.MustCompile(`]*>|`) headTags := headTagRegex.FindAllString(htmlStr, -1) bodyTagRegex := regexp.MustCompile(`]*>|`) bodyTags := bodyTagRegex.FindAllString(htmlStr, -1) // Combine all extracted elements var chunks []string chunks = append(chunks, doctypes...) chunks = append(chunks, htmlTags...) chunks = append(chunks, headTags...) chunks = append(chunks, comments...) chunks = append(chunks, bodyTags...) if len(chunks) == 0 { return "// No relevant HTML elements found after compression.\n" } var result strings.Builder for _, chunk := range chunks { result.WriteString(strings.TrimSpace(chunk)) result.WriteString("\n// -----\n") } return result.String() } func (gc *GenericCompressor) processCaptures(matches []*sitter.QueryMatch, query *sitter.Query, source []byte, languageIdentifier string) []CodeChunk { var chunks []CodeChunk seenNodes := make(map[uint32]struct{}) // Tracks nodes already processed to avoid duplicates for _, match := range matches { for _, capture := range match.Captures { node := capture.Node if node == nil { continue } if _, ok := seenNodes[node.StartByte()]; ok { continue } captureName := query.CaptureNameForId(capture.Index) nodeContent := node.Content(source) chunkContent := "" switch { case strings.HasPrefix(captureName, "definition.function"), strings.HasPrefix(captureName, "definition.method"), strings.HasPrefix(captureName, "definition.class"): jsNode := node // The node captured by the query pattern. actualDeclarationNode := jsNode // Default for non-JS or simple cases. This is the node that has the .body child. var bodyNode *sitter.Node if languageIdentifier == "javascript" || languageIdentifier == "typescript" { // Check for arrow function with expression body (not statement block) arrowName := "" for _, c := range match.Captures { if query.CaptureNameForId(c.Index) == "arrow.name" { arrowName = c.Node.Content(source) break } } // Handle arrow functions with expression bodies if arrowName != "" { var arrowFunc *sitter.Node for _, c := range match.Captures { if query.CaptureNameForId(c.Index) == "arrow.function" { arrowFunc = c.Node break } } if arrowFunc != nil { // For arrow functions with expression bodies, we need to construct the signature differently params := arrowFunc.ChildByFieldName("parameters") body := arrowFunc.ChildByFieldName("body") if params != nil && body != nil && body.Type() != "statement_block" { // This is an arrow function with expression body signature := fmt.Sprintf("const %s = %s =>", strings.TrimSpace(arrowName), strings.TrimSpace(params.Content(source))) chunkContent = signature + " { ... } // Body removed" // Add to chunks and continue to next capture if chunkContent != "" { chunks = append(chunks, CodeChunk{ Content: chunkContent, StartByte: node.StartByte(), EndByte: node.EndByte(), }) seenNodes[node.StartByte()] = struct{}{} } continue } } } // Check for anonymous default exported function/class var anonNode *sitter.Node for _, c := range match.Captures { if query.CaptureNameForId(c.Index) == "anon.function" || query.CaptureNameForId(c.Index) == "anon.class" { anonNode = c.Node break } } if anonNode != nil { bodyNode = anonNode.ChildByFieldName("body") if bodyNode != nil { // For anonymous default exports, we need to construct the signature differently signature := "" if anonNode.Type() == "function_declaration" { signature = "export default function()" } else if anonNode.Type() == "class_declaration" { signature = "export default class" } chunkContent = signature + " { ... } // Body removed" // Add to chunks and continue to next capture if chunkContent != "" { chunks = append(chunks, CodeChunk{ Content: chunkContent, StartByte: node.StartByte(), EndByte: node.EndByte(), }) seenNodes[node.StartByte()] = struct{}{} } continue } } // Standard processing for other JavaScript constructs switch jsNode.Type() { case "export_statement": foundDecl := jsNode.ChildByFieldName("declaration") if foundDecl != nil && (isNamedDeclarationType(foundDecl) || foundDecl.Type() == "arrow_function" || foundDecl.Type() == "function_expression") { actualDeclarationNode = foundDecl } else { // Check for `export default function_declaration` etc. (direct named child) var directChildDecl *sitter.Node for i := 0; i < int(jsNode.NamedChildCount()); i++ { child := jsNode.NamedChild(i) // Broaden condition to find anonymous function/class declarations as well switch child.Type() { case "function_declaration", "class_declaration", "generator_function_declaration", "arrow_function", "function_expression": // These are types that can be default exported and might have bodies to strip. directChildDecl = child default: // Not a type we are looking for as a direct declaration in `export default ...` } } if directChildDecl != nil { actualDeclarationNode = directChildDecl } else { actualDeclarationNode = nil // No suitable declaration found in export statement } } case "lexical_declaration", "variable_declaration": // For `const foo = () => {}` or `var bar = function() {}` // jsNode is lexical_declaration/variable_declaration. // We need to find the specific variable_declarator with an arrow_function/function_expression. var targetArrowFunc *sitter.Node for i := 0; i < int(jsNode.NamedChildCount()); i++ { declarator := jsNode.NamedChild(i) if declarator != nil && declarator.Type() == "variable_declarator" { valueNode := declarator.ChildByFieldName("value") if valueNode != nil && (valueNode.Type() == "arrow_function" || valueNode.Type() == "function_expression") { // Check if this arrow_function/function_expression has a statement_block body, // as per the query (`body: (statement_block)`). bodyCheck := valueNode.ChildByFieldName("body") if bodyCheck != nil { targetArrowFunc = valueNode break } } } } if targetArrowFunc != nil { actualDeclarationNode = targetArrowFunc } else { actualDeclarationNode = nil // No suitable arrow function found } case "function_declaration", "generator_function_declaration", "class_declaration", "method_definition": // jsNode is already the one with the body. actualDeclarationNode = jsNode default: actualDeclarationNode = nil // Other types not meant for body stripping by this rule. } } else if languageIdentifier == "go" || languageIdentifier == "python" || languageIdentifier == "bash" || languageIdentifier == "c" { actualDeclarationNode = jsNode // For Go/Python/Bash/C, the captured node is the declaration itself } else { actualDeclarationNode = nil // Should not happen if language is supported } if actualDeclarationNode != nil { bodyNode = actualDeclarationNode.ChildByFieldName("body") } if bodyNode != nil { var signatureEndPos uint32 if languageIdentifier == "python" { var colonNode *sitter.Node for i := 0; i < int(actualDeclarationNode.ChildCount()); i++ { childNode := actualDeclarationNode.Child(i) if childNode != nil && childNode.Type() == ":" { colonNode = childNode break } } if colonNode != nil { signatureEndPos = colonNode.EndByte() } else { signatureEndPos = bodyNode.StartByte() // Fallback } } else { // For Go and JS (functions, methods, classes, or arrow funcs with statement_block bodies) // Signature ends where the body node begins. signatureEndPos = bodyNode.StartByte() } // Ensure signaturePortion starts from the beginning of the originally captured node (`jsNode`). if signatureEndPos > jsNode.StartByte() && signatureEndPos <= jsNode.EndByte() { signaturePortion := string(source[jsNode.StartByte():signatureEndPos]) trimmedSignature := strings.TrimSpace(signaturePortion) placeholder := " { ... }" switch languageIdentifier { case "python": placeholder = " { ... } # Body removed" case "javascript", "typescript", "c", "html": placeholder = " { ... } // Body removed" case "bash": placeholder = " { ... } # Body removed" } chunkContent = trimmedSignature + placeholder } else { // Fallback: if positions are unusual, keep the original content of the captured node. chunkContent = strings.TrimSpace(jsNode.Content(source)) } } else { // No bodyNode found, or actualDeclarationNode was nil (e.g. not a strippable type, or error in logic). // Keep the original content of the captured node. chunkContent = strings.TrimSpace(jsNode.Content(source)) } case strings.HasPrefix(captureName, "import"), strings.HasPrefix(captureName, "package"), strings.HasPrefix(captureName, "definition.type"), strings.HasPrefix(captureName, "definition.struct"), strings.HasPrefix(captureName, "definition.enum"), strings.HasPrefix(captureName, "definition.union"), strings.HasPrefix(captureName, "definition.typedef"), strings.HasPrefix(captureName, "export.other"), strings.HasPrefix(captureName, "command"), strings.HasPrefix(captureName, "rule_set"), strings.HasPrefix(captureName, "media"), strings.HasPrefix(captureName, "keyframes"), strings.HasPrefix(captureName, "declaration"), strings.HasPrefix(captureName, "comment"), strings.HasPrefix(captureName, "doctype"), strings.HasPrefix(captureName, "tag"), strings.HasPrefix(captureName, "script"), strings.HasPrefix(captureName, "style"): chunkContent = strings.TrimSpace(nodeContent) default: continue } if chunkContent != "" { chunks = append(chunks, CodeChunk{ Content: chunkContent, StartByte: node.StartByte(), EndByte: node.EndByte(), }) seenNodes[node.StartByte()] = struct{}{} } } } return chunks } // compressSwift is a specialized function to compress Swift content func (gc *GenericCompressor) compressSwift(content []byte) string { // Convert content to string for easier processing swiftStr := string(content) // Extract Swift comments singleLineCommentRegex := regexp.MustCompile(`(?m)//.*$`) singleLineComments := singleLineCommentRegex.FindAllString(swiftStr, -1) multiLineCommentRegex := regexp.MustCompile(`/\*[\s\S]*?\*/`) multiLineComments := multiLineCommentRegex.FindAllString(swiftStr, -1) // Extract imports importRegex := regexp.MustCompile(`import\s+\w+`) imports := importRegex.FindAllString(swiftStr, -1) // Extract struct definitions structRegex := regexp.MustCompile(`(?:public\s+|private\s+|fileprivate\s+|internal\s+)?struct\s+\w+(?::\s*[^{]+)?\s*\{`) structs := structRegex.FindAllString(swiftStr, -1) // Extract class definitions classRegex := regexp.MustCompile(`(?:public\s+|private\s+|fileprivate\s+|internal\s+)?(?:final\s+)?class\s+\w+(?::\s*[^{]+)?\s*\{`) classes := classRegex.FindAllString(swiftStr, -1) // Extract protocol definitions protocolRegex := regexp.MustCompile(`(?:public\s+|private\s+|fileprivate\s+|internal\s+)?protocol\s+\w+(?::\s*[^{]+)?\s*\{`) protocols := protocolRegex.FindAllString(swiftStr, -1) // Extract enum definitions enumRegex := regexp.MustCompile(`(?:public\s+|private\s+|fileprivate\s+|internal\s+)?enum\s+\w+(?::\s*[^{]+)?\s*\{`) enums := enumRegex.FindAllString(swiftStr, -1) // Extract function signatures funcRegex := regexp.MustCompile(`(?:public\s+|private\s+|fileprivate\s+|internal\s+)?(?:static\s+|class\s+)?func\s+\w+\s*\([^)]*\)(?:\s*->\s*[^{]+)?\s*\{`) funcs := funcRegex.FindAllString(swiftStr, -1) // Extract extension blocks extensionRegex := regexp.MustCompile(`extension\s+\w+(?::\s*[^{]+)?\s*\{`) extensions := extensionRegex.FindAllString(swiftStr, -1) // Combine all extracted elements var chunks []string chunks = append(chunks, singleLineComments...) chunks = append(chunks, multiLineComments...) chunks = append(chunks, imports...) chunks = append(chunks, structs...) chunks = append(chunks, classes...) chunks = append(chunks, protocols...) chunks = append(chunks, enums...) chunks = append(chunks, funcs...) chunks = append(chunks, extensions...) if len(chunks) == 0 { return "// No relevant Swift elements found after compression.\n" } var result strings.Builder for _, chunk := range chunks { result.WriteString(strings.TrimSpace(chunk)) result.WriteString("\n// -----\n") } return result.String() } // compressJava is a specialized function to compress Java content func (gc *GenericCompressor) compressJava(content []byte) string { // Convert content to string for easier processing javaStr := string(content) // Extract Java comments singleLineCommentRegex := regexp.MustCompile(`(?m)//.*$`) singleLineComments := singleLineCommentRegex.FindAllString(javaStr, -1) multiLineCommentRegex := regexp.MustCompile(`/\*[\s\S]*?\*/`) multiLineComments := multiLineCommentRegex.FindAllString(javaStr, -1) // Extract package declaration packageRegex := regexp.MustCompile(`package\s+[^;]+;`) packages := packageRegex.FindAllString(javaStr, -1) // Extract imports importRegex := regexp.MustCompile(`import\s+[^;]+;`) imports := importRegex.FindAllString(javaStr, -1) // Extract class definitions classRegex := regexp.MustCompile(`(?:public\s+|private\s+|protected\s+)?(?:abstract\s+|final\s+)?class\s+\w+(?:\s+extends\s+\w+)?(?:\s+implements\s+[^{]+)?\s*\{`) classes := classRegex.FindAllString(javaStr, -1) // Extract interface definitions interfaceRegex := regexp.MustCompile(`(?:public\s+|private\s+|protected\s+)?interface\s+\w+(?:\s+extends\s+[^{]+)?\s*\{`) interfaces := interfaceRegex.FindAllString(javaStr, -1) // Extract enum definitions enumRegex := regexp.MustCompile(`(?:public\s+|private\s+|protected\s+)?enum\s+\w+\s*\{`) enums := enumRegex.FindAllString(javaStr, -1) // Extract method signatures methodRegex := regexp.MustCompile(`(?:public\s+|private\s+|protected\s+)?(?:static\s+|final\s+|abstract\s+)*(?:<[^>]+>\s+)?(?:\w+(?:\[\])?\s+)?\w+\s*\([^)]*\)(?:\s+throws\s+[^{]+)?\s*\{`) methods := methodRegex.FindAllString(javaStr, -1) // Combine all extracted elements var chunks []string chunks = append(chunks, singleLineComments...) chunks = append(chunks, multiLineComments...) chunks = append(chunks, packages...) chunks = append(chunks, imports...) chunks = append(chunks, classes...) chunks = append(chunks, interfaces...) chunks = append(chunks, enums...) chunks = append(chunks, methods...) if len(chunks) == 0 { return "// No relevant Java elements found after compression.\n" } var result strings.Builder for _, chunk := range chunks { result.WriteString(strings.TrimSpace(chunk)) result.WriteString("\n// -----\n") } return result.String() } // compressRust is a specialized function to compress Rust content func (gc *GenericCompressor) compressRust(content []byte) string { // Convert content to string for easier processing rustStr := string(content) // Extract Rust comments singleLineCommentRegex := regexp.MustCompile(`(?m)//.*$`) singleLineComments := singleLineCommentRegex.FindAllString(rustStr, -1) multiLineCommentRegex := regexp.MustCompile(`/\*[\s\S]*?\*/`) multiLineComments := multiLineCommentRegex.FindAllString(rustStr, -1) // Extract imports importRegex := regexp.MustCompile(`use\s+[^;]+;`) imports := importRegex.FindAllString(rustStr, -1) // Extract struct definitions structRegex := regexp.MustCompile(`(?:pub\s+)?struct\s+\w+\s*\{[^}]*\}`) structs := structRegex.FindAllString(rustStr, -1) // Extract trait definitions traitRegex := regexp.MustCompile(`(?:pub\s+)?trait\s+\w+\s*\{[^}]*\}`) traits := traitRegex.FindAllString(rustStr, -1) // Extract enum definitions enumRegex := regexp.MustCompile(`(?:pub\s+)?enum\s+\w+\s*\{[^}]*\}`) enums := enumRegex.FindAllString(rustStr, -1) // Extract function signatures funcRegex := regexp.MustCompile(`(?:pub\s+)?fn\s+\w+\s*\([^)]*\)(?:\s*->\s*[^{]+)?\s*\{`) funcs := funcRegex.FindAllString(rustStr, -1) // Extract impl blocks implRegex := regexp.MustCompile(`impl(?:\s+\w+)?\s+for\s+\w+\s*\{`) impls := implRegex.FindAllString(rustStr, -1) // Combine all extracted elements var chunks []string chunks = append(chunks, singleLineComments...) chunks = append(chunks, multiLineComments...) chunks = append(chunks, imports...) chunks = append(chunks, structs...) chunks = append(chunks, traits...) chunks = append(chunks, enums...) chunks = append(chunks, funcs...) chunks = append(chunks, impls...) if len(chunks) == 0 { return "// No relevant Rust elements found after compression.\n" } var result strings.Builder for _, chunk := range chunks { result.WriteString(strings.TrimSpace(chunk)) result.WriteString("\n// -----\n") } return result.String() } ================================================ FILE: internal/compressor/testdata/example.c ================================================ #include #include #include // Define a struct struct Person { char name[50]; int age; float height; }; // Define an enum enum Color { RED, GREEN, BLUE, YELLOW }; // Define a union union Data { int i; float f; char str[20]; }; // Function prototype void greet(const char* name); int calculate_sum(int a, int b); // Typedef example typedef unsigned long int UINT32; /** * Main function */ int main(int argc, char *argv[]) { // Variable declarations struct Person person; enum Color favorite_color = BLUE; union Data data; UINT32 big_number = 123456789UL; // Initialize struct strcpy(person.name, "John"); person.age = 30; person.height = 1.75; // Print information printf("Name: %s\n", person.name); printf("Age: %d\n", person.age); printf("Height: %.2f\n", person.height); // Call functions greet(person.name); printf("Sum: %d\n", calculate_sum(5, 7)); // Use union data.i = 10; printf("data.i: %d\n", data.i); data.f = 220.5; printf("data.f: %.2f\n", data.f); strcpy(data.str, "C Programming"); printf("data.str: %s\n", data.str); return 0; } /** * Greet function implementation */ void greet(const char* name) { printf("Hello, %s!\n", name); if (strlen(name) > 5) { printf("You have a long name!\n"); } else { printf("You have a short name!\n"); } } /** * Calculate sum function implementation */ int calculate_sum(int a, int b) { int result = a + b; return result; } ================================================ FILE: internal/compressor/testdata/example.css ================================================ /* This is a CSS comment */ /* Import statement */ @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap'); /* Root variables */ :root { --primary-color: #3498db; --secondary-color: #2ecc71; --text-color: #333; --background-color: #f9f9f9; --spacing-unit: 8px; } /* Body styles */ body { font-family: 'Roboto', sans-serif; line-height: 1.6; color: var(--text-color); background-color: var(--background-color); margin: 0; padding: 0; } /* Header styles */ header { background-color: var(--primary-color); color: white; padding: calc(var(--spacing-unit) * 2); box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } /* Navigation styles */ nav { display: flex; justify-content: space-between; align-items: center; } nav ul { display: flex; list-style: none; margin: 0; padding: 0; } nav li { margin-left: var(--spacing-unit); } nav a { color: white; text-decoration: none; padding: var(--spacing-unit); border-radius: 4px; transition: background-color 0.3s ease; } nav a:hover { background-color: rgba(255, 255, 255, 0.2); } /* Main content */ main { max-width: 1200px; margin: 0 auto; padding: calc(var(--spacing-unit) * 3); } /* Card component */ .card { background-color: white; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); padding: calc(var(--spacing-unit) * 2); margin-bottom: calc(var(--spacing-unit) * 2); } .card-title { color: var(--primary-color); margin-top: 0; } /* Button styles */ .button { display: inline-block; background-color: var(--primary-color); color: white; padding: var(--spacing-unit) calc(var(--spacing-unit) * 2); border: none; border-radius: 4px; cursor: pointer; text-decoration: none; transition: background-color 0.3s ease; } .button:hover { background-color: #2980b9; } .button.secondary { background-color: var(--secondary-color); } .button.secondary:hover { background-color: #27ae60; } /* Media queries */ @media (max-width: 768px) { nav { flex-direction: column; } nav ul { margin-top: var(--spacing-unit); } .card { padding: var(--spacing-unit); } } /* Animation */ @keyframes fadeIn { from { opacity: 0; } to { opacity: 1; } } .fade-in { animation: fadeIn 0.5s ease-in; } ================================================ FILE: internal/compressor/testdata/example.go ================================================ package main import "fmt" // This is a comment type MyStruct struct { FieldA int FieldB string } func (s *MyStruct) MyMethod(val int) string { // Method body if val > 0 { return fmt.Sprintf("Positive: %d", val) } return "Zero or Negative" } func main() { // Main function body instance := MyStruct{FieldA: 1, FieldB: "test"} fmt.Println(instance.MyMethod(5)) fmt.Println("Hello, world!") } ================================================ FILE: internal/compressor/testdata/example.html ================================================ Example HTML Document

Example HTML Document

Welcome to our website

This is an example HTML document that demonstrates various HTML elements and structure.

This is a container with some content.

Learn More

About Us

We are a company that specializes in creating example HTML documents.

  • Founded in 2023
  • Based in Example City
  • Serving clients worldwide

Our Services

Web Design

We create beautiful and functional websites.

Web Development

We build robust web applications.

SEO Optimization

We help your website rank higher in search results.

Contact Us

© 2023 Example Company. All rights reserved.

================================================ FILE: internal/compressor/testdata/example.java ================================================ // This is a Java comment /* This is a multi-line Java comment */ // Package declaration package com.example.demo; // Import statements import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; /** * This is a Javadoc comment for the Person class * @author Example Author */ public class Person { // Instance variables private String name; private int age; private String address; // Static variable private static int count = 0; // Constants public static final int MAX_AGE = 120; // Constructor public Person(String name, int age) { this.name = name; this.age = age; this.address = null; count++; } // Getter methods public String getName() { return name; } public int getAge() { return age; } public String getAddress() { return address; } // Setter methods public void setName(String name) { this.name = name; } public void setAge(int age) { if (age > MAX_AGE) { throw new IllegalArgumentException("Age cannot be greater than " + MAX_AGE); } this.age = age; } public void setAddress(String address) { this.address = address; } // Static method public static int getCount() { return count; } // Method with return value public String getInfo() { if (address != null) { return name + ", age " + age + ", lives at " + address; } else { return name + ", age " + age; } } // Method with parameters public boolean isOlderThan(Person other) { return this.age > other.age; } // Override toString method @Override public String toString() { return getInfo(); } // Inner class public class Address { private String street; private String city; private String zipCode; public Address(String street, String city, String zipCode) { this.street = street; this.city = city; this.zipCode = zipCode; } public String getFullAddress() { return street + ", " + city + " " + zipCode; } } } // Interface definition interface Printable { void print(); // Default method default void printWithPrefix(String prefix) { System.out.println(prefix + ": " + toString()); } } // Enum definition enum Status { ACTIVE("Active"), INACTIVE("Inactive"), PENDING("Pending"); private final String label; Status(String label) { this.label = label; } public String getLabel() { return label; } } // Main class public class Main { public static void main(String[] args) { // Create objects Person person1 = new Person("John Doe", 30); person1.setAddress("123 Main St"); Person person2 = new Person("Jane Smith", 25); person2.setAddress("456 Oak Ave"); // Use methods System.out.println(person1.getInfo()); System.out.println("Total persons: " + Person.getCount()); // Conditional statement if (person1.isOlderThan(person2)) { System.out.println(person1.getName() + " is older than " + person2.getName()); } else { System.out.println(person2.getName() + " is older than " + person1.getName()); } // Collections List people = new ArrayList<>(); people.add(person1); people.add(person2); // Stream API double averageAge = people.stream() .mapToInt(Person::getAge) .average() .orElse(0); System.out.println("Average age: " + averageAge); // Lambda expression people.forEach(p -> System.out.println(p.getName())); // Try-catch block try { Person invalidPerson = new Person("Invalid", 150); } catch (IllegalArgumentException e) { System.out.println("Error: " + e.getMessage()); } finally { System.out.println("Finished processing"); } } } ================================================ FILE: internal/compressor/testdata/example.js ================================================ // This is a JavaScript comment import { something } from 'module'; export class MyJSClass { constructor(name) { this.name = name; } greet(message) { // Method body console.log(message + ", " + this.name + "!"); if (this.name.length > 3) { console.log("Long name in JS"); } return true; } } export function myJSFunction(x, y) { // Function body const result = x + y; console.log("JS Result is " + result); return result; } // Arrow function with block body const myArrowFunc = (a, b) => { // Arrow function body return a * b; }; // Arrow function with expression body const myExpressionArrow = (x) => x * x; function* myGenerator() { yield 1; yield 2; } // Another export type export const myVar = 42; // Anonymous default export function // Commented out for testing separately // export default function() { // console.log("Anon default func body"); // return "done"; // } // Anonymous default export class export default class { constructor() { this.x = 1; } } // Create a second test file for the anonymous function default export ================================================ FILE: internal/compressor/testdata/example.py ================================================ # This is a Python comment import os from sys import argv class MyClass: """ A simple class """ def __init__(self, name): self.name = name def greet(self, message): """Greets the person.""" # Method body print(f"{message}, {self.name}!") if len(self.name) > 3: print("Long name") return True def my_function(x, y): # Function body result = x + y print(f"Result is {result}") return result if __name__ == "__main__": c = MyClass("Test") c.greet("Hello") my_function(1, 2) ================================================ FILE: internal/compressor/testdata/example.rs ================================================ // This is a Rust comment /* This is a multi-line Rust comment */ // Import statements use std::collections::HashMap; use std::io::{self, Read, Write}; use std::sync::{Arc, Mutex}; // Constants and statics const MAX_SIZE: usize = 100; static GLOBAL_COUNTER: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0); // Struct definition pub struct Person { name: String, age: u32, address: Option, } // Implementation block impl Person { // Constructor pub fn new(name: &str, age: u32) -> Self { Person { name: name.to_string(), age, address: None, } } // Method with parameters pub fn set_address(&mut self, address: String) { self.address = Some(address); } // Method with return value pub fn get_info(&self) -> String { match &self.address { Some(addr) => format!("{}, age {}, lives at {}", self.name, self.age, addr), None => format!("{}, age {}", self.name, self.age), } } } // Trait definition pub trait Printable { fn print(&self); // Default implementation fn print_debug(&self) { println!("Debug print"); } } // Trait implementation impl Printable for Person { fn print(&self) { println!("{}", self.get_info()); } } // Enum definition pub enum Status { Active, Inactive, Pending(String), Error { code: u32, message: String }, } // Function with generic type pub fn process(item: &T) { item.print(); } // Main function fn main() { // Variable declaration let mut person = Person::new("John Doe", 30); person.set_address("123 Main St".to_string()); // Function call process(&person); // Pattern matching let status = Status::Pending("Awaiting approval".to_string()); match status { Status::Active => println!("Active"), Status::Inactive => println!("Inactive"), Status::Pending(reason) => println!("Pending: {}", reason), Status::Error { code, message } => println!("Error {}: {}", code, message), } // Closure let add = |a: i32, b: i32| a + b; println!("5 + 3 = {}", add(5, 3)); // Error handling let result = std::fs::read_to_string("nonexistent.txt"); match result { Ok(content) => println!("File content: {}", content), Err(error) => println!("Error reading file: {}", error), } } ================================================ FILE: internal/compressor/testdata/example.sh ================================================ #!/bin/bash # This is a bash comment # Define a function function greet() { local name=$1 echo "Hello, $name!" if [[ ${#name} -gt 3 ]]; then echo "That's a long name!" fi return 0 } # Define another function process_file() { local file=$1 if [[ -f "$file" ]]; then echo "Processing $file..." cat "$file" | grep "pattern" else echo "File not found: $file" return 1 fi } # Main script execution echo "Starting script" greet "World" process_file "/tmp/example.txt" # Conditional logic if [[ $? -eq 0 ]]; then echo "Success!" else echo "Failed!" fi # Loop example for i in {1..5}; do echo "Iteration $i" done exit 0 ================================================ FILE: internal/compressor/testdata/example.swift ================================================ // This is a Swift comment /* This is a multi-line Swift comment */ // Import statements import Foundation // Constants and variables let maxAge = 120 var count = 0 // Struct definition struct Address { let street: String let city: String let zipCode: String // Computed property var fullAddress: String { return "\(street), \(city) \(zipCode)" } } // Class definition class Person { // Properties var name: String var age: Int var address: Address? // Static property static var count = 0 // Initializer init(name: String, age: Int) { self.name = name self.age = age Person.count += 1 } // Convenience initializer convenience init(name: String) { self.init(name: name, age: 0) } // Deinitializer deinit { Person.count -= 1 } // Method with parameters func setAddress(street: String, city: String, zipCode: String) { self.address = Address(street: street, city: city, zipCode: zipCode) } // Method with return value func getInfo() -> String { if let address = address { return "\(name), age \(age), lives at \(address.fullAddress)" } else { return "\(name), age \(age)" } } // Method with parameters and return value func isOlderThan(_ other: Person) -> Bool { return self.age > other.age } // Static method static func getCount() -> Int { return count } } // Extension extension Person { // Additional method in extension func celebrateBirthday() { age += 1 print("\(name) is now \(age) years old!") } } // Protocol definition protocol Printable { func print() } // Protocol extension extension Printable { // Default implementation func printWithPrefix(_ prefix: String) { Swift.print("\(prefix): \(self)") } } // Protocol conformance extension Person: Printable { func print() { Swift.print(getInfo()) } } // Enum definition enum Status { case active case inactive case pending(String) case error(code: Int, message: String) // Method in enum func description() -> String { switch self { case .active: return "Active" case .inactive: return "Inactive" case .pending(let reason): return "Pending: \(reason)" case .error(let code, let message): return "Error \(code): \(message)" } } } // Generic function func process(_ item: T) { item.print() } // Closure let greet = { (name: String) -> String in return "Hello, \(name)!" } // Main function equivalent func main() { // Create objects let person1 = Person(name: "John Doe", age: 30) person1.setAddress(street: "123 Main St", city: "Anytown", zipCode: "12345") let person2 = Person(name: "Jane Smith", age: 25) person2.setAddress(street: "456 Oak Ave", city: "Somewhere", zipCode: "67890") // Use methods print(person1.getInfo()) print("Total persons: \(Person.getCount())") // Conditional statement if person1.isOlderThan(person2) { print("\(person1.name) is older than \(person2.name)") } else { print("\(person2.name) is older than \(person1.name)") } // Collections var people = [person1, person2] // Higher-order functions let averageAge = people.reduce(0) { $0 + $1.age } / people.count print("Average age: \(averageAge)") // Closure usage people.forEach { person in print(person.name) } // Error handling do { let data = try Data(contentsOf: URL(fileURLWithPath: "/nonexistent.txt")) print("File size: \(data.count)") } catch { print("Error reading file: \(error)") } // Pattern matching let status = Status.pending("Awaiting approval") switch status { case .active: print("Active") case .inactive: print("Inactive") case .pending(let reason): print("Pending: \(reason)") case .error(let code, let message): print("Error \(code): \(message)") } } // Call main function main() ================================================ FILE: internal/compressor/testdata/example_anon_func.js ================================================ // This is a JavaScript comment import { something } from 'module'; // Arrow function with expression body const myExpressionArrow = (x) => x * x; // Anonymous default export function export default function() { console.log("Anon default func body"); return "done"; } ================================================ FILE: main.go ================================================ package main import ( "context" "encoding/json" "errors" "fmt" "io" "net/url" "os" "path/filepath" "regexp" "sort" "strings" "github.com/charmbracelet/glamour" "github.com/fatih/color" "github.com/sammcj/ingest/config" "github.com/sammcj/ingest/filesystem" "github.com/sammcj/ingest/git" "github.com/sammcj/ingest/internal/compressor" // Added compressor import "github.com/sammcj/ingest/template" "github.com/sammcj/ingest/token" "github.com/sammcj/ingest/utils" "github.com/sammcj/ingest/web" "github.com/sammcj/quantest" openai "github.com/sashabaranov/go-openai" "github.com/spf13/cobra" ) var ( includePriority bool excludeFromTree bool tokens bool encoding string output string diff bool gitDiffBranch string gitLogBranch string lineNumber bool noCodeblock bool relativePaths bool noClipboard bool templatePath string jsonOutput bool patternExclude string printDefaultExcludes bool printDefaultTemplate bool promptPrefix string promptSuffix string report bool vramFlag bool modelIDFlag string quantFlag string contextFlag int kvCacheFlag string memoryFlag float64 fitsFlag float64 quantTypeFlag string verbose bool noDefaultExcludes bool followSymlinks bool Version string // This will be set by the linker at build time rootCmd *cobra.Command webCrawl bool webMaxDepth int webAllowedDomains []string webTimeout int webConcurrentJobs int compressFlag bool // Added compress flag anthropicFlag bool noCorrectionFlag bool ) type GitData struct { Path string GitDiff string GitDiffBranch string GitLogBranch string } func init() { rootCmd = &cobra.Command{ Use: "ingest [flags] [path ...]", Short: "Generate a markdown LLM prompt from files and directories", Long: `ingest is a command-line tool to generate an LLM prompt from files and directories.`, RunE: run, Args: cobra.ArbitraryArgs, } // Define flags rootCmd.Flags().Bool("llm", false, "Send output to any OpenAI compatible API for inference") rootCmd.Flags().BoolP("version", "V", false, "Print the version number") rootCmd.Flags().BoolVar(&excludeFromTree, "exclude-from-tree", false, "Exclude files/folders from the source tree based on exclude patterns") rootCmd.Flags().BoolVar(&includePriority, "include-priority", false, "Include files in case of conflict between include and exclude patterns") rootCmd.Flags().BoolVar(&jsonOutput, "json", false, "Print output as JSON") rootCmd.Flags().BoolVar(&noCodeblock, "no-codeblock", false, "Disable wrapping code inside markdown code blocks") rootCmd.Flags().BoolVar(&printDefaultExcludes, "print-default-excludes", false, "Print the default exclude patterns") rootCmd.Flags().BoolVar(&printDefaultTemplate, "print-default-template", false, "Print the default template") rootCmd.Flags().BoolVar(&relativePaths, "relative-paths", false, "Use relative paths instead of absolute paths, including the parent directory") rootCmd.Flags().BoolVar(&report, "report", true, "Report the top 10 largest files included in the output") rootCmd.Flags().BoolVar(&tokens, "tokens", true, "Display the token count of the generated prompt") rootCmd.Flags().BoolVarP(&diff, "diff", "d", false, "Include git diff") rootCmd.Flags().BoolVarP(&lineNumber, "line-number", "l", false, "Add line numbers to the source code") rootCmd.Flags().BoolVarP(&noClipboard, "no-clipboard", "n", false, "Disable copying to clipboard") rootCmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output") rootCmd.Flags().StringSliceP("exclude", "e", nil, "Patterns to exclude") rootCmd.Flags().StringSliceP("include", "i", nil, "Patterns to include") rootCmd.Flags().StringVar(&gitDiffBranch, "git-diff-branch", "", "Generate git diff between two branches") rootCmd.Flags().StringVar(&gitLogBranch, "git-log-branch", "", "Retrieve git log between two branches") rootCmd.Flags().StringVar(&patternExclude, "pattern-exclude", "", "Path to a specific .glob file for exclude patterns") rootCmd.Flags().StringVarP(&encoding, "encoding", "c", "o200k", "Tokeniser to use for token count (o200k, cl100k, p50k, r50k)") rootCmd.Flags().StringVarP(&output, "output", "o", "", "Optional output file path") rootCmd.Flags().StringArrayP("prompt", "p", nil, "Prompt suffix to append to the generated content") rootCmd.Flags().StringVarP(&templatePath, "template", "t", "", "Optional Path to a custom Handlebars template") rootCmd.Flags().BoolP("save", "s", false, "Automatically save the generated markdown to ~/ingest/.md") rootCmd.Flags().Bool("config", false, "Open the config file in the default editor") rootCmd.Flags().BoolVar(&noDefaultExcludes, "no-default-excludes", false, "Disable default exclude patterns") rootCmd.Flags().BoolVar(&followSymlinks, "follow-symlinks", false, "Follow symlinked files and directories") rootCmd.Flags().BoolVar(&compressFlag, "compress", false, "Enable code compression using Tree-sitter") // Added compress flag rootCmd.Flags().BoolVarP(&anthropicFlag, "anthropic", "a", false, "Use Anthropic API for token counting (requires ANTHROPIC_API_KEY, ANTHROPIC_TOKEN, or ANTHROPIC_TOKEN_COUNT_KEY)") rootCmd.Flags().BoolVar(&noCorrectionFlag, "no-correction", false, "Disable offline tokeniser correction factor (use raw token count)") // Web Crawler flags rootCmd.Flags().BoolVar(&webCrawl, "web", false, "Enable web crawling mode") rootCmd.Flags().IntVar(&webMaxDepth, "web-depth", 1, "Maximum crawling depth for web pages") rootCmd.Flags().StringSliceVar(&webAllowedDomains, "web-domains", nil, "Allowed domains for web crawling") rootCmd.Flags().IntVar(&webTimeout, "web-timeout", 120, "Timeout in seconds for web requests") rootCmd.Flags().IntVar(&webConcurrentJobs, "web-concurrent", 6, "Number of concurrent crawling jobs") // VRAM estimation flags rootCmd.Flags().BoolVar(&vramFlag, "vram", false, "Estimate vRAM usage") rootCmd.Flags().StringVarP(&modelIDFlag, "model", "m", "", "vRAM Estimation - Model ID") rootCmd.Flags().StringVarP(&quantFlag, "quant", "q", "", "vRAM Estimation - Quantization type (e.g., q4_k_m) or bits per weight (e.g., 5.0)") rootCmd.Flags().IntVar(&contextFlag, "context", 0, "vRAM Estimation - Context length for vRAM estimation") rootCmd.Flags().StringVar(&kvCacheFlag, "kvcache", "fp16", "vRAM Estimation - KV cache quantization: fp16, q8_0, or q4_0") rootCmd.Flags().StringVar(&quantTypeFlag, "quanttype", "gguf", "vRAM Estimation - Quantization type: gguf or exl2") rootCmd.Flags().Float64Var(&memoryFlag, "memory", 0, "vRAM Estimation - Available memory in GB for context calculation") // if --fits is set, set memory to it's value rootCmd.Flags().Float64VarP(&fitsFlag, "fits", "f", 0, "(alias for --memory)") if err := rootCmd.Flags().SetAnnotation("fits", cobra.BashCompOneRequiredFlag, []string{"--memory"}); err != nil { fmt.Printf("Error setting annotation for fits flag: %v\n", err) } // Add completion command rootCmd.AddCommand(&cobra.Command{ Use: "completion [bash|zsh|fish]", Short: "Generate completion script", Long: `Generate shell completion script for the specified shell. To load completions: Bash: $ source <(ingest completion bash) To load completions for each session, execute once: Linux: $ ingest completion bash > /etc/bash_completion.d/ingest macOS: $ ingest completion bash > $(brew --prefix)/etc/bash_completion.d/ingest Zsh: $ source <(ingest completion zsh) To load completions for each session, execute once: $ ingest completion zsh > "${fpath[1]}/_ingest" Fish: $ ingest completion fish | source To load completions for each session, execute once: $ ingest completion fish > ~/.config/fish/completions/ingest.fish `, DisableFlagsInUseLine: true, ValidArgs: []string{"bash", "zsh", "fish"}, Args: cobra.MatchAll(cobra.ExactArgs(1), cobra.OnlyValidArgs), Run: runCompletion, }) } func main() { if err := rootCmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) } } func run(cmd *cobra.Command, args []string) error { // If no arguments are provided, use the current directory if len(args) == 0 { currentDir, err := os.Getwd() if err != nil { return fmt.Errorf("failed to get current directory: %w", err) } args = []string{currentDir} } if version, _ := cmd.Flags().GetBool("version"); version { fmt.Printf("ingest version %s\n", Version) return nil } if configFlag, _ := cmd.Flags().GetBool("config"); configFlag { if err := config.OpenConfig(); err != nil { return fmt.Errorf("failed to open config: %w", err) } os.Exit(0) } cfg, err := config.LoadConfig() if err != nil { return fmt.Errorf("failed to load config: %w", err) } if err := utils.EnsureConfigDirectories(); err != nil { return fmt.Errorf("failed to ensure config directories: %w", err) } // If no arguments are provided, use the current directory if len(args) == 0 { currentDir, err := os.Getwd() if err != nil { return fmt.Errorf("failed to get current directory: %w", err) } args = []string{currentDir} } // Handle the prompt flag promptArray, _ := cmd.Flags().GetStringArray("prompt") promptSuffix = strings.Join(promptArray, " ") if printDefaultExcludes { filesystem.PrintDefaultExcludes() return nil } if printDefaultTemplate { template.PrintDefaultTemplate() return nil } includePatterns, _ := cmd.Flags().GetStringSlice("include") excludePatterns, _ := cmd.Flags().GetStringSlice("exclude") // Setup template tmpl, err := template.SetupTemplate(templatePath) if err != nil { return fmt.Errorf("failed to set up template: %w", err) } // Setup progress spinner spinner := utils.SetupSpinner("Traversing directory and building tree..") defer func() { if err := spinner.Finish(); err != nil { fmt.Printf("Error finishing spinner: %v\n", err) } }() // If verbose, print active excludes if verbose { activeExcludes, err := filesystem.ReadExcludePatterns(patternExclude, false) if err != nil { return fmt.Errorf("failed to read exclude patterns: %w", err) } printExcludePatterns(activeExcludes) } // Process all provided paths var allFiles []filesystem.FileInfo var allTrees []string var gitData []GitData var allExcluded []*filesystem.ExcludedInfo remainingArgs := make([]string, len(args)) copy(remainingArgs, args) for i := range remainingArgs { arg := remainingArgs[i] // Check if this is a URL (either with --web flag or auto-detected) if webCrawl || isURL(arg) { if !isURL(arg) { return fmt.Errorf("web crawling is enabled but the argument '%s' is not a URL", arg) } utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Processing URL: %s", arg), color.FgBlue) // Process as web URL - now passing excludePatterns result, err := processWebInput(arg, excludePatterns) if err != nil { return fmt.Errorf("failed to process web URL %s: %w", arg, err) } allFiles = append(allFiles, result.Files...) allTrees = append(allTrees, result.TreeString) continue } // Process as local file/directory absPath, err := filepath.Abs(arg) if err != nil { return fmt.Errorf("failed to get absolute path for %s: %w", arg, err) } fileInfo, err := os.Stat(absPath) if err != nil { return fmt.Errorf("failed to get file info for %s: %w", arg, err) } var files []filesystem.FileInfo var tree string var excluded *filesystem.ExcludedInfo // Initialize the compressor if the flag is set var comp *compressor.GenericCompressor if compressFlag { comp = compressor.NewGenericCompressor() } if fileInfo.IsDir() { // Existing directory processing logic tree, files, excluded, err = filesystem.WalkDirectory(absPath, includePatterns, excludePatterns, patternExclude, includePriority, lineNumber, relativePaths, excludeFromTree, noCodeblock, noDefaultExcludes, followSymlinks, comp) // Pass compressor if err != nil { return fmt.Errorf("failed to process directory %s: %w", arg, err) } // Use relative path for tree header when relativePaths flag is set treePath := absPath if relativePaths { treePath = filepath.Base(absPath) } tree = fmt.Sprintf("%s:\n%s", treePath, tree) } else { // New file processing logic file, err := filesystem.ProcessSingleFile(absPath, lineNumber, relativePaths, noCodeblock, followSymlinks, comp) // Pass compressor if err != nil { return fmt.Errorf("failed to process file %s: %w", arg, err) } files = []filesystem.FileInfo{file} // Use relative path for file header when relativePaths flag is set filePath := absPath if relativePaths { filePath = filepath.Base(absPath) } tree = fmt.Sprintf("File: %s", filePath) } allFiles = append(allFiles, files...) allTrees = append(allTrees, tree) if excluded != nil { allExcluded = append(allExcluded, excluded) } // Handle git operations for each path gitDiffContent := "" gitDiffBranchContent := "" gitLogBranchContent := "" if diff { gitDiffContent, err = git.GetGitDiff(absPath) if err != nil { // Log the error but continue processing fmt.Printf("Warning: failed to get git diff for %s: %v\n", absPath, err) } } if gitDiffBranch != "" { branches := strings.Split(gitDiffBranch, ",") if len(branches) == 2 { gitDiffBranchContent, err = git.GetGitDiffBetweenBranches(absPath, branches[0], branches[1]) if err != nil { fmt.Printf("Warning: failed to get git diff between branches for %s: %v\n", absPath, err) } } } if gitLogBranch != "" { branches := strings.Split(gitLogBranch, ",") if len(branches) == 2 { gitLogBranchContent, err = git.GetGitLog(absPath, branches[0], branches[1]) if err != nil { fmt.Printf("Warning: failed to get git log for %s: %v\n", absPath, err) } } } gitData = append(gitData, GitData{ Path: absPath, GitDiff: gitDiffContent, GitDiffBranch: gitDiffBranchContent, GitLogBranch: gitLogBranchContent, }) } // Prepare data for template var excludedInfo any if len(allExcluded) > 0 { excludedInfo = allExcluded[0] // Use the first excluded info if available } data := map[string]any{ "source_trees": strings.Join(allTrees, "\n\n"), "files": allFiles, "git_data": gitData, "excluded": excludedInfo, } if err := spinner.Finish(); err != nil { return fmt.Errorf("failed to finish spinner: %w", err) } // Render template rendered, err := template.RenderTemplate(tmpl, data) if err != nil { return fmt.Errorf("failed to render template: %w", err) } // Check if save is set in config or flag autoSave, _ := cmd.Flags().GetBool("save") if cfg.AutoSave || autoSave { // Pass the output flag value to autoSaveOutput if err := autoSaveOutput(rendered, output, args[0]); err != nil { // Assuming args[0] is a representative source path utils.PrintColouredMessage("❌", fmt.Sprintf("Error auto-saving file: %v", err), color.FgRed) } } // VRAM estimation if vramFlag { fmt.Println() if err := performVRAMEstimation(rendered); err != nil { utils.PrintColouredMessage("❌", fmt.Sprintf("VRAM estimation error: %v", err), color.FgRed) } } useLLM, _ := cmd.Flags().GetBool("llm") // Handle output if useLLM { if err := handleLLMOutput(rendered, cfg.LLM, tokens, encoding); err != nil { utils.PrintColouredMessage("❌", fmt.Sprintf("LLM output error: %v", err), color.FgRed) } } else { // If both --save and --output are used, we don't want handleOutput to write the file // as autoSaveOutput will handle it outputForHandleOutput := output if (cfg.AutoSave || autoSave) && output != "" { outputForHandleOutput = "" } if err := handleOutput(rendered, tokens, encoding, noClipboard, outputForHandleOutput, jsonOutput, report || verbose, allFiles); err != nil { return fmt.Errorf("failed to handle output: %w", err) } } // Print all collected messages at the end utils.PrintMessages() return nil } func reportLargestFiles(files []filesystem.FileInfo) { sort.Slice(files, func(i, j int) bool { return len(files[i].Code) > len(files[j].Code) }) utils.PrintColouredMessage("ℹ️", "Top 15 largest files (by estimated token count):", color.FgCyan) colourRange := []*color.Color{ color.New(color.FgRed), color.New(color.FgRed), color.New(color.FgRed), color.New(color.FgRed), color.New(color.FgRed), color.New(color.FgYellow), color.New(color.FgYellow), color.New(color.FgYellow), color.New(color.FgYellow), color.New(color.FgYellow), color.New(color.FgGreen), color.New(color.FgGreen), color.New(color.FgGreen), color.New(color.FgGreen), color.New(color.FgGreen), } // Limit to top 15 files displayCount := min(len(files), 15) // Collect file contents for batch processing fileContents := make([]string, displayCount) for i := range displayCount { fileContents[i] = files[i].Code } // Count tokens in batch (uses parallel API calls if Anthropic API is enabled) tokenCounts := token.CountTokensBatch(fileContents, encoding, anthropicFlag, noCorrectionFlag) // Print the files with their token counts for i := range displayCount { colour := colourRange[i] fmt.Printf("- %d. %s (%s tokens)\n", i+1, files[i].Path, colour.Sprint(utils.FormatNumber(tokenCounts[i]))) } fmt.Println() } func handleOutput(rendered string, countTokens bool, encoding string, noClipboard bool, output string, jsonOutput bool, report bool, files []filesystem.FileInfo) error { if countTokens { tokenCount := token.CountTokens(rendered, encoding, anthropicFlag, noCorrectionFlag) println() utils.AddMessage("ℹ️", fmt.Sprintf("Tokens (Approximate): %v", utils.FormatNumber(tokenCount)), color.FgYellow, 1) } if report { reportLargestFiles(files) } if jsonOutput { jsonData := map[string]any{ "prompt": rendered, "token_count": token.CountTokens(rendered, encoding, anthropicFlag, noCorrectionFlag), "model_info": token.GetModelInfo(encoding), } jsonBytes, err := json.MarshalIndent(jsonData, "", " ") if err != nil { return fmt.Errorf("failed to marshal JSON: %w", err) } fmt.Println(string(jsonBytes)) } else { outputWritten := false if output != "" { err := utils.WriteToFile(output, rendered) if err != nil { // Report the error but continue to potentially copy to clipboard or print utils.PrintColouredMessage("❌", fmt.Sprintf("Failed to write to file %s: %v", output, err), color.FgRed) } else { utils.AddMessage("✅", fmt.Sprintf("Written to file: %s", output), color.FgGreen, 20) outputWritten = true } } clipboardCopied := false if !noClipboard { err := utils.CopyToClipboard(rendered) if err == nil { utils.AddMessage("✅", "Copied to clipboard successfully.", color.FgGreen, 5) clipboardCopied = true } else { // Only show clipboard error if we didn't write to a file if !outputWritten { utils.PrintColouredMessage("⚠️", fmt.Sprintf("Failed to copy to clipboard: %v.", err), color.FgYellow) } } } // If neither output file was written nor clipboard was copied, print to console if !outputWritten && !clipboardCopied { fmt.Print(rendered) } } return nil } func printExcludePatterns(patterns []string) { utils.PrintColouredMessage("i", "Active exclude patterns:", color.FgCyan) // Define colours for syntax highlighting starColour := color.New(color.FgHiGreen).SprintFunc() slashColour := color.New(color.FgGreen).SprintFunc() dotColour := color.New(color.FgBlue).SprintFunc() // Calculate the maximum width of patterns for alignment maxWidth := 0 for _, pattern := range patterns { if len(pattern) > maxWidth { maxWidth = len(pattern) } } // Print patterns in a horizontal list lineWidth := 0 // get the width of the terminal w := utils.GetTerminalWidth() for i, pattern := range patterns { highlighted := pattern highlighted = strings.ReplaceAll(highlighted, "*", starColour("*")) highlighted = strings.ReplaceAll(highlighted, "/", slashColour("/")) highlighted = strings.ReplaceAll(highlighted, ".", dotColour(".")) // Add padding to align patterns padding := strings.Repeat(" ", maxWidth-len(pattern)+2) if lineWidth+len(pattern)+2 > w && i > 0 { fmt.Println() lineWidth = 0 } if lineWidth == 0 { fmt.Print(" ") } fmt.Print(highlighted + padding) lineWidth += len(pattern) + len(padding) if i < len(patterns)-1 { fmt.Print("| ") lineWidth += 2 } } } func handleLLMOutput(rendered string, llmConfig config.LLMConfig, countTokens bool, encoding string) error { if countTokens { tokenCount := token.CountTokens(rendered, encoding, anthropicFlag, noCorrectionFlag) utils.AddMessage("ℹ️", fmt.Sprintf("Tokens (Approximate): %v", utils.FormatNumber(tokenCount)), color.FgYellow, 40) } if promptPrefix != "" { rendered = promptPrefix + "\n" + rendered } if promptSuffix != "" { rendered += "\n" + promptSuffix } if llmConfig.AuthToken == "" { return fmt.Errorf("LLM auth token is empty") } clientConfig := openai.DefaultConfig(llmConfig.AuthToken) clientConfig.BaseURL = llmConfig.BaseURL clientConfig.APIType = openai.APIType(llmConfig.APIType) c := openai.NewClientWithConfig(clientConfig) ctx := context.Background() req := openai.CompletionRequest{ Model: llmConfig.Model, MaxTokens: llmConfig.MaxTokens, Prompt: rendered, Stream: true, } if llmConfig.Temperature != nil { req.Temperature = *llmConfig.Temperature } if llmConfig.TopP != nil { req.TopP = *llmConfig.TopP } if llmConfig.PresencePenalty != nil { req.PresencePenalty = *llmConfig.PresencePenalty } if llmConfig.FrequencyPenalty != nil { req.FrequencyPenalty = *llmConfig.FrequencyPenalty } stream, err := c.CreateCompletionStream(ctx, req) if err != nil { return fmt.Errorf("LLM CompletionStream error: %w", err) } defer stream.Close() termWidth := min( // if the term width is over 160, set it to 160 utils.GetTerminalWidth(), 160) r, err := glamour.NewTermRenderer( glamour.WithStandardStyle("dracula"), glamour.WithWordWrap(termWidth-10), ) if err != nil { return fmt.Errorf("failed to create renderer: %w", err) } var buffer strings.Builder var output strings.Builder for { response, err := stream.Recv() if errors.Is(err, io.EOF) { break } if err != nil { return fmt.Errorf("stream error: %w", err) } buffer.WriteString(response.Choices[0].Text) // Process complete lines for { line, rest, found := strings.Cut(buffer.String(), "\n") if !found { break } output.WriteString(line + "\n") buffer.Reset() buffer.WriteString(rest) // Render and print if we have a complete code block or enough non-code content // if there are any headers, render the markdown chuck before each header isHeading := regexp.MustCompile(`^#+`).MatchString(output.String()) if isHeading && output.Len() > 0 && !strings.Contains(output.String(), "```") && (strings.HasSuffix(output.String(), "```\n") || output.Len() > 200) { contentToRender := strings.TrimSpace(output.String()) + "\n" renderedContent, err := r.Render(contentToRender) if err != nil { return fmt.Errorf("rendering error: %w", err) } fmt.Print(renderedContent) output.Reset() } else { if !strings.Contains(output.String(), "```") && (strings.HasSuffix(output.String(), "```\n") || output.Len() > 200) { // Trim excess newlines before rendering contentToRender := strings.TrimSpace(output.String()) + "\n" renderedContent, err := r.Render(contentToRender) if err != nil { return fmt.Errorf("rendering error: %w", err) } fmt.Print(renderedContent) output.Reset() } } } } // Render and print any remaining content if buffer.Len() > 0 { output.WriteString(buffer.String()) } if output.Len() > 0 { renderedContent, err := r.Render(output.String()) if err != nil { return fmt.Errorf("rendering error: %w", err) } fmt.Print(renderedContent) } return nil } func performVRAMEstimation(content string) error { if modelIDFlag == "" { return fmt.Errorf("model ID is required for vRAM estimation") } tokenCount := token.CountTokens(content, encoding, anthropicFlag, noCorrectionFlag) // TODO: fix this: // quant, err := quantest.GetOllamaQuantLevel(modelIDFlag) // if err != nil { // return fmt.Errorf("error getting quantisation level: %w", err) // } quant := "q4_k_m" estimation, err := quantest.EstimateVRAMForModel(modelIDFlag, memoryFlag, tokenCount, quant, kvCacheFlag) if err != nil { return fmt.Errorf("error estimating vRAM: %w", err) } utils.AddMessage("ℹ️", fmt.Sprintf("Model: %s", estimation.ModelName), color.FgCyan, 10) utils.AddMessage("ℹ️", fmt.Sprintf("Estimated vRAM Required: %.2f GB", estimation.EstimatedVRAM), color.FgCyan, 3) // print the vram available utils.AddMessage("ℹ️", fmt.Sprintf("Available vRAM: %.2f GB", memoryFlag), color.FgCyan, 10) if estimation.FitsAvailable { utils.AddMessage("✅", "Fits Available vRAM", color.FgGreen, 2) } else { utils.AddMessage("❌", "Does Not Fit Available vRAM", color.FgYellow, 2) } utils.AddMessage("ℹ️", fmt.Sprintf("Max Context Size: %d", estimation.MaxContextSize), color.FgCyan, 8) // utils.AddMessage("ℹ️", fmt.Sprintf("Maximum Quantisation: %s", estimation.MaximumQuant), color.FgCyan, 10) // TODO: - this isn't that useful, come up with something smarter // Generate and print the quant table table, err := quantest.GenerateQuantTable(estimation.ModelConfig, memoryFlag) if err != nil { return fmt.Errorf("error generating quant table: %w", err) } fmt.Println(quantest.PrintFormattedTable(table)) // Check if the content fits within the specified constraints if memoryFlag > 0 { if tokenCount > estimation.MaxContextSize { utils.AddMessage("❗️", fmt.Sprintf("Generated content (%d tokens) exceeds maximum context (%d tokens).", tokenCount, estimation.MaxContextSize), color.FgYellow, 2) } else { utils.AddMessage("✅", fmt.Sprintf("Generated content (%d tokens) fits within maximum context (%d tokens).", tokenCount, estimation.MaxContextSize), color.FgGreen, 2) } } return nil } // autoSaveOutput saves the content based on the combination of --save and --output flags: // - If only --save is used, save to ~/ingest/.md // - If --save and --output ./somefile.md, only save to ./somefile.md // - If --save and --output somefile.md, save to ~/ingest/somefile.md func autoSaveOutput(content string, outputPath string, sourcePath string) error { var finalPath string if outputPath != "" { if strings.HasPrefix(outputPath, "./") || strings.HasPrefix(outputPath, "../") || filepath.IsAbs(outputPath) { // Case: --output starts with ./ or ../ or is absolute path // Save only to the specified output path absOutputPath, err := filepath.Abs(outputPath) if err != nil { return fmt.Errorf("failed to get absolute path for output file %s: %w", outputPath, err) } finalPath = absOutputPath } else { // Case: --output is just a filename // Save to ~/ingest/ with the specified filename homeDir, err := os.UserHomeDir() if err != nil { return fmt.Errorf("failed to get user home directory: %w", err) } ingestDir := filepath.Join(homeDir, "ingest") finalPath = filepath.Join(ingestDir, outputPath) } } else { // Default --save behavior homeDir, err := os.UserHomeDir() if err != nil { return fmt.Errorf("failed to get user home directory: %w", err) } ingestDir := filepath.Join(homeDir, "ingest") fileName := filepath.Base(sourcePath) + ".md" finalPath = filepath.Join(ingestDir, fileName) } // Ensure the directory for the final path exists finalDir := filepath.Dir(finalPath) if err := os.MkdirAll(finalDir, 0700); err != nil { return fmt.Errorf("failed to create directory %s for auto-save file: %w", finalDir, err) } // Write the file using os.WriteFile if err := os.WriteFile(finalPath, []byte(content), 0600); err != nil { return fmt.Errorf("failed to write auto-save file to %s: %w", finalPath, err) } utils.AddMessage("💾", fmt.Sprintf("Auto-saved to: %s", finalPath), color.FgMagenta, 15) // Changed icon and message slightly return nil } func runCompletion(cmd *cobra.Command, args []string) { switch args[0] { case "bash": if err := cmd.Root().GenBashCompletion(os.Stdout); err != nil { fmt.Printf("Error generating bash completion: %v\n", err) } case "zsh": if err := cmd.Root().GenZshCompletion(os.Stdout); err != nil { fmt.Printf("Error generating zsh completion: %v\n", err) } case "fish": if err := cmd.Root().GenFishCompletion(os.Stdout, true); err != nil { fmt.Printf("Error generating fish completion: %v\n", err) } } } func processWebInput(urlStr string, excludePatterns []string) (*web.CrawlResult, error) { options := web.CrawlOptions{ MaxDepth: webMaxDepth, AllowedDomains: webAllowedDomains, Timeout: webTimeout, ConcurrentJobs: webConcurrentJobs, } return web.ProcessWebURL(urlStr, options, excludePatterns) } func isURL(str string) bool { u, err := url.Parse(str) return err == nil && u.Scheme != "" && u.Host != "" } ================================================ FILE: pdf/pdf.go ================================================ // pdf/pdf.go package pdf import ( "fmt" "io" "net/http" "os" "path/filepath" "strings" "github.com/ledongthuc/pdf" ) // ConvertPDFToMarkdown converts a PDF file to markdown format func ConvertPDFToMarkdown(path string, isURL bool) (string, error) { var reader io.ReadCloser var err error if isURL { reader, err = downloadPDF(path) if err != nil { return "", fmt.Errorf("failed to download PDF: %w", err) } defer reader.Close() tempFile, err := os.CreateTemp("", "ingest-*.pdf") if err != nil { return "", fmt.Errorf("failed to create temp file: %w", err) } defer os.Remove(tempFile.Name()) defer tempFile.Close() if _, err := io.Copy(tempFile, reader); err != nil { return "", fmt.Errorf("failed to save PDF: %w", err) } path = tempFile.Name() } // Open and read the PDF f, r, err := pdf.Open(path) if err != nil { return "", fmt.Errorf("failed to open PDF: %w", err) } defer f.Close() var buf strings.Builder buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path))) // Extract text from each page totalPages := r.NumPage() for pageNum := 1; pageNum <= totalPages; pageNum++ { page := r.Page(pageNum) if page.V.IsNull() { continue } text, err := page.GetPlainText(nil) if err != nil { return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err) } // Clean and process the text cleanedText := cleanText(text) if cleanedText != "" { buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum)) buf.WriteString(cleanedText) buf.WriteString("\n\n") } } result := buf.String() if strings.TrimSpace(result) == strings.TrimSpace(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path))) { return "", fmt.Errorf("no text content could be extracted from PDF") } return result, nil } // IsPDF checks if a file is a PDF based on its content type or extension func IsPDF(path string) (bool, error) { // Check if it's a URL if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") { resp, err := http.Head(path) if err != nil { return false, fmt.Errorf("failed to check URL for PDF: %w", err) } defer resp.Body.Close() return resp.Header.Get("Content-Type") == "application/pdf", nil } // Check local file file, err := os.Open(path) if err != nil { return false, fmt.Errorf("failed to open file: %w", err) } defer file.Close() // Read first 512 bytes to determine file type buffer := make([]byte, 512) n, err := file.Read(buffer) if err != nil && err != io.EOF { return false, fmt.Errorf("failed to read file header: %w", err) } // Check file signature contentType := http.DetectContentType(buffer[:n]) if contentType == "application/pdf" { return true, nil } // Also check file extension return strings.ToLower(filepath.Ext(path)) == ".pdf", nil } func downloadPDF(url string) (io.ReadCloser, error) { resp, err := http.Get(url) if err != nil { return nil, err } if resp.StatusCode != http.StatusOK { resp.Body.Close() return nil, fmt.Errorf("failed to download PDF: status code %d", resp.StatusCode) } if resp.Header.Get("Content-Type") != "application/pdf" { resp.Body.Close() return nil, fmt.Errorf("URL does not point to a PDF file") } return resp.Body, nil } func cleanText(text string) string { if strings.Contains(text, "%PDF-") || strings.Contains(text, "endobj") { // This appears to be raw PDF data rather than extracted text return "" } // Remove control characters except newlines and tabs text = strings.Map(func(r rune) rune { if r < 32 && r != '\n' && r != '\t' { return -1 } return r }, text) // Split into lines and clean each line lines := strings.Split(text, "\n") var cleanLines []string for _, line := range lines { line = strings.TrimSpace(line) // Skip empty lines and lines that look like PDF syntax if line == "" || strings.HasPrefix(line, "%") || strings.HasPrefix(line, "/") || strings.Contains(line, "obj") || strings.Contains(line, "endobj") || strings.Contains(line, "stream") { continue } cleanLines = append(cleanLines, line) } return strings.Join(cleanLines, "\n\n") } ================================================ FILE: scripts/install.sh ================================================ #!/usr/bin/env bash # This is a simple installer that gets the latest version of ingest from Github and installs it to /usr/local/bin INSTALL_DIR="/usr/local/bin" INSTALL_PATH="${INSTALL_PATH:-$INSTALL_DIR/ingest}" ARCH=$(uname -m | tr '[:upper:]' '[:lower:]') OS=$(uname -s | tr '[:upper:]' '[:lower:]') # Ensure the user is not root if [ "$EUID" -eq 0 ]; then echo "Please do not run as root" exit 1 fi # Get the latest release from Github VER=$(curl --silent -qI https://github.com/sammcj/ingest/releases/latest | awk -F '/' '/^location/ {print substr($NF, 1, length($NF)-1)}') echo "Downloading ingest ${VER} for ${OS}-${ARCH}..." wget -q --show-progress -O ingest "https://github.com/sammcj/ingest/releases/download/$VER/ingest-${OS}-${ARCH}" # # Move the binary to the install directory mv ingest "${INSTALL_PATH}" # # Make the binary executable chmod +x "${INSTALL_PATH}" echo "ingest has been installed to ${INSTALL_PATH}" ================================================ FILE: template/template.go ================================================ package template import ( "fmt" "os" "path/filepath" "strings" "text/template" "github.com/fatih/color" "github.com/mitchellh/go-homedir" "github.com/sammcj/ingest/utils" ) func SetupTemplate(templatePath string) (*template.Template, error) { var templateContent string var err error if templatePath != "" { templateContent, err = readTemplateFile(templatePath) } else { templateContent, err = getDefaultTemplate() } if err != nil { return nil, fmt.Errorf("failed to read template: %w", err) } tmpl, err := template.New("default").Parse(templateContent) if err != nil { return nil, fmt.Errorf("failed to parse template: %w", err) } return tmpl, nil } func readTemplateFile(path string) (string, error) { content, err := os.ReadFile(path) if err != nil { return "", err } return string(content), nil } func getDefaultTemplate() (string, error) { // Check for user-specific template home, err := homedir.Dir() if err == nil { userTemplateDir := filepath.Join(home, ".config", "ingest", "patterns", "templates") userDefaultTemplate := filepath.Join(userTemplateDir, "default.tmpl") if _, err := os.Stat(userDefaultTemplate); err == nil { return readTemplateFile(userDefaultTemplate) } } // If no user-specific template, use built-in template return readEmbeddedTemplate() } func readEmbeddedTemplate() (string, error) { return ` Source Trees: {{.source_trees}} {{if .excluded}} Excluded Content: {{if le .excluded.TotalFiles 20}} Files: {{range .excluded.Files}} - {{.}} {{end}} {{else}} Directories with excluded files: {{range $dir, $count := .excluded.Directories}} {{if gt $count 0}}- {{$dir}}: {{$count}} files{{end}} {{end}} File extensions excluded: {{range $ext, $count := .excluded.Extensions}} - {{$ext}}: {{$count}} files {{end}} {{end}} {{end}} {{range .files}} {{if .Code}} ` + "`{{.Path}}:`" + ` {{.Code}} {{end}} {{end}} {{range .git_data}} {{if or .GitDiff .GitDiffBranch .GitLogBranch}} Git Information for {{.Path}}: {{if .GitDiff}} Git Diff: {{.GitDiff}} {{end}} {{if .GitDiffBranch}} Git Diff Between Branches: {{.GitDiffBranch}} {{end}} {{if .GitLogBranch}} Git Log Between Branches: {{.GitLogBranch}} {{end}} {{end}} {{end}} `, nil } func RenderTemplate(tmpl *template.Template, data map[string]any) (string, error) { var output strings.Builder err := tmpl.Execute(&output, data) if err != nil { return "", fmt.Errorf("failed to render template: %w", err) } return output.String(), nil } func PrintDefaultTemplate() { defaultTemplate, err := readEmbeddedTemplate() if err != nil { utils.PrintColouredMessage("!", fmt.Sprintf("Failed to get default template: %v", err), color.FgRed) os.Exit(1) } fmt.Println(defaultTemplate) } ================================================ FILE: token/anthropic.go ================================================ package token import ( "bytes" "encoding/json" "fmt" "io" "net/http" "os" "sync" "time" ) type AnthropicTokenCountRequest struct { Model string `json:"model"` Messages []Message `json:"messages"` } type Message struct { Role string `json:"role"` Content string `json:"content"` } type AnthropicTokenCountResponse struct { InputTokens int `json:"input_tokens"` } // AnthropicModel is the Claude model used for token counting const AnthropicModel = "claude-sonnet-4-5" // httpClient is a reusable HTTP client with connection pooling and timeout var httpClient = &http.Client{ Timeout: 30 * time.Second, } func getAnthropicAPIKey() (string, error) { // Check environment variables in order of preference if key := os.Getenv("ANTHROPIC_API_KEY"); key != "" { return key, nil } if key := os.Getenv("ANTHROPIC_TOKEN"); key != "" { return key, nil } if key := os.Getenv("ANTHROPIC_TOKEN_COUNT_KEY"); key != "" { return key, nil } return "", fmt.Errorf("no Anthropic API key found in environment variables (checked ANTHROPIC_API_KEY, ANTHROPIC_TOKEN, ANTHROPIC_TOKEN_COUNT_KEY)") } func CountTokensAPI(content string) (int, error) { apiKey, err := getAnthropicAPIKey() if err != nil { return 0, err } requestBody := AnthropicTokenCountRequest{ Model: AnthropicModel, Messages: []Message{ { Role: "user", Content: content, }, }, } jsonData, err := json.Marshal(requestBody) if err != nil { return 0, fmt.Errorf("failed to marshal request: %w", err) } req, err := http.NewRequest("POST", "https://api.anthropic.com/v1/messages/count_tokens", bytes.NewBuffer(jsonData)) if err != nil { return 0, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("x-api-key", apiKey) req.Header.Set("anthropic-version", "2023-06-01") resp, err := httpClient.Do(req) if err != nil { return 0, fmt.Errorf("failed to make request: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) return 0, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body)) } var response AnthropicTokenCountResponse if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { return 0, fmt.Errorf("failed to decode response: %w", err) } return response.InputTokens, nil } // CountTokensBatchAPI counts tokens for multiple content strings in parallel batches. // Processes up to batchSize items concurrently to avoid overwhelming the API. func CountTokensBatchAPI(contents []string, batchSize int) ([]int, error) { if len(contents) == 0 { return []int{}, nil } results := make([]int, len(contents)) errors := make([]error, len(contents)) var wg sync.WaitGroup // Process in batches for i := 0; i < len(contents); i += batchSize { end := min(i+batchSize, len(contents)) // Process this batch concurrently for j := i; j < end; j++ { wg.Add(1) go func(index int, content string) { defer wg.Done() count, err := CountTokensAPI(content) if err != nil { errors[index] = err results[index] = 0 } else { results[index] = count } }(j, contents[j]) } // Wait for this batch to complete before starting the next wg.Wait() } // Check if any errors occurred var firstError error for _, err := range errors { if err != nil { firstError = err break } } return results, firstError } ================================================ FILE: token/token.go ================================================ package token import ( "fmt" "sync" "github.com/fatih/color" "github.com/pkoukk/tiktoken-go" "github.com/sammcj/ingest/utils" ) var ( apiUsedOnce sync.Once apiWarningOnce sync.Once ) // CorrectionMultiplier is applied to offline token counts for better accuracy. // Based on empirical analysis comparing offline tokeniser with Anthropic API, // the offline tokeniser underestimates by approximately 15.25%. // A 1.18x multiplier reduces average error from ~17% to ~2%. const CorrectionMultiplier = 1.18 func GetTokenizer(encoding string) *tiktoken.Tiktoken { var err error var tk *tiktoken.Tiktoken switch encoding { case "o200k", "gpt-4o", "gpt-4.1", "gpt-4.5": tk, err = tiktoken.GetEncoding("o200k_base") case "cl100k", "llama3", "llama-3", "gpt-4", "gpt-3.5-turbo", "text-embedding-3-large", "text-embedding-3-small", "text-embedding-ada-002", "text-ada-002": tk, err = tiktoken.GetEncoding("cl100k_base") case "p50k": tk, err = tiktoken.GetEncoding("p50k_base") case "r50k", "gpt2", "text-ada-001", "text-curie-001", "text-babbage-001": tk, err = tiktoken.GetEncoding("r50k_base") default: // Default to o200k_base for modern Anthropic Claude and OpenAI models tk, err = tiktoken.GetEncoding("o200k_base") } if err != nil { fmt.Printf("Failed to get tokenizer: %v\n", err) return nil } return tk } func GetModelInfo(encoding string) string { switch encoding { case "o200k", "gpt-4o", "gpt-4.1", "gpt-4.5": return "OpenAI gpt-4+, Anthropic Claude Haiku/Sonnet/Opus 3+ models" case "cl100k": return "Llama3, OpenAI <4o models, text-embedding-ada-002, gpt-4 etc..." case "p50k": return "OpenAI code models, text-davinci-002, text-davinci-003 etc..." case "r50k", "gpt2", "llama2", "llama-2": return "Legacy models like llama2, GPT-3, davinci etc..." default: return "OpenAI gpt-4+, Anthropic Claude Haiku/Sonnet/Opus 3+ models" } } func CountTokens(rendered string, encoding string, useAnthropicAPI bool, noCorrection bool) int { if useAnthropicAPI { count, err := CountTokensAPI(rendered) if err != nil { apiWarningOnce.Do(func() { fmt.Printf("Warning: Failed to count tokens using Anthropic API: %v\nFalling back to offline tokeniser\n", err) }) // Fall back to offline tokenizer } else { apiUsedOnce.Do(func() { fmt.Println() // Add blank line for visibility utils.PrintColouredMessage("✓", fmt.Sprintf("Using Anthropic API (%s) for token counting", AnthropicModel), color.FgYellow) }) return count } } tk := GetTokenizer(encoding) if tk == nil { return 0 } tokens := tk.Encode(rendered, nil, nil) rawCount := len(tokens) // Apply correction multiplier for better accuracy unless disabled if noCorrection { return rawCount } correctedCount := float64(rawCount) * CorrectionMultiplier return int(correctedCount) } // CountTokensBatch counts tokens for multiple strings, using parallel API calls if Anthropic API is enabled. // Processes up to 4 items concurrently when using the API. func CountTokensBatch(contents []string, encoding string, useAnthropicAPI bool, noCorrection bool) []int { if len(contents) == 0 { return []int{} } if useAnthropicAPI { counts, err := CountTokensBatchAPI(contents, 4) if err != nil { apiWarningOnce.Do(func() { fmt.Printf("Warning: Failed to count tokens using Anthropic API: %v\nFalling back to offline tokeniser\n", err) }) // Fall back to offline tokeniser for all items } else { apiUsedOnce.Do(func() { fmt.Println() // Add blank line for visibility utils.PrintColouredMessage("✓", fmt.Sprintf("Using Anthropic API (%s) for token counting", AnthropicModel), color.FgYellow) }) return counts } } // Use offline tokeniser for all items results := make([]int, len(contents)) tk := GetTokenizer(encoding) if tk == nil { return results } for i, content := range contents { tokens := tk.Encode(content, nil, nil) rawCount := len(tokens) if noCorrection { results[i] = rawCount } else { correctedCount := float64(rawCount) * CorrectionMultiplier results[i] = int(correctedCount) } } return results } ================================================ FILE: tree-sitter-dev-plan.md ================================================ # Tree-sitter Based Code Compressor - Development Plan ## Overview This document outlines the development plan for a Tree-sitter based code compressor. The goal is to extract key structural information from source code, such as imports, package definitions, function/method/class signatures, and comments, while omitting detailed implementation bodies. ## Phases ### Phase 1: Basic Language Identification & Parsing - [x] Identify language from file extension. - [x] Parse source code into an AST using Tree-sitter. - [x] Implement basic Tree-sitter query for Go (imports, functions, methods, types). **Status:** Completed. ### Phase 2: Code Chunk Extraction & Compression Logic - [x] Define `CodeChunk` struct (Note: `OriginalLine` field is a placeholder, actual line mapping not yet implemented - see Phase 6). - [x] Implement `GenericCompressor` with `Compress` method. - [x] Execute queries and process captures from matches. - [x] Implement logic to strip function/method bodies and retain signatures. - [x] Go - [x] Python - [x] JavaScript (for named functions, classes, methods, including exported and default exported named variants) - [x] Handle different types of captures (imports, packages, type definitions, comments). - [x] Sort and combine extracted code chunks. **Status:** Largely completed. Core compression logic is functional for Go, Python, and key JavaScript constructs. All `internal/compressor` tests are passing. ### Phase 3: Language-Specific Queries & Refinements - **Go:** - [x] Query for package, imports, type definitions, function declarations, method declarations, comments. - **Python:** - [x] Query for imports, function definitions, class definitions, comments. - **JavaScript:** - [x] Query for imports, comments, method definitions. - [x] Query and processing logic for: - [x] Exported named functions/classes/generators (e.g., `export function foo() {}`). - [x] Default exported named functions/classes/generators (e.g., `export default function foo() {}`). - [x] Standalone named functions/classes/generators. - [ ] **Outstanding/Needs Refinement for JavaScript:** - [ ] **Arrow Functions:** Implement body stripping for arrow functions assigned to variables (e.g., `const myArrow = () => { /* body */ };`). The test file `example.js` includes `const myArrowFunc = (a, b) => { /* Arrow function body */ ... }` which is currently not processed for body stripping. - [ ] **Anonymous Default Exports:** Clarify and potentially implement body stripping for anonymous default exported functions and classes (e.g., `export default function() { /* body */ }`). Currently, these are captured by `@export.other` and kept whole. The test file `example.js` includes `export default function() { console.log("Anon default func"); }` and `export default class { constructor() { this.x = 1;} }`, implying these might need stripping. - [ ] Review other common JS constructs (e.g., object methods not part of class syntax, IIFEs) if they need specific handling. - **Other Languages:** - [ ] Plan for and implement queries for other languages as needed (e.g., TypeScript, Java, C++). **Status:** Go and Python are well-covered by current tests. JavaScript has significantly improved and handles many common cases, with tests passing for implemented features. Specific outstanding items for JavaScript are noted above. ### Phase 4: Testing & Edge Cases - [x] Write initial unit tests for Go, Python, JavaScript compressor logic. (Current tests for `internal/compressor` are passing). - [ ] Expand test coverage with more complex real-world examples and edge cases for all supported languages, particularly for JavaScript features listed as outstanding in Phase 3. - [ ] Test interactions between different JavaScript export/import syntaxes. **Status:** Basic unit tests are in place and passing. Further testing is required, especially for JavaScript refinements. ### Phase 5: Integration & CLI - [ ] Integrate the `GenericCompressor` into `main.go`. - [ ] Add CLI flags for language selection, input file/directory, output options. - [ ] Handle file system traversal for multiple files. **Status:** Not started. ### Phase 6: Advanced Features (Future Considerations) - [ ] **Line Number Mapping:** Fully implement `OriginalLine` in `CodeChunk` to map compressed chunks back to their original line numbers. - [ ] Contextual Compression: Explore options for keeping more context if needed (e.g., call sites of a function, specific variable assignments). - [ ] Configuration: Allow users to customize what to extract/omit via configuration files or advanced CLI options. - [ ] Performance Optimization: Profile and optimize parsing and query execution for large codebases. **Status:** Not started. Languages supported by `smacker/go-tree-sitter`: ``` bash c cpp csharp css cue dockerfile elixir elm golang groovy hcl html java javascript kotlin lua markdown ocaml php protobuf python ruby rust scala sql svelte swift toml typescript markdown yaml ``` The most important to support initially (if we even have to do anything to enable them?) are: - Go - Python - JavaScript - TypeScript - bash - rust - swift - toml - yaml - css - c - html - sql ================================================ FILE: utils/output_manager.go ================================================ package utils import ( "sort" "sync" "github.com/fatih/color" ) type OutputMessage struct { Symbol string Message string Color color.Attribute Priority int } var ( messages []OutputMessage mutex sync.Mutex ) // AddMessage adds a message to the output queue func AddMessage(symbol string, message string, messageColor color.Attribute, priority int) { mutex.Lock() defer mutex.Unlock() messages = append(messages, OutputMessage{ Symbol: symbol, Message: message, Color: messageColor, Priority: priority, }) } // PrintMessages prints all collected messages sorted by priority func PrintMessages() { mutex.Lock() defer mutex.Unlock() // Sort messages by priority (lower priority prints later) sort.Slice(messages, func(i, j int) bool { return messages[i].Priority > messages[j].Priority }) for _, msg := range messages { PrintColouredMessage(msg.Symbol, msg.Message, msg.Color) } // Clear messages after printing messages = nil } ================================================ FILE: utils/utils.go ================================================ package utils import ( "fmt" "os" "os/exec" "path/filepath" "strconv" "strings" "syscall" "unsafe" "github.com/atotto/clipboard" "github.com/fatih/color" "github.com/mitchellh/go-homedir" "github.com/schollz/progressbar/v3" ) type termSize struct { Row uint16 Col uint16 Xpixel uint16 Ypixel uint16 } func CopyToClipboard(rendered string) error { if isWSL() { if err := writeToWindowsClipboard(rendered); err == nil { return nil } } // fallback to default return writeClipboard(rendered) } func writeToWindowsClipboard(text string) error { // Copy using PowerShell for UTF-8 encoding psCmd := `[Console]::InputEncoding = [System.Text.Encoding]::UTF8; $s = [Console]::In.ReadToEnd(); Set-Clipboard -Value $s` cmd := exec.Command("powershell.exe", "-NoProfile", "-Command", psCmd) stdin, err := cmd.StdinPipe() if err != nil { return fmt.Errorf("failed to create stdin pipe for PowerShell: %w", err) } defer stdin.Close() if err := cmd.Start(); err != nil { return fmt.Errorf("failed to start PowerShell: %w", err) } if _, err := stdin.Write([]byte(text)); err != nil { return fmt.Errorf("failed to write to PowerShell stdin: %w", err) } if err := stdin.Close(); err != nil { return fmt.Errorf("failed to close PowerShell stdin: %w", err) } if err := cmd.Wait(); err != nil { return fmt.Errorf("PowerShell failed to set clipboard: %w", err) } return nil } func writeClipboard(text string) error { if err := clipboard.WriteAll(text); err != nil { return fmt.Errorf("failed to copy to clipboard: %v", err) } return nil } func WriteToFile(outputPath string, rendered string) error { err := os.WriteFile(outputPath, []byte(rendered), 0644) if err != nil { return fmt.Errorf("failed to write to file: %v", err) } fmt.Printf("%s Prompt written to file: %s\n", color.GreenString("✓"), outputPath) return nil } func SetupSpinner(message string) *progressbar.ProgressBar { return progressbar.NewOptions(-1, progressbar.OptionSetDescription(message), progressbar.OptionSpinnerType(14), progressbar.OptionSetTheme(progressbar.Theme{ Saucer: "=", SaucerHead: ">", SaucerPadding: " ", BarStart: "[", BarEnd: "]", }), ) } func Label(path string) string { if path == "" { wd, err := os.Getwd() if err != nil { return "." } return wd } return path } func PrintColouredMessage(symbol string, message string, messageColor color.Attribute) { white := color.New(color.FgWhite, color.Bold).SprintFunc() colouredMessage := color.New(messageColor).SprintFunc() fmt.Printf("%s%s%s %s\n", white("["), white(symbol), white("]"), colouredMessage(message)) } func EnsureConfigDirectories() error { home, err := homedir.Dir() if err != nil { return fmt.Errorf("failed to get home directory: %w", err) } configDirs := []struct { path string desc string }{ {filepath.Join(home, ".config", "ingest", "patterns", "exclude"), "Add .glob files here containing glob matches to exclude additional patterns."}, {filepath.Join(home, ".config", "ingest", "patterns", "templates"), "Add go templates with the extension .tmpl here for different output formats."}, } for _, dir := range configDirs { if err := os.MkdirAll(dir.path, 0755); err != nil { return fmt.Errorf("failed to create directory %s: %w", dir.path, err) } readmePath := filepath.Join(dir.path, "README.md") if _, err := os.Stat(readmePath); os.IsNotExist(err) { content := fmt.Sprintf("# %s\n\n%s", filepath.Base(dir.path), dir.desc) if err := os.WriteFile(readmePath, []byte(content), 0644); err != nil { return fmt.Errorf("failed to create README.md in %s: %w", dir.path, err) } } } return nil } func FormatNumber(n int) string { in := strconv.Itoa(n) out := make([]byte, len(in)+(len(in)-2+int(in[0]/'0'))/3) if in[0] == '-' { in, out[0] = in[1:], '-' } for i, j, k := len(in)-1, len(out)-1, 0; ; i, j = i-1, j-1 { out[j] = in[i] if i == 0 { return string(out) } if k++; k == 3 { j, k = j-1, 0 out[j] = ',' } } } func GetTerminalWidth() int { ws := &termSize{} retCode, _, _ := syscall.Syscall(syscall.SYS_IOCTL, uintptr(syscall.Stdin), uintptr(syscall.TIOCGWINSZ), uintptr(unsafe.Pointer(ws))) if int(retCode) == -1 { return 100 } return int(ws.Col) } func isWSL() bool { if _, ok := os.LookupEnv("WSL_DISTRO_NAME"); ok { return true } if out, err := os.ReadFile("/proc/sys/kernel/osrelease"); err == nil { s := strings.ToLower(string(out)) if strings.Contains(s, "microsoft") || strings.Contains(s, "wsl") { return true } } return false } ================================================ FILE: web/crawler.go ================================================ // web/crawler.go package web import ( "io" "net/http" "net/url" "strings" "sync" "time" md "github.com/JohannesKaufmann/html-to-markdown" "github.com/JohannesKaufmann/html-to-markdown/plugin" "github.com/PuerkitoBio/goquery" "github.com/bmatcuk/doublestar/v4" ) type CrawlOptions struct { MaxDepth int AllowedDomains []string Timeout int ConcurrentJobs int } type WebPage struct { URL string Content string Title string Links []string Depth int StatusCode int ContentType string } type Crawler struct { visited map[string]bool visitedLock sync.Mutex options CrawlOptions converter *md.Converter excludePatterns []string initialPath string // Store the initial URL path singlePageMode bool // True if crawling a specific page } func NewCrawler(options CrawlOptions, startURL string) *Crawler { parsedURL, err := url.Parse(startURL) initialPath := "/" singlePageMode := false if err == nil && parsedURL.Path != "" && parsedURL.Path != "/" { initialPath = strings.TrimSuffix(parsedURL.Path, "/") singlePageMode = true } // Create a new converter with GitHub Flavored Markdown support converter := md.NewConverter("", true, &md.Options{ // Configure the converter to handle common edge cases StrongDelimiter: "**", EmDelimiter: "*", LinkStyle: "inlined", HeadingStyle: "atx", HorizontalRule: "---", CodeBlockStyle: "fenced", BulletListMarker: "-", }) // Use GitHub Flavored Markdown plugins converter.Use(plugin.GitHubFlavored()) // Configure the converter to handle specific elements converter.Keep("math", "script[type='math/tex']") // Keep math formulas converter.Remove("script", "style", "iframe", "noscript") // Remove unwanted elements return &Crawler{ visited: make(map[string]bool), options: options, converter: converter, initialPath: initialPath, singlePageMode: singlePageMode, } } func (c *Crawler) SetExcludePatterns(patterns []string) { c.excludePatterns = patterns } func (c *Crawler) shouldExclude(urlStr string) bool { parsedURL, err := url.Parse(urlStr) if err != nil { return false } path := parsedURL.Path if path == "" { path = "/" } for _, pattern := range c.excludePatterns { cleanPath := strings.TrimPrefix(path, "/") if match, _ := doublestar.Match(pattern, cleanPath); match { return true } if match, _ := doublestar.Match(pattern, path); match { return true } } return false } func (c *Crawler) hasVisited(urlStr string) bool { c.visitedLock.Lock() defer c.visitedLock.Unlock() return c.visited[urlStr] } func (c *Crawler) markVisited(urlStr string) { c.visitedLock.Lock() defer c.visitedLock.Unlock() c.visited[urlStr] = true } func (c *Crawler) fetchPage(urlStr string, depth int) (*WebPage, error) { if depth > c.options.MaxDepth { return nil, nil } if c.hasVisited(urlStr) { return nil, nil } if !c.isAllowed(urlStr) { return nil, nil } if c.shouldExclude(urlStr) { return nil, nil } client := &http.Client{ Timeout: time.Duration(c.options.Timeout) * time.Second, } resp, err := client.Get(urlStr) if err != nil { return nil, err } defer resp.Body.Close() c.markVisited(urlStr) body, err := io.ReadAll(resp.Body) if err != nil { return nil, err } // Parse the HTML document doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) if err != nil { return nil, err } // Extract title and links title := doc.Find("title").Text() links := c.extractLinks(doc, urlStr) // Convert HTML to Markdown markdown, err := c.converter.ConvertString(string(body)) if err != nil { return nil, err } return &WebPage{ URL: urlStr, Content: markdown, Title: title, Links: links, Depth: depth, StatusCode: resp.StatusCode, ContentType: resp.Header.Get("Content-Type"), }, nil } func (c *Crawler) extractLinks(doc *goquery.Document, baseURL string) []string { var links []string baseURLParsed, err := url.Parse(baseURL) if err != nil { return links } doc.Find("a[href]").Each(func(_ int, s *goquery.Selection) { if href, exists := s.Attr("href"); exists { if absURL := c.resolveURL(baseURLParsed, href); absURL != "" { links = append(links, absURL) } } }) return links } func (c *Crawler) resolveURL(base *url.URL, ref string) string { refURL, err := url.Parse(ref) if err != nil { return "" } resolvedURL := base.ResolveReference(refURL) if !strings.HasPrefix(resolvedURL.Scheme, "http") { return "" } return resolvedURL.String() } func (c *Crawler) isAllowed(urlStr string) bool { parsedURL, err := url.Parse(urlStr) if err != nil { return false } // Check domain restrictions if any if len(c.options.AllowedDomains) > 0 { domainAllowed := false for _, domain := range c.options.AllowedDomains { if strings.Contains(parsedURL.Host, domain) { domainAllowed = true break } } if !domainAllowed { return false } } // If we're in single page mode, only allow the exact same path if c.singlePageMode { currentPath := strings.TrimSuffix(parsedURL.Path, "/") // Only allow the exact same path or same path with a fragment return currentPath == c.initialPath } return true } func (c *Crawler) Crawl(startURL string) ([]*WebPage, error) { var pages []*WebPage var pagesLock sync.Mutex var wg sync.WaitGroup semaphore := make(chan struct{}, c.options.ConcurrentJobs) var crawlPage func(urlStr string, depth int) crawlPage = func(urlStr string, depth int) { defer wg.Done() semaphore <- struct{}{} // Acquire defer func() { <-semaphore }() // Release page, err := c.fetchPage(urlStr, depth) if err != nil || page == nil { return } pagesLock.Lock() pages = append(pages, page) pagesLock.Unlock() if depth < c.options.MaxDepth { for _, link := range page.Links { if !c.hasVisited(link) && c.isAllowed(link) { wg.Add(1) go crawlPage(link, depth+1) } } } } wg.Add(1) go crawlPage(startURL, 0) wg.Wait() return pages, nil } ================================================ FILE: web/integration.go ================================================ // web/integration.go package web import ( "fmt" "net/url" "path/filepath" "strings" "github.com/sammcj/ingest/filesystem" "github.com/sammcj/ingest/pdf" ) type CrawlResult struct { TreeString string Files []filesystem.FileInfo } func ProcessWebURL(urlStr string, options CrawlOptions, excludePatterns []string) (*CrawlResult, error) { // Check if URL points to a PDF isPDF, err := pdf.IsPDF(urlStr) if err != nil { return nil, fmt.Errorf("error checking PDF: %w", err) } if isPDF { content, err := pdf.ConvertPDFToMarkdown(urlStr, true) if err != nil { return nil, fmt.Errorf("error converting PDF: %w", err) } return &CrawlResult{ TreeString: fmt.Sprintf("PDF Document: %s", urlStr), Files: []filesystem.FileInfo{{ Path: urlStr, Extension: ".md", Code: content, }}, }, nil } // Validate URL parsedURL, err := url.Parse(urlStr) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } if !strings.HasPrefix(parsedURL.Scheme, "http") { return nil, fmt.Errorf("URL must start with http:// or https://") } // Initialize crawler with the start URL crawler := NewCrawler(options, urlStr) crawler.SetExcludePatterns(excludePatterns) // Perform crawl pages, err := crawler.Crawl(urlStr) if err != nil { return nil, fmt.Errorf("crawl failed: %w", err) } // Convert crawled pages to FileInfo format var files []filesystem.FileInfo for _, page := range pages { // Skip pages with no content or error status codes if page.StatusCode != 200 || page.Content == "" { continue } files = append(files, filesystem.FileInfo{ Path: page.URL, Extension: ".md", Code: page.Content, }) } // Generate tree representation, but only if we have more than one page var treeString string if len(files) > 1 { treeString = generateWebTree(pages) } else if len(files) == 1 { treeString = fmt.Sprintf("Web Page: %s", files[0].Path) } // If we're crawling a specific page, only return that page's content if parsedURL.Path != "/" && parsedURL.Path != "" { for _, file := range files { fileURL, err := url.Parse(file.Path) if err != nil { continue } // Find the exact matching path (ignoring trailing slashes) if strings.TrimSuffix(fileURL.Path, "/") == strings.TrimSuffix(parsedURL.Path, "/") { return &CrawlResult{ TreeString: fmt.Sprintf("Web Page: %s", file.Path), Files: []filesystem.FileInfo{file}, }, nil } } } return &CrawlResult{ TreeString: treeString, Files: files, }, nil } func generateWebTree(pages []*WebPage) string { var builder strings.Builder builder.WriteString("Web Crawl Structure:\n") // Create a map of depth to pages depthMap := make(map[int][]*WebPage) for _, page := range pages { if page.StatusCode == 200 && page.Content != "" { depthMap[page.Depth] = append(depthMap[page.Depth], page) } } // Build the tree structure with indentation for depth := 0; depth <= len(depthMap); depth++ { if pages, ok := depthMap[depth]; ok { for _, page := range pages { indent := strings.Repeat(" ", depth) urlPath := getURLPath(page.URL) builder.WriteString(fmt.Sprintf("%s├── %s\n", indent, urlPath)) } } } return builder.String() } func getURLPath(urlStr string) string { parsedURL, err := url.Parse(urlStr) if err != nil { return urlStr } path := parsedURL.Path if path == "" || path == "/" { return parsedURL.Host } return filepath.Join(parsedURL.Host, path) }