Repository: alinaqi/claude-bootstrap Branch: main Commit: 57c5c839f18f Files: 501 Total size: 2.7 MB Directory structure: gitextract_idptty0p/ ├── .github/ │ └── workflows/ │ ├── skill-lint.yml │ └── skill-review.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── _project_specs/ │ ├── 00-autonomous-engineering-roadmap.md │ ├── 01-runtime-observability.md │ ├── 02-rollback-and-recovery.md │ ├── 03-verifiable-contracts.md │ ├── 04-multi-agent-coordination.md │ ├── 05-confidence-calibration.md │ ├── 06-cost-budget-awareness.md │ ├── 07-human-escalation-protocol.md │ ├── 08-auto-code-index.md │ └── 09-multimodal-ingestion.md ├── commands/ │ ├── analyze-repo.md │ ├── analyze-workspace.md │ ├── check-contributors.md │ ├── icpg-bootstrap.md │ ├── icpg-drift.md │ ├── icpg-impact.md │ ├── icpg-why.md │ ├── initialize-project.md │ ├── maggy-init.md │ ├── maggy.md │ ├── mnemos-checkpoint.md │ ├── mnemos-status.md │ ├── polyphony-init.md │ ├── polyphony-spawn.md │ ├── polyphony-status.md │ ├── spawn-team.md │ ├── sync-agents.md │ ├── sync-contracts.md │ └── update-code-index.md ├── docs/ │ ├── architecture-v5.md │ ├── benchmark-results.md │ ├── mnemos-implementation.md │ └── polyphony-spec.md ├── evals/ │ ├── README.md │ ├── agent-teams/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── base/ │ │ ├── scenario-1/ │ │ │ ├── criteria.json │ │ │ └── task.md │ │ └── scenario-2/ │ │ ├── criteria.json │ │ └── task.md │ ├── code-review/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── commit-hygiene/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── credentials/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── database-schema/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── existing-repo/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── llm-patterns/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── project-tooling/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── python/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── react-web/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── run-evals.sh │ ├── security/ │ │ ├── scenario-1/ │ │ │ ├── criteria.json │ │ │ └── task.md │ │ └── scenario-2/ │ │ ├── criteria.json │ │ └── task.md │ ├── session-management/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ ├── supabase/ │ │ └── scenario-1/ │ │ ├── criteria.json │ │ └── task.md │ └── typescript/ │ └── scenario-1/ │ ├── criteria.json │ └── task.md ├── hooks/ │ ├── post-commit-graph │ ├── pre-push │ └── workspace/ │ ├── check-contract-freshness.sh │ ├── check-graph-freshness.sh │ ├── post-commit-contracts.sh │ └── pre-push-contracts.sh ├── install.sh ├── maggy/ │ ├── .gitignore │ ├── PLAN.md │ ├── README.md │ ├── config.example.yaml │ ├── docs/ │ │ ├── benchmark-results.md │ │ └── maggy-rfc.md │ ├── install.sh │ ├── maggy/ │ │ ├── __init__.py │ │ ├── adapters/ │ │ │ ├── __init__.py │ │ │ ├── cli_discovery.py │ │ │ └── pi.py │ │ ├── api/ │ │ │ ├── __init__.py │ │ │ ├── auth.py │ │ │ ├── routes.py │ │ │ ├── routes_budget.py │ │ │ ├── routes_chat.py │ │ │ ├── routes_cikg.py │ │ │ ├── routes_deploy.py │ │ │ ├── routes_engram.py │ │ │ ├── routes_escalation.py │ │ │ ├── routes_events.py │ │ │ ├── routes_forge.py │ │ │ ├── routes_heartbeat.py │ │ │ ├── routes_history.py │ │ │ ├── routes_improve.py │ │ │ ├── routes_lexon.py │ │ │ ├── routes_mesh.py │ │ │ ├── routes_mesh_admin.py │ │ │ ├── routes_monitor.py │ │ │ ├── routes_observability.py │ │ │ ├── routes_planning.py │ │ │ ├── routes_process.py │ │ │ ├── routes_projects.py │ │ │ ├── routes_routing.py │ │ │ └── routes_setup.py │ │ ├── budget.py │ │ ├── calibration/ │ │ │ ├── __init__.py │ │ │ └── tracker.py │ │ ├── checkpoint.py │ │ ├── cikg/ │ │ │ ├── __init__.py │ │ │ ├── graph.py │ │ │ ├── models.py │ │ │ ├── queries.py │ │ │ └── storage.py │ │ ├── cli.py │ │ ├── cli_chat.py │ │ ├── cli_client.py │ │ ├── cli_output.py │ │ ├── cli_repl_cmds.py │ │ ├── cli_sessions.py │ │ ├── cli_welcome.py │ │ ├── config.py │ │ ├── contracts/ │ │ │ ├── __init__.py │ │ │ └── generator.py │ │ ├── coordination/ │ │ │ ├── __init__.py │ │ │ └── lock_manager.py │ │ ├── deploy.py │ │ ├── discovery.py │ │ ├── engram/ │ │ │ ├── __init__.py │ │ │ ├── diagnostics.py │ │ │ ├── record.py │ │ │ ├── retrieval.py │ │ │ ├── seed.py │ │ │ └── store.py │ │ ├── escalation/ │ │ │ ├── __init__.py │ │ │ └── protocol.py │ │ ├── event_spine/ │ │ │ ├── __init__.py │ │ │ ├── emitter.py │ │ │ ├── events.py │ │ │ ├── header.py │ │ │ └── store.py │ │ ├── fatigue.py │ │ ├── forge/ │ │ │ ├── __init__.py │ │ │ ├── connector.py │ │ │ ├── detector.py │ │ │ └── registry.py │ │ ├── heartbeat/ │ │ │ ├── __init__.py │ │ │ ├── jobs.py │ │ │ └── scheduler.py │ │ ├── history/ │ │ │ ├── __init__.py │ │ │ ├── analyzer.py │ │ │ ├── models.py │ │ │ ├── parsers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── claude.py │ │ │ │ ├── codex.py │ │ │ │ └── kimi.py │ │ │ ├── service.py │ │ │ └── store.py │ │ ├── improve/ │ │ │ ├── __init__.py │ │ │ ├── analyzer.py │ │ │ ├── models.py │ │ │ ├── service.py │ │ │ └── signals.py │ │ ├── lexon/ │ │ │ ├── __init__.py │ │ │ ├── disambiguate.py │ │ │ ├── personalization.py │ │ │ ├── record.py │ │ │ ├── router.py │ │ │ └── terminology.py │ │ ├── main.py │ │ ├── mesh/ │ │ │ ├── __init__.py │ │ │ ├── discovery.py │ │ │ ├── git_discovery.py │ │ │ ├── manager.py │ │ │ ├── memory.py │ │ │ ├── network.py │ │ │ ├── org_scanner.py │ │ │ ├── protocol.py │ │ │ ├── provenance.py │ │ │ ├── publisher.py │ │ │ ├── quarantine.py │ │ │ ├── store.py │ │ │ ├── sync.py │ │ │ ├── transport.py │ │ │ ├── ws_client.py │ │ │ └── ws_server.py │ │ ├── mnemos/ │ │ │ ├── __init__.py │ │ │ ├── fatigue.py │ │ │ └── signals.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ └── plan.py │ │ ├── observability/ │ │ │ ├── __init__.py │ │ │ └── collector.py │ │ ├── planning.py │ │ ├── process/ │ │ │ ├── __init__.py │ │ │ ├── discovery.py │ │ │ ├── github_prs.py │ │ │ ├── model_router.py │ │ │ ├── models.py │ │ │ ├── patterns.py │ │ │ ├── report.py │ │ │ ├── service.py │ │ │ ├── signals.py │ │ │ └── store.py │ │ ├── providers/ │ │ │ ├── __init__.py │ │ │ ├── asana.py │ │ │ ├── base.py │ │ │ ├── github_issues.py │ │ │ └── monday.py │ │ ├── recovery/ │ │ │ ├── __init__.py │ │ │ └── rollback.py │ │ ├── registry.py │ │ ├── routing.py │ │ ├── routing_rules.py │ │ ├── routing_rules_defaults.py │ │ ├── routing_rules_io.py │ │ ├── scores.py │ │ ├── services/ │ │ │ ├── __init__.py │ │ │ ├── account_guide.py │ │ │ ├── activity.py │ │ │ ├── ai_client.py │ │ │ ├── cascade.py │ │ │ ├── chat.py │ │ │ ├── chat_context.py │ │ │ ├── chat_router.py │ │ │ ├── chat_stream.py │ │ │ ├── checkpoint.py │ │ │ ├── competitor.py │ │ │ ├── context_compactor.py │ │ │ ├── convention_inferrer.py │ │ │ ├── convention_scanner.py │ │ │ ├── executor.py │ │ │ ├── executor_helpers.py │ │ │ ├── executor_prompts.py │ │ │ ├── executor_types.py │ │ │ ├── inbox.py │ │ │ ├── monitor.py │ │ │ ├── output_reviewer.py │ │ │ ├── planner.py │ │ │ ├── session_detect.py │ │ │ ├── stakes.py │ │ │ ├── tdd_verifier.py │ │ │ └── vision.py │ │ └── static/ │ │ ├── app.js │ │ └── index.html │ ├── pyproject.toml │ └── tests/ │ ├── conftest.py │ ├── integration/ │ │ ├── __init__.py │ │ ├── test_full_task_flow.py │ │ ├── test_model_fallback.py │ │ └── test_process_loop.py │ ├── test_account_guide.py │ ├── test_activity.py │ ├── test_api_endpoints.py │ ├── test_benchmark_scenario.py │ ├── test_bootstrap.py │ ├── test_budget.py │ ├── test_calibration.py │ ├── test_cascade.py │ ├── test_chat.py │ ├── test_chat_context.py │ ├── test_chat_routed.py │ ├── test_chat_router.py │ ├── test_chat_stream.py │ ├── test_checkpoint.py │ ├── test_checkpoint_mgr.py │ ├── test_cikg.py │ ├── test_cli.py │ ├── test_cli_chat.py │ ├── test_cli_discovery.py │ ├── test_cli_sessions.py │ ├── test_cli_welcome.py │ ├── test_context_compactor.py │ ├── test_contracts.py │ ├── test_convention_inferrer.py │ ├── test_convention_scanner.py │ ├── test_coordination.py │ ├── test_deploy.py │ ├── test_discovery.py │ ├── test_dual_planner.py │ ├── test_engram.py │ ├── test_escalation.py │ ├── test_event_spine.py │ ├── test_executor_routing.py │ ├── test_fatigue.py │ ├── test_forge.py │ ├── test_heartbeat.py │ ├── test_history.py │ ├── test_history_parsers.py │ ├── test_improve.py │ ├── test_lexon.py │ ├── test_mesh.py │ ├── test_mesh_network.py │ ├── test_mesh_store.py │ ├── test_mesh_ws.py │ ├── test_mnemos_fatigue.py │ ├── test_monday_provider.py │ ├── test_monitor.py │ ├── test_multimodel_integration.py │ ├── test_observability.py │ ├── test_output_reviewer.py │ ├── test_pi_adapter.py │ ├── test_planning.py │ ├── test_registry.py │ ├── test_repl_cmds.py │ ├── test_rollback.py │ ├── test_routes_escalation.py │ ├── test_routes_observability.py │ ├── test_routes_projects.py │ ├── test_routing_config.py │ ├── test_routing_rules.py │ ├── test_routing_service.py │ ├── test_scores.py │ ├── test_setup_routes.py │ ├── test_stakes.py │ ├── test_tdd_verifier.py │ ├── test_vision.py │ └── test_zero_config.py ├── rules/ │ ├── nodejs-backend.md │ ├── python.md │ ├── quality-gates.md │ ├── react.md │ ├── security.md │ ├── tdd-workflow.md │ └── typescript.md ├── scripts/ │ ├── convert-hooks-to-toml.sh │ ├── convert-skills-structure.sh │ ├── detect-agents.sh │ ├── icpg/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── bootstrap.py │ │ ├── contracts.py │ │ ├── drift.py │ │ ├── models.py │ │ ├── pyproject.toml │ │ ├── store.py │ │ ├── symbols.py │ │ └── vectors.py │ ├── install-graph-tools.sh │ ├── install-hooks.sh │ ├── install-skills.sh │ ├── mnemos/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── checkpoint.py │ │ ├── consolidation.py │ │ ├── fatigue.py │ │ ├── models.py │ │ ├── pyproject.toml │ │ ├── signals.py │ │ └── store.py │ ├── polyphony/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── adapters/ │ │ │ ├── __init__.py │ │ │ ├── claude.py │ │ │ ├── codex.py │ │ │ └── kimi.py │ │ ├── config.py │ │ ├── events.py │ │ ├── identity.py │ │ ├── models.py │ │ ├── orchestrator.py │ │ ├── pyproject.toml │ │ ├── router.py │ │ ├── runtime.py │ │ ├── scoring.py │ │ ├── sources/ │ │ │ ├── __init__.py │ │ │ ├── github.py │ │ │ └── local.py │ │ ├── state_machine.py │ │ ├── store.py │ │ └── workspace.py │ └── skill_lint/ │ ├── __init__.py │ ├── __main__.py │ ├── content.py │ ├── frontmatter.py │ ├── pyproject.toml │ ├── references.py │ ├── report.py │ └── spec.py ├── skills/ │ ├── aeo-optimization/ │ │ └── SKILL.md │ ├── agent-teams/ │ │ ├── SKILL.md │ │ └── agents/ │ │ ├── code-review.md │ │ ├── feature.md │ │ ├── merger.md │ │ ├── quality.md │ │ ├── security.md │ │ └── team-lead.md │ ├── agentic-development/ │ │ └── SKILL.md │ ├── ai-models/ │ │ └── SKILL.md │ ├── android-java/ │ │ └── SKILL.md │ ├── android-kotlin/ │ │ └── SKILL.md │ ├── aws-aurora/ │ │ └── SKILL.md │ ├── aws-dynamodb/ │ │ └── SKILL.md │ ├── azure-cosmosdb/ │ │ └── SKILL.md │ ├── base/ │ │ └── SKILL.md │ ├── cloudflare-d1/ │ │ └── SKILL.md │ ├── code-deduplication/ │ │ └── SKILL.md │ ├── code-graph/ │ │ └── SKILL.md │ ├── code-review/ │ │ └── SKILL.md │ ├── codex-review/ │ │ └── SKILL.md │ ├── commit-hygiene/ │ │ └── SKILL.md │ ├── cpg-analysis/ │ │ └── SKILL.md │ ├── credentials/ │ │ └── SKILL.md │ ├── cross-agent-delegation/ │ │ └── SKILL.md │ ├── database-schema/ │ │ └── SKILL.md │ ├── existing-repo/ │ │ └── SKILL.md │ ├── firebase/ │ │ └── SKILL.md │ ├── flutter/ │ │ └── SKILL.md │ ├── gemini-review/ │ │ └── SKILL.md │ ├── icpg/ │ │ └── SKILL.md │ ├── iterative-development/ │ │ └── SKILL.md │ ├── klaviyo/ │ │ └── SKILL.md │ ├── llm-patterns/ │ │ └── SKILL.md │ ├── maggy/ │ │ └── SKILL.md │ ├── medusa/ │ │ └── SKILL.md │ ├── mnemos/ │ │ └── SKILL.md │ ├── ms-teams-apps/ │ │ └── SKILL.md │ ├── nodejs-backend/ │ │ └── SKILL.md │ ├── playwright-testing/ │ │ └── SKILL.md │ ├── polyphony/ │ │ └── SKILL.md │ ├── posthog-analytics/ │ │ └── SKILL.md │ ├── project-tooling/ │ │ └── SKILL.md │ ├── pwa-development/ │ │ └── SKILL.md │ ├── python/ │ │ └── SKILL.md │ ├── react-native/ │ │ └── SKILL.md │ ├── react-web/ │ │ └── SKILL.md │ ├── reddit-ads/ │ │ └── SKILL.md │ ├── reddit-api/ │ │ └── SKILL.md │ ├── security/ │ │ └── SKILL.md │ ├── session-management/ │ │ └── SKILL.md │ ├── shopify-apps/ │ │ └── SKILL.md │ ├── site-architecture/ │ │ └── SKILL.md │ ├── supabase/ │ │ └── SKILL.md │ ├── supabase-nextjs/ │ │ └── SKILL.md │ ├── supabase-node/ │ │ └── SKILL.md │ ├── supabase-python/ │ │ └── SKILL.md │ ├── team-coordination/ │ │ └── SKILL.md │ ├── ticket-craft/ │ │ └── SKILL.md │ ├── typescript/ │ │ └── SKILL.md │ ├── ui-mobile/ │ │ └── SKILL.md │ ├── ui-testing/ │ │ └── SKILL.md │ ├── ui-web/ │ │ └── SKILL.md │ ├── user-journeys/ │ │ └── SKILL.md │ ├── web-content/ │ │ └── SKILL.md │ ├── web-payments/ │ │ └── SKILL.md │ ├── woocommerce/ │ │ └── SKILL.md │ └── workspace/ │ └── SKILL.md ├── templates/ │ ├── AGENTS.md │ ├── CLAUDE.local.md │ ├── CLAUDE.md │ ├── Dockerfile.polyphony │ ├── codex-auto-review.sh │ ├── config.toml │ ├── icpg-pre-edit.sh │ ├── icpg-stop-record.sh │ ├── mnemos-post-compact-inject.sh │ ├── mnemos-post-tool.sh │ ├── mnemos-pre-compact.sh │ ├── mnemos-pre-edit.sh │ ├── mnemos-session-start.sh │ ├── mnemos-statusline.sh │ ├── mnemos-stop-checkpoint.sh │ ├── polyphony-agents.yaml │ ├── polyphony-config.yaml │ ├── polyphony-identities.yaml │ ├── polyphony-routing.yaml │ ├── pre-compact.sh │ ├── settings.json │ └── tdd-loop-check.sh └── tests/ ├── test_cross_agent.py ├── test_cross_tool.py ├── test_polyphony_adapters.py ├── test_polyphony_config.py ├── test_polyphony_events.py ├── test_polyphony_identity.py ├── test_polyphony_models.py ├── test_polyphony_orchestrator.py ├── test_polyphony_router.py ├── test_polyphony_runtime.py ├── test_polyphony_scoring.py ├── test_polyphony_sources.py ├── test_polyphony_state.py ├── test_polyphony_store.py ├── test_polyphony_workspace.py ├── test_session_detect.py ├── test_skill_lint.py └── validate-structure.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/skill-lint.yml ================================================ name: Skill Lint on: push: branches: [main] paths: - 'skills/**' - 'scripts/skill_lint/**' - 'tests/test_skill_lint.py' pull_request: paths: - 'skills/**' - 'scripts/skill_lint/**' - 'tests/test_skill_lint.py' jobs: lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: '3.12' - name: Install pytest run: pip install pytest - name: Run skill-lint tests run: PYTHONPATH=scripts python -m pytest tests/test_skill_lint.py -v - name: Run skill-lint (errors fail) run: PYTHONPATH=scripts python -m skill_lint --fail-on error skills/ - name: Run skill-lint (full report) if: always() run: PYTHONPATH=scripts python -m skill_lint --format json skills/ > skill-lint-report.json || true - name: Upload lint report if: always() uses: actions/upload-artifact@v4 with: name: skill-lint-report path: skill-lint-report.json ================================================ FILE: .github/workflows/skill-review.yml ================================================ name: Skill Review (Tessl + skills-ref) on: pull_request: paths: - 'skills/**' jobs: tessl: runs-on: ubuntu-latest if: ${{ vars.TESSL_ENABLED == 'true' }} steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Tessl uses: tesslio/setup-tessl@v2 with: token: ${{ secrets.TESSL_TOKEN }} - name: Detect changed skills id: changes run: | # Multi-skill PRs produce a multi-line list. Plain echo "skills=$X" # fails GHA's output parser on newlines ("Invalid format"), AND the # downstream `for skill in ${{ outputs.skills }}` breaks on newlines # because the expansion ends the `for ... in` expression. Join with # spaces so both the output format and the shell loop are happy. CHANGED=$(git diff --name-only origin/main...HEAD -- skills/ | cut -d'/' -f2 | sort -u | tr '\n' ' ') # Trim trailing space for clean logs CHANGED="${CHANGED%% }" echo "skills=$CHANGED" >> "$GITHUB_OUTPUT" echo "Changed skills: $CHANGED" - name: Run Tessl review on changed skills if: steps.changes.outputs.skills != '' run: | for skill in ${{ steps.changes.outputs.skills }}; do echo "=== Reviewing: $skill ===" tessl skill lint "skills/$skill" || true tessl skill review --json "skills/$skill" || true done skills-ref: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: actions/setup-python@v5 with: python-version: '3.12' - name: Install skills-ref run: pip install skills-ref || echo "skills-ref not available, skipping" - name: Detect changed skills id: changes run: | # Same space-join as the tessl job — keeps both the GHA output format # and the downstream `for skill in ${{ ... }}` loop working. CHANGED=$(git diff --name-only origin/main...HEAD -- skills/ | cut -d'/' -f2 | sort -u | tr '\n' ' ') CHANGED="${CHANGED%% }" echo "skills=$CHANGED" >> "$GITHUB_OUTPUT" - name: Validate changed skills if: steps.changes.outputs.skills != '' run: | for skill in ${{ steps.changes.outputs.skills }}; do echo "=== Validating: $skill ===" skills-ref validate "skills/$skill" || true done ================================================ FILE: .gitignore ================================================ __pycache__/ .DS_Store evals/.results/ .pytest_cache/ ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to Claude Bootstrap will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). --- ## [5.8.0] - 2026-05-12 ### Fixed #### UX Fix Pass (12 issues from manual CLI testing) - **Prompt character** — Changed from `maggy:` to `>` for cleaner input (`cli_chat.py:76`) - **Ctrl+C during streaming** — Now cancels current response instead of exiting REPL. Added `except KeyboardInterrupt` in `_stream_chunks` (`cli_chat.py:161`) - **`/health` 404** — Client was calling `/api/health/memory` (non-existent). Fixed to call `/api/engram/diagnostics` (`cli_client.py:260`) - **`/route`, `/models`, `/budget`, `/stats`, `/health`, `/config` crash on server down** — Added `_call(fn, default)` safe wrapper that catches `Exception` and `SystemExit` from unreachable server. All display commands return fallback data instead of crashing (`cli_repl_cmds.py:18`) - **Models shows "0 tracked" / "No data yet"** — When heatmap is empty, now shows the 5 known model tiers (local, kimi, gpt, claude, codex) with 0 samples (`cli_repl_cmds.py:129`) - **`/use` accepts invalid model names** — Now validates against `_KNOWN_MODELS`, prints warning for unknown names while still setting the restriction (`cli_repl_cmds.py:147`) - **Dir shows "?"** — Welcome banner now falls back to `os.getcwd()` when session `working_dir` is empty (`cli_welcome.py:36`) ### Added #### Budget Subscription Awareness - **`plan` field** on `BudgetConfig` — Users set `budget.plan: subscription` in `~/.maggy/config.yaml` (`config.py:150`) - **`BudgetManager.budget_status()`** includes `plan` in response (`budget.py:163`) - **`/budget`** shows "Subscription" instead of "$0.00 / $10.00" when plan is subscription (`cli_repl_cmds.py:87`) - **Welcome banner** shows "Subscription" for subscription plans (`cli_welcome.py:54`) #### Welcome Banner Improvements - **Models count** — Shows "5 available" (known model count) instead of "0 tracked" when no heatmap data (`cli_welcome.py:62`) ### Changed - **`_HELP` compressed** — 2-column layout saves 6 lines, fits all new features within 200-line limit (`cli_repl_cmds.py:191`) ### Tests - `test_repl_cmds.py` — +5 tests: models_empty_shows_known, use_warns_unknown_model, budget_subscription_plan, health_graceful_failure, stats_server_down - `test_cli_welcome.py` — +3 tests: dir_shows_cwd_fallback, models_shows_available_count, budget_subscription_welcome - `test_cli_chat.py` — +1 test: chat_prompt_uses_angle_bracket - **Total: 825 tests passing** (816 + 9 new) --- ## [5.7.0] - 2026-05-12 ### Added #### `/monitor` Command — Background Tracker Polling - **`maggy/services/monitor.py`** — MonitorService with SQLite-backed polling for GitHub PRs and Monday.com items. `MonitorConfig` and `MonitorEvent` dataclasses, `add/remove/list_active/is_new/mark_seen/status/poll` methods - **`maggy/providers/monday.py`** — Monday.com provider implementing `IssueTrackerProvider` protocol via GraphQL API. Maps board items to Task dataclass - **`maggy/api/routes_monitor.py`** — REST endpoints: `GET /api/monitor/status`, `POST /api/monitor/start`, `POST /api/monitor/stop` - **`/monitor` handler** in REPL — shows active monitor count (`cli_chat.py:94`) #### `/health` Command — Memory Health Dashboard - **`cmd_health()`** — Shows Engram health score (color-coded) and Mnemos fatigue state in Rich Panel (`cli_repl_cmds.py:180`) - **`health_dashboard()`** and **`engram_diagnostics()`** client methods (`cli_client.py:259`) #### Enhanced Welcome Banner - **`cli_welcome.py`** — New file with Rich Panel welcome banner showing project info, budget, models, status, and memory health score #### Search Routing to Local Model - **"search" type** added to `TYPE_KEYWORDS` in `chat_router.py` — 11 keywords (find, search, grep, where, locate, which, look, scan, show, list, read) route to local/Qwen model for free #### Account Switching Guidance - **`maggy/services/account_guide.py`** — Detects CLI auth profiles from `~/.claude/`, `~/.codex/`. `suggest_switch()` returns CLI instructions, `render_switch_guide()` prints Rich-formatted guidance - **Quota error detection** — `_QUOTA_MARKERS` in `cli_chat.py` triggers account switch guidance on rate limit errors ### Tests - `test_monitor.py` — 8 tests for MonitorService - `test_monday_provider.py` — 6 tests for MondayProvider - `test_account_guide.py` — 5 tests for account switching - `test_chat_router.py` — +3 tests for search type detection - `test_repl_cmds.py` — +3 tests for health command - `test_cli_welcome.py` — +2 tests for health and session history - `test_cli_chat.py` — +1 test for quota error guidance - **Total: 816 tests passing** (788 + 28 new) --- ## [5.1.0] - 2026-05-11 ### Added #### REPL Slash Commands — Stats, Routing, Model Control - **`maggy/cli_repl_cmds.py`** — 9 command handlers for the interactive REPL: - `/stats` — Budget + model performance summary (spend, status, reward heatmap) - `/budget` — Detailed per-provider breakdown with visual progress bar - `/route` — Routing rules, task type overrides, model strengths/success rates - `/models` — Full reward heatmap grid by model × task type × blast tier - `/use claude,codex` — Restrict routing to specific models for this session - `/use all` — Remove model restriction - `/config` — Configuration summary (codebases, routing mode, budget limit) - `/claude-md` — Render project's CLAUDE.md in terminal - `/help` — List all available commands - **`SessionState`** dataclass — Mutable session-level state (session_id, working_dir, allowed_models) - **`dispatch()`** router — Parses slash commands, routes to handlers, returns True if handled - **`GET /api/routing/rules`** endpoint — Exposes routing mode, task type overrides, model performance - **`allowed_models`** field on `RoutedMessageRequest` — Server-side model restriction: if routed model not in allowed list, picks first allowed model with updated reason #### Qwen3-Coder Benchmarks - **75.7 tok/s average** — 3.4× faster than Qwen2.5-Coder (22.1 tok/s), 2× faster than Claude API (37.4 tok/s) - MoE architecture (3.3B active / 30B total params) on M4 Max 128GB - Quality: 10/10 BST correctness, 9/10 async rate limiter (token bucket + asyncio.Lock) - Cold start: ~13s model load; hot runs: <100ms start #### mWP Mindset — Full Framework - **`skills/base/SKILL.md`** — Added complete mWP section with 11-Star Framework (Brian Chesky), mWP planning checklist (obvious → magical → multiplier) - **`routing_rules.py`** — Expanded mWP convention injected into all CLI prompts (codex, kimi, qwen3, claude) with 3-question framework and 11-star reference ### Changed - **`cli_chat.py`** — Integrated `SessionState` and `dispatch()` from `cli_repl_cmds`; passes `allowed_models` to `chat_send_routed()`; mode hint now shows `/help for commands` - **`cli_client.py`** — Added `budget_by_provider()`, `routing_rules()` methods; updated `chat_send_routed()` signature to accept `allowed_models` - **`benchmark-results.md`** — Qwen3-Coder results filled in (was TBD), quality assessment section added ### Tests - `tests/test_repl_cmds.py` — 10 tests (dispatch routing, stats, budget, route, models, use, claude-md, help) - `tests/test_cli_chat.py` — Updated 2 assertions for `allowed_models=None` parameter - **Total: 653 tests passing** (643 maggy + 10 session detect) --- ## [5.0.0] - 2026-05-10 ### Added #### Interactive Chat — Session Takeover - **`maggy/services/chat.py`** — ChatManager for interactive Claude sessions with SSE streaming - Auto-connects to all active CLI sessions (Claude, Codex, Kimi) via ActivityService process scanning - Session continuity with `--resume ` for multi-turn conversations - `CLAUDECODE` env var stripping to allow nested Claude subprocess spawning - `--verbose` flag for `--output-format stream-json` compatibility - Deduplication via dict keyed by project name - **`maggy/services/chat_context.py`** — Context builder for session enrichment - Path-based history matching (not just exact project name) via `_path_candidates()` - `_SKIP_DIRS` set prevents matching common system directories (Users, Documents, Library) - Recent prompt injection from activity data per project - Claude `session_id` resolution from `~/.claude/history.jsonl` for true `--resume` - **`maggy/api/routes_chat.py`** — Chat API (5 endpoints) - `POST /api/chat/auto-connect` — detect all active sessions, enrich with history context - `POST /api/chat/sessions` — create session - `GET /api/chat/sessions` — list sessions - `GET /api/chat/sessions/{id}` — get session + messages - `POST /api/chat/sessions/{id}/send` — send message, stream response via SSE - `DELETE /api/chat/sessions/{id}` — delete session - **Chat UI** in `app.js` — full web-based chat interface - Auto-connects on tab load, shows all active project sessions in sidebar - Message thread with user/Claude bubbles - SSE EventSource for real-time streaming - Session history context display - New session creation from active + configured projects #### Auto-Bootstrap — No Empty Tabs - **`_bootstrap()` in `main.py`** — seeds all services on startup - `history.analyze()` — parses CLI sessions immediately (260+ sessions, 11,994 prompts) - `introspector.analyze()` — collects signals, emits events - `_seed_cikg()` — scans configured codebases, creates nodes for repos + detected languages #### UI Navigation Cleanup - **Grouped navigation** — 9 flat tabs reorganized into 3 logical groups: - **Work** (Chat, Tasks, Watching) — things you do - **Intel** (Competitors, Insights) — things you learn - **System** (gear dropdown: Budget, Models, Forge, Settings) — things you configure - **Tab renames** — Inbox→Tasks, Followed→Watching, Process→Insights - **Chat is default tab** — loads on startup, auto-connects immediately - **Gear dropdown** — system tabs collapsed into icon menu, reduces nav clutter - **Section labels** — tiny uppercase "WORK" / "INTEL" separators #### Process Intelligence Tab Enhancement - Parallel fetch of activity, history, improve, events, CIKG data - Health signals display (routing, memory, reliability, cost percentages) - Live activity section showing active sessions + recent prompts - Session patterns from history analysis - Button spinner feedback + success toast on Analyze History / Self-Improve #### Infrastructure - **No-cache static middleware** — `_NoCacheStatic` adds `Cache-Control: no-store` to `/static` - **Cache-busting** — `?v=3` on script tag - **`showToast()`** — green success notification for async operations ### Security - **Chat path validation** — `project_path` now validated against configured codebase roots (blocks arbitrary filesystem access via `--dangerously-skip-permissions`) - **Chat streaming lock** — per-session `asyncio.Lock` rejects concurrent `/send` requests, preventing duplicate subprocess spawning and workspace corruption ### Fixed - Engram `expire_engrams` referencing `self` outside class context - `auto_connect` returning duplicate sessions for same project - `CLAUDECODE` env var blocking nested Claude subprocess spawning - `--verbose` flag required when using `--output-format stream-json` with `-p` - History matching missing projects stored under parent dir name (e.g. "AI-Playground" vs "claude-skills-package") - Process tab buttons doing nothing due to browser-cached old JS - 500-row limit in history store masking projects — switched to aggregated report data ### Changed - Default tab: `inbox` → `chat` - Org name in config: `"Your Org"` → read from `~/.maggy/config.yaml` - README fully rewritten to reflect current feature set (was still describing MVP) ### Tests - `tests/test_chat.py` — 17 tests (ChatManager + AutoConnect) - `tests/test_chat_context.py` — 18 tests (path candidates, history matching, prompts, session ID) - Total: **466 tests passing** --- ## [4.0.0] - 2026-05-05 ### Added #### Polyphony — Multi-Agent Orchestration (Core) - **`scripts/polyphony/`** — Full multi-agent orchestration package with container-isolated workspaces. Each agent session runs in its own Docker container with independent git branches. - **Domain models** (`models.py`) — Task, Identity, AgentProfile, RunSpec, Result dataclasses - **Task state machine** (`state_machine.py`) — DISCOVERED -> CLAIMED -> ROUTED -> PROVISIONED -> RUNNING -> VERIFYING -> LANDED with FAILED/BLOCKED paths - **SQLite store** (`store.py`) — Persistent CRUD for tasks, run_specs, results with state audit log - **YAML config** (`config.py`) — Configuration loading from `~/.polyphony/` with defaults merging - **5-dimension complexity scoring** (`scoring.py`) — Cyclomatic depth, fan-out, security boundary, concurrency, domain invariants (0-10 scale) - **Pure function router** (`router.py`) — Task x Policy -> RunSpec, first-match rules with fallback chains - **Identity broker** (`identity.py`) — Named credential bundles with volume mounts and env overlays - **Workspace manager** (`workspace.py`) — Per-task git clone lifecycle with `--reference`/`--dissociate` mirror support - **Docker runtime** (`runtime.py`) — Container create/start/stop/wait/logs/rm lifecycle - **Event parser** (`events.py`) — NDJSON/stream-json parsing from container stdout - **Orchestrator** (`orchestrator.py`) — Supervisor loop: discover -> claim -> route -> provision -> run -> verify -> land - **Agent adapters** (`adapters/`) — Claude (`-p --output-format stream-json`), Codex (`exec --full-auto`), Kimi (`--print -y`) - **Work sources** (`sources/`) — GitHub Issues via `gh api`, local SQLite task queue - **CLI** (`__main__.py`) — `polyphony {init|spawn|status|cleanup}` commands - **Skill** (`skills/polyphony/SKILL.md`) — Full documentation for the orchestration system - **Commands** — `/polyphony-init`, `/polyphony-spawn`, `/polyphony-status` - **Templates** — `Dockerfile.polyphony`, `polyphony-config.yaml`, `polyphony-identities.yaml`, `polyphony-agents.yaml`, `polyphony-routing.yaml` - **Spec** (`docs/polyphony-spec.md`) — Full specification reference (12 sections) - **173 tests** across 13 test files with full TDD coverage --- ## [3.6.1] - 2026-05-04 ### Changed - **Complexity-based delegation replaces file-count heuristic** (`skills/cross-agent-delegation/SKILL.md`) — Kimi delegation now scored on 5 dimensions (cyclomatic depth, fan-out, security boundary, concurrency, domain invariants) × 0-2 each, sourced from iCPG signals + Claude reasoning. Routing: 0-3 → Kimi solo, 4-6 → Kimi + Codex auto-review, 7-10 → Claude direct. Adds trivial-case shortcut (<2 files + no risk keywords → auto-Kimi without scoring) and single-dimension override (7+ in any one dim keeps Claude). PR #16. --- ## [3.6.0] - 2026-05-03 ### Added #### Cross-Tool Compatibility (Claude + Kimi + Codex) - **`scripts/detect-agents.sh`** — Detects installed AI CLI tools (Claude Code, Kimi CLI, Codex CLI) - **`scripts/install-skills.sh`** — Reusable skill copier for any target directory - **`templates/AGENTS.md`** — Codex project instructions template (mirrors CLAUDE.md with `.agents/skills/` paths) - **`templates/config.toml`** — Hooks in TOML format for Kimi/Codex compatibility - **`scripts/convert-hooks-to-toml.sh`** — JSON to TOML hook converter (requires jq) - **`commands/sync-agents.md`** — `/sync-agents` command for cross-tool config sync - **`install.sh`** auto-detects and installs skills to `~/.kimi/skills/` and `~/.codex/skills/` - **`/initialize-project`** question 9: "Which AI CLI tools do you use?" with auto-detection - Cross-tool directories (`.kimi/`, `.codex/`, `.agents/`) added to `.gitignore` template #### Cross-Agent Intelligence - **`templates/codex-auto-review.sh`** — Stop hook that auto-runs Codex review on changed files - Checks for Critical/High severity issues only - Exit 0 = pass, Exit 2 = feed findings back to Claude for fixing - Truncates diff to 8000 chars to prevent Codex token overflow - Gracefully skips if Codex CLI not installed - **`skills/cross-agent-delegation/SKILL.md`** — Delegation skill with: - Tool detection (checks `command -v` for each CLI) - iCPG blast radius rules for Kimi delegation (<=3 files suggest Kimi, 4-8 offer option, 9+ stay Claude) - iCPG mandatory pre-task queries for all agents (prior, constraints, risk) - Mnemos mandatory memory lifecycle for all agents (goals, checkpoints, fatigue) - 10-step cross-agent workflow summary - **Codex auto-review Stop hook** added to `settings.json` (after TDD, before iCPG record, 120s timeout) - **Codex auto-review TOML hook** added to `config.toml` for Kimi/Codex compatibility - **Cross-Agent Workflow** section added to both `CLAUDE.md` and `AGENTS.md` templates - **`cross-agent-delegation/`** added to always-copy skill list in `/initialize-project` #### Tests - **`tests/test_cross_tool.py`** — 12 tests for cross-tool compatibility (detect-agents, install-skills, templates, sync-agents) - **`tests/test_cross_agent.py`** — 22 tests for cross-agent intelligence (codex-auto-review, delegation skill, settings.json hook ordering, config.toml, template refs) ### Changed - `install.sh` bumped to v3.6.0 - `install.sh` now makes `codex-auto-review.sh` executable during install - `tests/validate-structure.sh` includes cross-tool template validation - Total skills increased from 60 to **61 skills** - Total tests: 62 pytest + 238 validation checks --- ## [3.5.2] - 2026-04-22 ### Fixed - **Hook error behavior revised** — the 3.5.1 fix silently no-op'd missing scripts, which hid real installation problems. Hook commands now: - **Fail loud on real errors** — if the script exists and crashes, its stderr + non-zero exit propagate to Claude Code so you can debug - **Print one actionable line on missing installs** — `[claude-bootstrap] hook script 'X' not installed — run /install.sh …` and exit 0 (no blocking error, but you see exactly what to do) - **Use `exec` to run the resolved script** — exit code + stderr pass through unchanged - **Hook scripts stop swallowing stderr** — removed 19 instances of `2>/dev/null` across `mnemos-*.sh`, `icpg-*.sh`, and `tdd-loop-check.sh`. Python tracebacks and Python stderr now surface to Claude Code's hook diagnostics. Command substitution (`$(...)`) only captures stdout, so this doesn't affect any value parsing. ## [3.5.1] - 2026-04-21 ### Fixed - **PreToolUse hook "Bash hook error" on any tool call.** `templates/settings.json` declared hook commands as relative paths (`scripts/mnemos-*.sh`) that don't exist in most projects — the scripts live in `templates/` and nothing copies them to `/scripts/`. Every tool call triggered a hook-not-found error shown as `PreToolUse:Bash hook error` in the session (non-blocking but noisy). - Hook commands now try `.claude/scripts/.sh` first (project-local override), fall back to `$HOME/.claude/templates/.sh` (always installed by `install.sh`), and no-op cleanly when neither exists. Applied to all 8 hook script references across `PreCompact`, `PreToolUse`, `PostToolUse`, `Stop`, and `SessionStart`. --- ## [3.5.0] - 2026-04-19 ### CI - **`skill-review.yml`**: both `tessl` and `skills-ref` jobs now space-join the detected-skills list before writing to `$GITHUB_OUTPUT`. The old plain `echo "skills=$CHANGED"` with a multi-line `$CHANGED` value failed GHA's output parser ("Invalid format") AND broke the downstream `for skill in ${{ outputs.skills }}` loop. Space-joining keeps both happy and unblocks multi-skill PRs (like this one, which touches both `maggy/` and `mnemos/`). ### Third review pass fixes (Copilot iteration) - **Package renamed `src/` → `maggy/`.** The top-level `src` package name was a well-known Python packaging anti-pattern that collides with other projects. The Python code now lives at `claude-bootstrap/maggy/maggy/` and imports as `from maggy.X import Y` (matching the icpg/mnemos/skill_lint convention). `pyproject.toml` entrypoint + includes, `install.sh`, and the launcher commands updated to `python3 -m maggy.main`. - **SQLite PRAGMAs** — `InboxService` and `CompetitorService` open connections via a shared helper that sets `journal_mode=WAL`, `foreign_keys=ON`, and `busy_timeout=30000`. Matches the convention used by `scripts/icpg/store.py` and prevents "database is locked" errors when the FastAPI handlers race the heartbeat worker. - **Host-safety startup check** — `create_app()` now refuses to boot when `dashboard.auth_mode="local"` is combined with a non-loopback host (anything other than `127.0.0.1`/`localhost`/`::1`). Execute spawns `claude --dangerously-skip-permissions`, so binding to `0.0.0.0` with no auth would expose that to the local network. Users are directed to switch to token auth or rebind. - **`is_configured()` no longer accepts `linear`** — `providers.build()` raises `NotImplementedError` for Linear (stub), so treating it as configured would crash `create_app()` at startup. Now returns `False` cleanly. - **`providers.build()`** raises `NotImplementedError` with a clear "use github or asana" hint for `linear`. - **GitHub provider logs non-200s** in `list_tasks` — previously a 401/403/404 silently yielded an empty inbox. Now WARNING-logged with the repo slug and first 200 chars of the response body for debuggability. - **Removed unused `timedelta` import** from `inbox.py`. ### Second review pass fixes (CodeRabbit iteration 2) - `AsyncAnthropic` used in async methods — inbox ranking + competitor discovery + daily briefing no longer block the event loop on multi-second LLM round-trips - RSS/Google News feed date handling uses `parsedate_to_datetime` + ISO parser and compares real `datetime` objects — RFC 822 strings aren't lexicographically ordered (day-of-week cycles weekly) - iCPG CLI invocation fixed: `python3 -m scripts.icpg query prior --text ...` against the real argparse entrypoint, not the utility submodule `scripts.icpg.symbols` which has no `__main__` - Background `asyncio.create_task()` reference kept in a set + `add_done_callback(discard)` so GC can't kill the TDD pipeline mid-run - `GitHubIssuesProvider.list_followed()` and `search_tasks()` refuse to run when `repos` is empty (otherwise the query has no repo filter and searches all of public GitHub) - `AsanaProvider.list_tasks()` drops the dead `completed_filter` variable and skips sending `completed_since=""` (Asana validator rejects empty string); filters `closed` state properly - `install.sh` enforces Python 3.11+ minimum (was only checking `python3` existed) - `/static/index.html`: added CSP meta tag; Font Awesome pinned with SHA-384 SRI; Tailwind Play CDN annotated with vendor-for-prod TODO - `static/app.js`: added `jsStr()` for JS-string-context escaping in inline onclick handlers (esc() alone leaves single quotes intact — XSS via ticket titles was possible) - `regenerateBriefing()` catches and displays errors instead of swallowing them - `commands/maggy.md`: reads `dashboard.host`/`dashboard.port` from config before probing health (was hardcoded 8080) - `commands/maggy-init.md`: removed the "offer to write to .env" suggestion — the runtime doesn't load that file, so it would leave tokens on disk with no reader - `config.example.yaml`: removed the Linear section (stub only, shouldn't be in the advertised selectable set) - `PLAN.md`: config sample aligned with the actual runtime schema (removed spurious `config:` nesting) - `maggy/README.md`: install path no longer assumes `~/Documents/AI-Playground/...`; uses relative `cd claude-bootstrap/maggy` - `providers/__init__.py`: `__all__` alphabetized (RUF022) - `skills/maggy/SKILL.md`: explicit permission-model disclosure box explaining the `--dangerously-skip-permissions` tradeoff and the `working_dir` whitelist mitigations ### Added - **Maggy — AI engineering command center** (optional extension under `maggy/`) - Local FastAPI + vanilla JS dashboard; install with `maggy/install.sh`, zero build step - Provider abstraction: `GitHubIssuesProvider`, `AsanaProvider`, `LinearProvider` (stub) implement a single `IssueTrackerProvider` Protocol — swap trackers without touching services - AI-prioritized inbox with 30-min SQLite cache; stale-cache fallback when provider is unavailable - Generic competitor discovery + RSS + Google News monitoring with daily AI briefing (cached per day) - TDD execute pipeline (plan → tests → implement) spawns `claude -p --dangerously-skip-permissions` locally in the right codebase, with iCPG context auto-injected from the bootstrap's iCPG CLI - Config-driven (`~/.maggy/config.yaml`) — no hardcoded org IDs, repo names, or competitor lists - `/maggy` command launches dashboard; `/maggy-init` runs interactive setup - `skills/maggy/SKILL.md` documents capabilities; README skills table updated - Maggy skill included in the skills table (fixes RI002 lint error for this PR) ### Fixed - Added YAML frontmatter to `skills/mnemos/SKILL.md` (fixes FM001 lint error that was blocking CI on main) - Skill lint now passes across all 60 skills ### Security (Maggy) - RSS URL validation before fetching competitor feeds — blocks loopback, link-local, private-network, and non-HTTP(S) targets (SSRF prevention) - `safeHref()` in dashboard JS — only allows `http(s)`/`mailto` schemes in external links, blocks `javascript:`/`data:` URIs that would slip past HTML escaping - `working_dir` validated against configured codebase roots before launching Claude Code — prevents arbitrary-cwd execution of `--dangerously-skip-permissions` - Execute-mode input validated via `Literal["tdd", "plan"]`; typos rejected at request boundary - GitHub `_decode_id()` returns `None` on malformed input instead of raising — surfaces as 404 not 500 - LLM ranking output validated (index range, numeric rank, dedupe) before applying ### Resilience (Maggy) - `provider.list_tasks` failure falls back to last cached ranking (flagged `stale=true`) instead of 500 - Route-level `_require_configured()` returns 503 + onboarding hint when `~/.maggy/config.yaml` is missing, instead of dereferencing `None` services - `is_configured()` requires provider credentials (token) in addition to org/repos; refreshes cache on each check - Claude subprocess kill on timeout (`proc.kill()` + `await proc.wait()`), non-zero exits marked as failed sessions - `_run_claude()` returns `(ok, output)` tuple — TDD pipeline now aborts chain on first-step failure - Competitor news events use deterministic SHA-256 IDs with `INSERT OR IGNORE` — prevents duplicate rows on cursor reset / overlapping scans ### Changed (Maggy) - `pyproject.toml` console script `maggy = "src.main:main"` (proper callable) instead of `"src.main:app"` (ASGI instance) --- ## [3.4.1] - 2026-04-10 ### Fixed - Fixed broken `build-backend` in all three pyproject.toml files (icpg, mnemos, skill_lint). Changed `setuptools.backends._legacy:_Backend` to `setuptools.build_meta`. (Community reported) ### Added - Cheeky personality section in CLAUDE.md template for new projects --- ## [3.4.0] - 2026-04-07 ### Added - **Skill Quality Gates** — Automated linter, CI integration, and behavioral evals - `scripts/skill_lint/` — Python package with 20 check rules across 4 categories: - Frontmatter (FM001-FM009): YAML validation, name/description/field checks - Spec (SP001-SP003, SR001): SKILL.md existence, line count limits, skills-ref integration - Content (CQ001-CQ006): ASCII art detection, vague phrase detection, filler intensity, code block density, stale references, H1 heading - References (RI001-RI002): Cross-skill link validation, README coverage - CLI: `PYTHONPATH=scripts python3 -m skill_lint [--format text|json] [--severity error|warning|info] [--skill NAME] [--fail-on error|warning] skills/` - Inline suppression: `` in first 10 lines - 28 unit tests covering all check modules, report formatters, and CLI - `.github/workflows/skill-lint.yml` — Runs linter + tests on PR/push to skills/ or scripts/skill_lint/ - `.github/workflows/skill-review.yml` — Tessl skill review + skills-ref validation on PRs (requires TESSL_TOKEN) - `evals/` — 18 behavioral eval scenarios for 15 skills with deterministic and LLM-judged criteria - `evals/run-evals.sh` — Eval runner with baseline comparison mode - Updated `CONTRIBUTING.md` with quality gate requirements and linter usage ### Scan Results (59 skills) - Errors: 1 (mnemos/ missing frontmatter) - Warnings: 85 (19 skills over 500 lines, 30+ with ASCII art) - Clean: 3 skills --- ## [3.3.2] - 2026-04-07 ### Fixed - Removed stale `Load with: base.md` line from all 53 skills. Since v3.0, base skill loads via `@include` in CLAUDE.md, not per-skill. The leftover line caused confusion about missing files. (Fixes #13) ### Housekeeping - Closed #10 (Gen Agent Trust Hub security audit) — false positives from scanning markdown code samples as executable code. - Closed #12 (Dispatch discoverability) — will address skill description metadata in a future cleanup pass. - Closed #11 (Low quality skills) — will revisit with specific eval criteria. --- ## [3.3.1] - 2026-04-03 ### Added - **Post-Compaction Task Restoration** (Two-Layer Defense) - `templates/mnemos-post-compact-inject.sh` — PreToolUse hook (no matcher, fires on ALL tools) that detects compaction via `.mnemos/just-compacted` marker and re-injects the full checkpoint into Claude's context. Fast path ~5ms when no compaction, ~100ms injection when triggered. - `build_task_narrative()` in `checkpoint.py` — Reads signals.jsonl to build human-readable summary of recent activity (files edited, read counts, focus area, error patterns). Automatically included in checkpoints. - `format_for_post_compact_injection()` in `checkpoint.py` — Formats checkpoint as structured restoration block with goal, constraints, activity narrative, progress, key files, git state. - Compaction marker system (`write_compaction_marker`, `check_compaction_marker`, `consume_compaction_marker`) — Atomic marker write/consume to prevent parallel injection. ### Changed - **`mnemos-pre-compact.sh`** — Enhanced from advisory to assertive. Now includes inline checkpoint content in preservation instructions, writes compaction marker for Layer 2, builds task narrative from signals, and uses stronger verbatim framing. - **`CheckpointNode`** — Added `task_narrative` (str) and `recent_files` (list[dict]) fields for richer checkpoint content. - **`settings.json`** — Added new PreToolUse entry (no matcher) for `mnemos-post-compact-inject.sh` before the existing Edit|Write matcher. - **`SKILL.md`** — Documented post-compaction recovery mechanism. - **`README.md`** — Rewrote Mnemos section with two-layer defense architecture, resilience failure mode table, "why not just a plain file" rationale, and post-compaction restoration flow diagram. ## [3.3.0] - 2026-04-03 ### Added #### Mnemos — Task-Scoped Memory Lifecycle Agents crash when context fills up. Claude Code's compaction is lossy — it summarizes everything uniformly. Mnemos solves this with typed memory, continuous fatigue monitoring, and checkpoint/resume. - **`scripts/mnemos/`** — Python package (zero external dependencies) - `models.py` — MnemoNode (8 types with typed eviction policies), FatigueState, CheckpointNode - `store.py` — SQLite MnemoGraph storage with mnemo_nodes, checkpoints, fatigue_log tables - `fatigue.py` — 4-dimension fatigue model from passively observed signals (no agent cooperation needed) - `signals.py` — Behavioral signal collection from hooks (scope scatter, re-read ratio, error density) - `checkpoint.py` — CheckpointNode write/load with iCPG bridge, git state capture, formatted resume output - `consolidation.py` — Micro-consolidation: compress ResultNodes, evict cold ContextNodes, decay weights - `__main__.py` — CLI: init, status, fatigue, checkpoint, resume, consolidate, nodes, add, bridge-icpg - **4-Dimension Fatigue Model** (all passively observed from hooks): - Token utilization (0.40) — real context_window.used_percentage from statusline - Scope scatter (0.25) — unique directories in recent tool calls (from PreToolUse) - Re-read ratio (0.20) — files Read more than once, strongest signal of context loss (from PreToolUse) - Error density (0.15) — failed tool calls ratio (from PostToolUse) - States: FLOW (0-0.4), COMPRESS (0.4-0.6), PRE-SLEEP (0.6-0.75), REM (0.75-0.9), EMERGENCY (0.9+) - **Auto-Feeding Token Signal**: - `templates/mnemos-statusline.sh` — Statusline receives `context_window` JSON from Claude Code, writes `fatigue.json`, delegates display to ccusage (if installed) or shows simple context % - JSONL fallback in PostToolUse — reads conversation JSONL to estimate context usage when statusline not configured (0.75 correction factor for cache overhead, ~1-2pp accuracy) - `statusLine` config added to `templates/settings.json` — auto-activates on install, no separate configuration needed - **Fatigue-Aware Hook System**: - `templates/mnemos-pre-edit.sh` — PreToolUse: logs file signals, reads fatigue, auto-checkpoints at 0.60+, auto-consolidates at 0.40+, includes iCPG context - `templates/mnemos-post-tool.sh` — PostToolUse: logs tool success/failure for error density, auto-feeds token signal from JSONL when statusline is stale - `templates/mnemos-session-start.sh` — SessionStart: loads checkpoint on resume, bridges iCPG state - `templates/mnemos-pre-compact.sh` — PreCompact: emergency checkpoint + typed preservation priorities (NEVER DROP goals/constraints, OK TO DROP file contents) - `templates/mnemos-stop-checkpoint.sh` — Stop: writes final session checkpoint - **MnemoNode Eviction Policies**: - GoalNodes, ConstraintNodes, CheckpointNodes, HandoffNodes: NEVER evicted - ResultNodes, WorkingNodes, SkillNodes: compressed first (summary kept), then evictable - ContextNodes: evictable when activation weight drops below threshold - **iCPG Bridge**: `mnemos bridge-icpg` imports ReasonNodes as GoalNodes, postconditions/invariants as ConstraintNodes - **Skill + Commands**: - `skills/mnemos/SKILL.md` — Full skill documentation with fatigue states, CLI reference, agent instructions - `commands/mnemos-status.md` — `/mnemos-status` slash command - `commands/mnemos-checkpoint.md` — `/mnemos-checkpoint` slash command - **Documentation**: - `docs/mnemos-implementation.md` — Implementation addendum for the Mnemos RFC ### Changed #### iCPG Fixes - `scripts/icpg/bootstrap.py` — Fixed `_get_commits()` git log parsing (was producing 0 symbols linked) - `scripts/icpg/drift.py` — Added `check_file_drift()` for fast, file-scoped drift (O(symbols-in-file)) - `scripts/icpg/__main__.py` — Added `drift file ` subcommand, `_resolve_path()` for relative path handling - `templates/icpg-pre-edit.sh` — Now includes file-scoped drift detection alongside context and constraints #### Settings Template - `templates/settings.json` — Added `statusLine` config for auto-feeding token signal, Mnemos hooks replace standalone iCPG hooks, added PostToolUse hook, added mnemos permission allows - `templates/CLAUDE.md` — Added `@.claude/skills/mnemos/SKILL.md` to skill includes --- ## [3.2.0] - 2026-04-02 ### Added #### iCPG Full Implementation (Intent-Augmented Code Property Graph) - **`scripts/icpg/`** — Python CLI package implementing the full iCPG RFC v8 - `models.py` — ReasonNode, Symbol, Edge, DriftEvent data models with Design by Contract (preconditions, postconditions, invariants) - `store.py` — SQLite storage layer with 4 tables, WAL mode, indexed queries - `symbols.py` — Language-aware symbol extraction: Python (AST), TypeScript/JS (regex), Go, Rust, Elixir - `drift.py` — 6-dimension drift detection: spec, decision, ownership, test, usage, dependency - `contracts.py` — Design by Contract layer with LLM inference (Claude/OpenAI) and heuristic fallback - `vectors.py` — Tiered duplicate detection: ChromaDB → TF-IDF → exact match fallback - `bootstrap.py` — Git history inference: cluster commits, LLM-infer ReasonNodes, link symbols - `__main__.py` — CLI with subcommands: init, create, record, query, drift, bootstrap, status - `pyproject.toml` — pip-installable with optional deps (chromadb, sentence-transformers, openai) - **3 Canonical Pre-Task Queries** (RFC Section 2.1): - `icpg query prior ""` — Vector-based duplicate detection before starting work - `icpg query constraints ` — Get invariants/contracts for files being modified - `icpg query risk ` — Drift score, ownership history, modification count - **Hook Integration**: - `templates/icpg-pre-edit.sh` — PreToolUse hook: injects intent context + constraints before every Edit/Write - `templates/icpg-stop-record.sh` — Stop hook: auto-records symbols to active ReasonNode after implementation - **Slash Commands**: - `commands/icpg-impact.md` — `/icpg-impact ` blast radius visualization - `commands/icpg-why.md` — `/icpg-why ` trace symbol to creating intent - `commands/icpg-drift.md` — `/icpg-drift` full drift report across all dimensions - `commands/icpg-bootstrap.md` — `/icpg-bootstrap` infer intents from git history ### Changed #### iCPG Skill Rewrite - **`skills/icpg/SKILL.md`** — Complete rewrite aligning with RFC v8 - ReasonNode now carries formal contracts (preconditions, postconditions, invariants) - Drift formally defined as predicate failure (not vague metric) - 6-dimension drift model with 0-1 severity scores per dimension - CLI reference for all `icpg` subcommands - Hook integration documentation (PreToolUse + Stop) - Agent Teams integration section with updated pipeline #### Agent Team iCPG Integration - **`skills/agent-teams/agents/team-lead.md`** — Team lead now creates ReasonNodes and checks for duplicates before creating task chains - **`skills/agent-teams/agents/feature.md`** — Feature agents query constraints/risk before implementing, auto-record symbols after - **`skills/agent-teams/agents/quality.md`** — Quality agent runs drift checks during GREEN verify, validates spec-intent alignment - **`skills/agent-teams/SKILL.md`** — Updated "Integration with Existing Skills" table with iCPG + code-graph entries #### Settings Template - **`templates/settings.json`** — Added PreToolUse hook (icpg-pre-edit.sh), Stop hook extension (icpg-stop-record.sh), icpg permission allows --- ## [3.1.0] - 2026-04-02 ### Added #### iCPG Skill (Initial Spec) - **`skills/icpg/SKILL.md`** — Initial iCPG skill spec (now superseded by 3.2.0 full implementation) --- ## [3.0.0] - 2026-03-31 ### Breaking Changes This release aligns Claude Bootstrap with how Claude Code actually works internally. Several features that referenced non-existent infrastructure have been replaced with real Claude Code mechanisms. - **Ralph Wiggum plugin removed** — The `/ralph-loop` command, `claude-plugins-official` marketplace, and plugin stop-hook mechanism never existed in Claude Code. All references removed. - **TDD loops now use real Stop hooks** — Claude Code's Stop hook (exit code 2 feeds stderr back to the model) replaces the fake plugin. `scripts/tdd-loop-check.sh` runs tests/lint/typecheck after each response. - **`CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` removed** — Agent spawning and task management are standard Claude Code features, not gated behind an env var. All references removed. - **CLAUDE.md template uses `@include` directives** — Skills are loaded via `@.claude/skills/base/SKILL.md` syntax which Claude Code resolves at parse time (recursive, max depth 5, cycle detection). - **Quality gates moved from CLAUDE.md to `.claude/rules/`** — Rules use YAML frontmatter with `paths:` globs for conditional activation. - **"STRICTLY ENFORCED" / "Non-Negotiable" language removed** — Claude Code treats CLAUDE.md as user-level context (not system prompt) wrapped in `` tags with "may or may not be relevant" caveat. Aggressive language wastes tokens without creating binding constraints. ### Added #### Stop Hook TDD Loops - **`templates/tdd-loop-check.sh`** — Universal TDD loop script for Stop hooks - Runs tests, lint, typecheck after each Claude response - Exit 0 (all pass) = Claude stops; Exit 2 (failures) = stderr fed back to Claude - Iteration counter with configurable max (default 25) - Detects project type (Node.js/Python) and runs appropriate commands - Distinguishes code errors (loop) from environment errors (stop) - **`templates/settings.json`** — Pre-configured Claude Code settings - Stop hook configuration for TDD loops - SessionStart hook for auto-context injection - Permission allow rules: test runners, linters, git read commands, gh CLI - Permission deny rules: `rm -rf`, `git push --force`, writing `.env` files - Ready to copy into any project's `.claude/settings.json` #### Conditional Rules System - **`.claude/rules/` directory** with 7 rule files using proper YAML frontmatter: - `quality-gates.md` — Always active: 20 lines/function, 200 lines/file, 3 params, 80% coverage - `tdd-workflow.md` — Always active: RED-GREEN-VALIDATE workflow - `security.md` — Always active: no secrets in code, parameterized queries, bcrypt - `react.md` — Active on `**/*.tsx`, `**/*.jsx`, `src/components/**` - `typescript.md` — Active on `**/*.ts`, `**/*.tsx` - `python.md` — Active on `**/*.py` - `nodejs-backend.md` — Active on `src/api/**`, `src/routes/**`, `server/**` #### CLAUDE.local.md - **`templates/CLAUDE.local.md`** — Private developer override template - Not checked into git (higher priority than project CLAUDE.md) - Template with common overrides: preferences, local environment, quality gate tweaks #### Agent Definition Frontmatter - All 6 agent definitions now use proper Claude Code frontmatter: - `name` — Agent identifier - `description` — When-to-use hint - `model` — Model selection (sonnet, inherit) - `tools` — Tool allowlist (e.g., `[Read, Glob, Grep, TaskCreate]`) - `disallowedTools` — Tool denylist (e.g., `[Write, Edit, Bash]`) - `maxTurns` — Maximum agentic turns before stopping - `effort` — Thinking depth (medium/high) #### @include Directives in CLAUDE.md - CLAUDE.md template now uses `@.claude/skills/base/SKILL.md` syntax - Claude Code resolves these at load time (recursively inlined) - Skills actually become part of the prompt instead of decorative text #### CLAUDE.md Template Structure - Added **Project Structure** section — tells Claude where things live without filesystem exploration - Added **Key Decisions** section — prevents Claude from re-litigating settled architectural choices - Added **Conventions** section — patterns Claude should follow (test colocation, API shape, etc.) - Added **Don't** section — short guardrails (no .env writes, no secret leaks) - Removed Session Persistence section (belongs in skills, not root template) #### PreCompact Hook for Smarter Compaction - **`templates/pre-compact.sh`** — PreCompact hook that injects project-specific preservation priorities into the compaction summarizer - Auto-detects project type (TypeScript, Python, Next.js, FastAPI, Flutter, etc.) - Finds schema files (Drizzle, Prisma, SQLAlchemy) and tells summarizer to preserve all schema discussion verbatim - Finds API directories and tells summarizer to preserve exact endpoint paths, request/response shapes - Extracts Key Decisions from CLAUDE.md and tells summarizer to reference them by name - Injects live git state (branch, uncommitted changes, staged files) into summary priorities - Tells summarizer to preserve exact error messages and fix context (not paraphrased) - Tells summarizer what NOT to preserve (dead ends, full file contents, formatting noise) - Zero overhead during normal usage — only runs when compaction fires - Configured in `.claude/settings.json` under `hooks.PreCompact` #### Full Skill Frontmatter (all 57 skills) - Added undocumented-but-functional Claude Code skill frontmatter to all 57 skills: - `when-to-use` — guidance for when Claude should invoke the skill - `user-invocable` — 11 skills are user-invocable (code-review, codex-review, gemini-review, security, existing-repo, ticket-craft, workspace, cpg-analysis, playwright-testing, ai-models), 46 are model-only - `effort` — thinking depth per skill (6 high, 47 medium, 4 low) - `paths` — file glob patterns for 24 language/framework/database skills (e.g., `["**/*.py"]` for Python, `["**/*.tsx"]` for React) - `allowed-tools` — restricted tool access for 3 review/security skills (`[Read, Glob, Grep, Bash]`) ### Changed - `install.sh` now copies rules/, templates/, and no longer checks for Ralph Wiggum plugin - `iterative-development/SKILL.md` completely rewritten for Stop hooks - `base/SKILL.md` — Ralph Wiggum auto-invoke section replaced with Stop hook explanation - `agent-teams/SKILL.md` — Removed experimental env var requirement - `commands/spawn-team.md` — Removed env var check, removed Shift+Up/Down and Ctrl+T UI references - All agent definitions in `skills/agent-teams/agents/` rewritten with frontmatter - Total files: 57 skills + 7 conditional rules + 3 templates ### Removed - All Ralph Wiggum plugin references (`/ralph-loop`, `/plugin install`, `--completion-promise`, `` tags) - `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1` env var requirement - Plugin marketplace references (`claude-plugins-official`) - `Shift+Up/Down` and `Ctrl+T` UI interaction assumptions - "STRICTLY ENFORCED" and "Non-Negotiable" language throughout ### Migration ```bash cd "$(cat ~/.claude/.bootstrap-dir)" git pull ./install.sh # Then in each project: claude > /initialize-project # Will update to v3.0.0 structure ``` **Manual steps for existing projects:** 1. Copy `templates/settings.json` to `.claude/settings.json` 2. Copy `templates/tdd-loop-check.sh` to `scripts/tdd-loop-check.sh` and `chmod +x` 3. Replace skill listings in CLAUDE.md with `@include` directives 4. Copy `rules/` files to `.claude/rules/` 5. Add `CLAUDE.local.md` to `.gitignore` 6. Remove `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS` from environment --- ## [2.7.0] - 2026-03-23 ### Added #### Tiered Code Graph System (MCP-based) - **Code Graph skill** (`code-graph/SKILL.md`) - Always-on code intelligence via MCP - "Graph first, file second" workflow — Claude queries the graph before reading files - Integrates with codebase-memory-mcp: 14 MCP tools, 64 languages, sub-ms queries - Decision tables for when to use graph vs direct file reads - Workflow: LOCATE → UNDERSTAND → BLAST → TRACE → CHANGE → VERIFY - Anti-patterns guide for common graph-ignoring mistakes - **CPG Analysis skill** (`cpg-analysis/SKILL.md`) - Opt-in deep code analysis - Tier 2: Joern CPG via CodeBadger MCP (40+ tools, AST+CFG+CDG+DDG+PDG) - Control flow graph analysis, data flow tracing, dead code detection - CPGQL query examples for common analysis patterns - 12 language support (Java, Python, TypeScript, Go, C/C++, etc.) - Tier 3: CodeQL MCP for interprocedural taint analysis and security auditing - OWASP vulnerability detection, source-to-sink data flow - 10+ languages including Rust (which Joern doesn't support) - Combined workflow: Tier 1 scope → Tier 2 flow → Tier 3 security - **Graph tools installer** (`scripts/install-graph-tools.sh`) - Platform-detecting installer (macOS/Linux, ARM64/AMD64) - `--joern` flag for Tier 2 (Docker + Python setup) - `--codeql` flag for Tier 3 (CodeQL CLI + query packs) - `--all` flag for all tiers - **Post-commit graph hook** (`hooks/post-commit-graph`) - Lightweight (~10ms) hook that signals codebase-memory-mcp file watcher - Filters to code files only, never blocks git workflow - Auto-installed by `/initialize-project` - **Graph freshness check** (`hooks/workspace/check-graph-freshness.sh`) - Session-start advisory warns if graph data is stale - Cross-platform timestamp comparison (macOS/Linux) #### Initialize Project Updates - New question 4b: "Code graph analysis level?" (Standard/Deep/Security/Full) - New Step 4b: Automatic MCP server configuration (`.mcp.json`) - `.code-graph/` auto-added to `.gitignore` - Post-commit graph hook auto-installed - CLAUDE.md template now includes "Code Graph (MCP)" section - Summary output shows graph tier configuration ### Changed - Total skills increased from 55 to **57 skills** - `install.sh` now copies `install-graph-tools.sh` to `~/.claude/` - `install.sh` summary output includes graph tools commands --- ## [2.6.0] - 2026-02-14 ### Added #### AI-Native Ticket Writing - **Ticket Craft skill** - Write Jira/Asana/Linear tickets optimized for Claude Code execution - INVEST+C criteria: standard INVEST plus "Claude-Ready" verification - 4 ticket templates: Feature, Bug, Tech Debt, Epic Breakdown - Claude Code Context section: file refs, pattern refs, verification commands, constraints - Claude Code Ready Checklist: 16-point validation before tickets enter sprint - Anti-patterns guide: 6 common ticket-writing mistakes that cause AI agents to fail - Story point calibration for AI agents (different from human estimation) - Epic slicing techniques: by workflow, data variation, user role, CRUD, happy path - Given-When-Then acceptance criteria format - Integration guide for Jira, Asana, Linear, and GitHub Issues - Maps tickets directly to the agent-teams 10-task pipeline #### Bug Fixes - **Fix pre-push hook false positive** - Hook was blocking pushes even when review passed with 0 Critical/High issues (fixes #8, reported by @shawnyeager) - `grep` pattern matched "Critical" in table headers and pass messages - Now checks for explicit `Status: ✅ PASS` / `Status: ❌` lines instead #### Community Contributions - **Flexible install directory** - Bootstrap can now be cloned anywhere, not just `~/.claude-bootstrap` (PR #9 by @victortrac) - Install path saved to `~/.claude/.bootstrap-dir` for runtime resolution - Removes fragile symlink approach - **Workspace skill frontmatter fix** - Added missing YAML frontmatter to workspace skill (PR #9 by @victortrac) ### Changed - Total skills increased from 54 to **55 skills** ### Contributors - @victortrac - Flexible install path, workspace skill fix (PR #9) - @shawnyeager - Pre-push hook bug report (#8) --- ## [2.5.0] - 2026-02-07 ### Added #### Agent Teams (Default Workflow) - **Agent Teams skill** - Coordinated team of AI agents as the default development workflow - Strict TDD pipeline: Specs > Tests > Fail > Implement > Test > Review > Security > Branch > PR - Task dependency chains enforce pipeline ordering (no step can be skipped) - Multiple features run in parallel with shared verification agents - Quality gates at every stage with cross-agent verification - **Default agent roster** (5 permanent agents): - **Team Lead** - Orchestration only (delegate mode), task breakdown, feature agent spawning - **Quality Agent** - TDD verification (RED/GREEN phases), spec review, coverage >= 80% - **Security Agent** - OWASP scanning, secrets detection, dependency audit - **Code Review Agent** - Multi-engine code review (Claude/Codex/Gemini) - **Merger Agent** - Feature branches, PR creation via `gh` CLI - **Feature agents** - One per feature, each follows the strict pipeline end-to-end - Writes spec, tests, implementation, validation - Hands off to Quality, Review, Security, Merger at each gate - **Agent definition files** in `skills/agent-teams/agents/`: - `team-lead.md`, `quality.md`, `security.md`, `code-review.md`, `merger.md`, `feature.md` - Copied to `.claude/agents/` during project initialization - **`/spawn-team` command** - Spawn the agent team on any project - Checks prerequisites (env var, agent definitions, feature specs) - Spawns all agents and creates task dependency chains - Shows team status summary - **10-task dependency chain per feature**: 1. Spec → 2. Spec Review → 3. Tests → 4. RED Verify → 5. Implement → 6. GREEN Verify → 7. Validate → 8. Code Review → 9. Security Scan → 10. Branch+PR ### Changed - Total skills increased from 53 to **54 skills** - `/initialize-project` Phase 6 now sets up agent team by default (replaces manual next steps) - CLAUDE.md template includes agent teams section - `team-coordination.md` superseded by `agent-teams.md` for automated coordination --- ## [2.4.0] - 2026-01-20 ### Added #### Multi-Repo Workspace Awareness - **Workspace skill** - Dynamic multi-repo and monorepo awareness for Claude Code - Workspace topology discovery (monorepo, multi-repo, hybrid detection) - Dependency graph generation (who calls whom) - API contract extraction (OpenAPI, GraphQL, tRPC, TypeScript, Pydantic) - Key file identification with token estimates - Cross-repo capability index (search before reimplementing) - Token budget management (P0-P3 priority allocation) - **`/analyze-workspace` command** - Full workspace analysis - Phase 1: Topology discovery (~30s) - Phase 2: Module analysis (~60s) - Phase 3: Contract extraction (~45s) - Phase 4: Dependency graph (~30s) - Phase 5: Key file identification (~30s) - Generates TOPOLOGY.md, CONTRACTS.md, DEPENDENCY_GRAPH.md, KEY_FILES.md, CROSS_REPO_INDEX.md - **`/sync-contracts` command** - Lightweight incremental contract sync - Checks only contract source files (~15s) - Diff mode to preview changes - Validate mode to check consistency - Lightweight mode for hooks #### Contract Freshness System - **Session start hook** - Staleness check (~5s, advisory) - **Post-commit hook** - Auto-sync when contracts change (~15s) - **Pre-push hook** - Validation gate (~10s, blocking) - `.contract-sources` file to track monitored files - Freshness indicators: 🟢 Fresh, 🟡 Stale, 🔴 Outdated, ⚠️ Drift #### Cross-Repo Change Detection - Automatic detection when changes affect other modules - Impact analysis with recommended action order - Breaking change protocol ### Changed - Total skills increased from 52 to **53 skills** - Added 3 new commands: `/analyze-workspace`, `/sync-contracts`, `/workspace-status` - Added 3 workspace hooks for contract freshness --- ## [2.3.0] - 2026-01-17 ### Added #### Google Gemini Code Review - **Gemini Review skill** - Google Gemini CLI for code review with Gemini 2.5 Pro - 1M token context window - analyze entire repositories at once - Free tier: 1,000 requests/day with Google account - Code Review Extension: `/code-review` command in Gemini CLI - Headless mode for CI/CD: `gemini -p "prompt"` - Benchmarks: 63.8% SWE-Bench, 56.3% Qodo PR, 70.4% LiveCodeBench - **Multi-engine code review** - `/code-review` now supports up to 3 engines - Claude (built-in) - quick, context-aware reviews - OpenAI Codex - 88% security issue detection - Google Gemini - 1M token context for large codebases - Dual engine mode - run any two engines, compare findings - Triple engine mode - maximum coverage for critical/security code - **GitHub Actions workflows** for all configurations - Gemini-only workflow - Triple engine (Claude + Codex + Gemini) workflow - Updated dual engine workflow ### Changed - Total skills increased from 51 to **52 skills** - Updated `/code-review` to support engine selection: `--engine claude,codex,gemini` - Added `--gemini` and `--all` shortcuts for common configurations --- ## [2.2.0] - 2026-01-17 ### Added #### Existing Repository Support - **Existing Repo skill** - Analyze existing codebases, maintain structure, setup guardrails - Repo structure detection (monorepo, full-stack, frontend-only, backend-only) - Tech stack auto-detection (TypeScript, Python, Flutter, Android, etc.) - Convention detection (naming, imports, exports, test patterns) - Guardrails audit (pre-commit hooks, linting, formatting, type checking) - Structure preservation rules - work within existing patterns, don't reorganize - Gradual implementation strategy for adding guardrails to legacy projects - Cross-repo coordination for separate frontend/backend repos - **`/analyze-repo` command** - Quick analysis of any existing repository - Directory structure mapping - Guardrails status audit (Husky, pre-commit, ESLint, Ruff, commitlint, etc.) - Convention detection and documentation - Generates analysis report with recommendations - Offers to add missing guardrails - **Auto-triggered** by `/initialize-project` when existing codebase detected #### Initialize Project Enhancement - **Auto-analysis for existing codebases** - `/initialize-project` now automatically analyzes existing repos before making changes - **User choice after analysis** - Options: skills only, skills + guardrails, full setup, or just view analysis - **Existing-repo skill auto-copied** - When working with existing codebases #### Guardrails Setup (for JS/TS and Python) - **Husky + lint-staged** setup for JavaScript/TypeScript projects - **pre-commit framework** setup for Python projects - **commitlint** configuration for conventional commits - **ESLint 9 flat config** template - **Ruff + mypy** configuration for Python ### Changed - Total skills increased from 50 to **51 skills** - Updated README with `/analyze-repo` usage pattern --- ## [2.1.0] - 2026-01-17 ### Added #### Mobile Development (contributed by @tyr4n7) - **Android Java skill** - MVVM architecture, ViewBinding, Espresso testing, GitHub Actions CI - **Android Kotlin skill** - Coroutines, Jetpack Compose, Hilt DI, MockK/Turbine testing - **Flutter skill** - Riverpod state management, Freezed models, go_router, mocktail testing - **Android/Flutter auto-detection** - `/initialize-project` now detects Flutter, Android Java, and Android Kotlin projects #### Database Skills (addresses #7) - **Firebase skill** - Firestore, Auth, Storage, real-time listeners, security rules, offline persistence - **Cloudflare D1 skill** - Serverless SQLite with Workers, Drizzle ORM integration, migrations - **AWS DynamoDB skill** - Single-table design, GSI patterns, SDK v3 TypeScript/Python - **AWS Aurora skill** - Serverless v2, RDS Proxy, Data API, connection pooling for Lambda - **Azure Cosmos DB skill** - Partition key design, consistency levels, change feed, SDK patterns #### Code Review Enhancements - **Codex Review skill** - OpenAI Codex CLI for code review with GPT-5.2-Codex (88% detection rate) - **Code review engine choice** - `/code-review` now lets you choose: Claude, OpenAI Codex, or both engines - **Dual engine review mode** - Run both Claude and Codex, compare findings, catch more issues - **CI/CD templates** - GitHub Actions workflows for Claude, Codex, and dual-engine reviews ### Changed - Total skills increased from 44 to **50 skills** - Updated README with new database and mobile skill listings ### Contributors - @tyr4n7 - Android Java, Android Kotlin, Flutter skills and auto-detection - @johnsfuller - Feature request for database skills (#7) --- ## [2.0.0] - 2026-01-08 ### Breaking Changes - **Skills structure changed** - Skills now use folder/SKILL.md structure instead of flat .md files - Before: `~/.claude/skills/base.md` - After: `~/.claude/skills/base/SKILL.md` - All skills now require YAML frontmatter with `name` and `description` fields ### Added - **Validation test** (`tests/validate-structure.sh`) - Validates skills structure, commands, hooks - `--full` mode: All 142 checks - `--quick` mode: Essential checks for initialize-project - **Phase 0 validation** in `/initialize-project` - Checks bootstrap installation before setup - **Conversion script** (`scripts/convert-skills-structure.sh`) - Migrates flat skills to folder structure - Install script now runs validation automatically - Symlink created at `~/.claude-bootstrap` for easy access to validation tools ### Fixed - Skills now load properly in Claude Code (fixes #1) - Install script properly copies skill folders instead of merging contents ### Migration ```bash cd ~/.claude-bootstrap git pull ./install.sh ``` --- ## [1.5.0] - 2026-01-07 ### Added - **Code Deduplication skill** - Prevent semantic code duplication with capability index - **Team Coordination skill** - Multi-person projects with shared state and todo claiming - `/check-contributors` command - Detect solo vs team projects - `/update-code-index` command - Regenerate CODE_INDEX.md - Pre-push hook for code review enforcement ### Changed - Code reviews now mandatory before push (blocks on Critical/High issues) --- ## [1.4.0] - 2026-01-06 ### Added - **Code Review skill** - Mandatory code reviews via `/code-review` - **Commit Hygiene skill** - Atomic commits, PR size limits - Pre-push hooks installation script --- ## [1.3.0] - 2026-01-05 ### Added - **MS Teams Apps skill** - Teams bots and AI agents with Claude/OpenAI - **Reddit Ads skill** - Agentic ad optimization service - **PWA Development skill** - Service workers, caching, offline support --- ## [1.2.0] - 2026-01-04 ### Added - **Playwright Testing skill** - E2E testing with Page Objects - **PostHog Analytics skill** - Event tracking, feature flags - **Shopify Apps skill** - Remix, Admin API, checkout extensions --- ## [1.1.0] - 2026-01-03 ### Added - Session management with automatic state tracking - Decision logging for architectural choices - Code landmarks for quick navigation --- ## [1.0.0] - 2026-01-01 ### Added - Initial release with 30+ skills - `/initialize-project` command - TDD-first workflow with Ralph Wiggum loops - Security-first patterns - Support for Python, TypeScript, React, React Native - Supabase integration skills - AI/LLM patterns for Claude and OpenAI ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Maggy Thanks for your interest in contributing! This project aims to make AI-assisted development more reliable and consistent. ## Philosophy Before contributing, understand the core philosophy: 1. **Complexity is the enemy** - Every line of code is a liability 2. **Measurable constraints** - Prefer specific numbers (20 lines/fn) over vague guidance 3. **Security is non-negotiable** - All projects must pass security checks 4. **AI-first thinking** - LLMs for logic, code for plumbing 5. **Spec-driven** - Define before you build ## How to Contribute ### Adding a New Skill 1. Create a directory in `skills/` with a lowercase hyphenated name 2. Add `SKILL.md` with YAML frontmatter: ```markdown --- name: my-skill description: One-line description of what this skill does when-to-use: When to activate this skill user-invocable: true effort: medium --- # My Skill ## Core Principles ... ``` 3. Include these sections: - Core principles with measurable constraints - Project structure (if applicable) - Patterns with code examples (>= 1 per 50 lines) - Anti-patterns list 4. Keep under 500 lines (ideal: under 300) 5. Run the linter before submitting: ```bash PYTHONPATH=scripts python3 -m skill_lint --skill my-skill skills/ ``` 6. Update `README.md` to include the new skill ### Quality Gates All skills must pass the automated linter before merge: ```bash # Lint all skills PYTHONPATH=scripts python3 -m skill_lint skills/ # Lint a single skill PYTHONPATH=scripts python3 -m skill_lint --skill python skills/ # JSON output for CI PYTHONPATH=scripts python3 -m skill_lint --format json skills/ ``` **Checks enforced:** - **FM001-FM009**: YAML frontmatter (name, description, format, fields) - **SP001-SP003**: Spec compliance (SKILL.md exists, line count limits) - **CQ001-CQ006**: Content quality (no ASCII art, no vague phrases, code examples) - **RI001-RI002**: Cross-references (valid skill links, README listing) Suppress known issues with inline comments: ```markdown ``` ### Improving Existing Skills 1. Keep changes focused on one improvement 2. Maintain the existing structure 3. Ensure examples are correct and tested 4. Update version comments if significant ### Updating the Initialize Command 1. Test changes locally before submitting 2. Ensure idempotency - running twice shouldn't break anything 3. Preserve user customizations (never overwrite `_project_specs/`) ## Skill Guidelines ### Do - Use specific, measurable constraints - Provide working code examples - Include anti-patterns with explanations - Keep skills focused on one topic - Reference other skills when building on them ### Don't - Use vague guidance ("write clean code") - Include time estimates - Add features beyond what's needed - Break existing projects when run as update ## Testing Your Changes ```bash # Install your changes ./install.sh # Test on a new project mkdir test-project && cd test-project claude > /initialize-project # Test on an existing project cd existing-project claude > /initialize-project # Should update skills without breaking existing config ``` ## Pull Request Process 1. Fork the repository 2. Create a feature branch (`git checkout -b feature/new-skill`) 3. Make your changes 4. Test locally 5. Submit PR with clear description of changes ## Code of Conduct - Be respectful and constructive - Focus on technical merit - Welcome newcomers - Share knowledge freely ## Questions? Open an issue for: - Bug reports - Feature requests - Clarification on philosophy - Help with implementation ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2025 Ali Naqi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Maggy > **From opinionated Claude Code setup to autonomous AI engineering platform.** Maggy started as an opinionated project initialization system for Claude Code — skills, TDD hooks, quality gates. It has evolved into a full autonomous engineering command center: interactive chat with session takeover, multi-agent orchestration in containers, P2P mesh networking across machines, AI-prioritized task triage, competitor intelligence, and process analytics. The guardrails that keep AI-generated code simple, secure, and verifiable are still the foundation — but now they power an end-to-end autonomous engineering workflow. **v5.0.0** — Interactive Chat (`--resume` session takeover), Polyphony (container-isolated multi-agent orchestration), P2P Mesh (cross-machine session sync), auto-bootstrap, grouped dashboard navigation. ## Core Philosophy ``` ┌────────────────────────────────────────────────────────────────┐ │ TDD LOOPS VIA STOP HOOKS │ │ ─────────────────────────────────────────────────────────────│ │ Stop hooks run tests after each Claude response. │ │ Failures feed back automatically. Claude iterates until green.│ │ Real Claude Code infrastructure — no plugins needed. │ ├────────────────────────────────────────────────────────────────┤ │ TESTS FIRST, ALWAYS │ │ ─────────────────────────────────────────────────────────────│ │ Features: Write tests → Watch them fail → Implement → Pass │ │ Bugs: Find test gap → Write failing test → Fix → Pass │ │ No code ships without a test that failed first. │ ├────────────────────────────────────────────────────────────────┤ │ SIMPLICITY IS THE GOAL │ │ ─────────────────────────────────────────────────────────────│ │ 20 lines per function │ 200 lines per file │ 3 params max │ │ Enforced via .claude/rules/ with paths: frontmatter. │ ├────────────────────────────────────────────────────────────────┤ │ SECURITY BY DEFAULT │ │ ─────────────────────────────────────────────────────────────│ │ No secrets in code │ Permission deny rules for .env files │ │ Dependency scanning │ Pre-commit hooks │ CI enforcement │ ├────────────────────────────────────────────────────────────────┤ │ AGENT TEAMS BY DEFAULT │ │ ─────────────────────────────────────────────────────────────│ │ Every project runs as a coordinated team of AI agents. │ │ Agent definitions use proper frontmatter: tools, model, │ │ maxTurns, effort, disallowedTools. │ ├────────────────────────────────────────────────────────────────┤ │ CONDITIONAL RULES │ │ ─────────────────────────────────────────────────────────────│ │ Rules in .claude/rules/ activate based on file paths. │ │ React rules only load when editing .tsx files. │ │ Python rules only load when editing .py files. │ │ Saves tokens. Reduces noise. More targeted guidance. │ └────────────────────────────────────────────────────────────────┘ ``` ## Quick Start ```bash # Clone and install (clone anywhere you like) git clone https://github.com/alinaqi/claude-bootstrap.git cd claude-bootstrap && ./install.sh # In any project directory claude > /initialize-project ``` Claude will: 1. **Validate tools** - Check gh, vercel, supabase CLIs 2. **Ask questions** - Language, framework, AI-first?, database, graph analysis level 3. **Set up repository** - Create or connect GitHub repo 4. **Create structure** - Skills, rules, settings, security, CI/CD, specs, todos 5. **Copy settings.json** - Pre-configured permissions and Stop hooks 6. **Generate CLAUDE.md** - With `@include` directives for modular skills 7. **Generate CLAUDE.local.md** - Template for private developer overrides 8. **Spawn agent team** - Deploy Team Lead + Quality + Security + Review + Merger + Feature agents ## Cross-Tool Compatibility (Claude + Kimi + Codex) Maggy works with **Claude Code**, **Kimi CLI**, and **OpenAI Codex CLI**. All three use the same `SKILL.md` format. | Feature | Claude Code | Kimi CLI | Codex CLI | |---------|-------------|----------|-----------| | Skills | `.claude/skills/` | `.kimi/skills/` (also reads `.claude/`) | `.codex/skills/` | | Project instructions | `CLAUDE.md` | (uses skills) | `AGENTS.md` | | Hooks config | `settings.json` | `config.toml` | `config.toml` | **`install.sh`** auto-detects installed tools and installs skills to all of them. **`/sync-agents`** syncs project config across tools on demand. ```bash # Install tools curl -L code.kimi.com/install.sh | bash # Kimi npm i -g @openai/codex # Codex # Reinstall to pick up new tools cd maggy && ./install.sh # In any project, sync cross-tool config claude > /sync-agents ``` ## Cross-Agent Intelligence When multiple AI CLI tools are installed, Maggy enables intelligent collaboration between them. ### Codex Auto-Review (Stop Hook) After tests pass, Codex automatically reviews your diff for critical bugs and security issues. Runs as a Stop hook between TDD and iCPG recording. ``` Stop hook order: 1. tdd-loop-check.sh → tests pass? 2. codex-auto-review.sh → Codex reviews diff (NEW) 3. icpg-stop-record.sh → record symbols 4. mnemos-checkpoint.sh → save memory ``` - Exit 0 = no critical issues found - Exit 2 = critical/high issues feed back to Claude for fixing - Gracefully skips if Codex not installed ### Kimi Delegation (Token Optimization) Claude checks iCPG blast radius and delegates small tasks to Kimi automatically — the user doesn't run anything: | Blast Radius | Claude's Action | |-------------|----------------| | 1-3 files | Saves context via `mnemos checkpoint`, runs `kimi --print -y -p "..."` with context + task | | 4-8 files | Asks user, then delegates or handles directly | | 9+ files | Handles directly (needs full context window) | Context transfer uses structured state (mnemos checkpoints + iCPG constraints), not raw conversation. ### iCPG + Mnemos (Always-On for All Agents) All three tools run the same iCPG pre-task queries and Mnemos memory lifecycle: ```bash # Before any code change (Claude, Kimi, or Codex): icpg query prior "" # check for duplicate work icpg query constraints # check invariants icpg query risk # check fragility # Memory management: mnemos add goal "" # at task start mnemos checkpoint # at sub-goal boundaries ``` ## How TDD Loops Work (Stop Hooks) **No plugins. No fake commands.** Claude Code's Stop hook runs a script when Claude finishes a response. Exit code 2 feeds stderr back to Claude and continues the conversation. ``` ┌─────────────────────────────────────────────────────────────┐ │ 1. You say: "Add email validation to signup" │ │ 2. Claude writes tests + implementation │ │ 3. Claude finishes response │ │ 4. Stop hook runs: npm test && npm run lint │ │ 5a. All pass (exit 0) → Done! │ │ 5b. Failures (exit 2) → stderr fed back to Claude │ │ 6. Claude sees failures, fixes, finishes again │ │ 7. Stop hook runs again → repeat until green │ └─────────────────────────────────────────────────────────────┘ ``` **Configuration** in `.claude/settings.json`: ```json { "hooks": { "Stop": [{ "hooks": [{ "type": "command", "command": "scripts/tdd-loop-check.sh", "timeout": 60, "statusMessage": "Running tests..." }] }] } } ``` The `tdd-loop-check.sh` script runs tests, lint, and typecheck. It tracks iteration count (max 25) and distinguishes code errors (loop) from environment errors (stop). ## @include Directives CLAUDE.md uses `@include` to modularly load skills: ```markdown # CLAUDE.md @.claude/skills/base/SKILL.md @.claude/skills/iterative-development/SKILL.md @.claude/skills/security/SKILL.md ``` These are **resolved at load time** by Claude Code — the content is recursively inlined (max depth 5, cycle detection built in). This means skills actually become part of the prompt instead of just being listed as text. ## Conditional Rules Rules in `.claude/rules/` use YAML frontmatter with `paths:` to activate only when relevant files are being edited: ```yaml # .claude/rules/react.md --- paths: ["src/components/**", "**/*.tsx"] --- Prefer functional components with hooks... ``` ```yaml # .claude/rules/python.md --- paths: ["**/*.py"] --- Use type hints, pytest, ruff... ``` **Included rules:** | Rule | Activates When | |------|----------------| | `quality-gates.md` | Always (no paths: filter) | | `tdd-workflow.md` | Always | | `security.md` | Always | | `react.md` | Editing .tsx/.jsx files | | `typescript.md` | Editing .ts/.tsx files | | `python.md` | Editing .py files | | `nodejs-backend.md` | Editing api/routes/server files | ## Smarter Compaction (PreCompact Hook) Claude Code's built-in compaction fires at ~83% context and summarizes everything into 20K tokens using a generic 9-section template. It doesn't know what YOUR project cares about. The PreCompact hook fixes this by injecting **project-specific preservation priorities** into the summarizer: ``` ┌─────────────────────────────────────────────────────────────┐ │ Built-in compaction: │ │ "Summarize this conversation" → generic summary │ ├─────────────────────────────────────────────────────────────┤ │ With PreCompact hook: │ │ "Summarize, but preserve ALL schema decisions verbatim, │ │ keep exact error messages, keep API contract details, │ │ reference these Key Decisions by name, and here's the │ │ current git state to include" → project-aware summary │ └─────────────────────────────────────────────────────────────┘ ``` The hook auto-detects: - **Project type** (TypeScript/Next.js, Python/FastAPI, Flutter, etc.) - **Schema files** (Drizzle, Prisma, SQLAlchemy) → tells summarizer to preserve schema discussion - **API directories** → tells summarizer to preserve endpoint paths and contracts - **Key Decisions from CLAUDE.md** → tells summarizer to reference them by name - **Git state** → injects branch, uncommitted changes, staged files Zero overhead during normal usage. Only runs when compaction actually fires. ## Mnemos — Task-Scoped Memory Lifecycle Claude Code's built-in compaction is lossy and unreliable. It sometimes doesn't fire, `/compact` and `/clear` can fail (especially in multi-agent executions), and crashes/restarts lose all context. Mnemos provides **disk-persistent structured state** that survives all of these failure modes. ``` ┌─────────────────────────────────────────────────────────────┐ │ DEFAULT CLAUDE CODE vs WITH MNEMOS │ ├─────────────────────────────────────────────────────────────┤ │ Blind until 83.5% Continuous 4-dim monitoring│ │ Sudden hard compaction Graduated: 40→60→75→83% │ │ Uniform summarization Typed: goals never evict │ │ No cross-session memory Auto checkpoint/resume │ │ Crash = total context loss Crash = resume from disk │ │ Multi-agent: no shared state Per-agent structured state│ │ No behavioral awareness Detects re-reads, scatter │ └─────────────────────────────────────────────────────────────┘ ``` ### Post-Compaction Task Restoration (Two-Layer Defense) When compaction fires, the built-in summarizer often drops task-specific state. Mnemos uses two independent layers to guarantee restoration: ``` BEFORE COMPACTION AFTER COMPACTION PreCompact hook fires First tool call → PreToolUse fires ├── Write emergency checkpoint ├── Detect ".mnemos/just-compacted" marker ├── Build task narrative from ├── Read checkpoint-latest.json │ signals.jsonl (files, tools) ├── Output full checkpoint into context ├── Output STRONG preservation ├── Delete marker (one-shot) │ instructions to summarizer └── Claude now has: summary + checkpoint └── Write ".mnemos/just-compacted" marker file = Task fully restored ``` **Layer 1** (best-effort): PreCompact tells the summarizer what to keep, including inline checkpoint content with typed eviction priorities. **Layer 2** (guaranteed): Post-compaction injection via PreToolUse re-injects the full checkpoint on the first tool call after compaction. Doesn't depend on the summarizer. Fast path ~5ms when no compaction occurred. ### Why Not Just Write to a Plain File? You could — but you'd immediately face: what format? When to update? How to distinguish "this is critical" from "this is nice to have"? The MnemoGraph's typed nodes solve this: | Node Type | Eviction Policy | Example | |-----------|----------------|---------| | GoalNode | NEVER evict | "Implement auth module" | | ConstraintNode | NEVER evict | "API backward compatibility" | | ResultNode | Compress first | "JWT middleware tested" → summary kept | | WorkingNode | Compress first | Current reasoning / in-progress analysis | | ContextNode | Evictable | File contents → re-read from disk | Without typed priorities, a checkpoint is just a blob. With them, the system knows goals > constraints > working memory > context, and makes intelligent decisions about what to restore within token budgets. ### Resilience Beyond Normal Compaction The real value isn't the happy path — it's when things go wrong: | Failure Mode | CC Built-in | Mnemos | |---|---|---| | Session crash/collapse | Context gone | Checkpoint on disk survives | | `/compact` doesn't fire | Truncation at limit | Fatigue hooks wrote checkpoints earlier | | Multi-agent child dies | No recovery | Child's `.mnemos/` has structured state | | Forced restart | Generic summary | SessionStart reloads full checkpoint | | `/clear` fails in multi-agent | Stuck in weird state | MnemoGraph is independent of CC's state | ### Fatigue Model 4 dimensions passively observed from hooks — no agent cooperation needed: | Dimension | Weight | Signal Source | Detects | |-----------|--------|---------------|---------| | Token utilization | 0.40 | Statusline JSON | How full the context window is | | Scope scatter | 0.25 | PreToolUse file paths | Agent bouncing between directories | | Re-read ratio | 0.20 | PreToolUse Read calls | Agent re-reading files (context loss) | | Error density | 0.15 | PostToolUse outcomes | Agent struggling (high error rate) | Fatigue states: **FLOW** (0-0.4) → **COMPRESS** (0.4-0.6) → **PRE-SLEEP** (0.6-0.75) → **REM** (0.75-0.9) → **EMERGENCY** (0.9+). The fatigue model ensures checkpoints are written *before* things go wrong — so when a crash happens at 0.85, you have a recent checkpoint from 0.6. ### CLI ```bash mnemos init # Initialize .mnemos/ mnemos status # Node counts + fatigue mnemos fatigue # Detailed 4-dimension breakdown mnemos checkpoint --force # Write checkpoint now mnemos resume # Output checkpoint for session inject mnemos add goal "Build auth" # Create a GoalNode mnemos bridge-icpg # Import iCPG ReasonNodes ``` **Overhead:** ~5ms per tool call (fast path), 84KB on disk. Token signal auto-feeds via statusline. ## iCPG — Intent-Augmented Code Property Graph iCPG tracks *why* code exists, not just what it does. Every code change is linked to a ReasonNode that captures the intent, postconditions, and invariants. ```bash icpg create "Implement auth" --scope src/auth/ # Create intent icpg record src/auth/middleware.ts # Link symbols icpg query constraints src/auth/middleware.ts # Get invariants icpg drift # Check for drift icpg bootstrap # Infer from git history ``` **Pre-Task Queries** (injected automatically via PreToolUse hook): - `icpg query context ` — What intents touch this file? - `icpg query constraints ` — What invariants must hold? - `icpg drift file ` — Has this file drifted from its intent? **6-Dimension Drift Detection:** spec drift, decision drift, ownership drift, test drift, usage drift, dependency drift. ## Maggy Dashboard — AI Engineering Command Center (Optional) Maggy is a full-featured AI engineering command center. Install once, point it at your codebases and issue tracker, and get an interactive dashboard with chat, task triage, competitor intelligence, process analytics, and P2P session sync. ```bash cd maggy/maggy ./install.sh # Edit ~/.maggy/config.yaml — set your org, GitHub repos, codebase paths export GITHUB_TOKEN=ghp_... export ANTHROPIC_API_KEY=sk-ant-... python3 -m maggy.main # Open http://localhost:8080 ``` Or from inside any Claude Code session: ``` /maggy-init # Interactive setup wizard /maggy # Launch dashboard ``` ### What it does - **Interactive Chat** — auto-connects to all active Claude/Codex/Kimi sessions, SSE streaming, session continuity via `--resume`, path-based history matching - **AI-prioritized Tasks** — Claude ranks your open issues by urgency, OKR alignment, and recency. 30-min SQLite cache with stale-cache fallback. - **One-click Execute** — spawns `claude -p` locally in the right codebase, with iCPG context pre-injected. Runs a TDD pipeline, then commits locally for your review. - **Competitor Intelligence** — AI-discovered competitors in whatever domain you configure, plus daily news briefing from RSS + Google News. - **Process Insights** — CLI session history analysis, health signals, self-improvement recommendations, event spine queries. - **P2P Mesh** — WebSocket-based multi-node session sync and handoff across machines, org-scoped networks, state quarantine. - **Auto-Bootstrap** — all services seed themselves on startup (history, CIKG, events). No empty tabs. - **Provider-agnostic** — GitHub Issues, Asana, or (stubbed) Linear. Swap trackers without touching services. ### Dashboard Navigation Navigation is grouped by intent — 3 groups instead of 9 flat tabs: | Group | Tabs | Purpose | |-------|------|---------| | **Work** | Chat, Tasks, Watching | Do things — chat with Claude, triage issues | | **Intel** | Competitors, Insights | Learn things — competitor news, session analytics | | **System** | Budget, Models, Forge, Settings | Configure — spend limits, model routing, MCP gaps | Chat is the default tab — auto-connects to all running CLI sessions on load. ### Architecture ``` maggy/ ├── maggy/ # optional dashboard — run ./install.sh to enable │ ├── maggy/ # Python package (importable as `maggy`) │ │ ├── main.py # FastAPI entry + auto-bootstrap │ │ ├── config.py # ~/.maggy/config.yaml loader │ │ ├── providers/ # GitHub, Asana, Linear (stub) │ │ ├── services/ # chat, inbox, competitor, executor, activity │ │ ├── api/ # REST endpoints (chat, mesh, process, etc.) │ │ ├── mesh/ # P2P networking (discovery, sync, WebSocket) │ │ ├── process/ # Process intelligence (patterns, signals, router) │ │ ├── history/ # CLI session history parsers (Claude, Codex, Kimi) │ │ ├── improve/ # Self-improvement (signals, analyzer) │ │ ├── cikg/ # Code Intelligence Knowledge Graph │ │ ├── engram/ # Memory entries (write/query/expire) │ │ ├── event_spine/ # Structured event emission + querying │ │ ├── forge/ # MCP capability gap detection │ │ ├── heartbeat/ # Scheduled jobs (history, engram, mesh sync) │ │ └── static/ # Dashboard (Tailwind + vanilla JS, no build step) │ ├── tests/ # 468 tests │ └── install.sh # one-line install ├── commands/maggy.md # /maggy command ├── commands/maggy-init.md # /maggy-init wizard └── skills/maggy/SKILL.md # skill reference ``` ### Config-driven, no hardcoded anything One `~/.maggy/config.yaml` drives everything — org name, domain, repos, codebase paths, competitor categories. No hardcoded board IDs or team lists. ```yaml org: { name: "Acme Corp", domain: "fintech" } issue_tracker: provider: "github" # or "asana" github: org: "acmecorp" repos: ["acmecorp/api", "acmecorp/web"] codebases: - { path: "~/dev/acmecorp/api", key: "api" } - { path: "~/dev/acmecorp/web", key: "web" } competitors: categories: ["fintech", "embedded-finance"] ``` ### Safety model Execute and Chat both run Claude Code with `--dangerously-skip-permissions` so subprocesses aren't blocked waiting on approval prompts with no terminal attached. Mitigations in place: - `working_dir` and `project_path` are **validated against configured codebase roots** — both Execute and Chat reject arbitrary filesystem paths - **Per-session streaming lock** — `asyncio.Lock` prevents concurrent subprocess spawning via the Chat API - Dashboard **refuses to boot** if `auth_mode="local"` is combined with a non-loopback host (would expose Execute on the local network) - RSS URLs **SSRF-validated** before fetching (blocks loopback, private, link-local) - `CLAUDECODE` env var stripped from subprocesses to allow nested Claude sessions - **No-cache static middleware** — `Cache-Control: no-store` prevents stale JS See `maggy/README.md` for the full hardening notes. ### P2P Mesh Network Multi-node session sync and handoff across machines. Each Maggy instance is a mesh peer that can share memory, discover other nodes, and synchronize state. | Component | What it does | |-----------|-------------| | **Peer Discovery** | Registry of known peers with address, org, last-seen tracking | | **Git Discovery** | Auto-discovers peers from shared git remotes across configured codebases | | **WebSocket Server/Client** | Bidirectional real-time communication between peers | | **Mesh Protocol** | 7 message types: `hello`, `share`, `request`, `response`, `quarantine`, `promote`, `heartbeat` | | **Quarantine** | Untrusted data from peers is quarantined until reviewed — prevents poisoned memory injection | | **Org Scoping** | Peers are filtered by org key so only your team's nodes connect | | **Provenance** | Tracks origin of shared data (which peer, when, confidence level) | Configure in `~/.maggy/config.yaml`: ```yaml mesh: enabled: true port: 8080 orgs: ["my-team"] git_discovery: true share_interval: 600 ``` ### Engram Memory Persistent memory system with typed records, namespace isolation, and multi-path retrieval. Engrams survive across sessions — they're stored in SQLite, not in-context. | Field | Purpose | |-------|---------| | `memory_type` | `fact`, `decision`, `code_ref`, `handoff` | | `origin` | `explicit` (user-created), `inferred` (AI-derived), `mesh` (from peer) | | `validity` | `active`, `superseded`, `expired` | | `confidence` | 0.0-1.0 trust score | | `namespace` | Project/session scoping | | `expires_at` | Optional TTL for auto-expiry | Retrieval paths: by namespace, by type, by keyword, by tag, or most recent. The heartbeat scheduler runs periodic expiry to clean stale entries. ### Event Spine Structured event emission and querying across all Maggy services. Every significant action (task executed, competitor discovered, history analyzed, self-improvement run) emits a typed event with a standard header. Events are stored in SQLite and queryable via the `/api/events` endpoint. The Insights tab visualizes event streams for debugging and auditing service behavior. ### Other Subsystems | Subsystem | Purpose | |-----------|---------| | **CIKG** | Code Intelligence Knowledge Graph — codebase nodes, technology detection, landscape queries | | **Forge** | MCP capability gap detection — scans filesystem patterns, suggests MCP tools to fill gaps | | **History** | CLI session history parsers for Claude, Codex, and Kimi — topic extraction, session patterns | | **Improve** | Self-improvement — signal collection, health scoring, actionable recommendations | | **Budget** | Daily token spend limits with per-provider breakdown | | **Model Router** | Reward-based heatmap for model selection by task type | | **Heartbeat** | Scheduled jobs — history refresh, engram expiry, self-improvement, mesh sync | ## Pre-configured Permissions `.claude/settings.json` includes permission rules so users don't get pestered for routine operations: ```json { "permissions": { "allow": [ "Bash(npm test *)", "Bash(npm run lint *)", "Bash(pytest *)", "Bash(git status *)", "Bash(gh pr *)" ], "deny": [ "Bash(rm -rf *)", "Bash(git push --force *)", "Write(.env)", "Write(.env.*)" ] } } ``` ## CLAUDE.local.md (Private Overrides) Each developer gets a `.gitignore`'d `CLAUDE.local.md` for personal preferences: ```markdown # My Preferences - I prefer verbose explanations - My local DB runs on port 5433 - Use pnpm instead of npm ``` This loads at **higher priority** than project `CLAUDE.md` — personal preferences override team config without polluting the repo. ## Agent Teams Every project runs as a coordinated team of AI agents with **proper frontmatter definitions**: ```yaml # .claude/agents/team-lead.md --- name: team-lead description: Orchestrates the agent team model: sonnet tools: [Read, Glob, Grep, TaskCreate, TaskUpdate, TaskList, TaskGet, SendMessage] disallowedTools: [Write, Edit, Bash] maxTurns: 50 effort: high --- ``` **Default Team:** | Agent | Role | Can Edit Code? | |-------|------|----------------| | **Team Lead** | Orchestrates, assigns tasks (never writes code) | No | | **Quality Agent** | Verifies RED/GREEN TDD phases, coverage >= 80% | No | | **Security Agent** | OWASP scanning, secrets detection, dependency audit | No | | **Code Review Agent** | Multi-engine reviews | No | | **Merger Agent** | Creates feature branches and PRs via `gh` CLI | No | | **Feature Agent (x N)** | One per feature, follows strict TDD pipeline | Yes | **Pipeline (enforced by task dependencies):** ``` Spec > Spec Review > Tests > RED Verify > Implement > GREEN Verify > Validate > Code Review > Security > Branch+PR ``` ```bash # Auto-spawned by /initialize-project, or manually: /spawn-team ``` ## What Gets Created ``` your-project/ ├── .claude/ │ ├── agents/ # Agent definitions with frontmatter │ │ ├── team-lead.md # name, model, tools, disallowedTools, maxTurns │ │ ├── quality.md │ │ ├── security.md │ │ ├── code-review.md │ │ ├── merger.md │ │ └── feature.md │ ├── rules/ # Conditional rules (paths: frontmatter) │ │ ├── quality-gates.md # Always active │ │ ├── tdd-workflow.md # Always active │ │ ├── security.md # Always active │ │ ├── react.md # Active on .tsx/.jsx files │ │ ├── typescript.md # Active on .ts/.tsx files │ │ ├── python.md # Active on .py files │ │ └── nodejs-backend.md # Active on api/routes/server files │ ├── skills/ # Skills loaded via @include │ │ ├── base/SKILL.md │ │ ├── iterative-development/SKILL.md │ │ ├── security/SKILL.md │ │ ├── mnemos/SKILL.md │ │ ├── cross-agent-delegation/SKILL.md │ │ └── [framework]/SKILL.md │ └── settings.json # Permissions + hooks + statusline ├── scripts/ │ ├── tdd-loop-check.sh # Stop hook script for TDD loops │ ├── icpg/ # Intent-Augmented Code Property Graph │ └── mnemos/ # Task-Scoped Memory Lifecycle ├── .mnemos/ # Mnemos state (auto-created, gitignored) │ ├── mnemo.db # SQLite MnemoGraph │ ├── fatigue.json # Live fatigue signal │ ├── signals.jsonl # Behavioral signal log │ └── checkpoint-latest.json # Most recent checkpoint ├── .github/workflows/ │ ├── quality.yml │ └── security.yml ├── _project_specs/ │ ├── features/ │ └── todos/ ├── CLAUDE.md # @include directives, project context └── CLAUDE.local.md # Private developer overrides (gitignored) ``` ## Commit Hygiene ``` ┌─────────────────────────────────────────────────────────────┐ │ COMMIT SIZE THRESHOLDS │ ├─────────────────────────────────────────────────────────────┤ │ OK: ≤ 5 files, ≤ 200 lines │ │ WARN: 6-10 files, 201-400 lines → "Commit soon" │ │ STOP: > 10 files, > 400 lines → "Commit NOW" │ └─────────────────────────────────────────────────────────────┘ ``` ## Skills Included (62 Skills) ### Core Skills | Skill | Purpose | |-------|---------| | `base.md` | Universal patterns, constraints, TDD workflow, atomic todos | | `iterative-development.md` | TDD loops via Stop hooks (replaces Ralph Wiggum) | | `mnemos.md` | Task-scoped memory lifecycle — fatigue monitoring, checkpoints, typed compaction | | `icpg.md` | Intent-augmented code property graph — track why code exists, detect drift | | `code-review.md` | Mandatory code reviews - Claude, Codex, Gemini, or multi-engine | | `codex-review.md` | OpenAI Codex CLI code review | | `gemini-review.md` | Google Gemini CLI code review, 1M token context | | `workspace.md` | Multi-repo workspace awareness, contract tracking | | `commit-hygiene.md` | Atomic commits, PR size limits | | `code-deduplication.md` | Prevent semantic duplication with capability index | | `agent-teams.md` | Agent team workflow with proper frontmatter definitions | | `ticket-craft.md` | AI-native ticket writing optimized for Claude Code | | `maggy.md` | Optional local AI command center — AI-prioritized inbox, one-click TDD execute, competitor intelligence. See the [Maggy section](#maggy--ai-engineering-command-center-optional) for the full docs | | `team-coordination.md` | Multi-person projects, shared state, handoffs | | `code-graph.md` | Persistent code graph via MCP | | `cpg-analysis.md` | Deep CPG analysis - Joern + CodeQL | | `security.md` | OWASP patterns, secrets management | | `credentials.md` | Centralized API key management | | `session-management.md` | Context preservation, resumability | | `project-tooling.md` | gh, vercel, supabase CLI + deployment | | `existing-repo.md` | Analyze existing repos, setup guardrails | | `cross-agent-delegation.md` | Cross-agent task routing, Codex auto-review, Kimi delegation | | `polyphony.md` | Multi-agent orchestration with container-isolated workspaces | ### Language & Framework Skills | Skill | Purpose | |-------|---------| | `python.md` | Python + ruff + mypy + pytest | | `typescript.md` | TypeScript strict + eslint + jest | | `nodejs-backend.md` | Express/Fastify patterns, repositories | | `react-web.md` | React + hooks + React Query + Zustand | | `react-native.md` | Mobile patterns, platform-specific code | | `android-java.md` | Android Java with MVVM, ViewBinding, Espresso | | `android-kotlin.md` | Android Kotlin with Coroutines, Jetpack Compose, Hilt | | `flutter.md` | Flutter with Riverpod, Freezed, go_router | ### UI Skills | Skill | Purpose | |-------|---------| | `ui-web.md` | Web UI - Tailwind, dark mode, accessibility | | `ui-mobile.md` | Mobile UI - React Native, iOS/Android patterns | | `ui-testing.md` | Visual testing | | `playwright-testing.md` | E2E testing - Playwright, Page Objects | | `user-journeys.md` | User experience flows | | `pwa-development.md` | Progressive Web Apps - service workers, offline | ### Database & Backend Skills | Skill | Purpose | |-------|---------| | `database-schema.md` | Schema awareness | | `supabase.md` | Core Supabase CLI, migrations, RLS | | `supabase-nextjs.md` | Next.js + Supabase + Drizzle ORM | | `supabase-python.md` | FastAPI + Supabase | | `supabase-node.md` | Express/Hono + Supabase | | `firebase.md` | Firebase Firestore, Auth, Storage | | `cloudflare-d1.md` | Cloudflare D1 SQLite with Workers | | `aws-dynamodb.md` | AWS DynamoDB single-table design | | `aws-aurora.md` | AWS Aurora Serverless v2 | | `azure-cosmosdb.md` | Azure Cosmos DB | ### AI & Agentic Skills | Skill | Purpose | |-------|---------| | `agentic-development.md` | Build AI agents | | `llm-patterns.md` | AI-first apps, LLM testing | | `ai-models.md` | Latest models reference | ### Content, Integration & Other Skills | Skill | Purpose | |-------|---------| | `aeo-optimization.md` | AI Engine Optimization | | `web-content.md` | SEO + AI discovery | | `site-architecture.md` | Technical SEO | | `web-payments.md` | Stripe Checkout, subscriptions | | `reddit-api.md` | Reddit API | | `reddit-ads.md` | Reddit Ads API + agentic optimization | | `ms-teams-apps.md` | Microsoft Teams bots | | `posthog-analytics.md` | PostHog analytics | | `shopify-apps.md` | Shopify app development | | `woocommerce.md` | WooCommerce REST API | | `medusa.md` | Medusa headless commerce | | `klaviyo.md` | Klaviyo email/SMS marketing | ## Usage Patterns ### New Project ```bash mkdir my-new-app && cd my-new-app claude > /initialize-project ``` ### Existing Project ```bash cd my-existing-app claude > /initialize-project # Auto-detects existing code → runs analysis first ``` ### Update Skills Globally ```bash cd "$(cat ~/.claude/.bootstrap-dir)" git pull ./install.sh ``` ## Prerequisites ```bash # GitHub CLI brew install gh && gh auth login # Vercel CLI (optional) npm i -g vercel && vercel login # Supabase CLI (optional) brew install supabase/tap/supabase && supabase login ``` ## Evolution | Version | Date | What Changed | |---------|------|-------------| | **v1.0** | Jan 2026 | Initial release — 30+ skills, `/initialize-project`, TDD via Ralph Wiggum loops, Python/TypeScript/React support | | **v2.0** | Jan 2026 | Skills restructured (`folder/SKILL.md`), YAML frontmatter, validation tests, 60+ skills across 10 categories | | **v3.0** | Mar 2026 | **Real Claude Code infrastructure** — Ralph Wiggum replaced with Stop hooks, `@include` directives, conditional rules (`paths:` frontmatter), agent teams via `.claude/agents/`, pre-configured permissions | | **v3.3** | Apr 2026 | Mnemos (task-scoped memory), iCPG (intent tracking + drift detection), Maggy dashboard MVP (inbox, execute, competitors) | | **v3.5** | Apr 2026 | PreCompact hook for smarter compaction, fatigue model (4 dimensions), hook error resilience | | **v3.6** | May 2026 | Cross-tool compatibility (Claude + Kimi + Codex), cross-agent intelligence (Codex auto-review, Kimi delegation), complexity-based routing | | **v4.0** | May 2026 | **Polyphony** — multi-agent orchestration with container isolation, 5-dimension complexity scoring, Docker runtime, 3 agent adapters, state machine task lifecycle | | **v5.0** | May 2026 | **Autonomous command center** — Interactive Chat with `--resume` takeover, P2P Mesh networking, process intelligence, auto-bootstrap, grouped UI (Work/Intel/System), 468 tests, security hardening (path validation, streaming lock) | ### Where we started vs where we are | Area | v1 (Jan 2026) | v5 (May 2026) | |------|---------------|---------------| | **Scope** | Claude Code project setup tool | Autonomous AI engineering platform | | **TDD** | Ralph Wiggum plugin (didn't exist) | Real Stop hooks with iteration tracking | | **Skills** | 30 flat `.md` files | 62 skills with `@include`, conditional rules | | **Memory** | None (lost on compaction) | Mnemos typed graph + fatigue model | | **Intent** | None | iCPG with 6-dimension drift detection | | **Agents** | Single Claude session | Polyphony containers + cross-agent delegation | | **Models** | Claude only | Claude + Codex + Kimi + complexity routing | | **Dashboard** | None | Maggy — chat, tasks, competitors, insights, mesh | | **Networking** | None | P2P Mesh (WebSocket sync, org-scoped) | | **Tests** | Shell validation script | 468 pytest tests + integration suite | ## Contributing See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. ## Changelog See [CHANGELOG.md](CHANGELOG.md) for version history. ## License MIT - See [LICENSE](LICENSE) ## Credits Built on learnings from 100+ projects across customer experience management, agentic AI platforms, mobile apps, and full-stack web applications. --- **Need help scaling AI in your org?** [Claude Code & MCP experts](https://leanai.ventures/aiops/claude) ================================================ FILE: _project_specs/00-autonomous-engineering-roadmap.md ================================================ # Autonomous Engineering Roadmap A set of specs closing the gaps between claude-bootstrap's current code-intelligence stack (`codebase-memory-mcp` + `iCPG` + `Joern/CodeQL` + `Mnemos` + `code-deduplication`) and what an autonomous coding agent actually needs to ship changes without supervision. ## Why these? Autonomous agents fail in 10 specific, repeatable ways (see [comparison doc in chat history — 2026-04-20](#)). Our current stack addresses 9 of them. The specs below close the remaining agent-observable gaps and add the two "frontier" capabilities (multimodal ingestion, verifiable contracts). ## Priority order **Tier 1 — highest leverage, unlocks the rest:** | # | Spec | Why it matters | |---|---|---| | 01 | [Runtime observability](01-runtime-observability.md) | Drift detection is static — an agent that ships code needs a production feedback signal to know if the change actually worked | | 03 | [Verifiable contracts](03-verifiable-contracts.md) | iCPG postconditions are currently natural-language. Generating property-based tests from them makes them machine-checkable. | | 07 | [Human escalation protocol](07-human-escalation-protocol.md) | When the agent is stuck, it needs a formal "page a human with this packet" channel | **Tier 2 — valuable, not blocking:** | # | Spec | Why it matters | |---|---|---| | 08 | [Auto CODE_INDEX](08-auto-code-index.md) | The capability index currently depends on humans maintaining it. Auto-derive from the graph. | | 04 | [Multi-agent coordination](04-multi-agent-coordination.md) | When two agents touch the same area, we need locking / negotiation | | 02 | [Rollback & recovery](02-rollback-and-recovery.md) | Drift flags a problem; we still need automated revert paths | **Tier 3 — frontier / optional:** | # | Spec | Why it matters | |---|---|---| | 05 | [Confidence calibration](05-confidence-calibration.md) | Reinforcement loop — learn from past agent actions which patterns fail | | 06 | [Cost / budget awareness](06-cost-budget-awareness.md) | Agents stuck in loops burn real money. Hard budget stops. | | 09 | [Multimodal ingestion](09-multimodal-ingestion.md) | Graphify-style. Only matters if your repos include docs/images/video. | ## What each spec contains - **Context** — the failure mode being addressed - **Goal** — one-sentence outcome - **Approach** — concrete integration points with existing skills/scripts - **Success criteria** — how we know it works - **Effort** — rough size (small / medium / large) - **Depends on** — other specs that should land first ## Implementation convention When picking up a spec: 1. Create a feature branch `feat/spec-XX-` 2. Add an entry to `CHANGELOG.md` under an "Unreleased" section 3. Write the feature following TDD (as the rest of the project does) 4. Update the spec file's `Status` field when merged Status values: `pending` · `in-progress` · `in-review` · `done` · `deferred` ================================================ FILE: _project_specs/01-runtime-observability.md ================================================ # Spec 01: Runtime Observability for Drift Detection **Status:** pending **Priority:** Tier 1 (highest leverage) **Effort:** Medium ## Context `iCPG` detects drift **statically** — it can tell you a symbol's checksum changed, its tests disappeared, or a postcondition's predicate no longer holds against the current codebase. What it cannot tell you is whether the running system still delivers what the intent promised. An autonomous agent that ships code needs a feedback signal after deploy. Otherwise: - A refactor passes all tests and drift checks but tanks p99 latency in production → agent has no signal - A bug fix validates against one invariant but introduces regressions users hit → silent - An intent's postcondition is "<500ms response" — a static graph can't verify this ## Goal Bridge `iCPG` to runtime telemetry so drift detection includes post-deploy signals, not just pre-commit signals. ## Approach ### Step 1 — Define a runtime-signal abstraction Add a new edge type to iCPG: ``` VALIDATED_IN_PROD Reason → Metric (intent's postcondition has a runtime check) ``` A `Metric` node references an observability query: ```yaml metric: id: "checkout_p99_under_500ms" source: "datadog" # datadog | sentry | honeycomb | prometheus query: "avg:trace.checkout.latency.p99{env:prod}" predicate: "value < 500" window: "1h" ``` ### Step 2 — Pluggable observability adapters One-file adapters per backend (`scripts/icpg/observability/`): - `datadog_adapter.py` — query API key from env, return metric value - `sentry_adapter.py` — query event frequency for a given issue - `honeycomb_adapter.py` — run a Honeycomb query and extract result - `prometheus_adapter.py` — PromQL - `stub_adapter.py` — for testing, reads from a JSON file Each exposes `fetch(metric_id, window) -> float | None`. ### Step 3 — Extend `icpg drift check` with `--include-runtime` When the flag is set, evaluate every `VALIDATED_IN_PROD` edge by calling its adapter. Runtime predicate failure adds a 7th drift dimension: ``` Runtime drift Postcondition metric violates its predicate in production ``` ### Step 4 — Hook into claude-bootstrap's post-commit flow The `hooks/post-commit-graph` script runs `icpg record`. Add an optional `--check-runtime` step that queries the adapters for any symbols touched in this commit, so the agent sees drift before the change ships. ## Integration points - `scripts/icpg/models.py` — add `MetricNode`, `RuntimeEdge` - `scripts/icpg/drift.py` — add `check_runtime_drift()` - `scripts/icpg/__main__.py` — wire `drift check --include-runtime` flag - `skills/icpg/SKILL.md` — document the pattern - `templates/icpg-metric.yaml` — template for declaring metrics ## Success criteria 1. `icpg drift check --include-runtime` queries configured adapters and reports runtime-dimension drift 2. At least one adapter (Datadog or Sentry) ships with docs + example config 3. A test harness using `stub_adapter` verifies runtime drift triggers correctly 4. Agent receives runtime signal in pre-task query output (`icpg query risk` includes current runtime state) 5. Zero network calls when no `VALIDATED_IN_PROD` edges exist — backward compatible ## Depends on None — can be built independently on top of current iCPG. ## Follow-ups - Spec 02 (rollback) uses the same signal to auto-revert on severe drift - Spec 05 (confidence calibration) learns from runtime failures ================================================ FILE: _project_specs/02-rollback-and-recovery.md ================================================ # Spec 02: Rollback & Recovery **Status:** pending **Priority:** Tier 2 **Effort:** Medium ## Context When `iCPG` detects drift or a runtime signal (Spec 01) indicates a shipped change broke something, the agent has no automated path to recover. It knows the problem exists but still has to manually coordinate a revert — find the right commit, check for downstream work, revert, re-verify. For autonomous engineering this needs to be a first-class operation. The agent should be able to say "revert intent R-abc because its postcondition failed in production" and get a safe, auditable rollback. ## Goal Add a `icpg revert` command that safely undoes all commits attributed to a given ReasonNode, handling downstream dependencies and leaving a verifiable audit trail. ## Approach ### Step 1 — Track commit SHAs on intents iCPG already has `CREATES` / `MODIFIES` edges between ReasonNodes and Symbols. Extend the `record` command to also store the commit SHA that made the change: ``` CREATES Reason → Symbol [commit_sha, timestamp] MODIFIES Reason → Symbol [commit_sha, timestamp] ``` ### Step 2 — `icpg revert ` The command: 1. Collects all commit SHAs attributed to this intent (from its edges) 2. Checks for downstream `REQUIRES` intents whose postconditions depend on this one 3. If downstream intents exist and aren't in `drifted`/`abandoned` status → refuse revert, explain the chain 4. Otherwise: `git revert --no-commit ...` in reverse chronological order 5. Runs the intent's `VALIDATED_BY` tests to confirm pre-intent state is reached 6. Updates the intent status to `reverted` (new status) 7. Emits a `REVERTED` edge type linking the revert commit to the original ### Step 3 — Auto-revert on severe drift (opt-in) Wire into drift detection: when `Runtime drift` severity > 0.9 AND drift age < 1h AND `auto_revert: true` is set on the intent → trigger `icpg revert` automatically and page a human (Spec 07). Config per-project in `.icpg/config.yaml`: ```yaml auto_revert: enabled: false # opt-in per project severity_threshold: 0.9 max_age_minutes: 60 require_test_pass: true ``` ### Step 4 — Recovery for partial failures If `git revert` fails mid-way (conflicts, missing commits), leave the tree in a clean state (`git revert --abort`) and report exactly which commit failed + why. ## Integration points - `scripts/icpg/__main__.py` — add `revert` subcommand - `scripts/icpg/models.py` — add `commit_sha` field on edges, `reverted` status, `REVERTED` edge type - `scripts/icpg/drift.py` — optional auto-revert trigger for severe runtime drift - `hooks/post-commit-graph` — capture SHA when recording - `skills/icpg/SKILL.md` — add revert section ## Success criteria 1. `icpg revert ` reverts all commits attributed to that intent cleanly or explains why it can't 2. Downstream `REQUIRES` intents block the revert with a clear message 3. Auto-revert is opt-in per-intent and only fires on high-severity runtime drift 4. Every revert is logged in the graph with `REVERTED` edges pointing to the original commits 5. A test harness verifies revert correctness against a scripted intent lifecycle ## Depends on - Spec 01 (runtime observability) — the auto-revert signal comes from runtime drift ================================================ FILE: _project_specs/03-verifiable-contracts.md ================================================ # Spec 03: Verifiable Contracts (Property-Based Test Generation) **Status:** pending **Priority:** Tier 1 (highest leverage) **Effort:** Large ## Context iCPG's ReasonNodes already carry formal contracts: ``` preconditions: What must be true before execution postconditions: What must be true when fulfilled invariants: What must remain true ``` Today these are natural-language strings. Drift detection matches commit patterns and checksums against them heuristically. That's good but not verifiable — the agent can't prove a postcondition still holds after a change. For autonomous engineering, we want machine-checkable contracts: the agent writes a postcondition, and the system generates tests that will fail if the postcondition is ever violated. ## Goal Generate property-based tests from iCPG postconditions so drift detection becomes "did the test pass?" instead of "does the string still plausibly match?" ## Approach ### Step 1 — Structured postconditions (optional schema) Let authors write postconditions in either natural language (current) or a structured form that's machine-generatable: ```yaml postconditions: - type: "returns" of: "save_response" shape: "Response" properties: - "response.id is not null" - "response.org_id == input.org_id" - "len(response.answers) == len(input.answers)" - type: "invariant" holds: "during_save" assertion: "db.responses.count() increases by 1" ``` The structured form compiles to tests; natural language fallback uses LLM-assisted generation (Step 2). ### Step 2 — Pluggable property-based test generators One generator per language/framework: - `scripts/icpg/codegen/hypothesis_python.py` — Hypothesis (Python) - `scripts/icpg/codegen/fastcheck_ts.py` — fast-check (TypeScript) - `scripts/icpg/codegen/proptest_rust.py` — proptest (Rust) - Natural-language postconditions use LLM generation, structured ones compile directly Each takes a `ReasonNode` and returns a test file with a `# @icpg-generated from R-abc123` header so the agent knows not to hand-edit. ### Step 3 — `icpg contracts generate ` CLI command that: 1. Reads the intent's postconditions 2. Detects the language of the scope files (already tracked) 3. Invokes the right generator 4. Writes tests to `tests/generated/contracts/.test.py` (or equivalent) 5. Adds a `VALIDATED_BY` edge automatically `icpg contracts generate --all` regenerates every intent's tests (bulk operation for upgrading existing projects). ### Step 4 — Drift check gains a "contract-verified" signal Existing drift detection checks whether `VALIDATED_BY` tests exist and pass. With this spec, those tests are now *derived from the postconditions* rather than hand-written, so failure is a direct postcondition violation signal — not just "a test broke." ### Step 5 — Regenerate on intent edit When a ReasonNode's postconditions change, stale generated tests are flagged. Agent can run `icpg contracts sync` to regenerate; humans can review the diff. ## Integration points - `scripts/icpg/models.py` — add structured `postcondition` variants alongside existing strings - `scripts/icpg/codegen/` — new package, one module per language/framework - `scripts/icpg/__main__.py` — `contracts generate`, `contracts sync` subcommands - `skills/icpg/SKILL.md` — document how to write structured postconditions - `templates/reasonnode-structured.yaml` — template showing both forms ## Success criteria 1. Given an intent with structured postconditions, `icpg contracts generate` produces a runnable property-based test in Hypothesis/fast-check 2. The generated test has a header marking it as machine-generated 3. Running the test suite fails immediately when a postcondition is violated in the actual implementation 4. Natural-language postconditions fall back to LLM generation cleanly (doesn't silently skip) 5. Drift detection differentiates "stale test" from "postcondition violation" in its severity score ## Depends on None (iCPG only). But pairs well with: - Spec 01 — runtime postconditions (metric predicates) complement code-level postconditions - Spec 02 — a generated test failure is a strong auto-revert signal ================================================ FILE: _project_specs/04-multi-agent-coordination.md ================================================ # Spec 04: Multi-Agent Coordination (Symbol-Level Locks) **Status:** pending **Priority:** Tier 2 **Effort:** Medium ## Context claude-bootstrap already has `agent-teams` and `team-coordination` skills, and Maggy ships with a P2P session-handoff pattern. But when two agents (or two sessions of the same agent) want to modify the same area of code, there's no coordination protocol. First-to-commit wins, which creates silent merge conflicts, duplicated work, and lost intent tracking. For autonomous engineering at team scale (multiple agents, or one agent coordinating long-running subtasks), we need intent-level and symbol-level locks. ## Goal Agents claim exclusive work on an intent or set of symbols before modifying, negotiate with holders of conflicting locks, and release on completion or timeout. ## Approach ### Step 1 — Lock primitive in iCPG Add a `lock` table and edge type: ``` LOCKED_BY Reason | Symbol → Agent [acquired_at, expires_at, purpose] ``` Locks are scoped to an intent (broadest), a set of files, or a set of symbols (finest). A lock has: - `holder_id` — agent or session identifier - `scope` — intent id | files[] | symbols[] - `purpose` — one-line description ("refactor auth service") - `acquired_at` / `expires_at` — auto-expire to prevent orphans (default 30 min) - `heartbeat_at` — renewed periodically by the holder ### Step 2 — `icpg lock` / `icpg unlock` commands ```bash icpg lock intent R-abc --purpose "refactor auth" --expires 30m icpg lock symbols auth.login,auth.logout --purpose "rate-limiting fix" icpg locks list # show all active locks icpg unlock R-abc # release icpg locks prune # remove expired ``` Lock attempts on a held scope return the holder's info so the requesting agent can decide what to do (wait, negotiate, defer). ### Step 3 — Pre-task query integration Extend the 3 canonical pre-task queries with a 4th: | Query | What It Answers | |---|---| | `icpg query locks ` | Is someone else working on this right now? | The PreToolUse hook adds this to the injected context before any Edit/Write call. ### Step 4 — Negotiation protocol When an agent wants a held lock, it sends a `negotiation_request` to the holder (Mnemos message): - Requester states: intent, priority, estimated duration - Holder responds: `accept` (release), `defer` (hold until completion), `split` (narrow the lock to specific symbols) If no response within 5 minutes, the requester either takes the lock (if the holder's heartbeat is stale) or escalates (Spec 07). ### Step 5 — Conflict prevention at commit time Post-commit hook verifies the committing agent holds the right lock for all symbols the commit modified. If not, the commit is logged as `unauthorized_modification` and the drift check flags it. ## Integration points - `scripts/icpg/models.py` — `Lock`, `LockedByEdge` - `scripts/icpg/store.py` — `acquire_lock`, `release_lock`, `prune_locks`, `list_locks` - `scripts/icpg/__main__.py` — `lock`, `unlock`, `locks` subcommands - `hooks/pre-tool-use` — inject active-lock context - `hooks/post-commit-graph` — verify lock matches modified symbols - `skills/agent-teams/SKILL.md` — add locking discipline section - `skills/icpg/SKILL.md` — document the 4th pre-task query ## Success criteria 1. Two concurrent agents attempting to modify the same symbol can't both succeed — the second sees the held lock 2. Locks auto-expire 30 min after last heartbeat (agents don't have to remember to release) 3. Pre-task queries include active-lock info 4. Commits violating lock ownership are flagged in drift reports 5. Negotiation protocol works: requester gets a structured response from holder, or escalation fires ## Depends on - Spec 07 (escalation) — when negotiation fails, escalation fires - Builds on existing `agent-teams` and Maggy P2P patterns ================================================ FILE: _project_specs/05-confidence-calibration.md ================================================ # Spec 05: Confidence Calibration (Reinforcement Loop) **Status:** pending **Priority:** Tier 3 (frontier) **Effort:** Medium ## Context iCPG's `get_risk_profile` query today classifies symbols as fragile/stable based on ownership history and drift count. It doesn't learn from what actually failed when agents touched it. An agent that tried refactoring this file three times and failed gets the same risk score as one that hasn't been tried yet. For autonomous engineering, we want a reinforcement loop: past agent failures against a symbol or pattern should raise its risk score for future agents. ## Goal Track agent actions and their outcomes against symbols/patterns, and use that history to calibrate confidence for future pre-task queries. ## Approach ### Step 1 — Action-outcome tracking Add two new node types to iCPG: ``` AgentAction { id, agent, intent, scope[], timestamp } Outcome { action_id, result, evidence } ``` Result types: - `success` — tests passed, intent fulfilled, no drift - `partial` — intent fulfilled but introduced drift elsewhere - `failure_test` — tests failed, rolled back - `failure_runtime` — shipped, runtime drift detected (Spec 01) - `abandoned` — agent gave up Evidence is a pointer — commit SHA, test output, drift report. ### Step 2 — Hook into existing flows Automatic capture: - Pre-task query writes an open `AgentAction` node - Post-commit: matches to the most recent pending action and records outcome based on test results - Drift check: if a `VALIDATED_BY` test fails on an intent, the agent action tied to that intent's commit is marked `failure_test` - Spec 01 runtime drift: marks `failure_runtime` - Spec 02 auto-revert: marks `abandoned` ### Step 3 — Risk score now includes success rate `icpg query risk ` returns a calibrated score: ``` Historical success rate for this symbol: 40% (2 of 5 attempts successful) Pattern complexity: high (10+ dependents, 3 owners, drifted twice) Recommendation: treat as fragile — consider smaller changes or pair ``` Calibration uses a simple Bayesian update: prior = structural risk (current method), likelihood = recent action outcomes. ### Step 4 — Pattern-level learning (stretch) For autonomous agents, single-symbol history is too narrow — we want "refactors of dataclasses with >5 fields fail 60% of the time." This requires clustering actions by pattern, not just symbol. Defer this to a v2 of this spec; first ship the single-symbol version. ### Step 5 — Privacy & data hygiene Action history is sensitive (could leak intent details). Make it: - Opt-out per project (`.icpg/config.yaml: track_outcomes: false`) - Redact content, keep structure only (symbol ids, outcome types, timestamps) - Never exported outside the `.icpg/` directory ## Integration points - `scripts/icpg/models.py` — `AgentAction`, `Outcome` node types - `scripts/icpg/store.py` — outcome-tracking tables - `scripts/icpg/drift.py` — risk scoring gains history term - `hooks/pre-tool-use` — record `AgentAction` before Edit/Write calls - `hooks/post-commit-graph` — finalize the outcome - `skills/icpg/SKILL.md` — document calibrated risk semantics ## Success criteria 1. Every agent Edit/Write action is automatically logged (no manual reporting) 2. `icpg query risk ` returns a score incorporating historical outcomes 3. Risk score converges toward structural risk when action history is empty (no regression) 4. Privacy opt-out works — no history written when disabled 5. A test harness replays an action sequence and verifies calibrated scores update correctly ## Depends on - Spec 01 (runtime observability) — feeds `failure_runtime` signal - Spec 02 (rollback) — feeds `abandoned` signal - Spec 03 (verifiable contracts) — feeds high-signal `failure_test` from postcondition failures ================================================ FILE: _project_specs/06-cost-budget-awareness.md ================================================ # Spec 06: Cost / Budget Awareness **Status:** pending **Priority:** Tier 3 (frontier) **Effort:** Small ## Context Autonomous agents stuck in loops burn real money. Mnemos's fatigue detection (4-dim: tokens, scatter, re-reads, error density) is a *behavioral* proxy for "the agent is struggling" but it isn't a hard stop. An agent that's actually wasting tokens or API calls needs a budget ceiling. This matters especially for: - `/improve-maggy`, self-improvement flows, anything that spawns subagents - Team runs where one misbehaving agent shouldn't bankrupt the whole run - Maggy's TDD execute pipeline (up to 3 Claude Code invocations per ticket) ## Goal Add per-task and per-session budget limits with hard stops and a budget-aware fatigue state. ## Approach ### Step 1 — Declare a budget in intent config Extend ReasonNode: ```yaml budget: tokens: 100000 api_calls: 50 wall_clock_minutes: 30 usd: 5.00 ``` All fields optional. `usd` calculated from model pricing tables (current Sonnet/Opus rates, refreshed quarterly). ### Step 2 — Track spend via hooks PostToolUse hook accumulates: - Tokens consumed (from `transcript_path` JSON blobs) - Claude API calls (by counting tool uses) - Wall clock elapsed since intent started Stored in `.icpg/budgets/.json` with heartbeats. ### Step 3 — Budget-aware fatigue state Add a 5th Mnemos fatigue dimension: `budget_burn_rate`. If the agent has consumed 70% of its token budget at 40% progress, that's a signal to compress / consolidate / consider abandoning. Threshold behavior: | Budget consumed | Action | |---|---| | <60% | Normal | | 60-85% | Mnemos COMPRESS state forced | | 85-100% | Mnemos REM state forced, agent warned to wrap up | | >100% | Hard stop — PreToolUse hook rejects further Edit/Write/Bash | ### Step 4 — Graceful stop behavior When budget is exceeded: 1. PreToolUse hook returns `budget_exceeded` error with context about remaining work 2. Agent is expected to write a handoff Mnemos checkpoint before exiting 3. Intent status flips to `deferred_budget` 4. Human (or another agent with a fresh budget) can resume from the checkpoint ### Step 5 — Budget override A human can set `allow_overage: true` on an intent or raise the limit mid-run. Override requires a commit to the intent's config (auditable). ## Integration points - `scripts/icpg/models.py` — `Budget` field on ReasonNode - `scripts/icpg/budget.py` — new module for tracking and enforcement - `hooks/pre-tool-use` — budget check before Edit/Write/Bash - `hooks/post-tool-use` — accumulate spend - `templates/pricing.yaml` — model → $/token table, refreshed quarterly - `skills/mnemos/SKILL.md` — document the 5th fatigue dimension - `skills/icpg/SKILL.md` — document budget declaration ## Success criteria 1. An intent with a 10k-token budget hard-stops at 10k tokens via PreToolUse rejection 2. Mnemos fatigue state reflects budget consumption (COMPRESS / REM / EMERGENCY) 3. Budget overruns leave a Mnemos handoff checkpoint so work can resume 4. `icpg budgets list` shows current spend vs limit per active intent 5. No budget declared → no enforcement (backward compatible) ## Depends on - Mnemos fatigue model (already exists) - Nothing else ================================================ FILE: _project_specs/07-human-escalation-protocol.md ================================================ # Spec 07: Human-in-the-Loop Escalation Protocol **Status:** pending **Priority:** Tier 1 (highest leverage) **Effort:** Small-Medium ## Context When an autonomous agent hits a wall it can't resolve — drift it can't fix, a contract violation with no clear cause, lock negotiation failure, budget exceeded — there's no formal protocol for raising the problem to a human. The hooks infrastructure exists, the discipline doesn't. Today the agent might: - Silently continue and compound the issue - Write a confused summary and exit, leaving no actionable packet - Page every minor issue, creating alert fatigue None of these scale to autonomous engineering at a team level. ## Goal A standard escalation protocol: the agent packages a context packet (what it tried, what went wrong, what it needs a human to decide) and delivers it through a configured channel. ## Approach ### Step 1 — Escalation packet schema ```yaml escalation: id: "esc-abc123" agent: "claude-opus-4.7" intent: "R-auth-refactor" severity: "blocking" # blocking | high | medium | low category: "drift_unresolvable" # or: contract_violation, lock_conflict, # budget_exceeded, taint_detected, unknown summary: "Two-sentence description of the situation" what_was_tried: - "Attempted X — result: failed because Y" - "Attempted Z — result: partial" proposed_options: - "Option A: revert to sha abc, human makes a decision" - "Option B: accept the drift, update postcondition" context_refs: - "commit: sha-latest" - "intent: R-auth-refactor" - "drift_report: path/to/drift.json" - "mnemos_checkpoint: path/to/checkpoint.json" awaiting: "resolution" ``` ### Step 2 — `icpg escalate` CLI ```bash icpg escalate --intent R-auth-refactor \ --category drift_unresolvable \ --severity blocking \ --summary "Cannot resolve postcondition drift" \ --context drift.json ``` Writes the packet to `.icpg/escalations/.yaml` and fires the configured delivery channel. ### Step 3 — Pluggable delivery channels One adapter per channel (`scripts/icpg/escalation/`): - `slack_adapter.py` — post to configured channel with packet fields - `github_issue_adapter.py` — create issue with the packet - `email_adapter.py` — SendGrid / SMTP - `file_adapter.py` — default; writes to `.icpg/escalations/` only (for local/dev) Config in `.icpg/config.yaml`: ```yaml escalation: channels: - type: slack webhook_url_env: SLACK_ESCALATION_WEBHOOK min_severity: high - type: github_issue repo: "org/repo" min_severity: blocking ``` ### Step 4 — Auto-trigger from known conditions Wire automatic escalations: | Condition | Severity | Category | |---|---|---| | Drift severity >0.8, auto-revert failed | blocking | drift_unresolvable | | Contract violation caught by generated test (Spec 03) | high | contract_violation | | Lock negotiation timeout (Spec 04) | medium | lock_conflict | | Budget exceeded without handoff checkpoint (Spec 06) | high | budget_exceeded | | CodeQL finds new taint path | blocking | taint_detected | Each hook module calls `icpg escalate` with the right packet when its trigger fires. ### Step 5 — Resolution tracking When a human responds (comment on the GitHub issue, Slack thread reply with a resolution marker like `resolved: revert`), an `EscalationResolution` node is written and any pending agent waiting on the packet can resume. Agents consult `icpg escalations list --pending` as part of their pre-task queries. ### Step 6 — Rate limiting / dedup Don't spam. If the same intent + category has an open escalation, merge into it (append to `what_was_tried`) instead of creating a new one. Escalation adapter respects a per-channel rate limit. ## Integration points - `scripts/icpg/models.py` — `Escalation`, `EscalationResolution` - `scripts/icpg/escalation/` — new package, one module per channel - `scripts/icpg/__main__.py` — `escalate`, `escalations list/resolve` subcommands - `hooks/post-tool-use` — auto-escalate on trigger conditions - `skills/icpg/SKILL.md` — document when agents should manually call it - `templates/escalation-config.yaml` — example config ## Success criteria 1. Agent can manually escalate a situation with `icpg escalate` and humans receive it through at least one channel (Slack preferred) 2. Auto-escalations fire for all 5 trigger conditions above 3. Dedup works — same intent + category doesn't spam 4. Human resolution flows back as `EscalationResolution` node, pending agents can detect it 5. Local/dev config uses file-only adapter (no external calls), never breaks tests ## Depends on None directly — builds on existing hook infrastructure. Integrates with: - Spec 02 (rollback) — failed auto-revert triggers escalation - Spec 03 (contracts) — test failures trigger escalation - Spec 04 (locks) — negotiation timeout triggers escalation - Spec 06 (budget) — overrun without handoff triggers escalation ================================================ FILE: _project_specs/08-auto-code-index.md ================================================ # Spec 08: Auto-Derived CODE_INDEX from Graph **Status:** pending **Priority:** Tier 2 **Effort:** Small-Medium ## Context The `code-deduplication` skill requires a `CODE_INDEX.md` in the project root — a capability index that tells the agent "this already exists, don't reimplement it." The current design asks humans (or agents) to maintain it manually. In practice: - Agents don't reliably update the index when they add capabilities - Humans forget to update it - The index drifts from reality fast - Agents that check the index get stale info and duplicate anyway Since we already have `codebase-memory-mcp` (symbol graph) and `iCPG` (intent graph), we can derive the capability index from them instead of hand-maintaining it. ## Goal Auto-generate `CODE_INDEX.md` from the graph, refreshed on every commit, organized by capability so agents can check-before-write reliably. ## Approach ### Step 1 — Capability extraction pass A new pass over the combined graph: 1. Read all `ReasonNode`s with status `fulfilled` (iCPG) 2. For each, pull the symbols they `CREATE` or `MODIFY` 3. Group by capability domain (inferred from: - intent's `scope` path prefixes — `app/auth/*` → "auth" - intent's `decision_type` — `business_goal` and `arch_decision` are top-level, `task` and `workaround` are subcategories - common tag patterns in the codebase) 4. For each capability, collect the main entry points (public classes/functions that serve that capability) ### Step 2 — Emit CODE_INDEX.md ```markdown # Code Capability Index Auto-generated from iCPG + codebase-memory-mcp. Last updated: 2026-04-20. Run `icpg index build` to regenerate. ## Authentication **Capability:** user auth, session management, token handling **Entry points:** - `app.auth.login_user()` [app/auth/login.py:42] — primary login - `app.auth.session.SessionManager` [app/auth/session.py] — session lifecycle **Intents:** R-auth-base, R-jwt-refactor, R-rate-limit ## Survey responses **Capability:** create, validate, persist, query survey responses **Entry points:** ... ``` Output is deterministic — same graph state produces the same output. ### Step 3 — Hook into post-commit Every commit that records new iCPG edges triggers a regeneration. Runs in under a second for typical repo sizes since it's a DB scan + markdown emit. ### Step 4 — `icpg index` subcommand ```bash icpg index build # regenerate CODE_INDEX.md icpg index check # verify CODE_INDEX.md matches graph state (for CI) icpg index query auth # query a specific capability section ``` The `check` subcommand lets CI reject commits that leave an out-of-sync CODE_INDEX. ### Step 5 — Agent workflow integration The `code-deduplication` skill's pre-write discipline stays the same, but the data source changes from "human-maintained CODE_INDEX.md" to "graph-derived CODE_INDEX.md." Update the skill to: 1. Call `icpg query prior ""` (iCPG's existing prior-work query) 2. If no match, consult the index sections matching the intent's scope 3. Only create new code if both checks are dry Also add `icpg query capability ""` — a semantic search over capability descriptions in the index, not just symbol names. ### Step 6 — Keep hand-written sections (optional) Let humans add non-derived sections (architecture notes, business domain glossary) in a separate file — `CODE_INDEX.human.md` — and `icpg index build` appends it. Auto-derived + human annotations cleanly separated. ## Integration points - `scripts/icpg/index.py` — new module, grouping + emit logic - `scripts/icpg/__main__.py` — `index build`, `index check`, `index query` subcommands - `hooks/post-commit-graph` — call `icpg index build` - `skills/code-deduplication/SKILL.md` — update to reference auto-derived index - `templates/CODE_INDEX.md` — deprecate the hand-maintained template; add note pointing to the auto-generated path ## Success criteria 1. On any repo with iCPG populated, `icpg index build` produces a grouped, readable CODE_INDEX.md 2. `icpg index check` detects drift between graph and markdown (for CI) 3. Agents find existing capabilities via semantic search (`icpg query capability "rate limiting"`) 4. Generation is deterministic — same graph → same markdown 5. Backward compatible: projects without iCPG continue to hand-maintain; projects with iCPG get the auto version 6. Regeneration is <2s on a 10k-symbol repo ## Depends on - iCPG (required) - codebase-memory-mcp (preferred — used for richer capability grouping) ================================================ FILE: _project_specs/09-multimodal-ingestion.md ================================================ # Spec 09: Multimodal Ingestion (Optional Graphify-Style Extension) **Status:** pending **Priority:** Tier 3 (frontier / optional) **Effort:** Large ## Context Our stack is code-only. Some repos carry essential context in non-code artifacts: - Product specs in PDFs or Google Docs exports - Architecture diagrams in PNG / Miro / whiteboard photos - Engineering demos in MP4 - Research papers in PDF When an autonomous agent works on such a repo, it currently ignores these artifacts. That's a real gap — the agent makes code decisions without knowing the intent captured in the diagrams or docs. Graphify (github.com/safishamsi/graphify) solves this: it ingests docs, images, audio, and video into the same knowledge graph as code. We don't need to rebuild their work — we can adopt their approach as an optional extension to claude-bootstrap. **This spec is optional** — only valuable if your repos actually carry non-code context. Most don't. ## Goal Let claude-bootstrap ingest non-code artifacts into the iCPG graph so agents can reason about code + docs + images in the same queries. ## Approach ### Step 1 — Artifact node type Extend iCPG with a new node: ``` Artifact { id, path, kind, content_hash, ingested_at, extracted_concepts: [] // concept strings } ``` Kinds: `pdf`, `markdown`, `image`, `diagram`, `video`, `audio`, `slides`. ### Step 2 — Ingestion pipeline `icpg ingest ` — one command, pluggable extractors: - `pdf_extractor.py` — text via `pypdf` or `pdfplumber`, then LLM to extract key concepts - `markdown_extractor.py` — parse headings, blockquotes, pull out "key decision" patterns - `image_extractor.py` — Claude multimodal: "describe this diagram; list entities and relationships" - `video_extractor.py` — `faster-whisper` transcription with domain-aware prompt, then concept extraction - `audio_extractor.py` — same as video, skip video decode Each extractor emits concept nodes + relationships back into iCPG using the existing edge vocabulary: - `DESCRIBES` — Artifact → Symbol / Reason (this doc describes this code) - `MENTIONS` — Artifact → Concept (looser reference) - `DECIDES` — Artifact → Reason (this doc made an architectural decision that became an intent) ### Step 3 — `.icpgignore` for ingest paths Respect a per-project `.icpgignore` like graphify's `.graphifyignore`, using `.gitignore` syntax. Default excludes: `node_modules/`, `dist/`, `.venv/`, `*.generated.*`, binary builds. ### Step 4 — Incremental refresh Track content hashes per artifact. Re-ingest only when hash changes. Bulk re-ingest via `icpg ingest --refresh`. ### Step 5 — Extend pre-task queries Add a 5th canonical query: ```bash icpg query docs "" # Find artifacts relevant to this topic ``` Returns: artifact paths, extracted concepts, relationships to code symbols. The PreToolUse hook includes this in the injected context when the agent is about to write code in a scope touched by `DESCRIBES` edges. ### Step 6 — Transparent honesty about inference Adopt graphify's `EXTRACTED` / `INFERRED` / `AMBIGUOUS` edge labeling. PDF text → EXTRACTED. Image concept → INFERRED with confidence. Whiteboard smudged text → AMBIGUOUS, flagged for review. ### Step 7 — Cost control LLM-based extractors (images, video transcripts) are expensive. Respect Spec 06 budgets. `icpg ingest` without a budget flag runs only the free extractors (markdown, PDF text). Image / video / audio require `--enable-llm` explicit flag. ### Step 8 — Distribution Ship this as a **separate installable package** — `claude-bootstrap-multimodal` on PyPI. Base claude-bootstrap stays code-only. Users opt in: ```bash pip install claude-bootstrap-multimodal icpg ingest docs/ specs/ ``` ## Integration points - `scripts/icpg/models.py` — `Artifact`, new edge types (`DESCRIBES`, `MENTIONS`, `DECIDES`) - `scripts/icpg/ingest/` — new package (could live in a separate repo) - `scripts/icpg/__main__.py` — `ingest` subcommand - `skills/icpg/SKILL.md` — document the 5th pre-task query - `skills/multimodal/SKILL.md` — new skill describing when to use ingestion ## Success criteria 1. `icpg ingest docs/` processes markdown + PDF without LLM and creates artifact nodes 2. `icpg ingest --enable-llm specs/` processes images and videos, with the budget flag respected 3. Pre-task queries surface relevant documentation when the agent is about to modify code touched by `DESCRIBES` edges 4. Re-ingestion only processes changed files (hash-based cache) 5. Base claude-bootstrap doesn't require multimodal deps to work — installed separately ## Depends on - Spec 06 (budget) — LLM extractors must respect budget caps ## Alternative: adopt graphify directly Instead of building this, we could document "for multimodal, run graphify alongside" and provide a conversion tool that imports graphify's `graph.json` into iCPG as Artifact nodes. This is faster to ship and avoids duplicating graphify's work. **Recommendation:** ship the conversion tool first (1-2 days of work), observe adoption, build native ingestion only if real demand emerges. ================================================ FILE: commands/analyze-repo.md ================================================ # Analyze Repository Analyze an existing repository's structure, conventions, and guardrails. **This command runs automatically** when `/initialize-project` detects an existing codebase without Claude setup. You can also run it standalone anytime. **Use this command standalone when:** - You want to re-analyze after making changes - You want analysis without running `/initialize-project` - Auditing code quality and guardrails on any repo - Reviewing a codebase without adding Claude skills **Automatic trigger:** - `/initialize-project` on existing codebase → auto-runs this analysis first --- ## Phase 1: Repository Detection Run these checks to understand the repo: ```bash # Git info echo "=== Git Status ===" && \ git remote -v 2>/dev/null && \ git branch -a 2>/dev/null | head -10 && \ git log --oneline -5 2>/dev/null # Config files echo "=== Config Files ===" && \ ls -la *.json *.toml *.yaml *.yml 2>/dev/null # Directory structure (3 levels, excluding noise) echo "=== Directory Structure ===" && \ find . -type d -maxdepth 3 \ -not -path "*/node_modules/*" \ -not -path "*/.git/*" \ -not -path "*/venv/*" \ -not -path "*/__pycache__/*" \ -not -path "*/dist/*" \ -not -path "*/build/*" \ 2>/dev/null | head -40 ``` --- ## Phase 2: Tech Stack Detection Identify the primary technologies: ```bash # JavaScript/TypeScript if [ -f "package.json" ]; then echo "=== Package.json ===" && \ cat package.json | head -50 fi # Python if [ -f "pyproject.toml" ]; then echo "=== pyproject.toml ===" && \ cat pyproject.toml fi # Mobile ls pubspec.yaml android/build.gradle ios/*.xcodeproj 2>/dev/null ``` Based on findings, determine: | File | Technology | |------|------------| | package.json + tsconfig.json | TypeScript | | package.json (no tsconfig) | JavaScript | | pyproject.toml | Python | | pubspec.yaml | Flutter (Dart) | | android/build.gradle | Android Native | | Cargo.toml | Rust | | go.mod | Go | --- ## Phase 3: Repo Structure Type Classify the repository: ```bash # Check structure type echo "=== Repo Structure Type ===" && \ if [ -d "packages" ] || [ -d "apps" ] || grep -q '"workspaces"' package.json 2>/dev/null; then echo "MONOREPO - Multiple packages/apps with shared tooling" elif [ -d "frontend" ] && [ -d "backend" ]; then echo "FULL-STACK MONOLITH - Frontend + Backend in same repo" elif [ -d "src" ] && grep -q '"react\|vue\|angular"' package.json 2>/dev/null; then echo "FRONTEND - Single frontend application" elif [ -d "src" ] && grep -q '"express\|fastify\|koa"' package.json 2>/dev/null; then echo "BACKEND - Single backend application" elif [ -f "pyproject.toml" ] && grep -q "fastapi\|django\|flask" pyproject.toml 2>/dev/null; then echo "BACKEND (Python) - Single backend application" else echo "STANDARD - Single-purpose repository" fi ``` --- ## Phase 4: Guardrails Audit Check existing code quality tools: ```bash echo "=== Guardrails Audit ===" && \ # Pre-commit hooks echo "Pre-commit Hooks:" && \ [ -d ".husky" ] && echo " [x] Husky installed" || echo " [ ] Husky NOT installed" && \ [ -f ".pre-commit-config.yaml" ] && echo " [x] pre-commit framework" || echo " [ ] pre-commit framework NOT installed" && \ [ -f ".git/hooks/pre-commit" ] && echo " [x] Git hooks present" || echo " [ ] No git hooks" # Linting echo "Linting:" && \ (grep -q '"eslint"' package.json 2>/dev/null && echo " [x] ESLint") || \ (grep -q '"biome"' package.json 2>/dev/null && echo " [x] Biome") || \ (grep -q "ruff" pyproject.toml 2>/dev/null && echo " [x] Ruff") || \ echo " [ ] No linter detected" # Formatting echo "Formatting:" && \ (grep -q '"prettier"' package.json 2>/dev/null && echo " [x] Prettier") || \ (grep -q "black" pyproject.toml 2>/dev/null && echo " [x] Black") || \ (grep -q "ruff" pyproject.toml 2>/dev/null && echo " [x] Ruff (formatting)") || \ echo " [ ] No formatter detected" # Type checking echo "Type Checking:" && \ ([ -f "tsconfig.json" ] && echo " [x] TypeScript") || \ (grep -q "mypy" pyproject.toml 2>/dev/null && echo " [x] mypy") || \ (grep -q "pyright" pyproject.toml 2>/dev/null && echo " [x] pyright") || \ echo " [ ] No type checker detected" # Testing echo "Testing:" && \ (grep -q '"jest\|vitest"' package.json 2>/dev/null && echo " [x] Jest/Vitest") || \ (grep -q "pytest" pyproject.toml 2>/dev/null && echo " [x] pytest") || \ echo " [ ] No test framework detected" # Commit validation echo "Commit Validation:" && \ ([ -f "commitlint.config.js" ] && echo " [x] commitlint") || \ (grep -q "conventional-pre-commit" .pre-commit-config.yaml 2>/dev/null && echo " [x] conventional-pre-commit") || \ echo " [ ] No commit validation" # CI/CD echo "CI/CD:" && \ [ -d ".github/workflows" ] && echo " [x] GitHub Actions" || echo " [ ] No GitHub Actions" && \ [ -f ".gitlab-ci.yml" ] && echo " [x] GitLab CI" || true && \ [ -f "Jenkinsfile" ] && echo " [x] Jenkins" || true ``` --- ## Phase 5: Convention Detection Identify existing code patterns: ```bash echo "=== Convention Detection ===" && \ # File naming echo "File Naming:" && \ ls src/**/*.ts 2>/dev/null | head -5 && \ ls src/**/*.py 2>/dev/null | head -5 # Import style (JS/TS) echo "Import Style:" && \ grep -h "^import" src/**/*.ts 2>/dev/null | head -5 # Export style (JS/TS) echo "Export Style:" && \ grep -h "^export" src/**/*.ts 2>/dev/null | head -5 # Test file location echo "Test Location:" && \ find . -name "*.test.*" -o -name "*.spec.*" -o -name "test_*.py" 2>/dev/null | head -5 ``` --- ## Phase 6: Generate Report Based on all findings, generate this report structure: ```markdown # Repository Analysis Report **Generated:** [timestamp] **Repository:** [name from git remote or directory] ## Overview | Attribute | Value | |-----------|-------| | Type | [Monorepo / Full-Stack / Frontend / Backend] | | Language | [TypeScript / Python / ...] | | Framework | [React / FastAPI / ...] | | Package Manager | [npm / pnpm / uv / pip] | ## Directory Structure [Simplified tree output] ## Tech Stack | Category | Technology | Config | |----------|------------|--------| | Language | X | X | | Framework | X | X | | Testing | X | X | | Linting | X | X | | Formatting | X | X | ## Guardrails Status ### Present - [x] Item 1 - [x] Item 2 ### Missing (Recommended to Add) - [ ] Item 1 - [brief reason] - [ ] Item 2 - [brief reason] ## Conventions Observed | Pattern | Observed Value | Example | |---------|----------------|---------| | Naming | camelCase / snake_case | file.ts | | Imports | Absolute / Relative | @/components | | Tests | Colocated / Separate | *.test.ts | | Exports | Named / Default | export { X } | ## Recommendations 1. **High Priority** - [Recommendation with reason] 2. **Medium Priority** - [Recommendation with reason] 3. **Low Priority / Nice to Have** - [Recommendation with reason] ## Key Files to Review | File | Purpose | Why Review | |------|---------|------------| | src/index.ts | Entry point | Understand app bootstrap | | src/config.ts | Configuration | Understand env handling | | tests/setup.ts | Test setup | Understand test patterns | ``` --- ## Phase 7: Offer Next Steps After generating the report, offer these options: > **Analysis complete!** Here's what I found: [summary] > > What would you like to do next? > 1. **Add missing guardrails** - Set up pre-commit hooks, linting, etc. > 2. **Generate detailed conventions doc** - Document patterns for team > 3. **Set up Claude integration** - Run `/initialize-project` to add Claude skills > 4. **Start working on code** - I'll follow the conventions I detected > 5. **Something else** --- ## Quick Analysis (One Command) For a quick overview without the full report: ```bash echo "=== Quick Analysis ===" && \ echo "Repo: $(basename $(pwd))" && \ echo "Type: $([ -d packages ] && echo 'Monorepo' || ([ -d frontend ] && [ -d backend ] && echo 'Full-Stack') || echo 'Standard')" && \ echo "Tech: $([ -f package.json ] && echo 'JS/TS' || ([ -f pyproject.toml ] && echo 'Python') || echo 'Other')" && \ echo "Guardrails: $([ -d .husky ] || [ -f .pre-commit-config.yaml ] && echo 'Present' || echo 'Missing')" && \ echo "CI/CD: $([ -d .github/workflows ] && echo 'GitHub Actions' || echo 'None')" ``` ================================================ FILE: commands/analyze-workspace.md ================================================ # /analyze-workspace > Full dynamic analysis of workspace topology, dependencies, and contracts. ## Trigger Run this command when: - First time setting up workspace awareness - Major refactor or new module added - Weekly scheduled refresh - `/sync-contracts` reports too much drift - Switching to work on a different workspace ## Behavior ### Phase 1: Topology Discovery (~30 seconds) ``` 🔍 Analyzing workspace topology... Checking workspace indicators: ✓ Found turbo.json (Turborepo) ✓ Found pnpm-workspace.yaml ✗ No nx.json ✗ No lerna.json Workspace type: Monorepo (Turborepo) Root: /Users/ali/code/myapp Discovering modules... ✓ apps/web (package.json found) ✓ apps/api (pyproject.toml found) ✓ packages/shared-types (package.json found) ✓ packages/db (package.json found) Modules found: 4 ``` ### Phase 2: Module Analysis (~60 seconds) For each module, analyze: ``` 📦 Analyzing apps/web... Tech stack: Next.js 14, TypeScript, TailwindCSS Entry point: src/app/layout.tsx Key directories: src/lib/, src/components/, src/types/ Dependencies: @repo/shared-types, @repo/ui External calls: fetch → apps/api (15 files) Token estimate: 18K full, 5K summarized 📦 Analyzing apps/api... Tech stack: FastAPI, Python 3.12, SQLAlchemy Entry point: app/main.py Key directories: app/routes/, app/schemas/, app/models/ Dependencies: packages/db (internal) Exposes: OpenAPI spec (47 endpoints) Token estimate: 24K full, 7K summarized 📦 Analyzing packages/shared-types... Tech stack: TypeScript Entry point: src/index.ts Exports: 34 types Consumed by: apps/web, apps/api (codegen) Token estimate: 3K 📦 Analyzing packages/db... Tech stack: Drizzle ORM, TypeScript Entry point: src/index.ts Tables: 12 Migrations: 23 Token estimate: 8K full, 2K schema only ``` ### Phase 3: Contract Extraction (~45 seconds) ``` 📜 Extracting contracts... OpenAPI Detection: ✓ apps/api/openapi.json (47 endpoints, 23 schemas) GraphQL Detection: ✗ No GraphQL schemas found TypeScript Types: ✓ packages/shared-types/src/index.ts (34 exports) Pydantic Schemas: ✓ apps/api/app/schemas/ (23 models) Database Schema: ✓ packages/db/schema/ (12 tables) Contract sources registered: 5 files ``` ### Phase 4: Dependency Graph (~30 seconds) ``` 🔗 Building dependency graph... Internal dependencies: apps/web → packages/shared-types (23 imports) apps/web → apps/api (15 API calls) apps/api → packages/db (12 imports) apps/api → packages/shared-types (codegen) packages/db → (none) packages/shared-types → (none) Dependency order (for changes): 1. packages/shared-types (leaf) 2. packages/db (leaf) 3. apps/api (depends on db, shared-types) 4. apps/web (depends on api, shared-types) ``` ### Phase 5: Key File Identification (~30 seconds) ``` 📁 Identifying key files... High priority (always relevant): ✓ apps/api/openapi.json ✓ packages/shared-types/src/index.ts ✓ apps/web/src/lib/api/client.ts Context-specific: ✓ API work: apps/api/app/routes/*.py ✓ DB work: packages/db/schema/*.ts ✓ Auth work: apps/api/app/routes/auth.py + deps ✓ Frontend: apps/web/src/components/** Token budget by context: Frontend API: ~8K tokens Backend endpoints: ~12K tokens Database changes: ~6K tokens Shared types: ~4K tokens ``` ### Phase 6: Generate Artifacts ``` 📝 Generating workspace artifacts... Created: ✓ _project_specs/workspace/TOPOLOGY.md ✓ _project_specs/workspace/CONTRACTS.md ✓ _project_specs/workspace/DEPENDENCY_GRAPH.md ✓ _project_specs/workspace/KEY_FILES.md ✓ _project_specs/workspace/CROSS_REPO_INDEX.md ✓ _project_specs/workspace/.contract-sources ``` ## Final Output ``` ════════════════════════════════════════════════════════════════ WORKSPACE ANALYSIS COMPLETE ════════════════════════════════════════════════════════════════ Workspace: myapp Type: Monorepo (Turborepo) Modules: 4 (2 apps, 2 packages) ┌─────────────────────────────────────────────────┐ │ apps/web (Next.js) ←──── apps/api (FastAPI) │ │ │ │ │ │ ▼ ▼ │ │ packages/shared-types packages/db │ └─────────────────────────────────────────────────┘ Contracts: REST API: 47 endpoints Shared types: 34 interfaces DB tables: 12 Token Estimates: Current module only: ~20K tokens With cross-module context: ~45K tokens Full workspace: ~53K tokens Budget remaining: ~100K tokens ✓ Artifacts generated in: _project_specs/workspace/ Next steps: • Contracts will auto-sync on commit (if changed) • Run /sync-contracts manually to refresh • Run /workspace-status for quick check ════════════════════════════════════════════════════════════════ ``` ## Flags | Flag | Description | |------|-------------| | `--force` | Regenerate all artifacts even if recent | | `--type ` | Override auto-detection: `monorepo`, `multi-repo`, `hybrid` | | `--repos ` | For multi-repo: comma-separated paths to related repos | | `--skip-contracts` | Skip contract extraction (faster) | | `--verbose` | Show detailed analysis output | | `--json` | Output as JSON (for tooling) | ## Multi-Repo Mode For workspaces with separate git repositories: ```bash # Auto-detect sibling repos /analyze-workspace --type multi-repo # Specify repo locations explicitly /analyze-workspace --type multi-repo --repos "../backend,../shared,../mobile" ``` Claude will: 1. Detect related repos in parent directory 2. Set up symlinks in `.workspace/repos/` if needed 3. Analyze each repo 4. Build cross-repo dependency graph 5. Extract contracts from each ## Integration Points ### On First Run Creates the full workspace context structure: ``` _project_specs/ └── workspace/ ├── TOPOLOGY.md ├── CONTRACTS.md ├── DEPENDENCY_GRAPH.md ├── KEY_FILES.md ├── CROSS_REPO_INDEX.md ├── .contract-sources └── cache/ # Cached cross-repo files ``` ### Updates CLAUDE.md Adds workspace skill reference: ```markdown ## Skills - .claude/skills/workspace.md ``` ### Sets Up Hooks Installs contract freshness hooks: - Session start: Staleness check - Post-commit: Auto-sync trigger - Pre-push: Validation gate ## Error Handling ### No Workspace Detected ``` ⚠️ No workspace configuration detected This appears to be a single-repo project. Use /analyze-repo for single repository analysis. Or specify workspace type manually: /analyze-workspace --type monorepo /analyze-workspace --type multi-repo --repos "../other-repo" ``` ### Access Denied to Related Repo ``` ⚠️ Cannot access related repository: ../backend Options: 1. Ensure the repo exists at that path 2. Create symlink: ln -s /path/to/backend .workspace/repos/backend 3. Skip this repo: /analyze-workspace --skip-repo backend ``` ### Contract Extraction Failed ``` ⚠️ Failed to extract contracts from apps/api Reason: openapi.json not found Suggestions: 1. Generate OpenAPI spec: cd apps/api && python -m app.generate_openapi 2. Skip contract extraction: /analyze-workspace --skip-contracts 3. Use inferred contracts: /analyze-workspace --infer-contracts ``` ## When to Re-run | Scenario | Action | |----------|--------| | Added new module/package | Full `/analyze-workspace` | | Changed API endpoints | `/sync-contracts` (lightweight) | | Major refactor | Full `/analyze-workspace --force` | | Weekly maintenance | Full `/analyze-workspace` | | Quick check | `/workspace-status` | ================================================ FILE: commands/check-contributors.md ================================================ # Check Contributors Checks who's working on the project and optionally converts to a multi-person project with team state management. --- ## What This Command Does 1. **Detect current state** - Is this a solo or team project? 2. **Show active contributors** - Who's working on what right now? 3. **Offer conversion** - Convert solo → team project if needed --- ## Phase 1: Detect Project Type Check for team structure: ```bash # Check if team coordination exists ls _project_specs/team/state.md 2>/dev/null ls _project_specs/team/contributors.md 2>/dev/null # Check git contributors git shortlog -sn --all 2>/dev/null | head -10 # Check recent activity git log --oneline --since="7 days ago" --format="%an" | sort | uniq -c | sort -rn ``` ### If Team Structure Exists Report current state: ``` 📊 Team Project Detected Contributors: ┌──────────┬────────────────┬──────────┐ │ Handle │ Focus Area │ Status │ ├──────────┼────────────────┼──────────┤ │ @alice │ Backend, Auth │ 🟢 Active │ │ @bob │ Frontend │ 🟡 Paused │ └──────────┴────────────────┴──────────┘ Active Sessions: • @alice working on TODO-042 (src/auth/*) • No conflicts detected Claimed Todos: • TODO-042 - @alice (since 2024-01-15) • TODO-038 - @bob (since 2024-01-14) Recent Decisions: • [2024-01-15] JWT vs Sessions - chose JWT (@alice) Run 'cat _project_specs/team/state.md' for full details. ``` ### If Solo Project ``` 👤 Solo Project Detected Git contributors found: • alice@example.com (142 commits) • bob@example.com (38 commits) ← Recent activity This project has multiple git contributors but no team coordination. Would you like to: 1. Convert to team project (adds team state management) 2. Keep as solo project (no changes) ``` --- ## Phase 2: Convert to Team Project If user chooses to convert: ### Step 1: Create Team Structure ```bash mkdir -p _project_specs/team/handoffs ``` ### Step 2: Create state.md ```markdown # Team State *Last synced: [TIMESTAMP]* ## Active Sessions | Contributor | Working On | Started | Files Touched | Status | |-------------|------------|---------|---------------|--------| | - | - | - | - | - | ## Claimed Todos | Todo | Claimed By | Since | ETA | |------|------------|-------|-----| | - | - | - | - | ## Recently Completed (Last 48h) | Todo | Completed By | When | PR | |------|--------------|------|-----| | - | - | - | - | ## Conflicts to Watch | Area | Contributors | Notes | |------|--------------|-------| | - | - | - | ## Announcements - [DATE] Project converted to team coordination mode ``` ### Step 3: Create contributors.md Ask user about team members: ``` Who are the team members? (I'll help you fill this out) For each person, I need: - Handle (e.g., @alice) - Name - Focus areas (e.g., Backend, Auth) - Timezone - Status (Active/Part-time) ``` Then create: ```markdown # Contributors ## Team Members | Handle | Name | Focus Areas | Timezone | Status | |--------|------|-------------|----------|--------| | @[handle] | [name] | [areas] | [tz] | Active | ## Ownership | Area | Primary | Backup | Notes | |------|---------|--------|-------| | - | - | - | Define as you work | ## Communication - Slack: #[channel] - PRs: Tag area owner for review ``` ### Step 4: Update active.md Add claim annotation format to existing todos: ```markdown ## [TODO-XXX] Description **Status:** pending **Claimed:** - ... ``` ### Step 5: Update CLAUDE.md Add team-coordination.md to skills list: ```markdown ## Skills Read and follow these skills before writing any code: - .claude/skills/base.md - .claude/skills/team-coordination.md ← Add this ... ``` ### Step 6: Copy Skill ```bash cp ~/.claude/skills/team-coordination.md .claude/skills/ ``` --- ## Phase 3: Summary After conversion: ``` ✅ Converted to Team Project Created: • _project_specs/team/state.md • _project_specs/team/contributors.md • _project_specs/team/handoffs/ • .claude/skills/team-coordination.md Updated: • _project_specs/todos/active.md (added claim format) • CLAUDE.md (added team-coordination skill) Next steps: 1. Fill out contributors.md with your team 2. Each team member should read team-coordination.md 3. Claim todos before starting work 4. Update state.md at start/end of each session Commit these changes: git add _project_specs/team .claude/skills/team-coordination.md CLAUDE.md git commit -m "Enable team coordination for multi-person project" git push origin main ``` --- ## Quick Check Mode For quick status without conversion prompt: ``` /check-contributors --status ``` Output: ``` 📊 Quick Status Type: Team Project / Solo Project Contributors: 3 (2 active this week) Active Now: @alice (TODO-042) Claimed: 2 todos Conflicts: None Last state update: 2 hours ago ``` --- ## Reverting to Solo If team coordination is no longer needed: ``` /check-contributors --solo ``` This: 1. Archives `_project_specs/team/` to `_project_specs/team-archive-[date]/` 2. Removes claim annotations from todos 3. Removes team-coordination.md from CLAUDE.md skills 4. Keeps decisions.md (valuable history) --- ## Usage ```bash # Check who's working and see options /check-contributors # Quick status only /check-contributors --status # Force conversion to team project /check-contributors --team # Revert to solo project /check-contributors --solo ``` ================================================ FILE: commands/icpg-bootstrap.md ================================================ # /icpg-bootstrap — Bootstrap from Git History Infer ReasonNodes from existing git commit history. One-time setup for existing codebases. --- ## Usage `/icpg-bootstrap [--days N]` Default: 90 days of history. --- ## Steps ### 1. Initialize iCPG if needed ```bash icpg init ``` ### 2. Run bootstrap ```bash icpg bootstrap --days 90 --verbose ``` If no LLM API key available: ```bash icpg bootstrap --days 90 --verbose --no-llm ``` ### 3. Show results ``` iCPG BOOTSTRAP COMPLETE ═══════════════════════ History scanned: {N} days ({M} commits) Commit clusters: {K} ReasonNodes created: {R} Symbols linked: {S} Duplicates skipped: {D} TOP INFERRED INTENTS: 1. [0.80] "Add JWT authentication" — 12 symbols, 5 files 2. [0.75] "Refactor payment processing" — 8 symbols, 3 files 3. [0.65] "Fix rate limiting bug" — 3 symbols, 2 files ... LOW CONFIDENCE (review recommended): - [0.55] "Update dependencies" — may be too generic - [0.50] "Misc fixes" — commit message unclear ``` ### 4. Offer review Ask the user: > {N} ReasonNodes were inferred from git history. > {M} are low-confidence and may need review. > > Would you like to: > 1. Keep all (proceed with current quality) > 2. Review low-confidence intents (I'll show each one) > 3. Run drift scan now (`icpg drift check`) ### 5. Post-bootstrap drift scan ```bash icpg drift check ``` Show any immediate drift detected. ================================================ FILE: commands/icpg-drift.md ================================================ # /icpg-drift — Show All Drift Run a full drift scan and display all unresolved drift events, grouped by dimension and sorted by severity. --- ## Usage `/icpg-drift` --- ## Steps ### 1. Run drift scan ```bash icpg drift check ``` ### 2. Also show existing unresolved drift ```bash icpg status ``` ### 3. Display results ``` DRIFT REPORT ═══════════════ {N} unresolved drift events across {M} symbols BY SEVERITY: [0.85] spec(0.9) + decision(0.8) — validateToken drifted from "JWT auth" [0.60] ownership(0.7) + test(0.5) — UserService has 4 owners, tests stale ... BY DIMENSION: Spec drift: {count} events Decision drift: {count} events Ownership drift: {count} events Test drift: {count} events Usage drift: {count} events Dependency drift: {count} events TOP ACTIONS: 1. Fix spec drift in validateToken — checksum changed without MODIFIES edge 2. Add tests for UserService — VALIDATED_BY tests are missing 3. Assign single owner to PaymentProcessor — 5 different owners ``` ### 4. Offer resolution For each event, suggest: - `icpg drift resolve ` to mark resolved - Create a new MODIFIES ReasonNode if the change was intentional - Write missing tests if test drift detected ================================================ FILE: commands/icpg-impact.md ================================================ # /icpg-impact — Show Blast Radius Show the blast radius of a ReasonNode or symbol — what depends on it, what breaks if it changes. --- ## Usage `/icpg-impact ` - If argument looks like a UUID (contains `-`), treat as ReasonNode ID - Otherwise, treat as symbol name and find its creating ReasonNode --- ## Steps ### 1. Resolve target ```bash # If ReasonNode ID icpg query blast # If symbol name icpg query risk # Then get the creating reason from the output icpg query blast ``` ### 2. Display results Format the output as: ``` BLAST RADIUS: ═══════════════════════════════════ Symbols ({N}): function validateToken (src/auth/service.ts) class AuthMiddleware (src/auth/middleware.ts) ... Dependent Intents ({N}): a1b2c3d4 — Dashboard user session management e5f6g7h8 — Payment authorization flow ... Contracts: INV: file_exists("src/auth/middleware.ts") POST: test_exists("src/auth/__tests__/service.test.ts") Risk: {HIGH|MEDIUM|LOW} based on dependent count + drift history ``` ### 3. Recommendations If high risk (>5 dependents or active drift): - Suggest running full test suite before changes - Suggest creating a new ReasonNode with MODIFIES edge - Warn about function signatures to preserve ================================================ FILE: commands/icpg-why.md ================================================ # /icpg-why — Why Does This Code Exist? Trace any symbol back to its creating ReasonNode — show the original goal, who wrote it, and whether it's still doing what it was made for. --- ## Usage `/icpg-why ` --- ## Steps ### 1. Find the symbol ```bash icpg query risk ``` If not found, search more broadly: ```bash icpg query context ``` ### 2. Show the full trace ``` WHY: ═══════════════════ Symbol: () Signature: Checksum: CREATING INTENT: ID: Goal: Type: Owner: Status: Created: CONTRACTS: PRE: POST: INV: MODIFICATION HISTORY: 1. (by ) 2. (by ) DRIFT STATUS: {CLEAN | DRIFTED} Dimensions: Severity: ``` ### 3. If no ReasonNode found Symbol exists but has no iCPG tracking: ``` ⚠ No ReasonNode found for . This code has no tracked intent — consider creating one: icpg create "" --scope ``` ================================================ FILE: commands/initialize-project.md ================================================ # Initialize Project Full project setup with Claude coding guardrails. Works for both new and existing projects. **This command is idempotent** - run it anytime to update skills, add missing structure, or reconfigure. --- ## Phase 0: Validate Bootstrap Installation **FIRST**, verify Maggy is properly installed: ```bash # Read bootstrap directory (saved during install) BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) # Run quick validation "$BOOTSTRAP_DIR/tests/validate-structure.sh" --quick ``` This checks: - Skills are installed with correct structure (folder/SKILL.md) - Commands are installed (~/.claude/commands/) - Hooks are installed (~/.claude/hooks/) **If validation fails:** - Show the error to user - Suggest running: `cd "$BOOTSTRAP_DIR" && git pull && ./install.sh` - Offer to continue anyway or abort **If validation passes:** - Continue to Phase 1 --- ## Phase 1: Detect Project State First, check what already exists: ```bash # Check for existing Claude setup ls -la .claude/skills/ 2>/dev/null ls -la CLAUDE.md 2>/dev/null ls -la _project_specs/ 2>/dev/null # Check for cross-tool setup (Kimi CLI, Codex CLI) ls -la .kimi/skills/ 2>/dev/null ls -la .codex/skills/ 2>/dev/null ls -la .agents/skills/ 2>/dev/null ls -la AGENTS.md 2>/dev/null # Detect installed AI CLI tools BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) DETECTED_AGENTS=$("$BOOTSTRAP_DIR/scripts/detect-agents.sh" 2>/dev/null || echo "claude") echo "Detected AI CLI tools: $DETECTED_AGENTS" # Check for existing git repo git remote -v 2>/dev/null # Check for existing package files ls package.json pyproject.toml 2>/dev/null # Check for Flutter project ls pubspec.yaml 2>/dev/null # Check for Android project ls android/build.gradle android/app/build.gradle 2>/dev/null # Check for native language in Android projects find android -name "*.java" -type f 2>/dev/null | head -1 find android -name "*.kt" -type f 2>/dev/null | head -1 ``` Based on findings, determine: - **New project**: No CLAUDE.md, no .claude/skills/, no code files - **Existing project with skills**: Has .claude/skills/ - offer to UPDATE - **Existing codebase without skills**: Has code but no Claude setup - **AUTO-RUN ANALYSIS** Inform the user: - "Detected new project - will do full setup" - "Detected existing Claude project - will update skills and add any missing structure" - "Detected existing codebase - **analyzing before making changes...**" **For existing codebases without Claude setup, AUTOMATICALLY proceed to Phase 1b.** --- ## Phase 1b: Analyze Existing Codebase (Auto-triggered) **This phase runs automatically when an existing codebase is detected without Claude setup.** ### Step 1: Repository Structure Detection ```bash echo "=== Analyzing Repository Structure ===" && \ # Detect repo type if [ -d "packages" ] || [ -d "apps" ] || grep -q '"workspaces"' package.json 2>/dev/null; then REPO_TYPE="MONOREPO" elif [ -d "frontend" ] && [ -d "backend" ]; then REPO_TYPE="FULL_STACK" elif [ -d "src" ] && grep -q '"react\|vue\|angular"' package.json 2>/dev/null; then REPO_TYPE="FRONTEND" elif [ -f "pyproject.toml" ] || grep -q '"express\|fastify"' package.json 2>/dev/null; then REPO_TYPE="BACKEND" else REPO_TYPE="STANDARD" fi echo "Repo Type: $REPO_TYPE" # Directory structure (3 levels, excluding noise) find . -type d -maxdepth 3 \ -not -path "*/node_modules/*" \ -not -path "*/.git/*" \ -not -path "*/venv/*" \ -not -path "*/__pycache__/*" \ -not -path "*/dist/*" \ -not -path "*/build/*" \ 2>/dev/null | head -30 ``` ### Step 2: Tech Stack Detection ```bash echo "=== Tech Stack ===" && \ # Primary language/framework [ -f "package.json" ] && echo "JavaScript/TypeScript project" [ -f "tsconfig.json" ] && echo " → TypeScript configured" [ -f "pyproject.toml" ] && echo "Python project" [ -f "pubspec.yaml" ] && echo "Flutter project" [ -d "android" ] && echo "Android project" # Frameworks (from package.json) if [ -f "package.json" ]; then grep -q '"react"' package.json && echo " → React" grep -q '"next"' package.json && echo " → Next.js" grep -q '"express"' package.json && echo " → Express" grep -q '"fastify"' package.json && echo " → Fastify" fi # Frameworks (from pyproject.toml) if [ -f "pyproject.toml" ]; then grep -q "fastapi" pyproject.toml && echo " → FastAPI" grep -q "django" pyproject.toml && echo " → Django" grep -q "flask" pyproject.toml && echo " → Flask" fi ``` ### Step 3: Guardrails Audit ```bash echo "=== Guardrails Status ===" && \ # Pre-commit hooks echo "Pre-commit Hooks:" [ -d ".husky" ] && echo " ✓ Husky installed" || echo " ✗ Husky NOT installed" [ -f ".pre-commit-config.yaml" ] && echo " ✓ pre-commit framework" || echo " ✗ pre-commit NOT installed" # Linting echo "Linting:" (grep -q '"eslint"' package.json 2>/dev/null && echo " ✓ ESLint") || \ (grep -q "ruff" pyproject.toml 2>/dev/null && echo " ✓ Ruff") || \ echo " ✗ No linter detected" # Formatting echo "Formatting:" (grep -q '"prettier"' package.json 2>/dev/null && echo " ✓ Prettier") || \ (grep -q "ruff\|black" pyproject.toml 2>/dev/null && echo " ✓ Ruff/Black") || \ echo " ✗ No formatter detected" # Type checking echo "Type Checking:" ([ -f "tsconfig.json" ] && echo " ✓ TypeScript") || \ (grep -q "mypy" pyproject.toml 2>/dev/null && echo " ✓ mypy") || \ echo " ✗ No type checker detected" # Commit validation echo "Commit Validation:" ([ -f "commitlint.config.js" ] && echo " ✓ commitlint") || \ (grep -q "conventional-pre-commit" .pre-commit-config.yaml 2>/dev/null && echo " ✓ conventional-pre-commit") || \ echo " ✗ No commit validation" # CI/CD echo "CI/CD:" [ -d ".github/workflows" ] && echo " ✓ GitHub Actions" || echo " ✗ No GitHub Actions" ``` ### Step 4: Convention Detection ```bash echo "=== Conventions Detected ===" && \ # File naming pattern echo "File Naming:" ls src/**/*.ts 2>/dev/null | head -3 || ls src/**/*.py 2>/dev/null | head -3 # Import style echo "Import Style:" grep -h "^import" src/**/*.ts 2>/dev/null | head -3 || \ grep -h "^from\|^import" src/**/*.py 2>/dev/null | head -3 # Test location echo "Test Location:" [ -d "tests" ] && echo " Separate tests/ directory" [ -d "__tests__" ] && echo " __tests__/ directory" find . -name "*.test.*" -o -name "*.spec.*" 2>/dev/null | head -1 && echo " Colocated tests" ``` ### Step 5: Generate Analysis Summary After running the analysis, present this summary to the user: ```markdown ## Repository Analysis Complete **Type:** [Monorepo | Full-Stack | Frontend | Backend | Standard] **Language:** [TypeScript | Python | Flutter | ...] **Framework:** [React | FastAPI | ...] ### Guardrails Status | Category | Status | Recommendation | |----------|--------|----------------| | Pre-commit hooks | ✗ Missing | Add Husky (JS) or pre-commit (Python) | | Linting | ✓ ESLint | - | | Formatting | ✗ Missing | Add Prettier | | Type checking | ✓ TypeScript | - | | Commit validation | ✗ Missing | Add commitlint | ### Conventions I'll Follow - File naming: camelCase - Imports: Absolute (@/...) - Tests: Colocated (*.test.ts) ``` ### Step 6: Present Options After showing the analysis, ask: > **I've analyzed this codebase. Here's what I found:** [summary above] > > What would you like me to do? > 1. **Add Claude skills only** - Add skills, preserve everything else > 2. **Add skills + missing guardrails** - Also setup Husky/pre-commit, commitlint, etc. > 3. **Full setup** - Skills, guardrails, project specs structure, CI/CD > 4. **Just show analysis** - Don't change anything yet **Based on user choice:** - Option 1 → Skip to Phase 4, only copy skills - Option 2 → Phase 4 + guardrails setup from `existing-repo` skill - Option 3 → Full Phase 4 execution - Option 4 → End here, user can run `/initialize-project` again later --- ## Phase 2: Validate CLI Tools Check required CLI tools are installed and authenticated: ```bash # Check GitHub CLI gh auth status # Check Vercel CLI vercel whoami # Check Supabase CLI supabase projects list ``` If any tool fails, inform the user and offer to skip: - "GitHub CLI not authenticated. Run: `gh auth login` (or skip if not using GitHub)" - "Vercel CLI not authenticated. Run: `vercel login` (or skip if not using Vercel)" - "Supabase CLI not authenticated. Run: `supabase login` (or skip if not using Supabase)" --- ## Phase 3: Project Questions **For existing projects with CLAUDE.md**: Read existing config first, then ask what to update. **For new or unconfigured projects**: Ask these questions one at a time: ### 1. What are you building? Ask for a brief description (1-2 sentences). *Skip if CLAUDE.md exists and has Project Overview - show current and ask if they want to update.* ### 2. What language/runtime? - Python - TypeScript - JavaScript (Node) - Android Java - Android Kotlin - Flutter (Dart) - Multiple (specify which) *Auto-detect from package.json, pyproject.toml, pubspec.yaml, or android/ directory if present.* ### 3. What type of project? - Backend API - Frontend Web (React) - Mobile App (React Native) - Mobile App (Android Native) - Mobile App (Flutter) - Mobile App (Flutter + Native Android) - Full Stack (Backend + Frontend) - CLI Tool - Library/Package *Auto-detect from dependencies if possible.* ### 4. Is this an AI-first application? - Yes (LLMs handle core logic) - No (traditional application) *Check for anthropic/openai in dependencies.* ### 4b. Code graph analysis level? - **Standard** (default) - Lightweight AST graph with symbol lookup, dependency analysis, blast radius - **Deep analysis** - Also enable Joern CPG (control flow, data flow, dead code detection) - **Security audit** - Also enable CodeQL (taint analysis, vulnerability detection) - **Full** - All three tiers *Tier 1 (codebase-memory-mcp) is always enabled for all projects. This question determines opt-in tiers.* *Auto-suggest: If security skill is included, suggest "Security audit". If AI-first, suggest "Deep analysis".* ### 5. What framework? (based on previous answers) **Backend:** - Python: FastAPI, Flask, Django - Node: Express, Fastify, Hono **Frontend Web:** - React (Vite, Next.js) **Mobile:** - React Native, Expo *Auto-detect from dependencies.* ### 6. What database? - Supabase (Postgres) - None / SQLite - Other (specify) *Skip if supabase/ directory exists.* ### 7. Where will this be deployed? - Vercel - Render - Other (specify) *Skip if vercel.json or render.yaml exists.* ### 8. Repository setup? (skip if git remote already configured) - Create new repository - Connect to existing repository - Skip (local only for now) If creating new: - What should the repo be named? - Public or private? ### 9. Which AI CLI tools do you use? (auto-detect) - Claude Code only (default) - Claude Code + Kimi CLI - Claude Code + Codex CLI - All three (Claude + Kimi + Codex) *Auto-detect using `$BOOTSTRAP_DIR/scripts/detect-agents.sh`. Pre-select based on what's installed. If only Claude is detected, skip this question and default to Claude-only.* ### 10. Enable container isolation for parallel agents? (auto-detect) - **Yes** (default if Docker/OrbStack detected) — Each feature agent runs in its own container - **No** — Agents share the workspace (native Agent tool) *Auto-detect Docker/OrbStack. If available, default to Yes and skip this question. Only ask if Docker IS available and you want to confirm, or if Docker is NOT available (inform user and default to No).* ```bash if echo "$DETECTED_AGENTS" | grep -qE "docker|orbstack"; then echo "Docker detected — container isolation enabled by default" USE_POLYPHONY="true" else echo "Docker not found — agents will share the workspace" USE_POLYPHONY="false" fi ``` --- ## Phase 4: Execute Setup ### Step 1: Create/update directory structure ```bash mkdir -p .claude/skills mkdir -p docs mkdir -p _project_specs/features mkdir -p _project_specs/todos mkdir -p _project_specs/prompts mkdir -p _project_specs/session/archive mkdir -p scripts # Cross-tool directories (if selected in question 9) if [ "$USE_KIMI" = "true" ]; then mkdir -p .kimi/skills fi if [ "$USE_CODEX" = "true" ]; then mkdir -p .codex/skills fi # Generic .agents/ always created for cross-tool compat mkdir -p .agents/skills ``` ### Step 2: Update skill files from ~/.claude/skills/ **Skills use folder structure:** Each skill is a folder containing `SKILL.md`. ```bash # Copy skill folders (not flat .md files) cp -r ~/.claude/skills/base/ .claude/skills/ cp -r ~/.claude/skills/security/ .claude/skills/ cp -r ~/.claude/skills/project-tooling/ .claude/skills/ cp -r ~/.claude/skills/session-management/ .claude/skills/ cp -r ~/.claude/skills/code-graph/ .claude/skills/ cp -r ~/.claude/skills/cross-agent-delegation/ .claude/skills/ ``` **Always copy (overwrite with latest):** - `base/` → `.claude/skills/base/` - `security/` → `.claude/skills/security/` - `project-tooling/` → `.claude/skills/project-tooling/` - `session-management/` → `.claude/skills/session-management/` - `code-graph/` → `.claude/skills/code-graph/` - `cross-agent-delegation/` → `.claude/skills/cross-agent-delegation/` **If deep analysis or security audit selected (question 4b):** - `cpg-analysis/` → `.claude/skills/cpg-analysis/` ```bash # Copy CPG analysis skill if Tier 2 or 3 selected if [ "$GRAPH_TIER" != "standard" ]; then cp -r ~/.claude/skills/cpg-analysis/ .claude/skills/ fi ``` **For existing codebases (detected in Phase 1b):** - `existing-repo/` → `.claude/skills/existing-repo/` - Structure preservation, guardrails setup **Based on language:** - Python → copy `python/` - TypeScript/JavaScript → copy `typescript/` **Based on project type:** - React Native → copy `typescript/` AND `react-native/` - React Web → copy `typescript/` AND `react-web/` - Node Backend → copy `typescript/` AND `nodejs-backend/` - Full Stack (Node + React) → copy `typescript/`, `nodejs-backend/`, AND `react-web/` **For Android/Flutter projects (auto-detect from project structure):** | Detection | Skills to Copy | |-----------|---------------| | `pubspec.yaml` exists | `flutter/` | | `android/*.java` exists | `android-java/` | | `android/*.kt` exists | `android-kotlin/` | | Flutter + Java files | `flutter/` + `android-java/` | | Flutter + Kotlin files | `flutter/` + `android-kotlin/` | | Flutter + Both | `flutter/` + `android-java/` + `android-kotlin/` | ```bash # Detect and copy Android/Flutter skills if [ -f "pubspec.yaml" ]; then cp -r ~/.claude/skills/flutter/ .claude/skills/ fi if find android -name "*.java" -type f 2>/dev/null | head -1 | grep -q .; then cp -r ~/.claude/skills/android-java/ .claude/skills/ fi if find android -name "*.kt" -type f 2>/dev/null | head -1 | grep -q .; then cp -r ~/.claude/skills/android-kotlin/ .claude/skills/ fi ``` **If AI-first:** - Copy `llm-patterns/` **If container isolation enabled (question 10):** - Copy `polyphony/` ```bash if [ "$USE_POLYPHONY" = "true" ]; then cp -r ~/.claude/skills/polyphony/ .claude/skills/ fi ``` **Note:** Skills are always overwritten with the latest version from ~/.claude/skills/. This ensures updates propagate when user updates their global skills. ### Step 2b: Cross-tool skill sync (if Kimi or Codex selected) After copying skills to `.claude/skills/`, sync to other tool directories: ```bash # Sync skills to all selected tools for skill_dir in .claude/skills/*/; do [ -d "$skill_dir" ] || continue # Kimi CLI if [ "$USE_KIMI" = "true" ]; then cp -r "$skill_dir" .kimi/skills/ fi # Codex CLI if [ "$USE_CODEX" = "true" ]; then cp -r "$skill_dir" .codex/skills/ fi # Generic .agents/ (always) cp -r "$skill_dir" .agents/skills/ done echo "Skills synced to cross-tool directories" ``` ### Step 2c: Generate AGENTS.md (if Codex selected) If Codex was selected in question 9, generate `AGENTS.md` alongside `CLAUDE.md`: **If AGENTS.md exists:** Preserve customizations, update skill references to `.agents/skills/` paths. **If new:** Generate from `CLAUDE.md` content, replacing `.claude/skills/` references with `.agents/skills/` paths. The structure mirrors CLAUDE.md but uses the generic skill path that Codex reads. ```bash if [ "$USE_CODEX" = "true" ] && [ ! -f "AGENTS.md" ]; then if [ -f "CLAUDE.md" ]; then # Generate from existing CLAUDE.md sed 's|\.claude/skills/|.agents/skills/|g' CLAUDE.md > AGENTS.md echo "Generated AGENTS.md from CLAUDE.md" else # Copy template cp "$BOOTSTRAP_DIR/templates/AGENTS.md" ./AGENTS.md echo "Created AGENTS.md from template" fi fi ``` ### Step 2d: Generate config.toml hooks (if Kimi or Codex selected) ```bash BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) if [ "$USE_KIMI" = "true" ]; then cp "$BOOTSTRAP_DIR/templates/config.toml" .kimi/config.toml echo "Created .kimi/config.toml with hooks" fi if [ "$USE_CODEX" = "true" ]; then cp "$BOOTSTRAP_DIR/templates/config.toml" .codex/config.toml echo "Created .codex/config.toml with hooks" fi ``` ### Step 3: Create/update .gitignore (if missing or incomplete) Ensure these security-critical entries exist: ```gitignore # Environment files - NEVER commit .env .env.* !.env.example # Secrets *.pem *.key *.p12 credentials.json secrets.json service-account*.json # Dependencies node_modules/ __pycache__/ *.pyc .venv/ venv/ # Build outputs dist/ build/ # Code graph data (auto-generated) .code-graph/ # Cross-tool agent dirs (derived from .claude/skills/, regenerated by /sync-agents) .kimi/ .codex/ .agents/ # IDE .idea/ .vscode/settings.json .DS_Store ``` ### Step 4: Create .env.example (if missing) Based on project type: ```bash # .env.example - Copy to .env and fill in values # Server-side only (NEVER prefix with VITE_ or NEXT_PUBLIC_) DATABASE_URL= ANTHROPIC_API_KEY= # Client-side safe (public, non-sensitive) VITE_SUPABASE_URL= VITE_SUPABASE_ANON_KEY= ``` ### Step 4b: Configure Code Graph MCP Servers **This step runs for ALL projects** (Tier 1 is always-on). #### Create/merge .mcp.json ```bash # Check if .mcp.json exists if [ -f ".mcp.json" ]; then echo "Existing .mcp.json found - will merge code graph config" else echo "Creating .mcp.json for code graph MCP servers" fi ``` **Always add (Tier 1 — codebase-memory-mcp):** ```json { "mcpServers": { "codebase-memory": { "command": "codebase-memory-mcp", "args": [] } } } ``` **If Tier 2 selected (deep analysis / full), also add:** ```json { "mcpServers": { "codebadger": { "url": "http://localhost:4242/mcp", "type": "http" } } } ``` **If Tier 3 selected (security audit / full), also add:** ```json { "mcpServers": { "codeql": { "command": "codeql-mcp", "args": ["--database", ".code-graph/codeql-db"] } } } ``` **Merge strategy:** If `.mcp.json` already exists, read it, merge new `mcpServers` entries without overwriting existing ones, write back. #### Add .code-graph/ to .gitignore Ensure this entry exists in `.gitignore`: ```gitignore # Code graph data (auto-generated, machine-specific) .code-graph/ ``` #### Auto-install codebase-memory-mcp (if not found) ```bash if ! command -v codebase-memory-mcp &> /dev/null; then echo "" echo "Installing codebase-memory-mcp (Tier 1 code graph)..." # Run the graph tools installer (Tier 1 only by default) if [ -f "$HOME/.claude/install-graph-tools.sh" ]; then bash "$HOME/.claude/install-graph-tools.sh" else # Fallback: inline install INSTALL_DIR="$HOME/.local/bin" mkdir -p "$INSTALL_DIR" OS=$(uname -s | tr '[:upper:]' '[:lower:]') ARCH=$(uname -m) case "$ARCH" in aarch64|arm64) ARCH="arm64" ;; x86_64|amd64) ARCH="amd64" ;; esac DOWNLOAD_URL="https://github.com/DeusData/codebase-memory-mcp/releases/latest/download/codebase-memory-mcp-${OS}-${ARCH}.tar.gz" TEMP_DIR=$(mktemp -d) if curl -fsSL "$DOWNLOAD_URL" -o "$TEMP_DIR/codebase-memory-mcp.tar.gz"; then tar xzf "$TEMP_DIR/codebase-memory-mcp.tar.gz" -C "$TEMP_DIR" mv "$TEMP_DIR/codebase-memory-mcp" "$INSTALL_DIR/codebase-memory-mcp" chmod +x "$INSTALL_DIR/codebase-memory-mcp" echo "✓ Installed codebase-memory-mcp to $INSTALL_DIR" # Auto-configure for Claude Code "$INSTALL_DIR/codebase-memory-mcp" install 2>/dev/null || true else echo "⚠ Failed to download codebase-memory-mcp" echo " Manual install: ~/.claude/install-graph-tools.sh" fi rm -rf "$TEMP_DIR" fi else echo "✓ codebase-memory-mcp already installed" fi ``` #### Auto-install Tier 2/3 tools (if selected) ```bash # Tier 2: Joern CPG (if deep analysis or full selected) if [ "$GRAPH_TIER" = "deep" ] || [ "$GRAPH_TIER" = "full" ]; then if [ -f "$HOME/.claude/install-graph-tools.sh" ]; then echo "" echo "Installing Joern CPG (Tier 2)..." bash "$HOME/.claude/install-graph-tools.sh" --joern fi fi # Tier 3: CodeQL (if security audit or full selected) if [ "$GRAPH_TIER" = "security" ] || [ "$GRAPH_TIER" = "full" ]; then if [ -f "$HOME/.claude/install-graph-tools.sh" ]; then echo "" echo "Installing CodeQL (Tier 3)..." bash "$HOME/.claude/install-graph-tools.sh" --codeql fi fi ``` #### Enable auto-indexing and build initial graph ```bash if command -v codebase-memory-mcp &> /dev/null; then # Enable auto-index so graph stays fresh across sessions codebase-memory-mcp config set auto_index true 2>/dev/null || true # Build initial graph index for this project echo "" echo "Building code graph index (first time may take a moment)..." codebase-memory-mcp index --project-dir . 2>/dev/null || { echo "⚠ Initial index failed - graph will be built on first MCP query" } echo "✓ Code graph indexed" fi ``` #### Install post-commit graph update hook ```bash if [ -d ".git" ]; then # Append to existing post-commit hook (don't overwrite) if [ -f ".git/hooks/post-commit" ]; then if ! grep -q "code-graph" ".git/hooks/post-commit"; then echo "" >> .git/hooks/post-commit echo "# Code graph incremental update" >> .git/hooks/post-commit cat ~/.claude/hooks/post-commit-graph >> .git/hooks/post-commit fi else cp ~/.claude/hooks/post-commit-graph .git/hooks/post-commit chmod +x .git/hooks/post-commit fi echo "✓ Post-commit graph update hook installed" fi ``` ### Step 5: Create/update verification script Create or overwrite `scripts/verify-tooling.sh`: ```bash #!/bin/bash set -e echo "Verifying project tooling..." # GitHub CLI if command -v gh &> /dev/null; then if gh auth status &> /dev/null; then echo "✓ GitHub CLI authenticated" else echo "✗ GitHub CLI not authenticated. Run: gh auth login" exit 1 fi else echo "⚠ GitHub CLI not installed. Run: brew install gh" fi # Vercel CLI if command -v vercel &> /dev/null; then if vercel whoami &> /dev/null; then echo "✓ Vercel CLI authenticated" else echo "✗ Vercel CLI not authenticated. Run: vercel login" exit 1 fi else echo "⚠ Vercel CLI not installed. Run: npm i -g vercel" fi # Supabase CLI if command -v supabase &> /dev/null; then if supabase projects list &> /dev/null 2>&1; then echo "✓ Supabase CLI authenticated" else echo "✗ Supabase CLI not authenticated. Run: supabase login" exit 1 fi else echo "⚠ Supabase CLI not installed. Run: brew install supabase/tap/supabase" fi echo "" echo "Tooling verification complete!" ``` ```bash chmod +x scripts/verify-tooling.sh ``` ### Step 6: Create security check script Create `scripts/security-check.sh`: ```bash #!/bin/bash set -e echo "Running security checks..." # Check .env is not staged if git diff --cached --name-only | grep -E '^\.env$|^\.env\.' | grep -v '\.example$'; then echo "ERROR: .env file is staged for commit!" exit 1 fi # Check for common secret patterns STAGED_FILES=$(git diff --cached --name-only --diff-filter=ACM) if [ -n "$STAGED_FILES" ]; then if echo "$STAGED_FILES" | xargs grep -l -E '(password|secret|api_key|apikey|token)\s*[:=]\s*["\047][^"\047]{8,}["\047]' 2>/dev/null; then echo "WARNING: Possible secrets found in staged files - please verify" fi fi # Check for VITE_* secrets (common mistake) if [ -n "$STAGED_FILES" ]; then if echo "$STAGED_FILES" | xargs grep -l -E 'VITE_.*SECRET|VITE_.*KEY.*=.*[a-zA-Z0-9]{20,}' 2>/dev/null; then echo "ERROR: Secrets in VITE_* env vars are exposed to client!" exit 1 fi fi # Dependency audit if [ -f "package.json" ]; then echo "Checking npm dependencies..." npm audit --audit-level=high 2>/dev/null || echo "Warning: npm audit found issues" fi if [ -f "pyproject.toml" ] || [ -f "requirements.txt" ]; then if command -v safety &> /dev/null; then echo "Checking Python dependencies..." safety check 2>/dev/null || echo "Warning: safety found issues" fi fi echo "Security checks complete!" ``` ```bash chmod +x scripts/security-check.sh ``` ### Step 7: Create/update CLAUDE.md **If CLAUDE.md exists:** - Preserve Project Overview, Tech Stack, and Project-Specific Patterns sections - Update Skills list to reference current .claude/skills/ contents - Update Key Commands section with latest **If new:** ```markdown # CLAUDE.md ## Skills Read and follow these skills before writing any code: - .claude/skills/base/SKILL.md - .claude/skills/security/SKILL.md - .claude/skills/project-tooling/SKILL.md - .claude/skills/session-management/SKILL.md - .claude/skills/code-graph/SKILL.md - .claude/skills/cross-agent-delegation/SKILL.md - .claude/skills/cpg-analysis/SKILL.md (if deep analysis or security audit) - .claude/skills/[language]/SKILL.md - .claude/skills/[framework]/SKILL.md (if applicable) - .claude/skills/llm-patterns/SKILL.md (if AI-first) ## Project Overview [Description from question 1] ## Tech Stack - Language: [X] - Framework: [X] - Database: [X] - Deployment: [X] - Testing: [X] ## Key Commands ```bash # Verify all CLI tools are working ./scripts/verify-tooling.sh # Install dependencies npm install # or: pip install -e ".[dev]" # Run tests npm test # or: pytest # Lint npm run lint # or: ruff check . # Type check npm run typecheck # or: mypy src/ # Pre-commit hooks (run once after clone) npx husky init # or: pre-commit install # Database (if using Supabase) npm run db:start # Start local Supabase npm run db:migrate # Push migrations # Deploy npm run deploy:preview # Deploy to preview npm run deploy:prod # Deploy to production ``` ## Documentation - `docs/` - Technical documentation - `_project_specs/` - Project specifications and todos ## Atomic Todos All work is tracked in `_project_specs/todos/`: - `active.md` - Current work - `backlog.md` - Future work - `completed.md` - Done (for reference) Every todo must have validation criteria and test cases. See base.md skill for format. ## Session Management ### State Tracking Maintain session state in `_project_specs/session/`: - `current-state.md` - Live session state (update every 15-20 tool calls) - `decisions.md` - Key architectural/implementation decisions (append-only) - `code-landmarks.md` - Important code locations for quick reference - `archive/` - Past session summaries ### Automatic Updates Update `current-state.md`: - After completing any todo item - Every 15-20 tool calls during active work - Before any significant context shift - When encountering blockers ### Decision Logging Log to `decisions.md` when: - Choosing between architectural approaches - Selecting libraries or tools - Making security-related choices - Deviating from standard patterns ### Context Compression When context feels heavy (~50+ tool calls): 1. Summarize completed work in current-state.md 2. Archive verbose exploration notes to archive/ 3. Keep only essential context for next steps ### Session Handoff When ending a session or approaching context limits, update current-state.md with: - What was completed this session - Current state of work - Immediate next steps (numbered, specific) - Open questions or blockers - Files to review first when resuming ### Resuming Work When starting a new session: 1. Read `_project_specs/session/current-state.md` 2. Check `_project_specs/todos/active.md` 3. Review recent entries in `decisions.md` if context needed 4. Continue from "Next Steps" in current-state.md ## Code Graph (MCP) This project uses MCP-based code graph for optimized code navigation. ### Available Tiers - **Tier 1** (always on): `codebase-memory-mcp` - AST graph, symbol lookup, blast radius - **Tier 2** (opt-in): Joern/CodeBadger - Full CPG, control/data flow analysis - **Tier 3** (opt-in): CodeQL - Taint analysis, security vulnerability detection ### Usage Priority 1. **Graph first** - Use MCP graph tools for symbol search, dependency tracing, impact analysis 2. **File read second** - Only read full files when you need to modify code or need full context 3. **Grep last** - Avoid grep when graph tools can answer the question faster ### Configuration - MCP config: `.mcp.json` (project root, committed) - Graph data: `.code-graph/` (gitignored, auto-updated) - Post-commit hook: auto-updates graph on code changes ### Key Graph Commands ```bash # Install graph tools (run once per machine) ~/.claude/install-graph-tools.sh # Install with deep CPG analysis ~/.claude/install-graph-tools.sh --joern # Install with security auditing ~/.claude/install-graph-tools.sh --codeql ``` ## Project-Specific Patterns [Any specific patterns for this project] ``` ### Step 5: Create project specs structure (if missing) Only create files that don't exist - never overwrite existing specs. **_project_specs/overview.md** (if missing): ```markdown # Project Overview ## Vision [Description from question 1] ## Goals - [ ] Goal 1 - [ ] Goal 2 ## Non-Goals - What this project will NOT do ## Success Metrics - How we measure success ``` **_project_specs/todos/active.md** (if missing): ```markdown # Active Todos Current work in progress. Each todo follows the atomic todo format from base.md skill. --- ``` **_project_specs/todos/backlog.md** (if missing): ```markdown # Backlog Future work, prioritized. Move to active.md when starting. --- ``` **_project_specs/todos/completed.md** (if missing): ```markdown # Completed Done items for reference. Move here from active.md when complete. --- ``` **_project_specs/session/current-state.md** (if missing): ```markdown # Current Session State *Last updated: [timestamp]* ## Active Task [What are we working on right now - one sentence] ## Current Status - **Phase**: exploring | planning | implementing | testing | debugging - **Progress**: [X of Y steps, or description] - **Blocking Issues**: None ## Context Summary [2-3 sentences summarizing current state of work] ## Files Being Modified | File | Status | Notes | |------|--------|-------| | - | - | - | ## Next Steps 1. [ ] First next action 2. [ ] Second next action ## Key Context to Preserve - [Important decisions or context for this task] ## Resume Instructions To continue this work: 1. [Specific starting point] 2. [What to check/read first] ``` **_project_specs/session/decisions.md** (if missing): ```markdown # Decision Log Track key architectural and implementation decisions. ## Format ``` ## [YYYY-MM-DD] Decision Title **Decision**: What was decided **Context**: Why this decision was needed **Options Considered**: What alternatives existed **Choice**: Which option was chosen **Reasoning**: Why this choice was made **Trade-offs**: What we gave up **References**: Related code/docs ``` --- ``` **_project_specs/session/code-landmarks.md** (if missing): ```markdown # Code Landmarks Quick reference to important parts of the codebase. ## Entry Points | Location | Purpose | |----------|---------| | - | Main application entry | ## Core Business Logic | Location | Purpose | |----------|---------| | - | - | ## Configuration | Location | Purpose | |----------|---------| | - | Environment/app config | ## Key Patterns | Pattern | Example Location | Notes | |---------|------------------|-------| | - | - | - | ## Testing | Location | Purpose | |----------|---------| | tests/ | Test files | ## Gotchas & Non-Obvious Behavior | Location | Issue | Notes | |----------|-------|-------| | - | - | - | ``` ### Step 9: Create/update GitHub Actions workflows **Quality workflow** (`.github/workflows/quality.yml`): Create based on language (copy from the relevant skill file). **Security workflow** (`.github/workflows/security.yml`): ```yaml name: Security on: push: branches: [main] pull_request: branches: [main] schedule: - cron: '0 9 * * 1' # Weekly on Monday jobs: secrets-scan: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Detect secrets uses: trufflesecurity/trufflehog@main with: path: ./ dependency-audit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup Node if: hashFiles('package.json') != '' uses: actions/setup-node@v4 with: node-version: '20' - name: NPM Audit if: hashFiles('package.json') != '' run: npm audit --audit-level=high - name: Setup Python if: hashFiles('pyproject.toml') != '' uses: actions/setup-python@v5 with: python-version: '3.12' - name: Safety check if: hashFiles('pyproject.toml') != '' run: pip install safety && safety check ``` ### Step 7: Set up pre-commit hooks (if not already configured) **For Python projects** (if .pre-commit-config.yaml missing): Create `.pre-commit-config.yaml` **For TypeScript/JavaScript projects** (if .husky/ missing): Set up Husky + lint-staged ### Step 7b: Install pre-push code review hook **Always install the pre-push hook for code review enforcement:** ```bash # Check if .git exists if [ -d ".git" ]; then # Copy pre-push hook from ~/.claude/hooks/ cp ~/.claude/hooks/pre-push .git/hooks/pre-push chmod +x .git/hooks/pre-push echo "✓ Pre-push code review hook installed" fi ``` This hook: - Runs `/code-review` before every `git push` - Blocks push if 🔴 Critical or 🟠 High severity issues found - Allows push with advisory for 🟡 Medium and 🟢 Low issues To disable: `rm .git/hooks/pre-push` ### Step 8: GitHub repository setup (if selected and not already configured) **Create new repository:** ```bash git init # if needed git add . git commit -m "Initial project setup" gh repo create [repo-name] --[public|private] --source=. --remote=origin --push ``` **Connect to existing:** ```bash git remote add origin https://github.com/[owner]/[repo].git git push -u origin main ``` ### Step 9: Initialize deployment (if not already configured) **Vercel** (if vercel.json missing): ```bash vercel link ``` **Supabase** (if supabase/ missing): ```bash supabase init ``` --- ## Phase 5: Summary After setup, show what was done: ### For Updates (existing project): ``` Updated: ✓ Skills updated to latest versions - base.md (updated) - typescript.md (updated) - react-web.md (updated) - code-graph.md (updated) ✓ Pre-push code review hook (installed/updated) Added: ✓ llm-patterns.md (new skill added) ✓ _project_specs/prompts/ (new directory) Code Graph (fully automated): ✓ codebase-memory-mcp installed and configured ✓ .mcp.json configured (Tier 1: codebase-memory-mcp) ✓ Auto-indexing enabled (graph stays fresh across sessions) ✓ Initial graph index built ✓ Post-commit graph update hook installed [✓ Tier 2: Joern CPG installed and configured (if selected)] [✓ Tier 3: CodeQL installed and configured (if selected)] Cross-Tool Compatibility (if selected): [✓ Skills synced to .kimi/skills/ (Kimi CLI)] [✓ Skills synced to .codex/skills/ (Codex CLI)] [✓ Skills synced to .agents/skills/ (generic)] [✓ AGENTS.md created (Codex project instructions)] [✓ .kimi/config.toml created (Kimi hooks)] [✓ .codex/config.toml created (Codex hooks)] Unchanged: - CLAUDE.md (preserved your customizations) - _project_specs/todos/ (preserved your todos) - Git repository (already configured) ``` ### For New Projects: ``` Created: ✓ .claude/skills/ with [N] skill files (including code-graph) ✓ CLAUDE.md ✓ _project_specs/ structure ✓ scripts/verify-tooling.sh ✓ .github/workflows/quality.yml ✓ Pre-commit hooks configured ✓ Pre-push code review hook (blocks on Critical/High issues) ✓ GitHub repository: https://github.com/[owner]/[repo] Code Graph (fully automated): ✓ codebase-memory-mcp installed ✓ .mcp.json configured Tier 1: codebase-memory-mcp (always on - AST graph, 64 langs) [Tier 2: Joern CPG (control flow, data flow)] [Tier 3: CodeQL (taint analysis, security)] ✓ Auto-indexing enabled ✓ Initial graph index built ([N] files, [N] symbols) ✓ .code-graph/ added to .gitignore ✓ Post-commit graph update hook installed Cross-Tool Compatibility (if selected): ✓ Skills synced to .kimi/skills/, .codex/skills/, .agents/skills/ ✓ AGENTS.md created (Codex project instructions) ✓ .kimi/config.toml + .codex/config.toml (hooks) ✓ .kimi/, .codex/, .agents/ added to .gitignore ``` ### Quick Start ```bash # Verify setup ./scripts/verify-tooling.sh # Install dependencies [appropriate command] # Start development [appropriate command] ``` --- ## Phase 5b: Polyphony Setup (Container Isolation) **This phase runs automatically when Docker/OrbStack is detected (question 10) and the user hasn't opted out.** ### Step 1: Check prerequisites ```bash # Verify Docker is running if echo "$DETECTED_AGENTS" | grep -qE "docker|orbstack"; then docker info &>/dev/null && echo "✓ Docker running" || echo "⚠ Docker installed but not running" fi # Check polyphony CLI command -v polyphony &>/dev/null && echo "✓ polyphony CLI available" || echo "⚠ polyphony not on PATH" ``` ### Step 2: Initialize Polyphony config (if missing) ```bash if [ ! -d "$HOME/.polyphony" ]; then polyphony init echo "✓ Created ~/.polyphony/ config" else echo "✓ ~/.polyphony/ already exists" fi ``` ### Step 3: Build worker image (if not present) ```bash if ! docker image inspect polyphony-worker:latest &>/dev/null 2>&1; then BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) if [ -f "$BOOTSTRAP_DIR/templates/Dockerfile.polyphony" ]; then echo "Building polyphony-worker image..." docker build -t polyphony-worker:latest -f "$BOOTSTRAP_DIR/templates/Dockerfile.polyphony" "$BOOTSTRAP_DIR" echo "✓ Built polyphony-worker:latest" fi else echo "✓ polyphony-worker:latest image exists" fi ``` ### Step 4: Add polyphony skill to project ```bash # Copy polyphony skill to project cp -r ~/.claude/skills/polyphony/ .claude/skills/ ``` Add to CLAUDE.md Skills section: ```markdown - .claude/skills/polyphony/SKILL.md ``` Add to CLAUDE.md Cross-Agent Workflow section: ```markdown ### Container Isolation (Polyphony) When Docker is available, each feature agent runs in its own container with an independent git branch. - `/spawn-team` uses Polyphony by default (fallback to native agents if no Docker) - `polyphony status` to see running agents - `polyphony cleanup` after completion ``` ### Step 5: Show Polyphony status in summary Add to the Phase 5 summary output: ``` Container Isolation (Polyphony): ✓ Docker/OrbStack detected ✓ polyphony CLI available ✓ ~/.polyphony/ config ready ✓ polyphony-worker:latest image built ✓ Polyphony skill added to project → /spawn-team will use container isolation by default ``` **If Docker not available:** ``` Container Isolation: ⚠ Docker not found — /spawn-team will use native agents (shared workspace) Install Docker: brew install --cask docker ``` --- ## Phase 6: Agent Team Setup (Default Workflow) Every project uses Claude Agent Teams by default. This phase sets up the team infrastructure and spawns agents to implement features in parallel. ### Step 1: Set Environment Variable Ensure the agent teams experimental flag is set: ```bash export CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 ``` Also add to the project's `.env.example` if not present: ``` # Agent Teams (required for Maggy team workflow) CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 ``` ### Step 2: Copy Agent Definitions Copy agent definitions from the agent-teams skill to the project: ```bash mkdir -p .claude/agents cp ~/.claude/skills/agent-teams/agents/*.md .claude/agents/ ``` This creates: ``` .claude/agents/ team-lead.md # Orchestration only, delegate mode quality.md # TDD verification (RED/GREEN phases) security.md # OWASP scanning, secrets detection code-review.md # Multi-engine code review merger.md # Branch creation, PR management feature.md # Feature implementation template ``` ### Step 3: Add Agent Teams to CLAUDE.md Add the agent-teams skill to the Skills section in CLAUDE.md: ``` - .claude/skills/agent-teams/SKILL.md ``` Add a new section to CLAUDE.md: ```markdown ## Agent Teams (Default Workflow) This project uses Claude Code Agent Teams as the default development workflow. Every feature is implemented by a dedicated agent following a strict TDD pipeline. ### Strict Pipeline (per feature) Spec > Spec Review > Tests > RED Verify > Implement > GREEN Verify > Validate > Code Review > Security Scan > Branch + PR ### Team Roster - **Team Lead**: Orchestrates, breaks work into features, assigns tasks (NEVER writes code) - **Quality Agent**: Verifies TDD discipline - RED/GREEN phases, coverage >= 80% - **Security Agent**: OWASP scanning, secrets detection, dependency audit - **Code Review Agent**: Multi-engine code reviews (Claude/Codex/Gemini) - **Merger Agent**: Creates feature branches and PRs via gh CLI - **Feature Agents**: One per feature, follows strict TDD pipeline ### Commands - `/spawn-team` - Spawn the agent team (auto-run after init, or run manually) ### Required Environment export CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 ``` ### Step 4: Prompt for Features **For new projects:** > **Project initialized! Ready to deploy the agent team.** > > The agent team implements features in parallel using a strict TDD pipeline: > ``` > Spec > Tests > Verify Fail > Implement > Verify Pass > Review > Security > PR > ``` > > What are the key features of this project? List them and I'll create a spec > skeleton for each, then spawn the team to implement them in parallel. > > Example: "user authentication, dashboard, payment processing" For each feature the user lists: 1. Create `_project_specs/features/{feature-name}.md` with skeleton spec 2. Include: description (from user input), empty acceptance criteria, empty test cases table **For existing projects:** > **Project updated with latest skills and agent team support!** > > I've added agent team infrastructure. Your options: > 1. Define features and spawn the team now > 2. Continue working on existing todos (solo mode) > 3. Review what's new in skills ### Step 5: Spawn Team After the user provides features (or if feature specs already exist), automatically run the `/spawn-team` workflow: 1. Create the team (TeamCreate) 2. Spawn 5 default agents (team-lead, quality-agent, security-agent, review-agent, merger-agent) 3. Spawn 1 feature agent per feature 4. Team lead creates 10-task dependency chains per feature 5. Work begins automatically ### Step 6: Show Team Status ``` ┌─────────────────────────────────────────────────────────────────┐ │ AGENT TEAM DEPLOYED │ │ ────────────────────────────────────────────────────────────── │ │ │ │ Team: {project-name} │ │ Features: {N} │ │ Total tasks: {N * 10} │ │ Agents: {5 + N} │ │ │ │ PIPELINE (per feature) │ │ Spec > Review > Tests > RED > Implement > GREEN > │ │ Validate > Code Review > Security > Branch+PR │ │ │ │ Use Shift+Up/Down to select and message agents. │ │ Use Ctrl+T to toggle the shared task list. │ │ The team runs autonomously until all PRs are created. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Updating Skills System-Wide To update skills for all future projects: ```bash # Pull latest skills cd "$(cat ~/.claude/.bootstrap-dir)" git pull # Reinstall ./install.sh # Validate installation ./tests/validate-structure.sh ``` Then in any existing project: ``` /initialize-project ``` Skills will be updated while preserving project-specific configuration. ## Troubleshooting If `/initialize-project` shows validation errors: ```bash BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) # Full validation to see all issues "$BOOTSTRAP_DIR/tests/validate-structure.sh" --full # Quick validation (what initialize-project runs) "$BOOTSTRAP_DIR/tests/validate-structure.sh" --quick ``` Common issues: - **Flat .md files**: Skills should be folders with SKILL.md, not flat files - **Missing commands**: Reinstall with `./install.sh` - **Missing hooks**: Reinstall with `./install.sh` ================================================ FILE: commands/maggy-init.md ================================================ # /maggy-init — Set Up Maggy for This Team Interactive wizard that configures Maggy for the user's org, issue tracker, and codebases. Writes `~/.maggy/config.yaml` and ensures deps are installed. --- ## Usage `/maggy-init` — run the full setup wizard --- ## Steps ### 1. Check prerequisites - Python 3.11+ available - `claude` CLI on PATH (warn but don't block) - Maggy installed (check `~/.claude/.bootstrap-dir`) ### 2. Run installer ```bash BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir) cd "$BOOTSTRAP_DIR/maggy" ./install.sh ``` This installs Python deps and copies the config template to `~/.maggy/config.yaml`. ### 3. Interactive config wizard Ask the user: 1. **Org name** — human-readable name (e.g. "Acme Corp") 2. **Domain** — primary competitive domain (e.g. "fintech", "devtools", "cx", "healthcare"). This drives competitor discovery. 3. **Issue tracker** — `github` (default) or `asana`. Linear is a stub. 4. **For GitHub:** org name + comma-separated repo list (`acmecorp/api, acmecorp/web`) 5. **For Asana:** workspace ID + project GID for their default board 6. **Codebases** — paths to each repo Maggy should execute in. Prompt key per path (short name like `api`, `web`). 7. **Competitor categories** — comma-separated (can match domain; encourages 1-3 categories) 8. **OKRs** — "skip" or "yaml" (paste OKRs inline if yaml) ### 4. Write config Patch `~/.maggy/config.yaml` with the user's answers using a Python helper: ```python import yaml from pathlib import Path cfg_path = Path.home() / ".maggy" / "config.yaml" cfg = yaml.safe_load(cfg_path.read_text()) cfg["org"]["name"] = "" cfg["org"]["domain"] = "" cfg["issue_tracker"]["provider"] = "" # ... set github/asana section accordingly cfg["codebases"] = [{"path": "", "key": ""}, ...] cfg["competitors"]["categories"] = ["", ...] cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False)) ``` ### 5. Credentials check Tell the user to export these in their shell and source them when starting Maggy: ``` export GITHUB_TOKEN=ghp_... # repo + issues scopes export ANTHROPIC_API_KEY=sk-ant-... ``` **Do not write tokens to `~/.maggy/.env`** — the Maggy server does not load that file automatically, so credentials would sit on disk in plaintext with no code reading them. Use your shell's standard secret store (e.g. `.zshrc`, `direnv`, `op run`, a secrets manager) or export them inline when launching Maggy. ### 6. Test the connection ```bash cd "$BOOTSTRAP_DIR/maggy" python3 -c "from src import config, providers; cfg = config.load(); p = providers.build(cfg); import asyncio; print('Found', len(asyncio.run(p.list_tasks(limit=5))), 'tasks')" ``` If this returns tasks, setup is working. ### 7. Offer to launch > Maggy is configured. Run `/maggy` to launch the dashboard, or: > > ``` > cd $BOOTSTRAP_DIR/maggy && python3 -m maggy.main > ``` > > Then open http://127.0.0.1:8080 --- ## Related - `/maggy` — launch dashboard - `/icpg-bootstrap` — index your codebases so Execute gets rich context ================================================ FILE: commands/maggy.md ================================================ # /maggy — Launch Maggy Dashboard Start Maggy (the AI engineering command center) and open the dashboard in a browser. --- ## Usage `/maggy` — start server if not running, open dashboard `/maggy stop` — stop running server `/maggy status` — show whether server is running + config summary --- ## Steps ### 1. Check config ```bash if [ ! -f ~/.maggy/config.yaml ]; then echo "Maggy not configured yet. Run /maggy-init first." exit 1 fi ``` ### 2. Resolve host/port from config (don't hardcode 8080) ```bash # Read dashboard.host and dashboard.port from ~/.maggy/config.yaml. # Falls back to 127.0.0.1:8080 only if keys are missing. HOST=$(python3 -c "import yaml; d=yaml.safe_load(open('$HOME/.maggy/config.yaml'))or{}; print((d.get('dashboard') or {}).get('host') or '127.0.0.1')") PORT=$(python3 -c "import yaml; d=yaml.safe_load(open('$HOME/.maggy/config.yaml'))or{}; print((d.get('dashboard') or {}).get('port') or 8080)") URL="http://${HOST}:${PORT}" ``` ### 3. Check if already running ```bash if curl -sf "${URL}/api/health" >/dev/null 2>&1; then echo "Maggy is already running at ${URL}" open "${URL}" 2>/dev/null || xdg-open "${URL}" 2>/dev/null || true exit 0 fi ``` ### 4. Start in background The Maggy install lives at `/maggy`. Resolve it from `~/.claude/.bootstrap-dir`: ```bash BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null || echo "") MAGGY_DIR="$BOOTSTRAP_DIR/maggy" if [ ! -d "$MAGGY_DIR" ]; then echo "Maggy not installed. Run: cd /maggy && ./install.sh" exit 1 fi cd "$MAGGY_DIR" mkdir -p "$HOME/.maggy" nohup python3 -m maggy.main > "$HOME/.maggy/maggy.log" 2>&1 & echo $! > "$HOME/.maggy/maggy.pid" ``` ### 5. Wait for health check ```bash for i in {1..15}; do if curl -sf "${URL}/api/health" >/dev/null 2>&1; then echo "✓ Maggy ready at ${URL}" open "${URL}" 2>/dev/null || true exit 0 fi sleep 1 done echo "Maggy didn't come up in 15s. Check ~/.maggy/maggy.log" ``` ### 5. Report status Show: ``` Maggy is running: Dashboard: http://127.0.0.1:8080 Logs: ~/.maggy/maggy.log PID: ``` --- ## Related - `/maggy-init` — first-time setup wizard - `/icpg-bootstrap` — Maggy's Execute button uses iCPG context from this ================================================ FILE: commands/mnemos-checkpoint.md ================================================ # /mnemos-checkpoint — Write Mnemos Checkpoint Write a checkpoint capturing current session state for later resume. ## Steps 1. Run `python3 -m mnemos checkpoint --force` to write checkpoint 2. Report what was captured (goal, constraints, results, fatigue level) 3. Show the checkpoint file location ================================================ FILE: commands/mnemos-status.md ================================================ # /mnemos-status — Show Mnemos Memory Status Show current Mnemos fatigue level, active node counts, and checkpoint status. ## Steps 1. Run `python3 -m mnemos status` in the project directory 2. Run `python3 -m mnemos fatigue` for detailed breakdown 3. Report the fatigue state and any recommended actions 4. If fatigue >= 0.60, suggest writing a checkpoint with `python3 -m mnemos checkpoint --force` ================================================ FILE: commands/polyphony-init.md ================================================ # /polyphony-init — Setup Wizard Initialize the Polyphony multi-agent orchestration environment. --- ## Steps ### 1. Check Prerequisites ```bash command -v docker &>/dev/null || command -v orbctl &>/dev/null ``` If neither Docker nor OrbStack is available, inform the user: > Docker or OrbStack is required for Polyphony container isolation. Install one first. ### 2. Create Config Directory ```bash mkdir -p ~/.polyphony ``` ### 3. Copy Config Templates Copy default configuration files from the templates directory: ```bash TEMPLATES="$(dirname "$(realpath "$0")")/../templates" cp -n "$TEMPLATES/polyphony-config.yaml" ~/.polyphony/config.yaml cp -n "$TEMPLATES/polyphony-identities.yaml" ~/.polyphony/identities.yaml cp -n "$TEMPLATES/polyphony-agents.yaml" ~/.polyphony/agents.yaml cp -n "$TEMPLATES/polyphony-routing.yaml" ~/.polyphony/routing.yaml ``` ### 4. Build Worker Image ```bash docker build -t polyphony-worker:latest -f templates/Dockerfile.polyphony . ``` ### 5. Detect Available Agents ```bash command -v claude &>/dev/null && echo "claude: available" command -v codex &>/dev/null && echo "codex: available" command -v kimi &>/dev/null && echo "kimi: available" ``` ### 6. Confirm Print summary of what was initialized and which agents are available. ================================================ FILE: commands/polyphony-spawn.md ================================================ # /polyphony-spawn — Spawn Task Create a new task in the Polyphony orchestrator and route it to an agent. --- ## Usage ``` /polyphony-spawn [--type <task_type>] [--risk <risk>] [--source <source>] ``` ## Steps ### 1. Parse Arguments - `title`: Required task description - `--type`: Task type (feature, bugfix, docs, refactor, etc.). Default: feature - `--risk`: Risk level (low, medium, high). Default: low - `--source`: Work source (local, github). Default: local ### 2. Create Task ```bash PYTHONPATH=scripts python3 -m polyphony spawn "$TITLE" --type "$TYPE" ``` ### 3. Route Task The orchestrator will automatically: 1. Score task complexity (5-dimension scoring) 2. Match against routing rules 3. Select agent and fallback chain 4. Provision container with workspace 5. Start agent execution ### 4. Report Print task ID and routing decision. ================================================ FILE: commands/polyphony-status.md ================================================ # /polyphony-status — Show State Display the current state of all Polyphony tasks and running containers. --- ## Steps ### 1. Show Task States ```bash PYTHONPATH=scripts python3 -m polyphony status ``` ### 2. Show Running Containers ```bash docker ps --filter "name=polyphony-" --format "table {{.Names}}\t{{.Status}}\t{{.RunningFor}}" ``` ### 3. Show Workspace Usage ```bash du -sh ~/polyphony/workspaces/* 2>/dev/null || echo "No workspaces" ``` ================================================ FILE: commands/spawn-team.md ================================================ # /spawn-team - Spawn Agent Team Spawn the default agent team for this project. Creates a coordinated team of agents that implement features in parallel following the strict TDD pipeline. **Pipeline:** Specs > Tests > Ensure tests fail > Implement > Test again > Code Review > Security > Create branch > Create PR --- ## Phase 1: Prerequisites Check ### 1.1 Detect Container Mode Check if Polyphony container isolation is available. **Container mode is the default when both Docker and polyphony CLI are present.** ```bash BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) DETECTED_AGENTS=$("$BOOTSTRAP_DIR/scripts/detect-agents.sh" 2>/dev/null || echo "claude") CONTAINER_MODE="false" if echo "$DETECTED_AGENTS" | grep -qE "docker|orbstack"; then if command -v polyphony &>/dev/null; then CONTAINER_MODE="true" echo "✓ Container mode: ON (Docker + polyphony detected)" echo " Each feature agent will run in its own isolated container" else echo "⚠ Docker found but polyphony CLI missing" echo " Run: cd \$(cat ~/.claude/.bootstrap-dir) && ./install.sh" echo " Falling back to native agents (shared workspace)" fi else echo "ℹ Docker not found — using native agents (shared workspace)" echo " Install Docker for container isolation: brew install --cask docker" fi ``` ### 1.2 Check Agent Definitions Verify `.claude/agents/` exists and has the required agent definitions: ```bash ls .claude/agents/ ``` Required files (with proper frontmatter: name, description, model, tools, disallowedTools, maxTurns): - `team-lead.md` - `quality.md` - `security.md` - `code-review.md` - `merger.md` - `feature.md` If missing, copy from the agent-teams skill: ```bash cp -r ~/.claude/skills/agent-teams/agents/ .claude/agents/ ``` ### 1.3 Check Feature Specs ```bash ls _project_specs/features/ ``` If no feature specs exist, ask the user: > **No feature specs found.** The agent team needs features to implement. > > What are the key features of this project? I'll create a spec file for each one. For each feature the user lists, create `_project_specs/features/{feature-name}.md` with a skeleton spec. ### 1.4 Check GitHub CLI ```bash gh auth status ``` Needed by the merger agent for PR creation. Warn if not authenticated but don't block. ### 1.5 Ensure Worker Image (container mode only) ```bash if [ "$CONTAINER_MODE" = "true" ]; then if ! docker image inspect polyphony-worker:latest &>/dev/null 2>&1; then echo "Building polyphony-worker image..." docker build -t polyphony-worker:latest \ -f "$BOOTSTRAP_DIR/templates/Dockerfile.polyphony" "$BOOTSTRAP_DIR" echo "✓ Built polyphony-worker:latest" else echo "✓ polyphony-worker:latest image ready" fi fi ``` --- ## Phase 2: Spawn Default Agents Spawn the 5 permanent agents **natively** (these are coordination agents — they read/verify, not write code). Each agent reads `.claude/agents/{type}.md` for its full definition including frontmatter (tools, model, maxTurns, etc.). > **Note:** Permanent agents always run natively regardless of container mode. Only feature agents get containers. ### 2.1 Team Lead ``` Agent tool: name: "team-lead" subagent_type: "team-lead" prompt: "You are the team lead. Read .claude/agents/team-lead.md for your full instructions. Start by reading _project_specs/features/*.md to identify features, then create task chains and spawn feature agents." ``` ### 2.2 Quality Agent ``` Agent tool: name: "quality-agent" subagent_type: "quality-agent" prompt: "You are the quality agent. Read .claude/agents/quality.md for your instructions. Watch TaskList for tasks assigned to you. Process them in task ID order." ``` ### 2.3 Security Agent ``` Agent tool: name: "security-agent" subagent_type: "security-agent" prompt: "You are the security agent. Read .claude/agents/security.md for your instructions. Watch TaskList for security-scan tasks assigned to you." ``` ### 2.4 Code Review Agent ``` Agent tool: name: "review-agent" subagent_type: "review-agent" prompt: "You are the code review agent. Read .claude/agents/code-review.md for your instructions. Watch TaskList for code-review tasks assigned to you." ``` ### 2.5 Merger Agent ``` Agent tool: name: "merger-agent" subagent_type: "merger-agent" prompt: "You are the merger agent. Read .claude/agents/merger.md for your instructions. Watch TaskList for branch-pr tasks assigned to you." ``` --- ## Phase 3: Spawn Feature Agents ### Container Mode (default when Docker + polyphony available) For each feature spec in `_project_specs/features/`: ```bash # Polyphony creates a container with its own git clone + branch, # then starts the agent CLI inside polyphony spawn "{feature-name}: implement feature per _project_specs/features/{feature-name}.md" \ --type feature --risk low ``` This does everything in one command: 1. Creates a task in Polyphony's store 2. Routes it to an agent via the routing policy 3. Provisions a Docker container with a full git clone 4. Creates a feature branch (`feature/{feature-name}`) 5. Starts the agent CLI inside the container Check running containers: ```bash polyphony status ``` ### Fallback Mode (no Docker) If container mode is not available, spawn feature agents natively (shared workspace): ``` Agent tool: name: "feature-{feature-name}" subagent_type: "feature-agent" prompt: "You are the feature agent for {feature-name}. Read .claude/agents/feature.md for your instructions. Your feature spec is at _project_specs/features/{feature-name}.md. Start by checking TaskList for your first task." ``` > **Advisory:** Running without container isolation (Docker not found). Agents share the workspace — coordinate carefully to avoid file conflicts. --- ## Phase 4: Team Status Summary Show the user: ### Container Mode: ``` AGENT TEAM DEPLOYED (Container Isolation ON) ───────────────────────────────────────────── Team: {project-name} Features: {N} Isolation: Polyphony containers (each feature has its own branch) NATIVE AGENTS (coordination) ───────────────────────────── Team Lead Orchestrating Quality Agent Watching for verification tasks Security Agent Watching for security scan tasks Code Review Watching for review tasks Merger Agent Watching for branch/PR tasks CONTAINER AGENTS (isolated) ──────────────────────────── feature-{name1} Container running — branch: feature/{name1} feature-{name2} Container running — branch: feature/{name2} PIPELINE (per feature) ────────────────────── Spec > Review > Tests > RED Verify > Implement > GREEN Verify > Validate > Code Review > Security > Branch+PR Monitor: polyphony status Cleanup: polyphony cleanup (after all PRs created) ``` ### Fallback Mode: ``` AGENT TEAM DEPLOYED (Shared Workspace) ─────────────────────────────────────── ⚠ Docker not available — agents share the workspace Team: {project-name} Features: {N} Total tasks: {N * 10} AGENTS ────── Team Lead Orchestrating Quality Agent Watching for verification tasks Security Agent Watching for security scan tasks Code Review Watching for review tasks Merger Agent Watching for branch/PR tasks feature-{name1} Starting spec for {name1} feature-{name2} Starting spec for {name2} PIPELINE ──────── Spec > Review > Tests > RED Verify > Implement > GREEN Verify > Validate > Code Review > Security > Branch+PR The team runs autonomously until all PRs are created. ``` --- ## Monitoring After the team is spawned, the user can: - **Check progress:** Ask team lead for status, or run `polyphony status` (container mode) - **Message agents:** Use SendMessage to contact any agent - **View container logs:** `docker logs polyphony-{feature-name}` (container mode) - **Handle blockers:** Message the blocked agent or team lead The team runs autonomously until all PRs are created, then the team lead shuts everything down. ### Cleanup (container mode) After all PRs are created: ```bash polyphony cleanup ``` This removes completed containers and workspaces. Branches and PRs are preserved on the remote. ================================================ FILE: commands/sync-agents.md ================================================ # Sync Agents Sync project configuration between Claude Code, Kimi CLI, and Codex CLI. Run this after `/initialize-project` or anytime you want to ensure all installed AI CLI tools have matching skills, project instructions, and hooks. --- ## Phase 1: Detect Installed Tools ```bash BOOTSTRAP_DIR=$(cat ~/.claude/.bootstrap-dir 2>/dev/null) if [ -z "$BOOTSTRAP_DIR" ]; then echo "Error: Maggy not installed. Run install.sh first." exit 1 fi DETECTED=$("$BOOTSTRAP_DIR/scripts/detect-agents.sh" 2>/dev/null || echo "claude") echo "Detected AI CLI tools: $DETECTED" ``` --- ## Phase 2: Show Current State Check what exists for each tool and present a status table: ```bash echo "=== Current State ===" # Claude echo "Claude Code:" [ -d ".claude/skills" ] && echo " Skills: .claude/skills/ ($(ls -d .claude/skills/*/ 2>/dev/null | wc -l | tr -d ' ') skills)" || echo " Skills: NOT SET UP" [ -f "CLAUDE.md" ] && echo " Instructions: CLAUDE.md" || echo " Instructions: NOT SET UP" [ -f ".claude/settings.json" ] && echo " Hooks: .claude/settings.json" || echo " Hooks: NOT SET UP" # Kimi echo "Kimi CLI:" [ -d ".kimi/skills" ] && echo " Skills: .kimi/skills/ ($(ls -d .kimi/skills/*/ 2>/dev/null | wc -l | tr -d ' ') skills)" || echo " Skills: NOT SET UP" echo " Instructions: (Kimi uses skills directly, no project file needed)" [ -f ".kimi/config.toml" ] && echo " Hooks: .kimi/config.toml" || echo " Hooks: NOT SET UP" # Codex echo "Codex CLI:" [ -d ".codex/skills" ] && echo " Skills: .codex/skills/ ($(ls -d .codex/skills/*/ 2>/dev/null | wc -l | tr -d ' ') skills)" || echo " Skills: NOT SET UP" [ -f "AGENTS.md" ] && echo " Instructions: AGENTS.md" || echo " Instructions: NOT SET UP" [ -f ".codex/config.toml" ] && echo " Hooks: .codex/config.toml" || echo " Hooks: NOT SET UP" ``` Present the status table to the user, then ask what they want to do. --- ## Phase 3: Offer Sync Actions Ask the user which actions to perform: > **Current state shown above.** What would you like to sync? > > 1. **Sync all** - Copy skills + generate instructions + hooks for all detected tools > 2. **Skills only** - Copy .claude/skills/ to .kimi/skills/ and .codex/skills/ > 3. **Generate AGENTS.md** - Create Codex project instructions from CLAUDE.md > 4. **Generate config.toml** - Create Kimi/Codex hooks from settings.json > 5. **Show diff** - Show what differs between tool configs --- ## Phase 4: Execute Sync ### Option 1: Sync All (or individual options below) ### Skills Sync ```bash # Source of truth is .claude/skills/ if [ -d ".claude/skills" ]; then # Sync to Kimi if echo "$DETECTED" | grep -q "kimi"; then rm -rf .kimi/skills mkdir -p .kimi/skills cp -r .claude/skills/*/ .kimi/skills/ 2>/dev/null || true echo "Synced skills to .kimi/skills/" fi # Sync to Codex if echo "$DETECTED" | grep -q "codex"; then rm -rf .codex/skills mkdir -p .codex/skills cp -r .claude/skills/*/ .codex/skills/ 2>/dev/null || true echo "Synced skills to .codex/skills/" fi # Sync to generic .agents/ (works for any tool) rm -rf .agents/skills mkdir -p .agents/skills cp -r .claude/skills/*/ .agents/skills/ 2>/dev/null || true echo "Synced skills to .agents/skills/ (generic)" else echo "No .claude/skills/ found. Run /initialize-project first." fi ``` ### Generate AGENTS.md (from CLAUDE.md) If CLAUDE.md exists, generate AGENTS.md by: 1. Reading CLAUDE.md content 2. Replacing `.claude/skills/` paths with `.agents/skills/` paths 3. Writing as AGENTS.md **Important:** AGENTS.md should reference `.agents/skills/` (generic path) since Codex reads from `.codex/skills/` and `.agents/skills/`. The `.agents/skills/` path is the cross-compatible choice. If CLAUDE.md does not exist, copy from the bootstrap template: ```bash cp "$BOOTSTRAP_DIR/templates/AGENTS.md" ./AGENTS.md echo "Created AGENTS.md from template (customize for your project)" ``` ### Generate config.toml ```bash # For Kimi if echo "$DETECTED" | grep -q "kimi"; then mkdir -p .kimi cp "$BOOTSTRAP_DIR/templates/config.toml" .kimi/config.toml echo "Created .kimi/config.toml with hooks" fi # For Codex if echo "$DETECTED" | grep -q "codex"; then mkdir -p .codex cp "$BOOTSTRAP_DIR/templates/config.toml" .codex/config.toml echo "Created .codex/config.toml with hooks" fi ``` --- ## Phase 5: Summary ``` Sync complete! Skills synced: .claude/skills/ -> .kimi/skills/ (N skills) .claude/skills/ -> .codex/skills/ (N skills) .claude/skills/ -> .agents/skills/ (N skills, generic) Project instructions: CLAUDE.md (Claude Code) AGENTS.md (Codex CLI) Hooks config: .claude/settings.json (Claude Code) .kimi/config.toml (Kimi CLI) .codex/config.toml (Codex CLI) You can now run any of these in this project: claude # Claude Code kimi # Kimi CLI codex # Codex CLI ``` --- ## Phase 6: Update .gitignore Ensure cross-tool directories are properly handled in .gitignore: ```bash # Add to .gitignore if not present for entry in ".kimi/" ".codex/" ".agents/"; do if ! grep -qF "$entry" .gitignore 2>/dev/null; then echo "$entry" >> .gitignore fi done ``` **Note:** Unlike `.claude/` which is typically committed, `.kimi/` and `.codex/` project dirs should generally be gitignored since they're derived from `.claude/skills/`. The `/sync-agents` command regenerates them. AGENTS.md **should** be committed (it's the Codex equivalent of CLAUDE.md). ================================================ FILE: commands/sync-contracts.md ================================================ # /sync-contracts > Lightweight incremental update of workspace contracts without full re-analysis. ## Purpose Fast contract synchronization that: - Checks only contract source files (not full workspace) - Updates CONTRACTS.md with changes - Validates consistency - Takes ~15 seconds instead of ~2 minutes ## When to Use | Scenario | Command | |----------|---------| | After modifying API endpoints | `/sync-contracts` | | After changing shared types | `/sync-contracts` | | Session start shows stale contracts | `/sync-contracts` | | Post-commit hook (automatic) | `/sync-contracts --lightweight` | | Before pushing changes | `/sync-contracts --validate` | | See what changed without updating | `/sync-contracts --diff` | ## Behavior ### Step 1: Load Existing Topology ``` 🔄 Loading workspace context... Workspace: myapp (Monorepo) Last full analysis: 2026-01-18T10:00:00Z Last sync: 2026-01-20T14:32:00Z ``` Does NOT re-discover workspace structure - uses existing TOPOLOGY.md. ### Step 2: Check Contract Sources ``` 📋 Checking contract sources... Monitored files (from .contract-sources): ✓ apps/api/openapi.json (modified 2h ago) ✓ packages/shared-types/src/index.ts (modified 2h ago) ○ packages/db/schema/campaigns.ts (unchanged) ○ packages/db/schema/users.ts (unchanged) ○ apps/api/app/schemas/campaign.py (unchanged) Changes detected: 2 files ``` ### Step 3: Extract Changes ``` 📝 Extracting contract changes... apps/api/openapi.json: + POST /api/campaigns/bulk (new endpoint) ~ GET /api/campaigns (added 'status' query param) packages/shared-types/src/index.ts: ~ Campaign interface (added 'tags: string[]' field) + CampaignBulkCreate interface (new) ``` ### Step 4: Update Artifacts ``` ✏️ Updating workspace artifacts... Updated: _project_specs/workspace/CONTRACTS.md - Added POST /api/campaigns/bulk to endpoints - Updated Campaign type definition - Added CampaignBulkCreate type Updated: _project_specs/workspace/CROSS_REPO_INDEX.md - Added bulk create capability Timestamps updated: Last sync: 2026-01-20T16:45:00Z ``` ### Step 5: Validate Consistency ``` ✅ Validating contract consistency... Checks: ✓ OpenAPI endpoint count matches routes (48/48) ✓ All Pydantic models have TypeScript equivalents ✓ No orphaned types in shared-types ⚠️ Frontend types may need regeneration Validation: PASSED (1 warning) ``` ## Final Output ``` ════════════════════════════════════════════════════════════════ CONTRACT SYNC COMPLETE ════════════════════════════════════════════════════════════════ Sources checked: 5 Changes detected: 2 Files updated: 2 Changes Summary: + POST /api/campaigns/bulk (new endpoint) ~ Campaign interface (added 'tags' field) + CampaignBulkCreate interface (new) Freshness: 🟢 Fresh Last sync: 2026-01-20T16:45:00Z ⚠️ Note: Frontend types may need regeneration Run: cd apps/web && npm run generate:types ════════════════════════════════════════════════════════════════ ``` ## Flags | Flag | Description | |------|-------------| | `--lightweight` | Skip validation, minimal output (for hooks) | | `--diff` | Show changes without updating files | | `--validate` | Only validate, don't update | | `--force` | Update even if no changes detected | | `--verbose` | Show detailed extraction output | ## Diff Mode Preview changes without applying: ```bash /sync-contracts --diff ``` Output: ``` 📋 Contract Changes (not applied) apps/api/openapi.json: + POST /api/campaigns/bulk Request: CampaignBulkCreate[] Response: Campaign[] ~ GET /api/campaigns + query param: status (string, optional) packages/shared-types/src/index.ts: ~ interface Campaign { id: string; name: string; + tags: string[]; // NEW status: CampaignStatus; } + interface CampaignBulkCreate { campaigns: CampaignCreate[]; } To apply these changes: /sync-contracts ``` ## Validate Mode Check consistency without updating: ```bash /sync-contracts --validate ``` Output: ``` 🔍 Contract Validation Endpoint Consistency: ✓ OpenAPI spec: 48 endpoints ✓ Route files: 48 handlers ✓ Match: YES Type Consistency: ✓ Pydantic models: 23 ✓ TypeScript types: 34 ✓ Shared types exported: 34 ⚠️ 2 types only in backend (internal) Cross-Module References: ✓ Frontend imports valid types: YES ✓ Backend codegen up to date: YES Overall: ✅ VALID (2 warnings) ``` ## Lightweight Mode For hooks - minimal output, fast execution: ```bash /sync-contracts --lightweight ``` Output: ``` ✓ Contracts synced (2 changes) ``` Or if no changes: ``` ✓ Contracts up to date ``` ## Contract Sources File The sync uses `.contract-sources` to know what to check: ```bash # _project_specs/workspace/.contract-sources # Auto-generated by /analyze-workspace # Edit to add/remove monitored files # OpenAPI specs apps/api/openapi.json # Type definitions packages/shared-types/src/index.ts packages/shared-types/src/api.ts packages/shared-types/src/campaign.ts # Pydantic schemas (Python) apps/api/app/schemas/campaign.py apps/api/app/schemas/user.py apps/api/app/schemas/auth.py # Database schema packages/db/schema/campaigns.ts packages/db/schema/users.ts ``` To add a new source: ```bash echo "apps/api/app/schemas/new_model.py" >> _project_specs/workspace/.contract-sources ``` ## Error Handling ### No Contract Sources ``` ⚠️ No contract sources configured Run /analyze-workspace first to set up contract monitoring. ``` ### Source File Missing ``` ⚠️ Contract source not found: apps/api/openapi.json Options: 1. Generate it: cd apps/api && python -m app.generate_openapi 2. Remove from monitoring: Edit .contract-sources 3. Skip this file: /sync-contracts --skip apps/api/openapi.json ``` ### Validation Failed ``` ❌ Contract validation failed Issues found: 1. OpenAPI has 48 endpoints, routes have 47 Missing: DELETE /api/campaigns/:id (in spec, not in routes) 2. Type mismatch: Campaign.status OpenAPI: "draft" | "active" | "paused" TypeScript: "draft" | "active" | "paused" | "archived" Fix these issues, then run /sync-contracts again. Or force update: /sync-contracts --force ``` ## Integration with Hooks ### Post-Commit Hook Automatically runs after commits that touch contract sources: ```bash # hooks/post-commit CONTRACT_SOURCES=$(cat _project_specs/workspace/.contract-sources 2>/dev/null) COMMITTED=$(git diff-tree --no-commit-id --name-only -r HEAD) for source in $CONTRACT_SOURCES; do if echo "$COMMITTED" | grep -q "$source"; then echo "📝 Contract source changed, syncing..." claude --silent "/sync-contracts --lightweight" break fi done ``` ### Pre-Push Hook Validates before push: ```bash # hooks/pre-push echo "🔍 Validating contracts..." claude --silent "/sync-contracts --validate" if [ $? -ne 0 ]; then echo "❌ Contract validation failed" echo "Run /sync-contracts to fix" exit 1 fi ``` ## Comparison: sync-contracts vs analyze-workspace | Aspect | /sync-contracts | /analyze-workspace | |--------|-----------------|-------------------| | Time | ~15 seconds | ~2 minutes | | Scope | Contract files only | Full workspace | | Discovers new modules | No | Yes | | Updates TOPOLOGY.md | No | Yes | | Updates CONTRACTS.md | Yes | Yes | | Rebuilds dependency graph | No | Yes | | When to use | Frequent (daily) | Occasional (weekly) | ================================================ FILE: commands/update-code-index.md ================================================ # Update Code Index Regenerates `CODE_INDEX.md` by scanning the codebase for all functions, classes, hooks, and components. Organizes by capability to prevent semantic duplication. --- ## What This Command Does 1. **Scans source files** - Finds all exported functions, classes, hooks, components 2. **Extracts docstrings** - Gets descriptions from JSDoc/docstrings 3. **Categorizes by capability** - Groups by what things DO, not where they live 4. **Generates CODE_INDEX.md** - Creates/updates the semantic index --- ## Phase 1: Detect Project Type ```bash # Check language ls package.json pyproject.toml 2>/dev/null # Check source directories ls -d src/ lib/ app/ 2>/dev/null ``` --- ## Phase 2: Scan Codebase ### For TypeScript/JavaScript Scan for exports: ```bash # Find all exported functions grep -rn "export function\|export const\|export class\|export default" src/ --include="*.ts" --include="*.tsx" --include="*.js" --include="*.jsx" # Find React hooks grep -rn "export function use[A-Z]\|export const use[A-Z]" src/ --include="*.ts" --include="*.tsx" # Find React components (PascalCase exports) grep -rn "export function [A-Z]\|export const [A-Z].*=.*=>" src/ --include="*.tsx" --include="*.jsx" ``` ### For Python ```bash # Find all function definitions grep -rn "^def \|^async def \|^class " src/ --include="*.py" # Check __all__ exports grep -rn "__all__" src/ --include="*.py" ``` --- ## Phase 3: Extract Documentation For each found export, extract: 1. **Name** - Function/class name 2. **Location** - File path and line number 3. **Description** - From JSDoc `@description` or first line of docstring 4. **Parameters** - Function signature 5. **Returns** - Return type if available ### TypeScript Example ```typescript /** * Formats a date into a human-readable relative string. * @param date - The date to format * @returns Relative time string like "2 days ago" */ export function formatRelative(date: Date): string { ``` Extract: - Name: `formatRelative` - Description: "Formats a date into a human-readable relative string" - Params: `(date: Date)` - Returns: `string` ### Python Example ```python def format_relative(date: datetime) -> str: """Formats a date into a human-readable relative string. Args: date: The date to format Returns: Relative time string like "2 days ago" """ ``` Extract: - Name: `format_relative` - Description: "Formats a date into a human-readable relative string" - Params: `(date: datetime)` - Returns: `str` --- ## Phase 4: Categorize by Capability Group functions by what they DO: | Category | Keywords to Match | |----------|-------------------| | **Date/Time** | date, time, format, parse, duration, relative, timestamp | | **Validation** | validate, is*, check, verify, sanitize | | **String Operations** | string, text, format, parse, slug, truncate, capitalize | | **API Clients** | fetch, get, post, put, delete, api, request | | **Authentication** | auth, login, logout, session, token, user | | **Error Handling** | error, exception, handle, catch, throw | | **Database** | db, query, find, create, update, delete, repository | | **Hooks (React)** | use* | | **Components (React)** | PascalCase in .tsx/.jsx | | **Utilities** | util, helper, common (catch-all) | --- ## Phase 5: Generate CODE_INDEX.md Create or overwrite `CODE_INDEX.md`: ```markdown # Code Index *Auto-generated by /update-code-index* *Last updated: [TIMESTAMP]* > ⚠️ **Before writing new code, search this index first!** > Find similar functionality? Use or extend it instead of creating new. ## Quick Stats | Category | Count | Main Location | |----------|-------|---------------| | Date/Time | X | src/utils/dates.ts | | Validation | X | src/utils/validate.ts | | API Clients | X | src/api/*.ts | | Hooks | X | src/hooks/*.ts | | Components | X | src/components/*.tsx | --- ## Date/Time Operations | Function | Location | Description | Signature | |----------|----------|-------------|-----------| | `formatDate()` | utils/dates.ts:15 | Formats Date to locale string | `(date: Date, opts?)` | | `formatRelative()` | utils/dates.ts:32 | Formats as "2 days ago" | `(date: Date)` | | ... | ... | ... | ... | --- ## Validation | Function | Location | Description | Signature | |----------|----------|-------------|-----------| | `isEmail()` | utils/validate.ts:10 | Validates email format | `(email: string)` | | ... | ... | ... | ... | --- [Continue for each category...] ``` --- ## Phase 6: Report Changes After generating, report: ``` 📊 Code Index Updated Scanned: • 45 TypeScript files • 12 React components • 8 custom hooks • 156 exported functions Categories: • Date/Time: 5 functions • Validation: 8 functions • API Clients: 23 functions • Hooks: 8 hooks • Components: 12 components • Utilities: 42 functions New since last run: • + fetchOrders() in api/orders.ts • + useCart() in hooks/useCart.ts • + OrderCard component in components/OrderCard.tsx Possible duplicates detected: • ⚠️ formatDate() and displayDate() - similar purpose? • ⚠️ isValid() and validate() - review these Updated: CODE_INDEX.md ``` --- ## Handling Missing Documentation If a function lacks documentation: ```markdown | `myFunction()` | utils/helpers.ts:42 | ⚠️ *No description - add JSDoc* | `(a, b, c)` | ``` Report at end: ``` ⚠️ 12 functions missing documentation: • myFunction() in utils/helpers.ts:42 • anotherFunc() in services/user.ts:88 • ... Run with --add-docs to prompt for descriptions. ``` --- ## Options ```bash # Basic update /update-code-index # Include private/non-exported functions /update-code-index --include-private # Prompt to add missing docs /update-code-index --add-docs # Only scan specific directory /update-code-index src/utils # Output as JSON (for vector DB ingestion) /update-code-index --json > code_index.json # Detect duplicates only (no index update) /update-code-index --audit-only ``` --- ## Audit Mode When run with `--audit-only` or as `/audit-duplicates`: ```markdown ## Duplicate Audit Report - [DATE] ### 🔴 High Confidence Duplicates 1. **formatDate / displayDate / showDate** - `formatDate()` at utils/dates.ts:15 - `displayDate()` at components/Header.tsx:42 - `showDate()` at pages/Profile.tsx:28 - Similarity: 89% (same logic, different names) - **Recommendation:** Consolidate into utils/dates.ts 2. **isEmail / validateEmail / checkEmail** - `isEmail()` at utils/validate.ts:10 - `validateEmail()` at forms/signup.ts:55 - `checkEmail()` at api/users.ts:30 - Similarity: 95% (identical regex) - **Recommendation:** Use isEmail() everywhere ### 🟡 Possible Duplicates (Review) 1. **fetchUser / getUser / loadUser** - Different implementations but same purpose - May be intentional (different contexts) - **Action:** Document if intentional, merge if not ### 🟢 Similar But Distinct 1. **Button / IconButton / LinkButton** - Related components with different purposes - **Status:** OK - documented variants ``` --- ## Integration with Vector DB If vector DB is set up, also update embeddings: ```bash /update-code-index --vector ``` This: 1. Generates CODE_INDEX.md (as usual) 2. Creates embeddings for each function description 3. Stores in `.chroma/` or `.lancedb/` 4. Enables semantic search: "find functions that validate user input" --- ## Suggested Workflow ### Daily - Index auto-updates on significant code changes - Claude checks index before writing new code ### Weekly - Run `/update-code-index --audit-only` - Review duplicate report - Merge or document similar functions ### After Major Features - Full index regeneration - Vector DB re-embedding (if used) --- ## File Output Creates/updates: - `CODE_INDEX.md` - Human-readable index - `.code-index.json` (optional) - Machine-readable for tooling --- ## Claude Instructions When user runs `/update-code-index`: 1. Detect project type (TS/JS/Python) 2. Scan source directories 3. Extract all exports with documentation 4. Categorize by capability 5. Generate CODE_INDEX.md 6. Report stats and potential duplicates 7. Commit the updated index After running, remind user: > "Index updated! I'll check this before writing any new code to avoid duplicating existing functionality." ================================================ FILE: docs/architecture-v5.md ================================================ # Maggy v5 Architecture — Multi-Project, Multi-Model Command Center ## 1. Executive Summary v5 transforms Maggy from a single-project, single-model toolkit into a **multi-project, multi-model orchestration platform**. Pi replaces per-CLI adapters as the universal agent harness. Maggy becomes the central web dashboard. Token budgets are managed dynamically across providers. New features are validated against the competitive intelligence graph before engineering begins. --- ## 2. What Changed: Before and After ### v3.x (Single-Model, Single-Project) ``` User → Claude Code → single project → single model │ ├── CLAUDE.md (project config) ├── skills/ (TDD, security, etc.) ├── iCPG (blast radius, drift) ├── Mnemos (memory, fatigue) └── hooks (PreToolUse, Stop, etc.) ``` - One project at a time - One model (Claude) for everything - When Claude tokens ran out, work stopped - Agents shared a filesystem (conflict-prone) - No market validation for new features ### v4.0 (Container Isolation, Cross-Agent) ``` User → Claude Code → /spawn-team → Polyphony containers │ ├── Container 1 (claude CLI) ├── cross-agent-delegation ├── Container 2 (codex CLI) │ (complexity scoring) └── Container 3 (kimi CLI) ├── iCPG + Mnemos └── 3 separate CLI adapters ``` - Container isolation per agent (own git clone + branch) - Cross-agent delegation via complexity scoring - Still one project at a time - Still separate CLI tools (claude, codex, kimi) - Token exhaustion on one provider = manual switch ### v5.0 (Multi-Project, Multi-Model, Market-Validated) ``` User → Maggy Web Dashboard → multiple projects → multiple models │ │ │ ┌───────────────────┼───────────────────┐ │ │ Project A │ Project B │ │ │ zensurveys │ chief-of-staff │ │ │ │ │ │ │ ┌─Pi agent─┐ │ ┌─Pi agent─┐ │ │ │ │ claude │ │ │ gpt-4o │ │ │ │ │ → gpt-4o │ │ │ → gemini │ │ │ │ │ → gemini │ │ │ → qwen │ │ │ │ └──────────┘ │ └──────────┘ │ │ └───────────────────┼───────────────────┘ │ │ ├── codebase-memory-mcp (structural graph — 36 projects) ├── CIKG (market graph) │ iCPG (intent graph, layers on code graph) ├── Mnemos (cross-model fatigue) └── Token Budget Manager (auto-rotate) ``` --- ## 3. Core Components ### 3.1 Pi — Universal Agent Harness **Replaces:** `ClaudeAdapter`, `CodexAdapter`, `KimiAdapter` Pi is an open-source (MIT) terminal coding agent that supports 20+ model providers through a single interface. It runs in three modes: | Mode | Use Case | |------|----------| | **Interactive** | Human at terminal | | **RPC** | Headless JSONL over stdin/stdout — for container agents | | **SDK** | Embedded in Maggy's orchestrator | **Provider support:** | Tier | Providers | Auth | |------|-----------|------| | Subscription | Claude Pro/Max, ChatGPT Plus/Pro, GitHub Copilot | OAuth | | API Key | Anthropic, OpenAI, Google, DeepSeek, Mistral, Groq, xAI | Env var | | Cloud | Azure OpenAI, Amazon Bedrock, Cloudflare Workers | Platform | | Local | Ollama (Qwen, Llama, etc.) | None | **Key capability:** Runtime model switching via RPC without restarting: ```json {"command": "set_model", "provider": "openai", "model": "gpt-4o"} ``` ### 3.2 Maggy v2 — Multi-Project Command Center **Extends:** Maggy v1 (single-project inbox + execute) Maggy v2 is a web dashboard (FastAPI + React) that orchestrates work across multiple GitHub repos from a single browser tab. ``` ┌─────────────────────────────────────────────────────────────┐ │ MAGGY v2 — Web Dashboard │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ PROJECT REGISTRY (~/.maggy/projects.yaml) │ │ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ │ │ Project │ │ Project │ │ Project │ │ Project │ │ │ │ │ │ A │ │ B │ │ C │ │ D │ │ │ │ │ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │ │ │ └───────┼───────────┼───────────┼───────────┼─────────┘ │ │ │ │ │ │ │ │ ┌───────▼───────────▼───────────▼───────────▼─────────┐ │ │ │ ORCHESTRATOR │ │ │ │ ┌────────────┐ ┌─────────────┐ ┌────────────────┐ │ │ │ │ │ Planning │ │ Decision │ │ Execution │ │ │ │ │ │ Layer │ │ Layer │ │ Layer │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ Claude │ │ iCPG blast │ │ Pi agents in │ │ │ │ │ │ plans │ │ radius → │ │ Polyphony │ │ │ │ │ │ Codex │ │ model tier │ │ containers │ │ │ │ │ │ counter- │ │ │ │ │ │ │ │ │ │ checks │ │ CIKG market │ │ Token budget │ │ │ │ │ │ │ │ validation │ │ auto-rotation │ │ │ │ │ └────────────┘ └─────────────┘ └────────────────┘ │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ CODE INTELLIGENCE (codebase-memory-mcp) │ │ │ │ 36 projects indexed │ 700K+ nodes │ 1.4M+ edges │ │ │ │ Structural graph powering iCPG, blast radius, │ │ │ │ cross-project deps, agent context │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ DEPLOY LAYER │ │ │ │ 4 isolated browser containers (Playwright) │ │ │ │ Each with its own Vercel auth session │ │ │ └──────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────┘ ``` **New capabilities over v1:** - Multi-project view (registry of repos + branches) - Cross-project ticket triage - Token budget dashboard (usage per model per project) - Deploy status per project (isolated Vercel sessions) ### 3.3 Token Budget Manager **New component.** Manages model selection based on blast radius and token availability. #### Model Tiering by Composite Risk Score Model selection uses iCPG's **5-dimension complexity scoring**, not just file count. Each dimension is scored 0-2, total 0-10: | Dimension | What It Measures | Examples | |-----------|-----------------|----------| | **Cyclomatic** | Control flow complexity of touched code | Nested conditionals, state machines | | **Fan-out** | How many other modules depend on the change | Shared utilities, API contracts | | **Security** | Whether auth, crypto, permissions, or PII are involved | Auth policy, token validation | | **Concurrency** | Race conditions, locks, async coordination | Queue workers, websocket handlers | | **Domain** | Business logic criticality | Pricing, billing, compliance | Plus 6-dimension drift detection (spec, decision, ownership, test, usage, dependency) and constraint checking from active ReasonNodes. This means a one-file auth policy change scores high (security=2, domain=2) while a five-file CSS refactor scores low (cyclomatic=0, fan_out=1). The routing is risk-aware, not file-count-aware. ``` iCPG composite risk score → model tier ┌─────────────┬──────────────────────┬─────────────────────────┐ │ Score │ Model Tier │ Rationale │ ├─────────────┼──────────────────────┼─────────────────────────┤ │ 0-3 (low) │ Qwen local / DeepSeek│ Bounded scope, no │ │ │ via Ollama │ security/concurrency/ │ │ │ │ domain risk │ ├─────────────┼──────────────────────┼─────────────────────────┤ │ 4-6 (medium)│ Kimi / Gemini Flash │ Real risk but bounded; │ │ │ │ + high-tier post-review │ │ │ │ on output (catch subtle │ │ │ │ bugs cheap models miss) │ ├─────────────┼──────────────────────┼─────────────────────────┤ │ 7-10 (high) │ Claude / GPT-4o │ Full context needed — │ │ │ │ cross-cutting, security,│ │ │ │ concurrency, or domain │ │ │ │ critical changes │ └─────────────┴──────────────────────┴─────────────────────────┘ ``` **Dimension overrides:** Regardless of total score, if `security >= 2` or `concurrency >= 2`, the task is always routed to the high tier. These dimensions are too dangerous for cheap models. #### Low-Tier Output Verification When a task is handled by a cheap/local model (score 0-6), its output goes through additional verification before landing: | Gate | What It Catches | |------|----------------| | iCPG drift check | Scope drift, constraint violations, invariant breakage | | iCPG constraint assertions | Postconditions from ReasonNodes evaluated against output | | High-tier spot review | Claude/GPT-4o reviews the diff (cheaper than writing it) | | Static analysis | Linter + type checker catch mechanical errors | This prevents the failure class Codex identified: code that passes tests but has subtle logical regressions. #### Fallback Chain When the primary model hits quota, the budget manager rotates. Model switching is an **explicit handoff with verification**, not a silent swap: 1. Current model hits quota or rate limit 2. Mnemos writes checkpoint with full execution state 3. Pi switches to next model via RPC `set_model` 4. Checkpoint is re-injected as structured context 5. New model verifies it understands the task before continuing 6. If verification fails, escalate to next tier (don't retry on weaker model) ``` Claude (quota hit) → checkpoint + handoff → GPT-4o (quota hit) → checkpoint + handoff → Gemini 2.5 Pro (quota hit) → checkpoint + handoff → Kimi (quota hit) → checkpoint + handoff → DeepSeek (quota hit) → checkpoint + handoff → Qwen local (unlimited, always available) ``` #### Budget Tracking ```yaml # ~/.maggy/token-budget.yaml providers: anthropic: daily_limit_usd: 50.00 used_today_usd: 32.15 model_preference: claude-sonnet-4-20250514 openai: daily_limit_usd: 30.00 used_today_usd: 5.20 model_preference: gpt-4o local: daily_limit_usd: 0 # free model_preference: qwen2.5-coder:32b ollama_endpoint: http://localhost:11434 ``` ### 3.4 Planning Layer — Dual-Model Review Every plan goes through a two-model review before execution: ``` Feature Request / Ticket │ ▼ ┌─────────────────┐ │ Claude Plans │ Primary model creates architecture plan │ (full context) │ with file list, approach, risks └────────┬────────┘ │ ▼ ┌─────────────────┐ │ Codex Counter- │ Second model independently reviews: │ Checks │ - Missing edge cases? │ (independent) │ - Over-engineering? │ │ - Security gaps? │ │ - Simpler approach? └────────┬────────┘ │ ▼ ┌─────────────────┐ │ Diff View │ Maggy shows both perspectives │ in Maggy UI │ User approves/resolves conflicts └────────┬────────┘ │ ▼ Execution begins ``` ### 3.5 Decision Layer — iCPG + CIKG Two graphs feed the orchestrator's decisions: #### iCPG (Code Graph) — "Should we change this?" Per-project, SQLite-backed. Layers intent and constraints on top of the structural graph from **codebase-memory-mcp** (Section 3.8). Answers: | Query | What It Returns | |-------|----------------| | `icpg query blast <id>` | Files affected, downstream dependencies | | `icpg query risk <symbol>` | Drift history, ownership changes, fragility | | `icpg query constraints <file>` | Invariants that must be preserved | | `icpg drift check` | 6-dimension drift across spec, decision, ownership, test, usage, dependency | The blast radius score (0-10) determines: - Which model tier handles the task - How deep the architecture review goes - Whether dual-model planning is required #### CIKG (Competitive Intelligence Knowledge Graph) — "Should we build this?" Supabase-backed. Node types: `competitor`, `feature`, `market_segment`, `technology`, `trend`, `product`. Edge types: `has_feature`, `competes_with`, `targets_market`, `uses_technology`, `protaige_has`, `protaige_lacks`, `threatens`. Used for **new feature validation** before engineering begins: ``` New Feature Idea │ ▼ ┌────────────────────┐ │ CIKG: find_gaps() │ Who has this? Who lacks it? │ compare_entities() │ Competitive advantage or table stakes? │ get_landscape() │ Market trend alignment? └────────┬───────────┘ │ ▼ ┌────────────────────┐ │ Market Score │ │ │ │ gap_count: 3 │ 3 competitors lack this → opportunity │ threat_level: high │ 2 competitors actively building → urgent │ trend_align: yes │ Aligns with "AI voice" trend → proceed └────────┬───────────┘ │ ▼ Requirements validated → proceed to iCPG blast radius ``` ### 3.6 Execution Layer — Polyphony + Pi Updated container architecture. Each feature agent runs Pi in RPC mode inside a Polyphony container: ``` ┌──────────────────────────────────────────────────────┐ │ Polyphony Container (per feature) │ │ │ │ ┌─────────────────────────────────────────────────┐ │ │ │ Pi Agent (RPC mode over stdin/stdout) │ │ │ │ │ │ │ │ Current model: claude-sonnet-4-20250514 │ │ │ │ Fallback chain: gpt-4o → gemini → kimi → qwen │ │ │ │ │ │ │ │ Tools: read, write, edit, bash │ │ │ │ Extensions: skills, hooks, MCP servers │ │ │ └──────────────────────────────┬──────────────────┘ │ │ │ │ │ ┌──────────┐ ┌────────────┐ │ ┌──────────────┐ │ │ │ Git clone│ │ .mnemos/ │ │ │ .icpg/ │ │ │ │ own │ │ fatigue │ │ │ blast radius │ │ │ │ branch │ │ checkpoint │ │ │ constraints │ │ │ └──────────┘ └────────────┘ │ └──────────────┘ │ │ │ │ │ ┌─────────────────────────────▼──────────────────┐ │ │ │ RPC Bridge (Maggy ↔ Pi) │ │ │ │ • Send prompts │ │ │ │ • Receive streaming events │ │ │ │ • Switch models on quota hit │ │ │ │ • Steer/follow-up mid-task │ │ │ └────────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────────┘ ``` **Coordination model (hybrid — option 2):** Claude Code's native Task tool spawns agents that keep full team coordination (SendMessage, TaskList, UI visibility). Each agent controls a Pi instance inside a Polyphony container via RPC. The agent has Claude's brain for coordination but Pi's body for execution. **Why this is not a split-brain problem:** This concern is addressed by Mnemos, which serves as a **shared memory layer that both sides can read**: - **Mnemos checkpoint** persists goal, constraints, progress, and working state to disk (`.mnemos/`) - **iCPG state** persists intent, constraints, and drift to disk (`.icpg/`) - **Signal log** (`.mnemos/signals.jsonl`) persists behavioral signals across model switches - All three are inside the container volume — they survive model swaps The coordination agent (Claude Task tool) handles team communication. The execution agent (Pi) handles code work. The shared disk state (Mnemos + iCPG) is the single source of truth. There's no split brain because there's no duplicated state — each layer owns a distinct concern with shared persistence. ``` Claude Code Task tool agent (coordination — messaging, tasks, UI) │ ├── SendMessage to team lead ✓ ├── TaskUpdate progress ✓ ├── Visible in tmux/iTerm ✓ │ └── Executes code work via: docker exec polyphony-feature-X \ pi --mode rpc --provider anthropic │ ├── stdin: {"command": "prompt", "content": "implement auth"} ├── stdout: streaming events (text, tool calls, completion) ├── stdin: {"command": "set_model", ...} when quota hits │ └── Shared persistence (inside container volume): ├── .mnemos/checkpoint-latest.json ← goal, constraints, progress ├── .mnemos/signals.jsonl ← behavioral signals ├── .mnemos/fatigue.json ← model-normalized fatigue └── .icpg/reason.db ← intent, constraints, drift ``` ### 3.7 Deploy Layer — Isolated Vercel Sessions Four Docker containers, each running a headless browser with its own Vercel auth session: ``` ┌────────────────────────────┐ │ vercel-session-A │ │ Playwright + Chrome │ │ Auth: vercel.com (session) │ │ Project: zensurveys-backend│ │ No local `vercel login` │ ├────────────────────────────┤ │ vercel-session-B │ │ Own Chrome profile │ │ Project: zensurveys-fe │ ├────────────────────────────┤ │ vercel-session-C │ │ Own Chrome profile │ │ Project: chief-of-staff │ ├────────────────────────────┤ │ vercel-session-D │ │ Own Chrome profile │ │ Project: rodcast │ └────────────────────────────┘ ``` Each container persists its Chrome profile to a Docker volume. No local directory conflicts. Deploys are triggered from Maggy's web UI or via git push (Vercel auto-deploy). ### 3.8 Code Intelligence Layer — codebase-memory-mcp **Foundation layer.** Every component above — iCPG, blast radius scoring, Maggy's orchestrator, Pi agents — depends on a structural understanding of the code. codebase-memory-mcp is the AST-based knowledge graph that provides it. ``` ┌──────────────────────────────────────────────────────────────┐ │ codebase-memory-mcp │ │ ─────────────────────────────────────────────────────────── │ │ │ │ 36 projects indexed │ 14 MCP tools │ 64 languages │ │ 700K+ nodes │ 1.4M+ edges │ auto-updated via file watcher │ │ │ │ Node Types: │ │ Function, Method, Class, Variable, Route, │ │ File, Module, Folder, Section, Project │ │ │ │ Edge Types: │ │ CALLS, IMPORTS, USAGE, DEFINES, DEFINES_METHOD, │ │ TESTS, WRITES, HANDLES, HTTP_CALLS, CONFIGURES, │ │ SEMANTICALLY_RELATED, SIMILAR_TO, CONTAINS_* │ │ │ │ Search Modes: │ │ BM25 full-text │ regex pattern │ semantic vector │ │ │ │ Trace Modes: │ │ calls (callers/callees) │ data_flow (value propagation) │ │ cross_service (HTTP/async through Routes) │ └──────────────────────────────────────────────────────────────┘ ``` #### How Each Component Uses It | Component | Graph Queries | Purpose | |-----------|--------------|---------| | **iCPG blast radius** | `trace_path(fn, mode=calls, risk_labels=true)` | Fan-out scoring — how many callers/callees, at what hop distance | | **iCPG drift** | `detect_changes` + `query_graph` | Detect which functions changed, trace impact to dependents | | **Token budget routing** | `trace_path` depth + edge count | Feed fan-out dimension of 5-dimension complexity score | | **Pi agents (pre-task)** | `search_graph` + `get_architecture` | Understand codebase before making changes — no blind edits | | **Pi agents (post-task)** | `detect_changes` | Verify scope of changes matches intent | | **Maggy orchestrator** | `search_graph` across projects | Map ticket descriptions → relevant code across all repos | | **Dual-model planning** | `get_architecture` + `trace_path` | Give both Claude and Codex the same structural context | | **Reward registry** | `detect_changes` | Measure actual blast radius of completed work for reward signals | | **Cross-project deps** | `query_graph` with HTTP_CALLS/IMPORTS | If zensurveys-backend changes an API route, trace consumers in frontend | #### Multi-Project Graph Topology Each project has its own indexed graph. Maggy queries across them: ``` ┌─────────────────────────────────────────────────────────────┐ │ codebase-memory-mcp — Cross-Project Graph │ │ │ │ ┌──────────────────┐ ┌──────────────────┐ │ │ │ zensurveys │ │ zensurveys-fe │ │ │ │ 7,644 nodes │ │ 11,168 nodes │ │ │ │ 25,866 edges │ │ 16,876 edges │ │ │ │ │ │ │ │ │ │ Route: /api/v1/* │──│ HTTP_CALLS: fetch│ │ │ └──────────────────┘ └──────────────────┘ │ │ │ │ ┌──────────────────┐ ┌──────────────────┐ │ │ │ chief-of-staff │ │ maggy │ │ │ │ 2,687 nodes │ │ 4,692 nodes │ │ │ │ 6,958 edges │ │ 7,459 edges │ │ │ └──────────────────┘ └──────────────────┘ │ │ │ │ ┌──────────────────┐ ┌──────────────────┐ │ │ │ protaige-backend │ │ protaige-frontend│ │ │ │ 26,832 nodes │ │ 8,630 nodes │ │ │ │ 92,174 edges │ │ 14,539 edges │ │ │ └──────────────────┘ └──────────────────┘ │ │ │ │ + 30 more indexed projects │ └─────────────────────────────────────────────────────────────┘ ``` #### Integration with iCPG iCPG and codebase-memory-mcp are **complementary, not redundant**: | Layer | What It Knows | Storage | |-------|--------------|---------| | **codebase-memory-mcp** | Structure — what calls what, who imports whom, where routes go | `.code-graph/` (AST-derived) | | **iCPG** | Intent — WHY code exists, what constraints it must obey, what decisions shaped it | `.icpg/reason.db` (human/AI-derived) | ``` codebase-memory-mcp (structural) iCPG (intentional) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━━━━━━━━ Function: handleAuth() ReasonNode: "handles OAuth" CALLS → validateToken() Constraint: "must check exp" CALLS → refreshSession() Decision: "chose PKCE over implicit" USAGE → from 14 callers Drift: "spec says mTLS, code uses JWT" Route: POST /api/auth/login 5-dimension score: 8/10 trace_path → 3-hop blast radius (security=2, domain=2, fan_out=2) ``` The structural graph provides the "what and where." iCPG provides the "why and what-must-hold." Together they give the token budget manager a complete risk picture. #### Freshness Guarantees ``` ┌────────────────┬──────────────────────────────────────────┐ │ Layer │ How It Stays Fresh │ ├────────────────┼──────────────────────────────────────────┤ │ File watcher │ Re-indexes changed files on save (~10ms) │ │ Auto-index │ Ensures currency on Claude Code startup │ │ Post-commit │ git hook triggers incremental re-index │ │ detect_changes │ Diff-aware — shows what changed since │ │ │ last index, not full re-scan │ └────────────────┴──────────────────────────────────────────┘ ``` No manual re-indexing needed for normal development. Only `index_repository` after major restructures (branch switches with large diffs, directory renames). --- ## 4. Mnemos in a Multi-Model World ### The Problem Mnemos v1 tracks fatigue for a single Claude Code session. In v5, a task might start on Claude, switch to GPT-4o mid-session, then fall back to Qwen. Each model has different: - Context window sizes (200K Claude vs 128K GPT-4o vs 32K Qwen local) - Compaction behavior - Tool call patterns ### The Solution: Model-Aware Fatigue Extend the 4-dimension fatigue model with model-relative normalization: ``` ┌──────────────────────────────────────────────────────┐ │ Mnemos v2 — Cross-Model Fatigue │ │ │ │ Current model: gpt-4o (128K context) │ │ Previous model: claude (200K context) │ │ Model switches this session: 1 │ │ │ │ Fatigue dimensions (model-normalized): │ │ │ │ Token utilization: 0.65 │ │ → 83K / 128K (gpt-4o window, not claude's 200K) │ │ │ │ Scope scatter: 0.30 │ │ → Carried over from pre-switch signal log │ │ │ │ Re-read ratio: 0.45 ← ELEVATED │ │ → Model switch caused context loss, agent is │ │ re-reading files it already read under Claude │ │ │ │ Error density: 0.20 │ │ → New model still learning the codebase │ │ │ │ Composite: 0.43 (COMPRESS state) │ │ → Auto-consolidation triggered │ └──────────────────────────────────────────────────────┘ ``` ### Key Extensions | Extension | Description | |-----------|-------------| | **Model-relative token %** | Normalize against current model's context window, not a fixed 200K | | **Switch penalty** | When model switches, add +0.15 to re-read ratio (context was lost) | | **Cross-model checkpoint** | Checkpoint includes model history so the new model knows what was done | | **Shared signal log** | `.mnemos/signals.jsonl` persists across model switches (it's on disk) | | **Budget-aware thresholds** | If running on free tier (Qwen local), relax fatigue thresholds (no cost pressure) | ### Checkpoint Format — Extended for Multi-Model ```json { "goal": "Implement voice surveys", "model_history": [ {"provider": "anthropic", "model": "claude-sonnet", "tokens_used": 145000, "duration_s": 420}, {"provider": "openai", "model": "gpt-4o", "tokens_used": 83000, "duration_s": 180} ], "switch_reason": "anthropic quota exceeded", "active_constraints": ["..."], "active_results": ["..."], "current_subgoal": "...", "fatigue_at_checkpoint": 0.43, "icpg_state": {"..."}, "cikg_context": { "market_validation": "3 competitors have voice — table stakes", "gap_id": "uuid-of-cikg-gap-node" } } ``` --- ## 5. Data Flow — End to End ``` 1. USER opens Maggy dashboard → Sees all projects, token budgets, active agents 2. USER selects ticket from inbox (or creates feature idea) │ ▼ 3. CIKG VALIDATION (new features only) → find_gaps(): who has this? competitive pressure? → get_landscape(): market trend alignment? → Output: market score + competitive context │ ▼ 4. STRUCTURAL ANALYSIS (codebase-memory-mcp) → search_graph: locate relevant symbols across projects → trace_path: map call chains and fan-out (with risk labels) → get_architecture: understand module boundaries → Output: structural dependency map │ ▼ 5. iCPG ANALYSIS (layers on structural graph) → query blast: which files are affected? → query risk: are they fragile? → query constraints: what invariants exist? → Output: blast radius score (0-10) │ ▼ 5.5 LEXON TOOL RESOLUTION (when tool count > 20 — requires Lexon, Section 16) → Structured intent from iCPG fed to Lexon two-tier routing → Tier A: fast LLM router (<300ms) selects from compact tool manifest → Tier B: multilingual semantic retriever (vector search over tool registry) → Union candidates, filter through Terminology Map (user > org > system) → If confidence < 0.82 or top-2 gap < 0.15: trigger clarify_intent → Output: selected tool with confidence score + LexonRecord logged │ ▼ 6. MODEL SELECTION (from blast score + budget) → Score 0-3: Qwen local / DeepSeek (free tier) → Score 4-6: Kimi / Gemini Flash (cheap tier) → Score 7-10: Claude / GPT-4o (full tier) → Check token budget: rotate if primary is exhausted │ ▼ 7. PLANNING (score 7+ only) → Claude creates architecture plan → Codex independently counter-checks → Both get structural context from codebase-memory-mcp → Maggy shows diff in UI → User approves │ ▼ 8. EXECUTION → Polyphony provisions Docker container → Pi starts in RPC mode with selected model → Pi queries codebase-memory-mcp for context before editing → Claude Code Task agent controls Pi via RPC → Mnemos tracks fatigue (model-normalized) → If quota hits: Pi switches model, Mnemos logs switch │ ▼ 9. VERIFICATION → Tests pass in container → detect_changes: verify actual scope matches intended scope → iCPG drift check: no unintended scope drift → Code review (can use second model for independence) │ ▼ 10. DEPLOY → Changes on feature branch → PR created → Vercel preview deploy via isolated browser container → User reviews in Maggy dashboard │ ▼ 11. PROCESS LEARNING (async, post-merge) → Collect PR review comments + CodeRabbit findings → Collect CI pass/fail results for Maggy-written code → Track review rounds, time-to-merge, post-merge incidents → Update process_patterns.db, ci_patterns.db, pr_patterns.db → Feed reward registry: +0.5 first-round approval, -0.4 critical finding → Adjust policy: add pre-checks, evolve skills, tune PR sizing │ ▼ 11.5 ENGRAM PERSISTENCE (async, post-task — requires Engram, Section 15) → Mnemos scans completed task graph for high-confidence memories → Promote to EngramRecord: conventions, patterns, preferences with confidence > 0.8 → Namespace-isolate per project (project A's patterns never contaminate project B) → Apply temporal validity windows (patterns expire unless revalidated) → Track Origin: source channel, evidence count, last verified timestamp → Feed Amnesia Score diagnostic: measure retention across 7 dimensions │ ▼ 12. MESH SYNC (async, background — requires Maggy Mesh, Section 14) → Broadcast L1 score updates to connected peers (lightweight, one message per task) → Merge incoming peer data: scores weighted by sample count, patterns quarantined → Surface team-wide insights: "3 peers confirm: Claude best for auth" → Propose cross-team policy changes when backtesting passes on team-wide data → New peers receive full sync on connect — instant collective intelligence ``` --- ## 6. Project Registry ```yaml # ~/.maggy/projects.yaml projects: - name: zensurveys-backend repo: zenloopGmbH/surveys-backend path: ~/Documents/protaige/projects/zensurveys default_branch: staging-v2 vercel_session: vercel-session-A icpg: true cikg: false # not a product repo - name: zensurveys-frontend repo: zenloopGmbH/main-frontend-clean path: ~/Documents/protaige/projects/main-frontend-clean default_branch: main vercel_session: vercel-session-B icpg: true cikg: false - name: chief-of-staff repo: alinaqi/chief-of-staff path: ~/Documents/protaige/projects/chief-of-staff default_branch: main vercel_session: vercel-session-C icpg: true cikg: true # has competitive intelligence graph - name: rodcast repo: alinaqi/rodcast path: ~/Documents/AI-Playground/rodcast default_branch: main vercel_session: vercel-session-D icpg: true cikg: false ``` --- ## 7. Component Map ``` maggy/ ├── dashboard/ # Maggy v2 — web dashboard │ ├── src/ │ │ ├── api/ # FastAPI routes │ │ ├── providers/ # GitHub, Asana, Linear │ │ ├── services/ │ │ │ ├── inbox.py # AI-prioritized ticket inbox │ │ │ ├── executor.py # Execute pipeline (now via Pi) │ │ │ ├── competitor.py # Daily briefing │ │ │ ├── planner.py # NEW: dual-model planning │ │ │ ├── budget.py # NEW: token budget manager │ │ │ ├── deploy.py # NEW: isolated Vercel deploys │ │ │ ├── process.py # NEW: process intelligence (env discovery, signal collection) │ │ │ └── forge.py # NEW: MCP Forge integration (capability expansion) │ │ └── orchestrator.py # NEW: multi-project orchestrator │ └── frontend/ # React dashboard │ ├── ProjectRegistry.tsx # NEW: multi-project view │ ├── TokenBudget.tsx # NEW: usage per model │ ├── PlanReview.tsx # NEW: dual-model plan diff │ └── DeployStatus.tsx # NEW: per-project deploy │ ├── scripts/ │ ├── polyphony/ # Container orchestration │ │ ├── adapters/ │ │ │ ├── pi.py # NEW: PiAdapter (replaces claude/codex/kimi) │ │ │ ├── claude.py # DEPRECATED: kept for fallback │ │ │ ├── codex.py # DEPRECATED: kept for fallback │ │ │ └── kimi.py # DEPRECATED: kept for fallback │ │ ├── budget.py # NEW: token budget + model routing │ │ ├── runtime.py # Docker container lifecycle │ │ ├── orchestrator.py # Supervisor loop │ │ └── ... │ ├── icpg/ # Code graph (per-project) │ ├── mnemos/ # Memory + fatigue │ │ ├── fatigue.py # EXTENDED: model-normalized │ │ ├── checkpoint.py # EXTENDED: cross-model state │ │ └── ... │ ├── cikg/ # NEW: extracted from chief-of-staff │ │ ├── __init__.py │ │ ├── graph.py # KnowledgeGraphService │ │ ├── models.py # Node/Edge types │ │ └── __main__.py # CLI: cikg query/traverse/gaps │ ├── engram/ # NEW: cross-session memory persistence │ │ ├── __init__.py │ │ ├── record.py # EngramRecord schema │ │ ├── store.py # SQLite persistence + namespace isolation │ │ ├── retrieval.py # Multi-path retrieval (semantic+temporal+causal) │ │ └── diagnostics.py # Amnesia Score computation (7 dimensions) │ ├── lexon/ # NEW: semantic tool binding │ │ ├── __init__.py │ │ ├── record.py # LexonRecord schema │ │ ├── router.py # Two-tier routing (fast LLM + vector) │ │ ├── terminology.py # Terminology Map (system/org/user) │ │ ├── disambiguate.py # Confidence-gated clarification (self/user modes) │ │ └── personalization.py # Implicit learning from user behavior │ └── event_spine/ # NEW: canonical event flow │ ├── __init__.py │ ├── events.py # Typed event dataclasses (8 event types) │ ├── header.py # Common EventHeader │ ├── emitter.py # Event emission API (used by all components) │ └── store.py # SQLite append-only event log + archive │ ├── skills/ │ ├── polyphony/SKILL.md # Updated for Pi │ ├── mnemos/SKILL.md # Updated for multi-model │ ├── icpg/SKILL.md # Unchanged │ ├── code-graph/SKILL.md # codebase-memory-mcp integration │ ├── cikg/SKILL.md # NEW: competitive intelligence skill │ ├── engram/SKILL.md # NEW: cross-session memory instructions │ └── lexon/SKILL.md # NEW: tool binding instructions │ ├── templates/ │ ├── Dockerfile.polyphony # Updated: includes Pi │ ├── Dockerfile.vercel-session # NEW: Playwright + Chrome │ └── ... │ └── docs/ ├── architecture-v5.md # THIS DOCUMENT ├── polyphony-spec.md # Container orchestration spec └── mnemos-implementation.md # Memory lifecycle spec ``` --- ## 8. Migration Path | Phase | What | Depends On | |-------|------|-----------| | **Phase 1** | PiAdapter + token budget manager | Pi installed | | **Phase 2** | Model-tiered routing (blast score → model) | Phase 1 + iCPG | | **Phase 3** | Mnemos multi-model fatigue | Phase 1 | | **Phase 4** | Extract CIKG from chief-of-staff | Supabase access | | **Phase 5** | Maggy v2 multi-project UI | Phases 1-4 | | **Phase 6** | Dual-model planning (Claude + Codex) | Phase 1 | | **Phase 7** | Isolated Vercel deploy containers | Docker | | **Phase 8** | Process intelligence (env discovery + signal collection) | Phase 5 + GitHub API | | **Phase 9** | MCP Forge integration (capability expansion) | Phase 5 + mcp_forge | | **Phase 10** | Integration testing + docs | All phases | | **Phase 11** | Maggy Mesh — P2P team intelligence | Phase 5 + Phase 8 | | **Phase 12** | Engram — Cross-session memory persistence | Phase 3 + Phase 5 | | **Phase 13** | Lexon — Semantic tool binding | Phase 9 + Phase 12 | | **Phase 14** | Event Spine — Canonical event flow | Phase 12 + Phase 13 | --- ## 9. Security Considerations | Concern | Mitigation | |---------|-----------| | API keys across models | Pi's auth.json + env vars, never in code | | Container escape | Polyphony containers run unprivileged, no host network | | Vercel session theft | Each browser container has isolated Chrome profile in Docker volume | | CIKG data sensitivity | Competitive intelligence stays in Supabase with RLS | | Local model data leaks | Qwen/Ollama runs fully local, no data leaves machine | | Token budget manipulation | Budget file is local YAML, not exposed via API | --- ## 10. Core Principle — mWp (Minimum Wowable Product) Every component in this architecture must be designed to wow, not just work. > **mWp > MVP**: We don't ship "minimum viable." We ship "minimum wowable." The bar is: would this make someone stop scrolling and say "wait, how did it do that?" ### What mWp means for each component | Component | MVP (don't ship this) | mWp (ship this) | |-----------|----------------------|-----------------| | Token budget | Show remaining tokens | Auto-rotate models mid-task, user never notices the switch | | Blast radius | Show a score number | Score drives model selection, review depth, and plan complexity automatically | | CIKG validation | "3 competitors have this" | "Here's the competitive gap map, market trend alignment, and suggested positioning — before you write a line of code" | | Mnemos fatigue | "Context 80% full" | Silently checkpoints, switches models, re-injects context — user's train of thought is never interrupted | | Vercel deploy | "Run vercel deploy" | 4 projects deploy in parallel with zero auth conflicts, preview links appear in Maggy dashboard | | Code graph | "We indexed your repo" | "Maggy already knows every function, every caller, every route across all 36 projects — before you ask. It traced the blast radius in 10ms, not 10 minutes of grepping." | | Process intelligence | "Here are your CI results" | "Maggy learned that your reviewer always flags missing error handling — it added it before the PR was created. CI pass rate went from 72% to 97%. Review rounds dropped from 2.8 to 1.1. It didn't just fix the code, it fixed the process." | | Capability expansion | "We don't support that integration" | "Maggy built a Linear MCP server from the API docs, registered the tools, and pulled your sprint data — all within the same conversation." | | Dual-model planning | Two plans side by side | Conflicts highlighted, trade-offs explained, one-click approval with merged approach | ### The 5-second test for Maggy v2 A developer opens Maggy in the morning. Within 5 seconds they see: - Inbox ranked by urgency across all 4 projects - Token budget status (green/yellow/red per provider) - Active agents and their progress - Yesterday's competitive intelligence briefing - Process health: CI pass rate, review rounds trend, CodeRabbit findings trend - One-click "Execute" on any ticket with the right model auto-selected That's the wow. --- ## 11. Maggy as a Self-Improving System Maggy is not a tool that waits for instructions. It's an autonomous agent with a single objective function: **maximize user development efficiency**. It observes, measures, optimizes, and evaluates itself — continuously, without asking for permission. ### The Objective Function ``` efficiency = (value_delivered / time_spent) × quality_multiplier where: value_delivered = tickets landed + features shipped + bugs fixed time_spent = wall clock from ticket selection to merge quality_multiplier = 1.0 - (bug_escape_rate + revert_rate + incident_rate) ``` Maggy optimizes this function across all projects, all models, all workflows. Everything it does — model routing, inbox ordering, workflow tuning, fatigue management — feeds back into this single metric. ### Reward Registry Every action Maggy takes generates a reward signal. Positive rewards reinforce. Negative rewards suppress. The registry is the memory of what works. ``` ┌─────────────────────────────────────────────────────────────┐ │ REWARD REGISTRY │ │ │ │ ┌─────────────────────────────────────────────────────┐ │ │ │ POSITIVE REWARDS (reinforce) │ │ │ │ │ │ │ │ +1.0 Ticket lands without human intervention │ │ │ │ +0.8 Tests pass on first attempt │ │ │ │ +0.5 Time-to-merge below rolling average │ │ │ │ +0.3 No bug escapes at 2-week mark │ │ │ │ +0.2 User doesn't re-do the work manually │ │ │ │ +0.1 Model switch was seamless (no re-reads spike) │ │ │ └─────────────────────────────────────────────────────┘ │ │ │ │ ┌─────────────────────────────────────────────────────┐ │ │ │ NEGATIVE REWARDS (suppress) │ │ │ │ │ │ │ │ -1.0 User reverts the change │ │ │ │ -0.8 Bug escape discovered post-merge │ │ │ │ -0.5 User manually re-does the task │ │ │ │ -0.3 Tests fail after model switch │ │ │ │ -0.2 User overrides Maggy's model/routing choice │ │ │ │ -0.1 Time-to-merge above rolling average │ │ │ │ -0.1 iCPG drift detected after task completion │ │ │ │ -0.1 detect_changes shows scope exceeded intent │ │ │ └─────────────────────────────────────────────────────┘ │ │ │ │ Rewards decay: 0.95^(days_since_event) │ │ Window: 60-day rolling │ │ Cold start: hardcoded defaults until 30+ events per signal │ └─────────────────────────────────────────────────────────────┘ ``` ### Multi-Level Closed-Loop Control The previous version of this section described a flat observe → measure → adjust → evaluate loop. That's not a closed-loop system — that's batch processing with hope. A bad model routing decision on Monday would serve degraded output to every task until the weekly evaluation catches it. **Control theory insight: inner loops provide stability, outer loops provide optimization.** Level 0 keeps individual tasks from going off the rails. Level 2 keeps tools and models healthy day-to-day. Level 3 makes Maggy smarter week-over-week. Each level's output becomes an input signal for the level above it. ``` ┌──────────────────────────────────────────────────────────────┐ │ MULTI-LEVEL CLOSED-LOOP CONTROL │ │ │ │ Level 4 ─── Monthly (evolutionary) ──────────────────────── │ │ │ Sensor: cross-project trends, platform trajectory │ │ │ Actuator: new reward signals, new process patterns, │ │ │ blast→tier recalibration, exploration rate │ │ │ Bandwidth: weeks │ │ │ │ │ │ Level 3 ─── Weekly (strategic) ──────────────────────── │ │ │ │ Sensor: worst/best task patterns, score deltas, │ │ │ │ process pattern analysis, capability gaps │ │ │ │ Actuator: skill evolution, workflow step changes, │ │ │ │ model routing thresholds, MCP Forge, │ │ │ │ PR strategy, prompt patches │ │ │ │ Bandwidth: days │ │ │ │ │ │ │ │ Level 2 ─── Daily (operational) ────────────────── │ │ │ │ │ Sensor: CI pass rates, review round trends, │ │ │ │ │ CodeRabbit findings, model failure rates, │ │ │ │ │ token budget burn rate │ │ │ │ │ Actuator: pre-commit check toggles, lint rules, │ │ │ │ │ model enable/disable, routing weights │ │ │ │ │ Bandwidth: hours │ │ │ │ │ │ │ │ │ │ Level 1 ─── Task (post-completion) ───────────── │ │ │ │ │ │ Sensor: task reward score, CI results, │ │ │ │ │ │ iCPG drift, detect_changes scope, │ │ │ │ │ │ review comments on PR │ │ │ │ │ │ Actuator: update model scores, log process │ │ │ │ │ │ signals, update fatigue profile │ │ │ │ │ │ Bandwidth: minutes │ │ │ │ │ │ │ │ │ │ │ │ Level 0 ─── Real-time (within task) ──────────│ │ │ │ │ │ │ Sensor: tool success/fail, test pass/fail, ││ │ │ │ │ │ │ lint errors, Pi RPC events, ││ │ │ │ │ │ │ model response quality, fatigue ││ │ │ │ │ │ │ Actuator: switch model, retry with context, ││ │ │ │ │ │ │ adjust verification depth, ││ │ │ │ │ │ │ abort + re-plan, checkpoint ││ │ │ │ │ │ │ Bandwidth: seconds ││ │ │ │ │ │ └───────────────────────────────────────────────┘│ │ │ │ │ └──────────────────────────────────────────────────┘│ │ │ │ └─────────────────────────────────────────────────────┘│ │ │ └────────────────────────────────────────────────────────┘│ │ └───────────────────────────────────────────────────────────┘│ └──────────────────────────────────────────────────────────────┘ Signal cascade (inner → outer): L0 events aggregate into → L1 task reward L1 task rewards aggregate into → L2 daily trends L2 daily trends feed → L3 weekly pattern analysis L3 weekly patterns feed → L4 monthly trajectory ``` #### Level 0 — Real-Time (Within Task Execution) This is the **stability loop** — the most critical and currently missing level. It keeps individual tasks from going off the rails *as they happen*, not after the damage is done. ``` ┌──────────────────────────────────────────────────────────────┐ │ LEVEL 0 — REAL-TIME CONTROL (seconds) │ │ │ │ Pi agent executing task inside Polyphony container │ │ │ │ │ ├── Tool call fails (file not found, API error) │ │ │ → Retry with adjusted path/params (not new model) │ │ │ → If 3 consecutive fails: escalate model tier │ │ │ │ │ ├── Test fails during TDD green phase │ │ │ → Analyze error: syntax? logic? missing import? │ │ │ → If model is struggling (3+ failed attempts): │ │ │ checkpoint + switch to higher-tier model │ │ │ │ │ ├── Lint error on written code │ │ │ → Auto-fix (ruff --fix / eslint --fix) │ │ │ → If pattern repeats: flag for L2 (add pre-check) │ │ │ │ │ ├── Fatigue signal crosses threshold │ │ │ → Mnemos auto-checkpoint │ │ │ → If mid-task: consolidate context, continue │ │ │ → If near completion: push through, checkpoint after│ │ │ │ │ ├── Model response quality degrades │ │ │ → Detected by: repeated re-reads, circular edits, │ │ │ tool calls that undo previous tool calls │ │ │ → Action: checkpoint + model switch immediately │ │ │ │ │ └── Scope drift detected (iCPG) │ │ → Agent touching files outside blast radius │ │ → Action: warn → constrain → abort if persistent │ │ │ │ All L0 events are logged to signals.jsonl with timestamps. │ │ They aggregate into the L1 task reward score. │ └──────────────────────────────────────────────────────────────┘ ``` **Why L0 matters more than any weekly patch:** If Maggy can detect mid-task that the current model is struggling and switch to a stronger one *within seconds*, that's worth more than a hundred policy adjustments. A user whose task fails experiences -1.0 reward. A user whose task recovers mid-flight via model switch experiences +0.1. The delta between "fail and retry tomorrow" and "hiccup and recover" is the entire product experience. **L0 signal types:** | Signal | Detection Method | Response Time | Action | |--------|-----------------|---------------|--------| | Tool failure | Pi RPC error event | < 1s | Retry with adjusted params | | Test failure | Exit code from test runner | < 5s | Analyze, fix, or escalate model | | Lint error | ruff/eslint output on written code | < 2s | Auto-fix or flag for L2 | | Fatigue spike | Mnemos threshold breach | < 1s | Checkpoint, consolidate, or switch | | Quality degradation | Circular edits, re-reads, undo patterns | ~30s | Checkpoint + model switch | | Scope drift | iCPG blast radius check on file access | < 1s | Warn → constrain → abort | | Model quota hit | Pi RPC quota/rate error | < 1s | Fallback chain activation | #### Level 1 — Task (Post-Completion, Minutes) After each task completes, compute the task reward score and update the per-model, per-task-type scores. This is the **learning loop** — every completed task teaches Maggy something. ``` Task completes (PR created or code landed) │ ├── Compute task reward from L0 signals: │ reward = Σ(signal_weight × signal_value) │ adjusted for: model used, blast tier, task type │ ├── Update model_scores.db: │ (claude, auth, high) → new running average │ ├── Update fatigue_profile: │ session duration, checkpoint timing, recovery reads │ ├── Log L0 events summary → L2 aggregation: │ "3 tool retries, 1 model switch, 0 scope drifts" │ └── Emit task_completed event → Maggy dashboard ``` #### Level 2 — Daily (Operational, Hours) Runs on a daily schedule (or triggered when a threshold is breached). Catches degradation before it compounds. This is the **operational health loop**. ``` Daily aggregation job: │ ├── CI pass rate today vs 7-day average │ → If dropped >10%: disable the model causing failures │ ├── Review rounds today vs 7-day average │ → If increased: check which code patterns are new │ ├── CodeRabbit critical findings today │ → If >0 on Maggy-written code: add pattern to pre-check │ ├── Model failure rate by tier │ → If a model's L0 failure signals spike: demote it │ ├── Token budget burn rate │ → If burning faster than expected: adjust routing to cheaper tier │ └── Emergency trigger: if any metric drops >15% in one day → Halt exploration, revert last policy change, alert ``` **Why L2 exists separately from L3:** A weekly batch can't catch a model that started failing on Tuesday. By Friday, that's 3 days of degraded tasks, 3 days of negative rewards accumulating. L2's daily check catches it within hours and disables the failing model before the damage compounds. #### Level 3 — Weekly (Strategic, Days) The deliberate optimization loop. Analyzes patterns across the week, proposes and applies policy changes with rollback windows. This is where skill evolution, workflow step changes, and MCP Forge generation happen. ``` Weekly strategic analysis: │ ├── Worst 10 tasks this week: what went wrong? │ → Common patterns → skill file patches │ → Recurring reviewer comments → add to review prevention │ ├── Best 10 tasks this week: what went right? │ → Reinforce: model, workflow, blast tier settings │ ├── Score deltas from last week's modifications │ → delta < -0.2: auto-revert │ → delta > +0.2: reinforce + expand to similar task types │ ├── Process pattern analysis │ → New (code_pattern, review_feedback) entries │ → PR sizing effectiveness │ → CI failure patterns │ ├── Capability gap analysis │ → Top unresolvable requests → trigger MCP Forge │ └── Exploration candidates → Select 10% of low-blast task types for next week's exploration ``` #### Level 4 — Monthly (Evolutionary, Weeks) The meta-optimization loop. Evaluates whether the control system itself is improving. Changes the reward signals, recalibrates tier boundaries, adjusts exploration rates. This is the loop that improves the improvement process. ``` Monthly evolution review: │ ├── Cross-project patterns │ → Are skills learned in project A useful in project B? │ → Promote project-specific skills to global skills │ ├── Reward signal effectiveness │ → Is any signal consistently noisy? Reduce its weight │ → Is a new signal needed? (e.g., deploy success rate) │ → Add, remove, or reweight signals │ ├── Tier boundary recalibration │ → If blast 4-6 tasks are consistently handled well by │ the cheap tier, lower the threshold: 0-4 = cheap │ → If blast 3 tasks keep failing on cheap models, │ raise it: 0-2 = cheap, 3+ = medium │ ├── Exploration rate adjustment │ → If exploration success rate > 40%: increase to 15% │ → If exploration success rate < 10%: decrease to 5% │ ├── Control loop tuning │ → Is L2 catching issues that should be caught at L0? │ → Are L0 model switches too aggressive or too cautious? │ → Adjust L0 thresholds based on L1 outcome data │ └── Platform trajectory → Efficiency trend: improving, flat, or declining? → If flat for 2+ months: the system has saturated current strategy — try structural change ``` #### Signal Cascade — How Levels Feed Each Other ``` ┌──────────────────────────────────────────────────────────────┐ │ SIGNAL CASCADE │ │ │ │ L0: tool_fail, test_fail, lint_error, model_switch │ │ │ (raw events, seconds) │ │ ▼ │ │ L1: task_reward = f(L0_signals) │ │ │ model_score[claude, auth, 8] += task_reward │ │ │ (per-task aggregation, minutes) │ │ ▼ │ │ L2: daily_ci_rate = mean(L1.ci_pass for today) │ │ │ daily_model_health[claude] = mean(L1.rewards for claude) │ │ │ (daily aggregation, hours) │ │ │ ACTION: disable model if health < threshold │ │ ▼ │ │ L3: weekly_pattern = cluster(L2.failures + L1.review_comments│ │ │ score_delta = this_week.reward - last_week.reward │ │ │ (weekly analysis, days) │ │ │ ACTION: evolve skills, adjust routing, trigger Forge │ │ ▼ │ │ L4: monthly_trajectory = trend(L3.score_deltas) │ │ reward_signal_weights = recalibrate(L3.signal_noise) │ │ (monthly meta-analysis, weeks) │ │ ACTION: change reward function itself, adjust L0-L3 │ │ │ │ Key: outer loops NEVER override inner loop stability. │ │ L3 can change routing policy, but L0 still catches in-task │ │ failures regardless of what L3 decided. │ └──────────────────────────────────────────────────────────────┘ ``` ### What Gets Optimized (and How) #### 1. Model Routing Maggy tracks reward per `(model × task_type × blast_tier)` triple: ``` reward_table: (claude, auth, high): +0.92 ← claude is great at auth (claude, docs, low): +0.40 ← claude works but wasteful (qwen, docs, low): +0.85 ← qwen is faster + free (qwen, auth, medium): -0.30 ← qwen failed auth tasks (gpt-4o, frontend, medium):+0.78 ← gpt-4o is strong on frontend (kimi, tests, low): +0.70 ← kimi writes good tests cheaply ``` Maggy routes new tasks to the model with the highest reward for that `(task_type, blast_tier)`. No human in the loop — the reward table decides. If a model has no data for a task type, Maggy uses the tier default (hardcoded) until it collects 30+ data points. #### 2. Inbox Ordering Inbox priority is a weighted score that Maggy continuously adjusts: ```python priority = ( w_urgency * urgency_score + w_okr * okr_alignment + w_recency * recency + w_type * type_weight[ticket.type] + w_project * project_weight[ticket.project] ) ``` The weights (`w_urgency`, `w_okr`, etc.) are updated based on which tickets the user actually executes first. If the user consistently picks security tickets despite Maggy ranking them 5th, the type weight for security increases automatically. Not because Maggy asked — because the reward signal said "user overrode my ranking" (-0.2) and Maggy's adjustment brought the ranking closer to what the user actually does. #### 3. Workflow Steps Some workflow steps add value, some don't. Maggy measures reward per step: ``` workflow_rewards: codex_counter_check: blast_0_3: -0.1 # adds latency, never catches issues blast_4_6: +0.2 # catches real issues sometimes blast_7_10: +0.6 # catches critical issues often icpg_drift_check: all_tiers: +0.4 # consistently prevents regressions high_tier_post_review: after_qwen: +0.7 # catches qwen mistakes frequently after_kimi: +0.3 # kimi output is cleaner, fewer catches after_claude: +0.0 # reviewing claude with claude is redundant ``` Maggy skips steps with consistently negative reward. No permission needed — if Codex counter-check never catches issues on blast < 3, it gets dropped from that tier. If it starts catching issues again (maybe the codebase grew more complex), the reward changes and it gets re-enabled. #### 4. Fatigue Thresholds Different users fatigue differently. Maggy learns the user's fatigue curve: ``` fatigue_profile: avg_productive_session_minutes: 47 pre_checkpoint_optimal_minutes: 42 model_switch_recovery_reads: 3.2 # avg re-reads after switch best_model_for_recovery: gpt-4o # fastest context rebuild ``` Maggy pre-checkpoints at 42 minutes (not at the generic 0.60 threshold) because it learned this user's fatigue pattern. No question asked — the reward signal showed that checkpoints at 42 minutes led to better post-checkpoint output (+0.3 reward) than checkpoints at 50 minutes (-0.2 reward from quality drop). #### 5. Process Intelligence — Learning from the Full SDLC Maggy doesn't just optimize code output. It optimizes the **entire development process** by observing what happens to code after it's written: PR reviews, CI results, CodeRabbit findings, reviewer feedback, merge patterns, and post-deploy incidents. ##### 5a. Environment Discovery On first run per project, Maggy auto-discovers the developer's workflow. No configuration — it reads what's already there. ``` ┌──────────────────────────────────────────────────────────────┐ │ ENVIRONMENT DISCOVERY (auto, per project) │ │ │ │ Ticketing: │ │ gh api repos/{owner}/{repo}/issues → GitHub Issues? │ │ .asana.yml / .linear/* / jira.config → which tracker? │ │ Maggy Inbox providers config → already connected? │ │ │ │ GitHub Integrations: │ │ gh api repos/{owner}/{repo}/hooks → webhooks │ │ gh api repos/{owner}/{repo}/installation → GitHub Apps │ │ PR comment authors → detect bots: coderabbitai[bot], │ │ dependabot[bot], renovate[bot], github-actions[bot] │ │ │ │ CI/CD: │ │ .github/workflows/*.yml → GitHub Actions │ │ Jenkinsfile / .circleci/ / .gitlab-ci.yml → other CI │ │ gh api repos/{owner}/{repo}/actions/runs → run history │ │ │ │ Code Quality: │ │ .eslintrc* / ruff.toml / .prettierrc → lint config │ │ mypy.ini / tsconfig.json → type checking │ │ .pre-commit-config.yaml → pre-commit hooks │ │ codecov.yml / .nycrc → coverage config │ │ │ │ Review Process: │ │ gh api repos/{owner}/{repo}/branches/{b}/protection │ │ → required reviewers, status checks, merge rules │ │ CODEOWNERS → who reviews what │ │ Average PR review rounds from git history │ │ │ │ Output: ~/.maggy/environments/{project}.yaml │ └──────────────────────────────────────────────────────────────┘ ``` ```yaml # ~/.maggy/environments/zensurveys-backend.yaml (auto-generated) ticketing: github_issues github_integrations: - coderabbitai # CodeRabbit AI reviews - dependabot # dependency updates - vercel # preview deploys ci: provider: github_actions workflows: - test.yml # pytest + coverage - lint.yml # ruff + mypy - deploy.yml # staging deploy lint: python: [ruff, mypy] config_files: [ruff.toml, mypy.ini] review: required_approvals: 1 codeowners: true branch_protection: staging-v2 ``` ##### 5b. Process Signal Collection Maggy subscribes to signals from every stage of the SDLC pipeline: ``` ┌─────────────────────────────────────────────────────────────┐ │ PROCESS SIGNALS (collected per PR / per task) │ │ │ │ ┌─── REVIEW SIGNALS ────────────────────────────────────┐ │ │ │ │ │ │ │ PR reviewer comments (human) │ │ │ │ → "missing error handling in /api/surveys" │ │ │ │ → "this should be a transaction" │ │ │ │ → "add tests for edge case" │ │ │ │ │ │ │ │ CodeRabbit findings (automated) │ │ │ │ → severity: critical/warning/suggestion │ │ │ │ → category: security/performance/style/bug │ │ │ │ → file + line + specific suggestion │ │ │ │ │ │ │ │ Review rounds │ │ │ │ → PR needed 3 rounds before approval │ │ │ │ → First round had 8 comments, second had 2 │ │ │ │ │ │ │ └────────────────────────────────────────────────────────┘ │ │ │ │ ┌─── CI SIGNALS ────────────────────────────────────────┐ │ │ │ │ │ │ │ GitHub Actions results │ │ │ │ → test.yml: PASS (42s) │ │ │ │ → lint.yml: FAIL — ruff: 3 errors, mypy: 1 error │ │ │ │ → deploy.yml: PASS (preview URL generated) │ │ │ │ │ │ │ │ Failure patterns │ │ │ │ → lint failures in files Maggy touched │ │ │ │ → test failures from code Maggy wrote │ │ │ │ → flaky tests (pass/fail on same code) │ │ │ │ │ │ │ └────────────────────────────────────────────────────────┘ │ │ │ │ ┌─── POST-MERGE SIGNALS ────────────────────────────────┐ │ │ │ │ │ │ │ Revert within 48h → code was bad │ │ │ │ Hotfix within 7d → code had latent bug │ │ │ │ Incident linked to PR → production impact │ │ │ │ Dependency alert (Dependabot/Renovate) → stale deps │ │ │ │ │ │ │ └────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────┘ ``` New reward signals for the registry: ``` PROCESS REWARD SIGNALS +0.5 PR approved on first review round +0.3 CI passes on first push (no re-push needed) +0.2 CodeRabbit: zero critical/warning findings +0.1 PR merged within 24h of creation -0.8 PR reverted within 48h -0.5 CI fails on Maggy-written code (lint or test) -0.4 CodeRabbit critical finding on Maggy-written code -0.3 PR requires 3+ review rounds -0.2 Reviewer flags same issue type Maggy was warned about before -0.1 CodeRabbit warning finding on Maggy-written code ``` ##### 5c. Process Learning Maggy tracks patterns across three dimensions: **Code Pattern → Review Feedback:** ``` process_patterns.db: (api_route, missing_error_handling): occurrences: 7 reviewers: ["alice", "coderabbitai"] fix_pattern: "add try/except with proper HTTP error codes" → LEARNED: always add error handling to API routes (database_query, missing_transaction): occurrences: 4 reviewers: ["bob"] fix_pattern: "wrap multi-table writes in transaction" → LEARNED: multi-table writes need transactions (test_file, missing_edge_case): occurrences: 12 reviewers: ["alice", "bob", "coderabbitai"] fix_pattern: "test empty input, null, boundary values" → LEARNED: always test edge cases (empty, null, boundary) ``` **File → CI Failure:** ``` ci_patterns.db: src/api/surveys.py: lint_failures: 5 (ruff E501, E741) type_errors: 2 (mypy: missing return type) → LEARNED: this file needs strict lint pre-check tests/test_integration.py: flaky_rate: 0.15 (fails 15% of runs on same code) → LEARNED: mark as flaky, don't block on single failure src/services/auth.py: ci_failures: 0 in 30 days → LEARNED: auth code is well-tested, low CI risk ``` **PR Characteristics → Merge Velocity:** ``` pr_patterns.db: (size < 200 lines, single_concern): avg_review_rounds: 1.2 avg_time_to_merge: 4h → LEARNED: small focused PRs merge fast (size > 500 lines, multi_concern): avg_review_rounds: 3.1 avg_time_to_merge: 48h → LEARNED: split large PRs into stacked PRs (has_tests, covers_new_code): approval_rate_first_round: 0.78 → LEARNED: tests increase first-round approval (no_tests, new_feature): reviewer_comment_rate: 0.95 most_common: "please add tests" → LEARNED: never submit new features without tests ``` ##### 5d. Process Optimization — What Maggy Changes Based on learned patterns, Maggy autonomously adjusts its own behavior: | What Changes | Based On | Example | |-------------|---------|---------| | **Pre-task lint** | CI failure patterns | Maggy runs `ruff check` + `mypy` on its output before committing — prevents CI failures it has seen before | | **Skill evolution** | Recurring review comments | If reviewers flag "missing error handling" 7 times, Maggy adds the pattern to its skill files — future code includes error handling by default | | **PR sizing** | Merge velocity data | If PRs > 500 lines take 3x longer to merge, Maggy splits tasks into stacked PRs automatically | | **Test generation** | Reviewer feedback | If "add tests" is the most common review comment, Maggy ensures every PR includes tests for new code | | **CodeRabbit pre-check** | CodeRabbit finding patterns | If CodeRabbit consistently flags the same security issue, Maggy pre-validates against that pattern before pushing | | **Commit hygiene** | CI config + branch rules | Maggy matches commit message format, branch naming, and PR template to whatever the project enforces | ```yaml # Added to ~/.maggy/policy.yaml process: pre_commit_checks: ruff: true # learned: lint failures cost -0.5 mypy: true # learned: type errors caught by CI test_coverage_min: 80 # learned: PRs without coverage get rejected pr_strategy: max_lines: 400 # learned: optimal size for this team stacked_prs: true # learned: large changes split = faster merge require_tests: true # learned: "add tests" is #1 review comment review_prevention: error_handling_api_routes: true # learned from 7 review comments transaction_multi_writes: true # learned from 4 review comments edge_case_tests: true # learned from 12 review comments coderabbit_precheck: security_scan: true # learned: CodeRabbit catches these unused_imports: true # learned: CodeRabbit flags these ``` ##### 5e. The Process Intelligence Flywheel ``` ┌──────────────────────────────────────────────────────────────┐ │ PROCESS INTELLIGENCE FLYWHEEL │ │ │ │ Week 1: Maggy discovers environment, starts collecting │ │ → Sees 5 lint failures, 3 "add tests" comments │ │ → Learns: run lint before push, always include tests │ │ │ │ Week 2: Maggy applies learned patterns │ │ → Lint failures drop to 0 (pre-checked) │ │ → "Add tests" comments drop to 1 (edge case missed) │ │ → Review rounds drop from 2.8 to 1.6 avg │ │ │ │ Week 4: Maggy has enough data for deeper patterns │ │ → Learns that PRs touching auth need 2 reviewers │ │ → Learns that Friday PRs take 2x longer to merge │ │ → Starts scheduling auth PRs for Monday-Wednesday │ │ │ │ Week 8: Maggy evolves its own skills │ │ → Writes new lint rules based on recurring review comments │ │ → Generates pre-commit hooks for patterns that always fail │ │ → Review round avg: 1.1 (down from 2.8) │ │ → CI first-pass rate: 97% (up from 72%) │ │ → Time-to-merge: 6h avg (down from 36h) │ │ │ │ The wow: Maggy didn't just write better code. │ │ It made the entire development process faster. │ └──────────────────────────────────────────────────────────────┘ ``` #### 6. Capability Expansion — MCP Forge Integration When Maggy encounters a capability gap — a workflow integration that doesn't exist — it doesn't stop. It builds one. **Source:** MCP Forge (`~/Documents/protaige/mcp_forge`) generates TypeScript MCP servers from API documentation. ``` Maggy task requires Mailchimp subscriber data │ ├── search existing MCP tools → no Mailchimp tool found │ ├── Forge: search registry (500+ APIs) → Mailchimp API found │ ├── Forge: generate MCP server │ → TypeScript MCP server with validated tool schemas │ → Tools: list_segments, get_subscribers, campaign_stats │ ├── Register new tools with Pi agent's MCP config │ ├── Execute original task using new tools │ └── Reward signal: did it work? → +1.0: task completed with new tool → -0.5: tool generated but failed at runtime ``` **Weekly gap analysis:** ``` capability_gaps.db: This week's unresolvable requests: "check Linear sprint progress" → 8 occurrences "pull Slack channel activity" → 5 occurrences "get Figma design specs" → 3 occurrences Top 3 gaps → trigger Forge generation: 1. Linear MCP server (sprint, issues, labels) 2. Slack MCP server (channels, messages, threads) 3. Figma MCP server (files, components, comments) After generation: capability surface grows autonomously. Hibernation policy: tools with < 3 uses in 14 days → disabled. ``` ### Self-Evaluation Maggy evaluates its own optimization quality on a weekly cycle: ``` ┌──────────────────────────────────────────────────────────┐ │ MAGGY SELF-EVALUATION (weekly) │ │ │ │ Efficiency trend: │ │ Week 1: 2.3 tickets/day, 0.92 quality multiplier │ │ Week 2: 2.7 tickets/day, 0.94 quality multiplier ↑ │ │ Week 3: 3.1 tickets/day, 0.91 quality multiplier ↑↓ │ │ Week 4: 3.0 tickets/day, 0.95 quality multiplier →↑ │ │ │ │ Adjustments this week: 6 │ │ ✓ Promoted kimi for test-writing (reward +0.7) │ │ ✓ Dropped codex review for blast < 3 (reward +0.1) │ │ ✗ Tried qwen for API routes — auto-rolled back │ │ (reward -0.4, 2 bug escapes detected at day 12) │ │ ✓ Pre-checkpoint moved to 40min (reward +0.3) │ │ ✓ Added error handling to API routes (review feedback) │ │ ✓ Enabled ruff pre-check (CI failure prevention) │ │ │ │ Process intelligence: │ │ CI first-pass rate: 94% (up from 72% at week 1) │ │ Review rounds avg: 1.3 (down from 2.8 at week 1) │ │ CodeRabbit critical findings: 0 (down from 4 at week 1)│ │ Capability gaps filled: 2 (Linear, Slack via Forge) │ │ │ │ Auto-rollbacks this week: 1 │ │ qwen for API routes: reverted to kimi after 3 failures │ │ │ │ Overall efficiency delta: +18% vs 4 weeks ago │ └──────────────────────────────────────────────────────────┘ ``` When an adjustment makes things worse, Maggy doesn't wait for the user to notice. It detects the reward drop and **auto-rolls back**. When an adjustment works, it reinforces and looks for similar task types to expand to. ### Exploration vs Exploitation Maggy needs to try new things (exploration) while mostly doing what works (exploitation): ``` exploration_rate: 0.10 # 10% of tasks try a new model/workflow # 90% use the current best policy exploration_rules: - Never explore on blast >= 7 (too risky) - Never explore on security/concurrency tasks - Explore on docs, tests, low-blast refactors (low cost of failure) - If exploration succeeds 3x in a row, promote to exploitation - If exploration fails 2x in a row, abandon and try different hypothesis ``` ### Storage ``` ~/.maggy/ reward_registry.db # SQLite: (action, context, reward, timestamp) model_scores.db # SQLite: (model, task_type, blast_tier, reward_avg, n_samples) workflow_scores.db # SQLite: (workflow_step, tier, reward_avg, n_samples) process_patterns.db # SQLite: (code_pattern, review_feedback, occurrences, fix_pattern) ci_patterns.db # SQLite: (file, failure_type, count, flaky_rate) pr_patterns.db # SQLite: (size_bucket, concern_count, avg_rounds, avg_merge_time) capability_gaps.db # SQLite: (request_type, occurrences, forge_status, tool_name) improvement_ledger.db # SQLite: all self-modifications with config snapshots + backtesting task_history.db # SQLite: every task with L0 events, reward, CI/review outcomes fatigue_profile.yaml # Learned fatigue curve for this user policy.yaml # Current active policy (model routing, inbox weights, process rules) policy_history/ # Timestamped snapshots for rollback (also in ledger.db) self_eval.jsonl # Weekly self-evaluation log environments/ # Auto-discovered per-project workflow configs mesh.yaml # Mesh config (org_key, port, manual peers) mesh_state.db # SQLite: peer registry, sync timestamps, message log peer_id # This instance's stable UUID (generated on install) quarantine.db # Patterns from peers awaiting local validation engram.db # SQLite: EngramRecords with namespace, origin, confidence, temporal validity engram_namespaces.yaml # Per-project namespace config (isolation boundaries) lexon.db # SQLite: LexonRecords, terminology map entries, personalization data lexon_embeddings/ # Tool registry vector index (multilingual) events.db # SQLite: append-only Event Spine log (all 8 event types) events_archive/ # Compressed JSONL archives for events older than 90 days ``` ```yaml # ~/.maggy/policy.yaml (Maggy-managed, not user-edited) version: 47 # auto-incremented on every policy update updated_at: "2026-05-10T03:00:00Z" model_routing: blast_0_3: primary: qwen-local except: api_routes: kimi # learned: qwen bad at API routes auth: claude # override: security dimension >= 2 blast_4_6: primary: kimi post_review: true # high-tier spot check on output blast_7_10: primary: claude fallback: gpt-4o counter_check: codex # dual-model planning inbox_weights: urgency: 0.30 okr_alignment: 0.20 recency: 0.15 type: security: 1.8 bug: 1.2 feature: 1.0 docs: 0.6 project: zensurveys-backend: 1.3 # learned: user prioritizes this project chief-of-staff: 1.0 rodcast: 0.8 workflow: codex_counter_check: enabled_above_blast: 5 # learned: no value below 5 pre_checkpoint_minutes: 40 # learned: user's fatigue curve exploration_rate: 0.10 process: pre_commit_checks: ruff: true # learned: CI catches these mypy: true # learned: type errors in CI test_coverage_min: 80 # learned: PRs without coverage rejected pr_strategy: max_lines: 400 # learned: optimal for this team stacked_prs: true # learned: faster merge for large changes require_tests: true # learned: #1 review comment is "add tests" review_prevention: # patterns learned from reviewer feedback error_handling_api_routes: true transaction_multi_writes: true edge_case_tests: true coderabbit_precheck: # patterns learned from CodeRabbit security_scan: true unused_imports: true scheduling: avoid_friday_auth_prs: true # learned: Friday auth PRs take 2x to merge forge: auto_expand: true # generate new MCP tools for capability gaps hibernation_days: 14 # disable unused forge tools after 14 days min_gap_requests: 5 # require 5+ requests before triggering forge ``` ### Optimization Targets Mapped to Control Levels Each optimization target from Sections 1-6 now maps to a specific control level: | Target | L0 (seconds) | L1 (minutes) | L2 (hours) | L3 (days) | L4 (weeks) | |--------|:---:|:---:|:---:|:---:|:---:| | **1. Model routing** | Switch on failure/fatigue | Update (model,task,tier) score | Disable failing model | Adjust tier boundaries | Recalibrate blast→tier map | | **2. Inbox ordering** | — | — | — | Adjust type/project weights | Reweight signals | | **3. Workflow steps** | — | Log step value for task | — | Enable/disable steps by tier | Add/remove signal types | | **4. Fatigue** | Checkpoint on threshold | Update fatigue profile | — | Adjust checkpoint timing | Tune L0 thresholds | | **5. Process intelligence** | Lint before commit | Log CI/review signals | Toggle pre-checks | Evolve skills from patterns | Recalibrate process signals | | **6. Capability expansion** | — | Log capability gap | — | Forge top 3 gaps | Prune/archive unused tools | **L0 handles stability** (don't let a task fail). **L1-L2 handle health** (don't let bad patterns accumulate). **L3-L4 handle strategy** (make the system smarter over time). ### Improvement Ledger — Full Auditability + Backtesting Every self-modification Maggy makes is recorded in the improvement ledger with full state snapshots. This serves three purposes: auditability (what changed and why), rollback (revert any change), and **backtesting** (would a policy have worked better on historical data?). #### Ledger Schema ```sql -- ~/.maggy/improvement_ledger.db CREATE TABLE modifications ( id INTEGER PRIMARY KEY, timestamp TEXT NOT NULL, control_level INTEGER NOT NULL, -- 0-4 category TEXT NOT NULL, -- model_routing, process, workflow, etc. description TEXT NOT NULL, -- human-readable what changed reasoning TEXT NOT NULL, -- why the change was made (signal data) config_before TEXT NOT NULL, -- full policy.yaml snapshot (JSON) config_after TEXT NOT NULL, -- full policy.yaml snapshot (JSON) score_before REAL, -- avg reward in measurement window before score_after REAL, -- avg reward in measurement window after delta REAL, -- score_after - score_before status TEXT DEFAULT 'active', -- active, rolled_back, superseded rolled_back_at TEXT, -- timestamp if reverted rollback_reason TEXT -- why it was reverted ); CREATE TABLE task_history ( id INTEGER PRIMARY KEY, timestamp TEXT NOT NULL, project TEXT NOT NULL, task_type TEXT NOT NULL, -- auth, api_route, test, docs, etc. blast_tier INTEGER NOT NULL, -- 0-10 model_used TEXT NOT NULL, policy_version INTEGER NOT NULL, -- which policy was active l0_events TEXT NOT NULL, -- JSON array of L0 signals l1_reward REAL NOT NULL, -- computed task reward ci_passed BOOLEAN, review_rounds INTEGER, coderabbit_findings INTEGER, time_to_merge_h REAL, reverted BOOLEAN DEFAULT FALSE, bug_escape BOOLEAN DEFAULT FALSE ); ``` #### Backtesting: "Would This Policy Have Worked?" Before deploying a L3/L4 policy change, Maggy can **replay historical tasks** against the proposed policy to predict the outcome: ``` ┌──────────────────────────────────────────────────────────────┐ │ BACKTEST: proposed policy v48 vs current policy v47 │ │ │ │ Replaying 200 tasks from last 30 days... │ │ │ │ Proposed change: route blast 3 tasks to qwen instead of kimi │ │ │ │ Historical tasks at blast 3 (n=47): │ │ Under kimi (actual): │ │ avg reward: +0.62 │ │ CI pass rate: 91% │ │ review rounds: 1.4 │ │ │ │ Under qwen (backtest simulation): │ │ predicted reward: +0.38 ← LOWER │ │ predicted CI pass rate: 78% ← based on qwen's L0 data │ │ predicted review rounds: 2.1 ← based on qwen's L1 data │ │ │ │ VERDICT: DO NOT APPLY — backtest predicts -0.24 reward drop │ │ │ │ Alternative explored: route blast 1-2 to qwen, keep 3 on │ │ kimi. Backtest on blast 1-2 tasks (n=31): │ │ kimi actual: +0.58 │ │ qwen predicted: +0.71 ← HIGHER (simpler tasks = qwen OK) │ │ │ │ VERDICT: APPLY partial — blast 1-2 to qwen, blast 3 stays │ └──────────────────────────────────────────────────────────────┘ ``` **How backtesting works:** 1. **Query `task_history`** for all tasks matching the target criteria (e.g., blast tier, task type) 2. **For each historical task**, look up the proposed model's performance on similar `(task_type, blast_tier)` combinations from `model_scores.db` 3. **Predict reward** using the proposed model's historical L0 signals (failure rate, lint errors, test pass rate) on similar tasks 4. **Compare** predicted vs actual reward across the full set 5. **Decision**: apply if predicted delta > +0.1, reject if < -0.1, flag for exploration if between **Backtesting is required for L3 and L4 changes.** L0-L2 changes are reactive (stability and health) and don't need backtesting — they respond to immediate signals. L3-L4 changes are strategic and can be validated against historical data first. #### Auto-Seeding: Maggy Bootstraps Herself Maggy has Pi agents. She has access to Claude, Codex, Kimi, Qwen — whatever models are configured. There is no reason for a manual `maggy seed` command. The moment a project is registered in `~/.maggy/projects.yaml`, Maggy spawns a Pi agent to analyze the project's history and seed her own databases. No user action required. ``` ┌──────────────────────────────────────────────────────────────┐ │ AUTO-SEED (triggered on project registration) │ │ │ │ 1. Maggy detects new project in registry │ │ │ │ │ 2. Spawns Pi agent (cheapest available model — qwen/kimi) │ │ Task: "Analyze project history and extract patterns" │ │ │ │ │ 3. Agent executes via gh CLI + git log: │ │ │ │ │ ├── gh pr list --state merged --limit 200 --json │ │ │ → PR sizes, review rounds, time-to-merge │ │ │ → Reviewers, approval patterns │ │ │ │ │ ├── gh pr view {n} --comments --json │ │ │ → Review comments categorized by pattern │ │ │ → CodeRabbit findings by severity + category │ │ │ → Bot authors detected (coderabbitai, dependabot) │ │ │ │ │ ├── gh api repos/{owner}/{repo}/actions/runs │ │ │ → CI pass/fail rates per workflow │ │ │ → Failure patterns per file │ │ │ → Flaky test detection │ │ │ │ │ ├── git log --format='%H %s' --since='6 months ago' │ │ │ → Revert detection (commit messages with "revert") │ │ │ → Commit patterns, branch naming conventions │ │ │ │ │ ├── codebase-memory-mcp: get_architecture + search_graph │ │ │ → Module structure, hot files, dependency depth │ │ │ → Fan-out scores for initial blast radius calibration │ │ │ │ │ └── Environment discovery (Section 5a) │ │ → Ticketing, CI, lint, review process auto-detected │ │ │ │ 4. Agent writes structured analysis to Maggy's databases: │ │ process_patterns.db: seeded with review comment patterns │ │ ci_patterns.db: seeded with CI failure history │ │ pr_patterns.db: seeded with merge velocity data │ │ task_history.db: synthetic entries from git log │ │ environments/{project}.yaml: workflow config │ │ │ │ 5. Agent computes initial policy.yaml from patterns: │ │ → "PRs > 400 lines take 3x review rounds → set max 400" │ │ → "ruff failures in 40% of PRs → enable pre-check" │ │ → "auth files have 0% CI failures → low risk" │ │ → "CodeRabbit flags unused imports 60% of PRs → pre-fix" │ │ │ │ 6. Maggy logs seed as modification #1 in improvement_ledger │ │ config_before: empty (default policy) │ │ config_after: data-derived initial policy │ │ score_before: null (no baseline) │ │ → All future modifications measured against this seed │ │ │ │ Total cost: ~$0.10-0.50 on a cheap model (one-time) │ │ Total time: background task, user doesn't wait │ │ User action required: zero │ └──────────────────────────────────────────────────────────────┘ ``` **Why this works:** The seed analysis is exactly the kind of task cheap models are good at — structured data extraction, pattern counting, statistical aggregation. No creative reasoning needed. Qwen local can do it for free. And the Pi agent already has all the tools: `gh` CLI for GitHub data, `git` for history, codebase-memory-mcp for structural analysis. **Why manual seed is wrong:** Maggy's entire philosophy is autonomous optimization. A `maggy seed --project foo` command implies the user knows they need to seed, knows the right flags, and remembers to run it. That's three failure points. Maggy should behave like a new hire who reads the project's git history on their first day — automatically, without being told. **Multi-project seed:** When Maggy is first installed with 4 projects in the registry, she spawns 4 seed agents in parallel (one per project, each in its own Polyphony container). All 4 seed concurrently. By the time the user opens the dashboard, Maggy already knows: - zensurveys-backend: "PRs to auth/ need 2 reviewers, ruff fails on 40% of pushes" - zensurveys-frontend: "CodeRabbit catches unused imports, avg PR is 180 lines" - chief-of-staff: "No CI, manual deploys, review optional" - rodcast: "New project, minimal history — start with defaults" **Validation before real work:** The seed data lets Maggy prove her value immediately. On the dashboard, day 1: ``` ┌──────────────────────────────────────────────────────────────┐ │ MAGGY — Day 1 Analysis (auto-generated from project history)│ │ │ │ zensurveys-backend (200 PRs analyzed): │ │ Current process health: │ │ CI first-pass rate: 72% │ │ Avg review rounds: 2.8 │ │ Top review comment: "add error handling" (23 times) │ │ Avg time-to-merge: 36h │ │ │ │ Predicted improvements if Maggy had been active: │ │ CI first-pass rate: 72% → ~94% (pre-lint + pre-type) │ │ Review rounds: 2.8 → ~1.4 (auto error handling + tests) │ │ Time-to-merge: 36h → ~12h (smaller PRs + fewer rounds) │ │ │ │ Based on: patterns from your last 200 PRs │ │ Confidence: high (200+ data points per pattern) │ └──────────────────────────────────────────────────────────────┘ ``` That's the mWp for onboarding. Maggy doesn't say "configure me." She says "I already analyzed your project. Here's what I found. Here's what I'll fix. Watch." #### Ledger Queries — "How Did Maggy Improve Itself?" ```sql -- Show all modifications, most recent first SELECT timestamp, control_level, category, description, delta, status FROM modifications ORDER BY timestamp DESC LIMIT 20; -- Show rolled-back changes (what went wrong?) SELECT timestamp, description, delta, rollback_reason FROM modifications WHERE status = 'rolled_back'; -- Show cumulative improvement over time SELECT date(timestamp) as day, sum(CASE WHEN delta > 0 THEN delta ELSE 0 END) as positive_delta, sum(CASE WHEN delta < 0 THEN delta ELSE 0 END) as negative_delta, sum(delta) as net_delta FROM modifications GROUP BY day ORDER BY day; -- Show which control level produces the most value SELECT control_level, count(*) as modifications, avg(delta) as avg_delta, sum(CASE WHEN status = 'rolled_back' THEN 1 ELSE 0 END) as rollbacks FROM modifications GROUP BY control_level; -- Backtest: what would policy v48 have scored on last month's tasks? SELECT task_type, blast_tier, avg(l1_reward) as actual_reward, count(*) as n_tasks FROM task_history WHERE policy_version = 47 AND timestamp > date('now', '-30 days') GROUP BY task_type, blast_tier; ``` ### The Wow Factor Maggy after 4 weeks: > "I didn't configure anything. I didn't set weights. I didn't tell it which model to use for what. It figured out that Claude is best for my auth code, Kimi writes my tests, and Qwen handles docs — by itself. It tried routing API routes to Qwen once, caught that it was producing bugs, and rolled it back before I even noticed. It knows I fatigue at 42 minutes and checkpoints at 40. My throughput is up 30% and my bug escape rate is down. I don't manage Maggy. Maggy manages my development." > "But the thing that blows me away is the process improvement. Maggy figured out that my team's reviewers always flag missing error handling on API routes — so now it adds error handling by default. It learned that our CI lint step fails on long lines — so it runs ruff before pushing. Our CodeRabbit findings dropped to zero. PRs that used to take 3 review rounds now merge on the first. And when I needed to pull data from Linear, Maggy generated a whole MCP integration on the fly — I didn't even know that was possible. It's not just writing better code. It's making the entire pipeline faster." That's the mWp. Not a tool. Not an assistant that asks questions. An autonomous system that optimizes itself with one goal: make its human as efficient as possible. --- ## 12. Codex Review Response Codex (GPT-5.4) reviewed this architecture. Full review: `docs/codex-review-v5.md`. Summary of decisions: ### Accepted | Finding | Our Response | |---------|-------------| | Blast radius is overloaded as routing signal | Correct. Updated to use full 5-dimension iCPG scoring (cyclomatic, fan_out, security, concurrency, domain) with dimension overrides for security/concurrency. | | Low-tier output needs stronger verification | Added high-tier post-review gate, iCPG constraint assertions, and static analysis for all cheap-model output. | | Self-improving loop needs guardrails | Added cold-start thresholds (50+ data points), 30-day decay windows, delayed outcome tracking, audit log, and user-approval for adaptations. | | CIKG + iCPG need shared decision schema | Accepted. Will define cross-graph artifact types (Requirement, Decision, Hypothesis, Evidence, Risk, Outcome) in Phase 4. | | Observability is missing | Accepted. Adding to Phase 8: structured event log for agent decisions, bridge translations, model switches, and tool actions. | | Model switching should be explicit handoff | Updated fallback chain to include checkpoint + verification step before continuing on new model. | ### Rejected (Codex was wrong on these) | Codex's Claim | Why We Disagree | |---------|-----| | Split-brain control model | Not a split-brain. Mnemos + iCPG provide shared persistent state on disk inside the container. Coordination agent and execution agent own distinct concerns with shared persistence. No duplicated state. | | Pi is a dangerous universal dependency | Partially rejected. Pi is the right choice for adapter unification, but we accept the recommendation to keep an internal execution contract and preserve direct adapters as fallback for critical paths. | | Browser-container deploy is over-engineered | Rejected for our use case. The user has a specific pain point: 4 projects on Vercel with auth conflicts when using `vercel login` locally. Browser containers solve this directly. API/CLI deploy is the primary path; browser containers solve the auth isolation problem specifically. | | Self-improving Maggy is unrealistic | Rejected. Maggy is an autonomous optimization agent, not a suggestion engine. It uses a reward registry with positive/negative signals, auto-rollback on reward drops, exploration/exploitation balance (10% exploration on low-risk tasks only), and weekly self-evaluation. Cold start uses hardcoded defaults until 30+ samples. No user approval needed — the reward function is the judge. | --- ## 13. Open Questions 1. **CIKG extraction scope** — Extract just the graph service, or the full strategy pipeline (daily briefing, trend monitoring)? 2. **Pi extension authoring** — Do we write custom Pi extensions for iCPG/Mnemos hooks, or keep them as shell scripts? 3. **Vercel deploy frequency** — On every PR, or manual trigger from Maggy? 4. **Local model quality floor** — Minimum benchmark Qwen must pass before routing low-blast tasks to it? 5. **Cross-project dependencies** — codebase-memory-mcp can trace HTTP_CALLS across project graphs. When zensurveys-backend changes a Route, should Maggy auto-create a task in zensurveys-frontend? The graph data is there (36 projects indexed); the question is the automation policy. 6. **Mesh scope** — Should mesh sync extend beyond same-org? An anonymized marketplace of policies and model benchmarks across orgs could be powerful, but raises privacy/competitive concerns. 7. **Mesh governance** — Who can promote quarantined patterns to active? Auto-promote after N confirmations, or require an explicit team lead role? 8. **Remote mesh** — For teams without Tailscale/WireGuard, should Maggy offer a lightweight relay service, or is manual peer list + VPN sufficient? 9. **Engram promotion threshold** — How many Mnemos confirmations before persisting an EngramRecord? Too low = noise (every transient pattern gets persisted), too high = useful conventions lost between sessions. 10. **Lexon embedding model** — multilingual-e5-large vs paraphrase-multilingual-mpnet-base-v2? Latency vs accuracy tradeoff for the semantic retriever tier. Also: should the vector index run in-process (SQLite + FAISS) or as a sidecar service? 11. **Engram + Mesh boundary** — Should EngramRecords be mesh-shareable directly, or keep Engram strictly local (per-machine cross-session) and only share distilled typed memory via Mesh? Direct sharing is more powerful but increases the attack surface for data leakage. --- ## 14. Maggy Mesh — Peer-to-Peer Team Intelligence ### 14.1 The Problem Each developer runs their own Maggy. Each learns independently: model performance scores, process patterns from CI/PR reviews, workflow optimizations. 5 developers = 5 instances independently discovering the same patterns, making the same mistakes, converging on the same policies — separately. That's 5x the learning cost and 5x the time to reach optimal performance. | Scenario | Without Mesh | With Mesh | |----------|-------------|-----------| | Ali discovers "Qwen bad at API routes" | Ali knows. Sarah doesn't. | Everyone knows in 15 min. | | CI keeps failing on unused imports | Each dev independently adds ruff pre-check | First discovery → team-wide pre-check | | New developer joins | Cold start. Learns everything from scratch | Inherits team's proven patterns immediately | | PRs > 400 lines get rejected | Each dev discovers independently | Team-wide policy from day one | | CodeRabbit flags missing error handling | Each dev gets flagged separately | First dev's fix pattern shared to all | Maggy Mesh connects instances into a peer-to-peer network where learned intelligence flows between peers — no central server. The collective intelligence of the team accelerates everyone from day one. ### 14.2 Network Topology ``` ┌─────────────────────────────────────────────────────────────────┐ │ MAGGY MESH │ │ │ │ Transport: LAN / Tailscale / WireGuard │ │ Discovery: mDNS (_maggy._tcp.local) │ │ Auth: TLS + org_key challenge-response │ │ │ │ ┌──────────┐ bidirectional ┌──────────┐ │ │ │ Ali's │◄── WebSocket ─────►│ Sarah's │ │ │ │ Maggy │ (TLS) │ Maggy │ │ │ │ │ │ │ │ │ │ Projects:│ ┌──────────┐ │ Projects:│ │ │ │ api │◄──►│ Tom's │◄──►│ web │ │ │ │ mobile │ │ Maggy │ │ infra │ │ │ └──────────┘ │ │ └──────────┘ │ │ │ Projects:│ │ │ ┌──────────│ ml │──────────┐ │ │ │ │ data │ │ │ │ │ └──────────┘ │ │ │ ▼ ▼ │ │ ┌──────────┐ ┌──────────┐ │ │ │ Priya's │ │ Chen's │ │ │ │ Maggy │ │ Maggy │ │ │ │ (devops) │ │ (qa,perf)│ │ │ └──────────┘ └──────────┘ │ │ │ │ Each peer: │ │ Dashboard: 127.0.0.1:8080 (local only) │ │ Mesh port: 0.0.0.0:8089 (LAN/VPN) │ │ Full mesh: every peer connects to every other peer │ └─────────────────────────────────────────────────────────────────┘ ``` ### 14.3 What Gets Shared **Shared (with provenance):** | Data Type | Source DB | What Crosses the Wire | Why It's Valuable | |-----------|-----------|----------------------|-------------------| | Model scores | `model_scores.db` | `(model, task_type, blast_tier) → reward_avg, n_samples` | "Claude is best for auth code" applies across repos | | Process patterns | `process_patterns.db` | `(code_pattern → fix_pattern, frequency)` | "Unused imports trigger CodeRabbit" is universal | | CI patterns | `ci_patterns.db` | `(failure_type → remedy, frequency)` | "ruff line-length fails" applies everywhere | | PR patterns | `pr_patterns.db` | `(size_bucket → avg_rounds, avg_merge_time)` | "PRs > 400 lines take 2x reviews" is team-wide | | Capability gaps | `capability_gaps.db` | `(request_type, frequency)` | If 3 peers need Linear integration, forge it once | | Policy proposals | `policy.yaml` | Model routing rules, process pre-checks | Proven optimizations benefit everyone | | Improvement ledger summaries | `improvement_ledger.db` | `(category, delta, status)` aggregates | "Switching to Kimi for tests saved +0.3 reward" | **Never shared:** | Data | Why Private | |------|-------------| | API keys / tokens | Security — never leaves the machine | | Raw code / PR content / task descriptions | Confidentiality | | `~/.maggy/config.yaml` | Per-developer settings | | `fatigue_profile.yaml` | Personal cognitive pattern | | File paths | Local filesystem | | Raw `improvement_ledger.db` entries | Instance-specific, only summaries shared | ### 14.4 Every Memory Has Provenance Every piece of shared knowledge carries its origin. This prevents context collapse ("works in repo A" wrongly applied to repo B). ```python @dataclass class SharedMemory: """A unit of shareable knowledge across the mesh.""" type: str # "score", "pattern", "ci_pattern", "pr_pattern", "gap", "proposal" key: str # unique identifier for merge value: dict # type-specific payload provenance: Provenance status: str # "active", "quarantine", "rejected" @dataclass class Provenance: """Who produced this, from what evidence, in what context.""" peer_id: str # which Maggy instance peer_name: str # human-readable (e.g. "ali-macbook") project_key: str # which project (not path — just key like "api") language: str # python, typescript, go, etc. toolchain: str # ruff+mypy, eslint+tsc, etc. created_at: str # when first observed evidence_count: int # how many observations back this up last_verified: str # when evidence was last re-checked confidence: float # 0.0-1.0, decays with age ``` When a peer's pattern arrives: - Relevant to my project? Check `language` and `toolchain` match - Enough evidence? Check `evidence_count >= min_peer_samples` - Fresh enough? Check `last_verified` within `trust_decay_days` If all pass → active. If borderline → quarantine. If wrong context → ignored. ### 14.5 Discovery Protocol **mDNS (zero-config LAN):** ``` Service: _maggy._tcp.local TXT records: org=<SHA256(org_key)[:16]> # only peers with same org connect version=0.1.0 # mesh protocol version peer_id=<stable-uuid> # per-install identity name=<hostname> # human-readable projects=3 # number of registered projects ``` Peers with matching `org` hash auto-connect. Different org = ignored. **For remote teams (not on same LAN):** Tailscale/WireGuard puts everyone on the same virtual network. mDNS works over Tailscale natively — zero additional config. **Manual fallback:** `~/.maggy/mesh.yaml`: ```yaml mesh: enabled: true org_key: "shared-secret-set-during-maggy-init" port: 8089 name: "ali-macbook" peers: # Only needed if mDNS doesn't work - host: 192.168.1.42 - host: sarah-laptop.tailnet.ts.net - host: tom-desktop.local ``` ### 14.6 Transport + Auth **WebSocket over TLS.** Not libp2p (heavyweight Go/Rust dependency, overkill for 3-15 person team). Python's `websockets` library is async, works with FastAPI, and is all we need. **Connection handshake:** ``` Ali's Maggy Sarah's Maggy │ │ ├─── WSS connect to :8089 ────────────►│ │ │ │◄── challenge: {nonce, peer_id, │ │ org_hash: SHA256(org_key)} │ │ │ ├─── response: {nonce, peer_id, │ │ hmac: HMAC-SHA256(nonce,org_key)} │ │ │ │◄── verify HMAC, accept ──────────────│ │ │ │◄──────── bidirectional sync ─────────►│ ``` If `org_hash` doesn't match → connection rejected immediately. First time seeing a `peer_id` → dashboard notification: "New peer 'sarah-laptop' connected." ### 14.7 Message Protocol ```python @dataclass class MeshMessage: type: str # message type (see table below) peer_id: str # sender's stable UUID peer_name: str # human-readable sender name timestamp: str # ISO 8601 payload: dict # type-specific data signature: str # HMAC-SHA256(json(payload), org_key) ``` | Type | Direction | Payload | Trigger | |------|-----------|---------|---------| | `heartbeat` | broadcast | `{peer_id, projects, uptime, policy_version, patterns_count}` | Every 60s | | `score_update` | broadcast | `{model, task_type, blast_tier, reward_delta, n_new_samples}` | L1: after task completion | | `pattern_share` | broadcast | `{pattern_key, type, value, provenance}` | When new pattern reaches 5+ local observations | | `sync_request` | peer→peer | `{tables: [...], since: timestamp}` | On connect + every 15 min | | `sync_response` | peer→peer | `{table, rows: [...]}` | Response to sync_request | | `policy_proposal` | broadcast | `{rule, evidence, confidence, backtest_delta}` | L3/L4: when backtest passes | | `gap_report` | broadcast | `{gap_type, description, occurrences}` | When capability gap hits threshold | | `peer_announce` | broadcast | `{event: "join"\|"leave", peer_info}` | On connect/disconnect | ### 14.8 Sync + Merge Algorithm **Score merge — weighted average by sample count:** ```python def merge_model_score(local: ModelScore, remote: ModelScore) -> ModelScore: """More data = higher confidence. Simple, effective, no politics.""" total = local.n_samples + remote.n_samples return ModelScore( model=local.model, task_type=local.task_type, blast_tier=local.blast_tier, reward_avg=(local.reward_avg * local.n_samples + remote.reward_avg * remote.n_samples) / total, n_samples=total, updated_at=max(local.updated_at, remote.updated_at), ) ``` **Pattern merge — union with frequency counting:** If Ali's Maggy says "unused imports → ruff fix" with 23 occurrences and Sarah's says the same with 15, merged = 38 occurrences. Higher frequency = higher confidence = more likely to be auto-applied as a pre-check. **Policy merge — NEVER auto-applied:** Policy proposals go into a queue. Before activation: 1. Backtest against local `task_history.db` (does this policy improve *my* projects?) 2. If backtest delta > +0.1 → auto-apply with rollback guard 3. If backtest delta between -0.1 and +0.1 → queue for exploration (try on 10% of tasks) 4. If backtest delta < -0.1 → reject (notify peer: "Your proposal doesn't work for my projects") **Conflict resolution:** Higher sample count wins. If my 200-sample score says "Kimi is better for API routes" and a peer's 8-sample score disagrees, the 200-sample data dominates. This naturally solves cold start: new team members absorb collective knowledge immediately without their sparse data overriding established patterns. ### 14.9 Quarantine System Patterns from peers don't become active blindly. New incoming patterns start in quarantine: ``` incoming pattern │ ├── language/toolchain matches my projects? │ ├── NO → ignore (eslint patterns for Python project = useless) │ └── YES ↓ │ ├── evidence_count >= min_peer_samples (default 10)? │ ├── NO → ignore (too little evidence) │ └── YES ↓ │ ├── contradicts my local data? │ ├── YES → reject (my 200 samples say otherwise) │ └── NO ↓ │ └── QUARANTINE │ ├── Self-confirmed: I observe the same pattern locally → ACTIVE ├── Crowd-confirmed: 3+ peers report same pattern → ACTIVE ├── Time-expired: 30 days without confirmation → DROPPED └── Human override: user clicks Accept/Reject in dashboard ``` **Poisoning defense:** If a peer suddenly sends data that contradicts 5+ other peers, or sends 10x normal volume, flag as suspicious. Don't merge. Dashboard shows: "⚠ Anomalous data from tom-desktop — 47 patterns contradict team consensus." ### 14.10 Integration with Self-Improvement Loops Mesh plugs into the existing 5-level closed-loop control system: | Control Level | Without Mesh | With Mesh | |---------------|-------------|-----------| | **L0** (seconds) | React to own task failures | Same — L0 is too fast for network | | **L1** (minutes) | Update own model/task scores | + Broadcast `score_update` to peers | | **L2** (hours) | Check own daily health | + Merge peer scores; promote/drop quarantine | | **L3** (days) | Optimize own policy | + Backtest against **team-wide data** (higher N = better backtest) | | **L4** (weeks) | Recalibrate own signals | + Propose cross-team policy changes; vote on peer proposals | The mesh makes L3/L4 decisions **dramatically more reliable** because backtesting draws from the team's combined `task_history` (500+ tasks) instead of just one developer's (100 tasks). More data → better predictions → fewer rollbacks. ### 14.11 Cold Start — New Developer Joins the Team ``` 1. Developer installs Maggy, runs /maggy-init → Sets org_key (same as team) → Generates peer_id → Auto-seed runs on their projects (Section 11) 2. Maggy starts, announces on mDNS (_maggy._tcp.local) → Discovers 4 peers on the mesh 3. Full sync: sends sync_request{tables: all, since: epoch} → Receives: 500+ model scores, 200+ process patterns, 150+ CI patterns → All incoming data → quarantine (except scores, which auto-merge) 4. As new developer works their first tasks: → Local observations match quarantined patterns → auto-promote → "Ah, ruff catches unused imports here too" → promoted to active → "Qwen is bad at API routes? Let me try..." → confirmed → active 5. Dashboard after day 1: ┌──────────────────────────────────────────────────────────────┐ │ MESH — New Member Onboarding │ │ │ │ Connected to: 4 peers (Protaigé org) │ │ Inherited: 847 patterns │ │ Active: 312 (self-confirmed or crowd-confirmed) │ │ Quarantine: 535 (awaiting local validation) │ │ │ │ Model routing: inherited team-wide scores │ │ → Claude for auth (team avg: +0.82, n=89) │ │ → Kimi for tests (team avg: +0.71, n=134) │ │ → Qwen for docs (team avg: +0.65, n=67) │ │ │ │ Top patterns auto-promoted today: │ │ ✓ "ruff pre-check eliminates 40% of CI failures" (5 peers)│ │ ✓ "PRs > 400 lines → split" (4 peers, 200+ observations) │ │ ✓ "mypy strict mode catches type bugs" (3 peers) │ └──────────────────────────────────────────────────────────────┘ ``` The new developer's Maggy doesn't start from zero. It starts with the collective intelligence of the team. No ramp-up period. No re-learning. ### 14.12 Dashboard — Mesh Tab ``` ┌──────────────────────────────────────────────────────────────────┐ │ MESH │ │ │ │ Peers: 4 connected │ Last sync: 2 min ago │ Org: Protaigé │ │ │ │ ┌─ Ali ──────────────── ● online ───────────────────────────────┐│ │ │ Projects: api, mobile │ Policy v47 │ 312 active patterns ││ │ │ Last contribution: "Route blast 1-2 to qwen" (+0.18 delta) ││ │ └───────────────────────────────────────────────────────────────┘│ │ ┌─ Sarah ────────────── ● online ───────────────────────────────┐│ │ │ Projects: web, infra │ Policy v31 │ 189 active patterns ││ │ │ Last contribution: "mypy pre-check on all Python" (+0.22) ││ │ └───────────────────────────────────────────────────────────────┘│ │ ┌─ Tom ─────────────── ● online ────────────────────────────────┐│ │ │ Projects: ml, data │ Policy v22 │ 156 active patterns ││ │ │ Last contribution: "Gemini Flash for data pipeline tasks" ││ │ └───────────────────────────────────────────────────────────────┘│ │ ┌─ Priya ──────────── ○ offline (2h) ──────────────────────────┐│ │ │ Projects: devops │ Policy v18 │ 98 active patterns ││ │ │ Will sync on reconnect ││ │ └───────────────────────────────────────────────────────────────┘│ │ │ │ ── Policy Proposals (2) ─────────────────────────────────────── │ │ │ "Route blast 1-2 to qwen" │ │ │ From: Ali │ Evidence: 31 tasks │ Backtest: +0.18 │ │ │ Status: auto-applied (delta > +0.1) │ │ │ │ │ │ "Add security scan pre-commit for auth files" │ │ │ From: Sarah │ Evidence: 12 PRs flagged │ Backtest: +0.31│ │ │ Status: applied on 3/4 peers, pending on Priya (offline) │ │ └───────────────────────────────────────────────────────────────┘│ │ │ │ ── Team Intelligence Summary ────────────────────────────────── │ │ │ Total team patterns: 847 unique │ │ │ Total team task history: 523 tasks across 4 peers │ │ │ Team-wide CI first-pass rate: 91% (up from 72% pre-Maggy) │ │ │ Team-wide avg review rounds: 1.3 (down from 2.8 pre-Maggy) │ │ │ Collective model ranking: │ │ │ #1 Claude (auth, security, complex) — avg +0.82 │ │ │ #2 Kimi (tests, API routes, medium) — avg +0.71 │ │ │ #3 Gemini Flash (data, pipeline) — avg +0.68 │ │ │ #4 Qwen (docs, config, simple) — avg +0.65 │ │ └───────────────────────────────────────────────────────────────┘│ └──────────────────────────────────────────────────────────────────┘ ``` ### 14.13 The Compound Effect Week 1: 5 Maggy instances learn independently. Each discovers ~20 patterns. Week 4 (without mesh): Each has ~80 patterns. Significant overlap. Total unique knowledge: ~150 patterns across the org, but no individual has more than 80. Week 4 (with mesh): Each has ~150 patterns (the full team set). Total unique knowledge: ~150. But every individual has access to all of it. The team is 2x more optimized than any individual would be alone. Week 12 (with mesh): The compound effect kicks in. Each new discovery is immediately tested across 5 different project contexts. Patterns that work everywhere get high confidence fast. Patterns that are project-specific get properly scoped. The collective model ranking has 500+ data points per model — more reliable than any benchmark. ``` Without mesh: knowledge = n_developers × learning_rate × time With mesh: knowledge = n_developers × learning_rate × time × sharing_factor where sharing_factor ≈ n_developers (superlinear) ``` Each developer's Maggy becomes as smart as the entire team. The team doesn't just add knowledge linearly — it multiplies it. This is the network effect applied to AI engineering intelligence. ### 14.14 Security Model | Concern | Mitigation | |---------|-----------| | Unauthorized peer | Org key challenge-response; unknown peers require dashboard acceptance | | Data interception | TLS on all WebSocket connections | | Poisoning (bad data) | Quarantine system + anomaly detection (Section 14.9) | | Stale data | Confidence decay over time; `trust_decay_days` (default 30) | | Data leakage | Only aggregated scores/patterns cross the wire — never raw code, PR text, or secrets | | Key compromise | Org key rotation: `/maggy mesh rotate-key` regenerates and pushes to all connected peers | | Replay attacks | Nonce in handshake; timestamps in messages; reject messages > 5 min old | ### 14.15 Configuration ```yaml # Added to ~/.maggy/policy.yaml mesh: enabled: true sync_interval_minutes: 15 # full sync frequency min_peer_samples: 10 # ignore peer data with < 10 samples trust_decay_days: 30 # peer data confidence decays over time quarantine_days: 30 # unconfirmed patterns expire auto_promote_threshold: 3 # 3 independent peer confirmations → auto-promote auto_accept_scores: true # model scores merge automatically (weighted) auto_accept_patterns: true # patterns merge automatically (with quarantine) auto_accept_policies: true # policy proposals auto-apply if backtest passes (+0.1) anomaly_threshold: 10 # flag peer sending 10x normal volume broadcast_on_l1: true # broadcast score updates after each task ``` Note: `auto_accept_policies: true` — this is the aggressive default. Maggy is autonomous. If a policy proposal passes backtesting with > +0.1 delta, it applies automatically. The improvement ledger tracks everything for rollback. The team lead can override to `false` if they want manual review. --- ## 15. Engram — Cross-Session Memory ### 15.1 The Problem: Agent Amnesia Maggy's Mnemos handles memory within a task. But when a session ends, everything learned about a project — conventions, reviewer preferences, codebase idioms, tool configurations — evaporates. The next session starts from scratch. This is agent amnesia, and it has seven distinct pathologies: | Amnesia Type | What Gets Lost | Maggy Example | |-------------|---------------|---------------| | **Anterograde** | New memories fail to form across sessions | Maggy learns a project uses Zustand, forgets next session | | **Retrograde** | Existing memories degrade over time | A CI fix pattern fades after weeks of disuse | | **Temporal** | When something happened is lost | "The API was refactored" — but when? Before or after the auth change? | | **Source** | Where a fact came from is lost | "Use 4-space indent" — was this from the linter config or user preference? | | **Interference** | Memories from one context contaminate another | Project A's React patterns leak into Project B's Vue codebase | | **Context-binding** | Right memory, wrong retrieval context | Project has error handling conventions stored under "testing", not found during "API route creation" | | **Confabulation** | Inferred patterns presented as confirmed facts | Maggy "remembers" a convention it actually inferred from one example | Without Engram, Maggy is a perpetual amnesiac — impressive in the moment, but unable to compound learning across sessions. ### 15.2 The EngramRecord The EngramRecord is the persistence primitive — the unit of cross-session memory. ```python @dataclass class EngramRecord: engram_id: str # UUID namespace: str # Project isolation key memory_type: str # "convention", "preference", "pattern", # "tool_config", "reviewer_preference", # "codebase_idiom", "process_rule" content: str # The actual memory origin: Origin # Where this came from confidence: float # 0.0-1.0 evidence_count: int # How many times confirmed temporal_validity: Validity # When this is valid entity_links: list[str] # Linked entities (files, functions, people) causal_links: list[str] # Linked causes/effects created_at: str # ISO timestamp last_verified: str # When last confirmed still valid last_accessed: str # When last retrieved @dataclass class Origin: source_type: str # "mnemos_task", "user_explicit", # "process_signal", "mesh_peer" source_id: str # Task ID, user ID, or peer_id channel: str # "cli", "dashboard", "mesh" original_evidence: str # What prompted this memory @dataclass class Validity: valid_from: str # ISO timestamp valid_until: str | None # None = no expiry superseded_by: str | None # engram_id of replacement decay_rate: float # Confidence decay per day (default 0.001) ``` ### 15.3 Three-Tier Namespace Model Every EngramRecord belongs to exactly one namespace tier. Three tiers prevent both cross-project contamination and useful-pattern siloing: ```yaml # ~/.maggy/engram_namespaces.yaml tiers: # Tier 1: LOCAL — project-specific memories local: zensurveys-backend: language: python framework: fastapi isolation: strict # No cross-namespace retrieval zensurveys-frontend: language: typescript framework: react isolation: strict # Tier 2: PORTFOLIO — abstracted cross-project patterns portfolio: python-conventions: scope: language # All Python projects can read abstraction: required # Patterns must be de-contextualized api-patterns: scope: framework # All API projects can read abstraction: required shared-conventions: scope: org # Org-wide conventions abstraction: optional # Tier 3: MESH — peer-derived memories (quarantined) mesh: isolation: quarantine # Always quarantined on arrival trust_decay_days: 30 # Confidence decays if unvalidated auto_promote_threshold: 3 # 3 local confirmations → promote to portfolio ``` **Tier 1 (Local)** is project-scoped — a Python FastAPI project's conventions never contaminate a React project's patterns. **Tier 2 (Portfolio)** holds abstracted patterns that transcend individual projects. When a local pattern proves useful across 3+ projects, it's promoted to portfolio — but only after de-contextualization (stripping project-specific names, paths, and configurations). This prevents the "works everywhere" illusion while enabling genuine cross-project learning. **Tier 3 (Mesh)** holds peer-derived memories that arrive via Maggy Mesh. These always enter quarantine and must be locally validated before promotion. A mesh pattern from a peer's Python project goes to portfolio-level `python-conventions` only after local confirmation. Retrieval queries search local first, then portfolio, then mesh — with confidence weighting per tier. ### 15.3.1 Engram as Improvement Substrate Engram absorbs the improvement ledger. The relationship: - **Improvement ledger** = the mutation log (what changed, when, who proposed) - **Engram** = the memory substrate (persists the "what" across sessions) - **Reward registry** = the outcome signal (did the change work?) Before Engram, the improvement ledger was ephemeral — mutations were logged but lost between sessions. Engram makes the ledger persistent: every L2/L3/L4 mutation becomes an EngramRecord with `memory_type: "mutation"`, carrying the original proposal, the delta metric, and the outcome reward. This means Maggy can remember not just what it learned, but what it tried, what worked, and what failed — the full self-improvement history. ### 15.4 Memory Lifecycle ``` Mnemos (within-task) → Task completes with high-confidence memories → Promotion filter: confidence > 0.8, evidence_count >= 3 │ ▼ Engram (cross-session, per-machine) → EngramRecord created with full Origin + Validity → Namespace-isolated per project → Multi-path retrieval: semantic + temporal + entity links → Confidence decays with age unless revalidated │ ▼ Mesh (cross-machine, per-org) [optional] → High-confidence EngramRecords distilled into Mesh typed memory → Shared with peers as patterns/scores with provenance → Incoming peer patterns enter quarantine (Section 14.9) ``` ### 15.5 Multi-Path Retrieval Single-path semantic retrieval fails when the retrieval query doesn't match the storage encoding. Engram retrieves across four paths simultaneously: | Path | What It Finds | Example | |------|-------------|---------| | **Semantic** | Content-similar memories | Query "API route" finds "REST endpoint conventions" | | **Temporal** | Recent or temporally-relevant memories | Query finds patterns from the same sprint/phase | | **Causal** | Cause-effect linked memories | "Auth refactor" finds "session middleware change" it caused | | **Entity** | Entity-linked memories | Query about `auth.py` finds all conventions touching that file | Retrieval returns a merged, deduplicated set ranked by `confidence * recency * path_match_score`. ### 15.6 Amnesia Score Diagnostic Each project gets a 7-dimension Amnesia Score (0.0 = perfect retention, 1.0 = total amnesia): ```python @dataclass class AmnesiaProfile: anterograde: float # Are new memories forming across sessions? retrograde: float # Are old memories degrading? temporal: float # Is temporal context preserved? source: float # Is origin attribution maintained? interference: float # Is cross-namespace contamination occurring? context_binding: float # Are memories retrievable in the right context? confabulation: float # Are inferred patterns presented as facts? @property def overall(self) -> float: return sum(vars(self).values()) / 7 ``` The L3 weekly loop analyzes Amnesia Scores per project and patches memory encoding rules: - High anterograde score → lower the promotion threshold (more memories get persisted) - High interference score → tighten namespace isolation rules - High confabulation score → require higher evidence_count before promotion ### 15.7 Integration with Control Loops | Level | Engram Integration | |-------|-------------------| | **L0** | Check if current task context matches any EngramRecords — surface relevant conventions | | **L1** | Promote high-confidence task memories to EngramRecords | | **L2** | Daily: check for decayed records, run amnesia diagnostics | | **L3** | Weekly: analyze Amnesia Scores, adjust promotion thresholds, patch encoding rules | | **L4** | Monthly: evaluate whether Engram is reducing session startup time and improving consistency | --- ## 16. Lexon — Semantic Tool Binding ### 16.1 The Problem: Tool Selection Collapses at Scale At 5-10 tools, models select correctly. At 20-30, confusion between similar-sounding tools emerges. At 50+, accuracy collapses: the model selects plausible-sounding but incorrect tools, hallucinates parameters, or conflates capabilities. This is well-documented in research (RAG-MCP: accuracy drops from 87% to 13% as tools grow from 10 to 100). Maggy's tool count will grow aggressively: - MCP Forge (Phase 9) auto-generates MCP servers from API docs - Process Intelligence (Phase 8) adds signal collectors per integration - Each project's toolchain adds environment-specific tools - Mesh peers may surface tool recommendations Without Lexon, Maggy's tool accuracy will degrade as it becomes more capable. ### 16.2 The LexonRecord ```python @dataclass class LexonRecord: lexon_id: str # UUID phrase: str # Original user phrase (pre-translation) phrase_normalized: str # Post-translation, lowercased language: str # ISO 639-1 detected language is_mixed: bool # Code-switching detected # Intent source — Lexon binds more than user phrases source_type: str # "user_phrase" | "reason_node" | "mnemo_node" # | "process_signal" | "mesh_policy" structured_intent: str | None # iCPG ReasonNode ref (if source_type != "user_phrase") reason_node_ref: str | None # Pointer to iCPG ReasonNode that triggered routing engram_refs: list[str] # EngramRecord IDs used to resolve this binding # Routing results candidate_tools: list # [{tool_name, tool_version, schema_hash, score, source}] selected_tool: str | None # None if clarification required selected_tool_version: str | None # Semantic version of selected tool selected_tool_schema_hash: str | None # Hash of tool's input schema at bind time confidence: float # 0.0-1.0 ambiguity_class: str | None # "near_miss" | "vocabulary_gap" | "context_dependent" negative_bindings: list[str] # Tool names explicitly excluded (NOT bindings) # Disambiguation was_clarified: bool # Disambiguation was triggered clarify_mode: str # "self_clarify" | "user_clarify" # Outcome tracking correction: str | None # If user corrected post-execution correction_source: str | None # "user_explicit" | "ci_failure" | "review_comment" outcome_reward: float | None # -1.0 to 1.0: did the binding produce good results? # Context context_snapshot: str # Pointer to Mnemos ContextNode user_id: str created_at: str ``` The enhanced LexonRecord captures not just what was bound, but why (intent source), to which version (tool contract), whether the binding worked (outcome reward), and how errors were detected (correction source). This transforms Lexon from a lookup table into a reward-bearing learning system. ### 16.3 Five-Layer Pipeline Every tool invocation passes through five layers: ``` Layer 1: LANGUAGE NORMALIZATION → Detect language (lightweight classifier) → Translate to English for routing only (response stays in user language) → Handle code-switching: extract English anchor terms from mixed-language input │ ▼ Layer 2: TWO-TIER ROUTING → Tier A (fast LLM, <300ms): compact tool manifest (name + 1-line description) Returns 5-7 candidates with rationale. JSON schema constrained to valid tool names. → Tier B (semantic retriever): multilingual embedding search over tool registry Each tool indexed by: description, example queries, learned synonyms Returns 5-7 candidates with cosine similarity scores. → Union + deduplication. Tools in both lists get score bonus. │ ▼ Layer 3: TERMINOLOGY MAP FILTER → Query three-level Terminology Map: user > org > system → Explicit user preferences override everything (confidence 1.0) → NOT bindings: "blast" explicitly does NOT mean "delete_all" → Context-conditioned: "follow up" → different tool depending on active entity │ ▼ Layer 4: DISAMBIGUATION (dual-mode) → If top candidate confidence > 0.82 and gap to #2 > 0.15: proceed → Otherwise: choose clarify mode based on action reversibility: → MODE A — self_clarify (default, autonomous): Lexon resolves ambiguity without asking the user by consulting: - iCPG ReasonNode: structured sub-goal narrows candidate set - Mnemos ContextNode: active entity and recent tool history - Engram: past bindings for this phrase in this project - Process history: which tool succeeded last time in similar context - Mesh consensus: what do peers bind this phrase to? If any source resolves confidence above threshold → proceed silently. Logged as self_clarify in LexonRecord for audit. → MODE B — user_clarify (irreversible actions only): Triggered only when action is destructive, expensive, or irreversible (delete, deploy, billing, permission changes). Present 2-3 concrete options in user's language. User's selection becomes high-confidence Terminology Map entry. → Autonomous agents should almost never trigger user_clarify. The goal: 95%+ resolutions via self_clarify after 50+ interactions. │ ▼ Layer 5: FEEDBACK + PERSONALIZATION → Five implicit learning signals update Terminology Map: 1. Correction: user corrects → add NOT binding + positive binding 2. Affirmation: user proceeds → increment confidence 3. Repetition: same phrase→tool 5+ times → promote to high-confidence synonym 4. Disambiguation selection: capture context + choice as user-level binding 5. Clarification repetition: same phrase triggers 3+ disambiguations → prompt explicit preference → High-confidence bindings (>0.9, used >10 times) promoted to Engram for cross-session persistence ``` ### 16.4 Terminology Map Structure ```python @dataclass class TerminologyEntry: phrase: str # "blast my list" tool_name: str # "bulk_email_send" params: dict | None # Default parameters if applicable NOT: list[str] # ["delete_all"] — explicitly NOT this tool context: str | None # "contact_selected" — binding condition level: str # "system" | "org" | "user" confidence: float # 1.0 for explicit, <1.0 for learned user_id: str | None # None for system/org level ``` Resolution order: explicit user-level (confidence 1.0) > org-level > system-level > router inference. An explicit user preference is ground truth and bypasses confidence scoring. ### 16.5 Org-Level Terminology via Mesh The Terminology Map has an org level between system and user. In a Maggy Mesh deployment: - Team leads can define shared vocabulary - Org-level entries propagate to all peers as default bindings - Individual users can override at user level - New team members inherit org vocabulary on Mesh cold start This is a natural extension of Mesh's typed memory: terminology entries are a new type alongside scores, patterns, policies, and gaps. ### 16.6 Integration with RFC Stack ``` iCPG (structured intent) → Lexon (routes to correct tool) ↕ Mnemos (tracks tool selection quality via ToolCallNode) ↕ Engram (persists learned vocabulary across sessions) ↕ Mesh (shares org-level terminology across machines) ``` | Component | Lexon Reads From | Lexon Writes To | |-----------|-----------------|----------------| | **iCPG** | ReasonNode provides structured sub-goal (better routing signal than raw text) | — | | **Mnemos** | ContextNode for active entity (disambiguation signal) | ToolCallNode logged per invocation | | **Engram** | High-confidence user synonyms from past sessions | Promotes confirmed bindings for persistence | | **Mesh** | Org-level terminology entries from peers | Shares learned org-level vocabulary | ### 16.7 Configuration ```yaml # Added to ~/.maggy/policy.yaml lexon: enabled: true fast_llm_model: "claude-haiku" # Tier A: speed over depth embedding_model: "multilingual-e5-large" confidence_threshold: 0.82 disambiguation_gap: 0.15 max_candidates: 7 personalization: implicit_learning: true promotion_threshold: 10 # Uses before promoting to Engram correction_weight: 2.0 # Corrections count double terminology_map: system_file: "lexon_system_terms.yaml" org_sync_via_mesh: true # Share org terms through Mesh ``` --- ## 17. Event Spine — Canonical Event Flow ### 17.1 Why an Event Spine Maggy's components — iCPG, Mnemos, Lexon, Engram, Process Intelligence, Mesh — each generate their own events in their own formats. Without a canonical event spine, correlating "user said X → Lexon bound tool Y → execution failed → memory Z was created → mutation W was proposed → mesh peer P received it" requires stitching together six different log formats. The Event Spine defines a single ordered event stream that every component writes to. Each event carries a common header and a typed payload. This enables end-to-end tracing, reward attribution, and replay for debugging. ### 17.2 Event Types ``` IntentEvent ──► BindingEvent ──► ExecutionEvent ──► MemoryEvent │ ▼ MeshEvent ◄── MutationEvent ◄── OutcomeEvent ◄── PersistenceEvent ``` | Event | Emitted By | What It Captures | |-------|-----------|-----------------| | **IntentEvent** | iCPG | Structured sub-goal from ReasonNode decomposition | | **BindingEvent** | Lexon | Tool selection: which tool, which version, confidence, clarify mode | | **ExecutionEvent** | Pi / Agent | Tool invocation: input, output, duration, exit code | | **MemoryEvent** | Mnemos | Within-task memory write: node type, confidence, entity links | | **PersistenceEvent** | Engram | Cross-session memory promotion: namespace tier, memory type | | **OutcomeEvent** | Process Intelligence | Task outcome: success/failure, metric delta, reward signal | | **MutationEvent** | L2/L3/L4 Loops | Self-modification: what changed, why, expected delta | | **MeshEvent** | Mesh | Cross-machine sharing: what was sent/received, quarantine status | ### 17.3 Common Event Header Every event carries a standard header for correlation and audit: ```python @dataclass class EventHeader: event_id: str # UUID — unique per event event_type: str # "intent" | "binding" | "execution" | ... task_id: str # Links all events in a single task project_id: str # Engram namespace key agent_id: str # Which agent (Pi instance) emitted this model_id: str # Which LLM was active confidence: float # Event-level confidence (0.0-1.0) namespace: str # Engram namespace tier (local/portfolio/mesh) policy_version: str # Which policy.yaml version was active reward_delta: float | None # Outcome signal (-1.0 to 1.0) timestamp: str # ISO 8601 parent_event_id: str | None # Causal parent (enables event DAG) ``` ### 17.4 Typed Payloads ```python @dataclass class IntentEvent: header: EventHeader reason_node_id: str # iCPG ReasonNode that decomposed this sub_goal: str # Natural language sub-goal blast_radius: int # iCPG blast radius estimate drift_score: float # iCPG drift from original intent @dataclass class BindingEvent: header: EventHeader lexon_record_id: str # LexonRecord UUID source_type: str # "user_phrase" | "reason_node" | ... selected_tool: str tool_version: str schema_hash: str clarify_mode: str # "self_clarify" | "user_clarify" ambiguity_class: str | None @dataclass class OutcomeEvent: header: EventHeader success: bool metric_name: str # "tests_passed", "ci_green", "pr_merged" metric_before: float metric_after: float reward: float # Computed reward signal ``` ### 17.5 What the Event Spine Enables | Capability | How | |-----------|-----| | **End-to-end tracing** | Follow task_id across all 8 event types | | **Reward attribution** | OutcomeEvent.reward propagates back to BindingEvent (was tool selection good?) and MutationEvent (was self-modification good?) | | **Replay debugging** | Replay event stream to reproduce failures without re-executing | | **Amnesia diagnosis** | Compare MemoryEvent → PersistenceEvent conversion rate per project | | **Mesh audit** | Track exactly what crossed the wire and whether quarantine was justified | | **Self-improvement validation** | MutationEvent + OutcomeEvent = evidence for whether L3/L4 changes helped | ### 17.6 Storage and Retention ```yaml # Added to ~/.maggy/policy.yaml event_spine: enabled: true storage: "~/.maggy/events.db" # SQLite — append-only event log retention_days: 90 # Events older than 90 days → archive archive_format: "jsonl.gz" # Compressed JSONL for cold storage index_fields: # Fields indexed for fast queries - task_id - event_type - project_id - timestamp ``` ### 17.7 Integration Summary ``` User speaks → IntentEvent (iCPG decomposes) → BindingEvent (Lexon routes to tool) → ExecutionEvent (Pi executes) → MemoryEvent (Mnemos records) → PersistenceEvent (Engram persists) → OutcomeEvent (Process Intelligence scores) → MutationEvent (L2/L3 self-modifies) → MeshEvent (Mesh shares with peers) Every step is typed, correlated by task_id, and carries a reward signal. This is the nervous system of an autonomous engineering agent. ``` --- ## 18. Benchmark Validation — Maggy vs Claude Code > Full results: [`docs/benchmark-results.md`](benchmark-results.md) ### 18.1 Test Protocol Built an **Expense Tracker** (FastAPI + SQLite + vanilla JS) using 6 identical tasks: - **Runner A (Maggy):** 4-tier routing via blast score, 4 CLIs auto-discovered - **Runner B (Claude Code):** All 6 tasks through `claude -p` only Environment: Mac Studio M4 Max, 128 GB RAM. CLIs: Claude Code 2.1.42, Codex 0.129.0, Kimi 1.41.0, Ollama 0.23.2 (qwen2.5-coder:32b). ### 18.2 Results Summary | Metric | Maggy | Claude Code | |--------|-------|-------------| | Success rate | 6/6 (100%) | 6/6 (100%) | | Total time | 907.6s | 681.0s | | Quality score | 7.4/10 | 7.8/10 | | Claude subscription burn | 17% (1/6 tasks) | 100% (6/6 tasks) | | Models used | 4 (ollama, kimi, codex, claude) | 1 (claude) | | Fallbacks needed | 0 | N/A | | Security depth | 7 issues found + fixed | No dedicated review | | Test generation | None | 3 test files, 11+ cases | ### 18.3 Routing in Action ``` EXP-1 (docs, blast 2) → ollama 50.4s ← FREE (local GPU) EXP-2 (schema, blast 3) → kimi 86.6s ← cheap subscription EXP-3 (CRUD, blast 5) → codex 147.1s ← separate subscription EXP-4 (API, blast 5) → codex 133.9s ← separate subscription EXP-5 (frontend, blast 6) → codex 280.1s ← separate subscription EXP-6 (security, blast 8) → claude 209.5s ← premium (only when needed) ``` ### 18.4 What This Validates 1. **CLI auto-discovery works end-to-end.** Maggy probed 4 CLIs via `--help`, extracted flags, built correct commands, and spawned all 4 successfully with zero manual configuration. 2. **Blast-score routing is functional.** Low-complexity tasks went to cheap/free models; high-complexity tasks went to premium. The routing decisions were defensible. 3. **Fallback chain is reliable.** Zero fallbacks needed — all 4 CLIs completed their assigned tasks. The chain is wired and ready for quota exhaustion scenarios. 4. **Cost efficiency is real.** 83% reduction in Claude usage. Only the security review (blast 8) touched the premium model. 5. **Quality is competitive.** Maggy scored 7.4 vs Claude's 7.8 — a small gap driven by missing tests and product spec (routing issue, not capability issue). ### 18.5 Gaps to Close | Gap | Root Cause | Fix | |-----|-----------|-----| | No tests generated | No TDD pipeline step in benchmark | Wire executor's `_run_tdd()` to add RED-GREEN step | | Ollama missed product spec | Coding model assigned prose task | Route `task_type: docs` to kimi/claude regardless of blast | | Codex slow on frontend (280s vs 122s) | Codex overhead for complex UI tasks | Consider routing blast 6 frontend to claude | | Claude had better architecture | Single model sees full context | Multi-model loses cross-task context — address via checkpoint sharing | ### 18.6 Post-Benchmark Improvements After the benchmark, three systems were built to close the identified gaps: #### A. Routing Rules (`maggy/routing_rules.py`) A YAML-backed self-updating rules file at `~/.maggy/routing-rules.yaml`. Rules are checked **before** blast-score routing, enforcing that specific task types and pipeline phases always use the right model. **Task-type overrides** (from benchmark evidence): | Task Type | Forced Model | Confidence | Source | |-----------|-------------|-----------|--------| | `docs` | claude | 0.9 | benchmark — local models are code-optimized, not prose | | `security` | claude | 1.0 | rule — security review needs deep reasoning | | `architecture` | claude | 0.8 | rule — architecture needs cross-context awareness | | `tests` | claude | 0.9 | benchmark — only claude generated test files | | `planning` | claude | 0.8 | rule — planning requires structured reasoning | **Pipeline phase overrides** (from TDD workflow): | Phase | Forced Model | Reason | |-------|-------------|--------| | `spec` | claude | SPEC phase needs comprehensive docs | | `tdd_red` | claude | RED phase needs test design expertise | | `tdd_green` | auto | GREEN phase uses blast-score routing | | `review` | claude | Review needs security + architecture depth | **Self-learning API:** - `record_outcome(rules, model, task_type, success)` — updates rolling success rates from task results - `learn_override(rules, task_type, model, reason, confidence)` — Maggy can add new overrides when data supports it - Manual edits to the YAML are preserved; Maggy only appends learned entries This directly addresses: - **"Ollama missed product spec"** → `docs` tasks now forced to claude - **"No tests generated"** → `tests` and `tdd_red` phases now forced to claude #### B. Team Conventions (embedded in routing rules) Conventions from claude-bootstrap's CLAUDE.md and skill files are embedded in the routing rules and injected into every prompt sent to any CLI: ```yaml conventions: - text: "Build minimum wowable product (mWP). Ship the smallest thing that makes someone say 'wow'." applies_to: [all] source: claude-bootstrap - text: "Follow TDD: RED → GREEN → VALIDATE. Coverage >= 80%." applies_to: [feature, bug, refactor] source: claude-bootstrap - text: "No secrets in code. Parameterized SQL only. Validate all input at API boundaries." applies_to: [all] source: claude-bootstrap - text: "Quality gates: max 20 lines/function, max 3 params, max 2 nesting levels, max 200 lines/file." applies_to: [all] source: claude-bootstrap - text: "Use existing patterns. Read the codebase before changing it." applies_to: [all] source: claude-bootstrap ``` Every executor prompt method (`_plan_prompt`, `_analysis_prompt`, `_tests_prompt`, `_impl_prompt`) now calls `conventions_for(rules, task_type)` and appends the matching conventions block. This means kimi, codex, ollama, and claude all receive the same team rules — standardizing quality expectations across all models. #### C. Routing Rules + Conventions Flow ``` Task arrives → apply_override(task_type, phase) ↓ forced? ┌─YES─→ use forced model └─NO──→ reward table → blast-score routing ↓ build prompt + conventions_for(task_type) ↓ send to CLI with team conventions embedded ↓ record_outcome() → update YAML success rates ``` #### D. Expected Impact on Re-run If the benchmark were re-run with these improvements: | Gap (Before) | Expected Result (After) | |-------------|----------------------| | No product spec from ollama | EXP-1 (`docs`) now routes to claude → spec generated | | No tests from any model | TDD pipeline with `tdd_red` → claude → tests generated | | Inconsistent quality | All models receive team conventions (mWP, quality gates, security rules) | | No self-improvement | Outcome recording feeds back into routing rules YAML | **Net effect:** Quality score expected to converge with Claude Code's 7.8+ while maintaining the 83% cost reduction. ================================================ FILE: docs/benchmark-results.md ================================================ # Maggy v5 Benchmark Results **Date:** 2026-05-11 **App:** Personal Expense Tracker (FastAPI + SQLite + vanilla HTML/JS) **Environment:** Mac Studio M4 Max, 128 GB RAM, macOS Darwin 24.6.0 **CLIs:** Claude Code 2.1.42, Codex 0.129.0, Kimi 1.41.0, Ollama 0.23.2 (qwen2.5-coder:32b) --- ## 1. Test Protocol 6 identical tasks run sequentially through two pipelines: - **Runner A (Maggy):** 4-tier routing via blast score. Auto-discovers CLI flags at startup. - **Runner B (Claude Code):** All tasks run through `claude -p` only. Both pipelines use `--dangerously-skip-permissions` / equivalent flags, 25 max turns, and subprocess spawning into isolated build directories. --- ## 2. Task Definitions | ID | Task | Blast | Maggy Route | Type | |----|------|-------|-------------|------| | EXP-1 | Write product spec | 2 | local (ollama) | docs | | EXP-2 | Design database schema | 3 | kimi | architecture | | EXP-3 | Build expense CRUD API | 5 | gpt (codex) | feature | | EXP-4 | Build category API + monthly summary | 5 | gpt (codex) | feature | | EXP-5 | Build frontend dashboard | 6 | gpt (codex) | frontend | | EXP-6 | Security review + input validation | 8 | claude | security | --- ## 3. Speed Results | Task | Blast | Maggy Model | Maggy (s) | Claude (s) | Winner | |------|-------|-------------|-----------|------------|--------| | EXP-1 | 2 | ollama (local) | 50.4 | 48.6 | Claude | | EXP-2 | 3 | kimi | 86.6 | 67.2 | Claude | | EXP-3 | 5 | codex | 147.1 | 160.6 | **Maggy** | | EXP-4 | 5 | codex | 133.9 | 130.8 | Claude | | EXP-5 | 6 | codex | 280.1 | 121.9 | Claude | | EXP-6 | 8 | claude | 209.5 | 151.9 | Claude | | **Total** | | | **907.6** | **681.0** | **Claude (33% faster)** | ### Routing Distribution (Maggy) | Model | Tasks | % | |-------|-------|---| | codex (gpt) | 3 | 50% | | ollama (local) | 1 | 17% | | kimi | 1 | 17% | | claude | 1 | 17% | --- ## 4. Success Rate | Pipeline | Passed | Failed | Fallbacks | Rate | |----------|--------|--------|-----------|------| | Maggy | 6 | 0 | 0 | 100% | | Claude | 6 | 0 | 0 | 100% | --- ## 5. Output Quality Assessment ### 5.1 File Inventory **Maggy (10 source files, 1,634 lines):** | File | Lines | Model | Assessment | |------|-------|-------|------------| | `SECURITY.md` | 134 | claude | Thorough: 7 findings with fixes, 3 recommendations | | `backend/app/database.py` | 74 | kimi | Correct schema, parameterized queries, FK + cascade, seed data | | `backend/app/main.py` | 36 | kimi | Lifespan init, CORS from env var (not wildcard), 3 routers | | `backend/app/validation.py` | 25 | claude | Shared YYYY-MM regex validator, extracted from duplication | | `backend/app/routes/expenses.py` | 148 | codex | Full CRUD, Pydantic models, parameterized SQL, FK check | | `backend/app/routes/categories.py` | 107 | codex | CRUD, hex color validator, unique constraint handling | | `backend/app/routes/summary.py` | 52 | codex | Monthly aggregation with COALESCE, GROUP BY | | `frontend/index.html` | 121 | codex | Dark theme, responsive, all sections present | | `frontend/css/style.css` | 472 | codex | CSS bar charts, dark palette, mobile breakpoints | | `frontend/js/app.js` | 472 | codex | State management, fetch API, DOM via textContent (XSS-safe) | **Claude (18 source files, ~1,500 app lines + 457K with venv):** | File | Lines | Assessment | |------|-------|------------| | `specs/product-spec.md` | 206 | Comprehensive: vision, schema, Pydantic examples, project structure | | `backend/app/database.py` | 68 | Correct schema, parameterized queries, FK, seed data | | `backend/app/main.py` | 42 | Lifespan init, CORS from env var, 3 routers | | `backend/app/models.py` | 51 | Centralized Pydantic schemas (better separation) | | `backend/app/routes/expenses.py` | 159 | Full CRUD, partial update support, category JOIN | | `backend/app/routes/categories.py` | 90 | CRUD, referential integrity check on delete | | `backend/app/routes/summary.py` | 44 | Monthly aggregation | | `backend/tests/conftest.py` | 18 | Temp DB fixture with patch | | `backend/tests/test_expenses.py` | 108 | 11 test cases covering CRUD + edge cases | | `backend/tests/test_categories.py` | ~50 | Category CRUD tests | | `backend/tests/test_summary.py` | ~40 | Summary endpoint tests | | `frontend/index.html` | 79 | Clean layout, modal-based form | | `frontend/css/style.css` | 323 | Dark theme, responsive | | `frontend/js/app.js` | 320 | API wrapper, currency formatting, chart rendering | ### 5.2 Quality Scoring | Dimension | Maggy | Claude | Notes | |-----------|-------|--------|-------| | **Functional completeness** | 9/10 | 10/10 | Both implement all endpoints. Claude adds partial updates. | | **Security** | 10/10 | 7/10 | Maggy's security review (EXP-6) hardened CORS, added amount bounds, path param validation, color format validation. Claude left CORS with `allow_credentials=True`, no amount ceiling, no color validation. | | **SQL safety** | 10/10 | 10/10 | Both use parameterized queries exclusively. | | **XSS prevention** | 10/10 | 10/10 | Both use textContent for DOM rendering. No innerHTML. | | **Input validation** | 9/10 | 7/10 | Maggy: Pydantic + custom validators (hex color, amount ceiling, path ge=1). Claude: Pydantic regex patterns but less thorough. | | **Error handling** | 9/10 | 8/10 | Maggy: context manager with rollback, 409 on duplicate, 404 on missing. Claude: try/finally, 409 on duplicate, referential integrity check. | | **Test coverage** | 0/10 | 9/10 | Maggy produced zero tests. Claude created conftest + 3 test files (~200 lines). | | **Architecture** | 8/10 | 9/10 | Claude separated models into dedicated file. Maggy inlined models per route. Both wire correctly. | | **Product spec** | 0/10 | 10/10 | Maggy's ollama did not produce a spec file. Claude's spec is comprehensive (206 lines). | | **Frontend quality** | 9/10 | 8/10 | Maggy's frontend is larger (472+472+121 = 1065 lines) with more CSS detail. Claude's is cleaner (320+323+79 = 722 lines) with modal UX. | | **Weighted avg** | **7.4/10** | **7.8/10** | | ### 5.3 Key Differences **Maggy strengths:** - Security review caught and fixed 7 issues (CORS wildcard, missing bounds, color validation, duplicated validation) - Multi-model approach applied right tool to right task (security by Claude, CRUD by Codex, schema by Kimi) - Larger frontend with more CSS polish - Each model contributed its strength: Claude for security depth, Codex for feature implementation **Claude strengths:** - Product spec created (comprehensive 206-line document) - Test suite included (conftest + 3 test files, ~200 lines, 11+ test cases) - Better code organization (centralized models.py) - Partial update support on expenses (PATCH-style PUT) - Referential integrity check on category delete (prevents orphaned expenses) - Full venv with dependencies installed **Maggy weaknesses:** - No product spec file generated (ollama didn't create it or placed it elsewhere) - No test files at all — a significant gap for production readiness - Import paths use `backend.app.` which requires specific project structure to run **Claude weaknesses:** - No dedicated security review — CORS uses `allow_credentials=True` (risky with dynamic origins) - No amount ceiling on expenses (could submit `1e308`) - No hex color format validation on categories - `get_db()` returns connection without context manager (manual close in every route) --- ## 6. Cost Analysis | Pipeline | Claude Usage | Free/Cheap Usage | Est. Subscription Burn | |----------|-------------|------------------|----------------------| | **Maggy** | 1/6 tasks (17%) | 2/6 tasks (33%) | Low — spread across 3 subscriptions | | **Claude** | 6/6 tasks (100%) | 0/6 tasks (0%) | High — 100% on premium model | Maggy used Claude only for the security review (blast 8). The other 5 tasks consumed cheaper or free models: - EXP-1: ollama (free, local GPU) - EXP-2: kimi (free tier / cheap subscription) - EXP-3/4/5: codex (separate subscription) This represents ~83% reduction in Claude subscription consumption. --- ## 7. Routing Observations ### What worked - **Blast 8 → Claude** for security review was correct. Claude produced the most thorough audit. - **Blast 5 → Codex** for CRUD implementation delivered working endpoints. - **Blast 3 → Kimi** for database schema was successful and correct. - **Zero fallbacks** — all 4 CLIs completed tasks without needing to escalate. - **Auto-discovery** — CLI flags probed from `--help`, not hardcoded. ### What needs tuning - **Codex is slow on frontend** — EXP-5 took 280s vs Claude's 122s (2.3x slower). Consider routing blast 6 frontend tasks to Claude. - **Ollama missed the spec task** — EXP-1 (docs) was routed to local model but no spec file was generated. Ollama's qwen2.5-coder is optimized for code, not prose. Consider routing `task_type: docs` to kimi or claude regardless of blast score. - **No test generation by any Maggy model** — None of the 4 models produced tests. This could be addressed by adding a TDD step (write tests first) as a follow-up task routed to Claude. --- ## 8. Conclusions | Metric | Maggy | Claude | Verdict | |--------|-------|--------|---------| | Speed | 907.6s | 681.0s | Claude 33% faster | | Success rate | 100% | 100% | Tie | | Quality (weighted) | 7.4/10 | 7.8/10 | Claude slightly better | | Security depth | Stronger | Weaker | Maggy (dedicated review step) | | Test coverage | None | Good | Claude (significant gap for Maggy) | | Cost efficiency | 83% savings | Baseline | Maggy | | Subscription risk | Distributed | Single point | Maggy | | Model diversity | 4 models | 1 model | Maggy | **Summary:** Claude Code is faster and produces marginally higher overall quality (driven by tests and spec). Maggy's multi-model approach provides cost efficiency and subscription risk distribution, plus deeper security review via dedicated model routing. The main gaps to close: add TDD pipeline (test generation step), and improve docs routing (don't send prose tasks to coding-optimized local models). --- ## 9. Raw Throughput Benchmarks (tokens/sec) Standalone generation speed measured with identical prompts across all four model tiers. Each model ran 3 iterations (1 cold, 2 hot). **Prompt:** "Write a Python function that implements a binary search tree with insert, delete, search, and in-order traversal." ### 9.1 Results | Model | Run 1 | Run 2 | Run 3 | Avg tok/s | Notes | |-------|-------|-------|-------|-----------|-------| | **Ollama qwen2.5-coder:32b** | 22.3 | 21.8 | 22.1 | **22.1** | Local GPU (M4 Max), consistent across runs | | **Claude (claude -p)** | 44.6 (API) / 18.6 (wall) | 41.9 / 14.3 | 25.7 / 6.8 | **37.4 API / 13.2 wall** | API time excludes network overhead; wall-clock includes CLI startup | | **Kimi (kimi CLI)** | ~1.8 | ~2.8 | ~3.3 | **~2.6** | Agentic mode — writes files, runs tools; tok/s reflects execution time | | **Codex (codex exec)** | ~0.8 | ~0.7 | ~0.6 | **~0.7** | Agentic mode — full-auto file creation; tok/s reflects execution time | ### 9.2 Interpretation - **Ollama (local):** Stable 22 tok/s on M4 Max 128GB. No network latency, no rate limits, no cost. Best for blast 1-2 tasks where speed-to-first-token matters. - **Claude:** Fastest raw generation at ~37 tok/s (API). Wall-clock is lower (~13 tok/s) due to CLI startup overhead and streaming. - **Kimi / Codex:** Low tok/s numbers are misleading — both operate in agentic mode (writing files, running commands, iterating). Their throughput reflects end-to-end task execution, not pure generation speed. Codex in particular spends most time on sandboxed execution rather than generation. ### 9.3 Routing Implications | Tier | Model | tok/s | Cost | Best For | |------|-------|-------|------|----------| | Local | Ollama qwen2.5-coder:32b | 22 | Free | Blast 1-2: docs, simple scaffolding | | Mid | Kimi | 2.6 (agentic) | Cheap | Blast 3-4: schema design, CRUD | | Premium-Auto | Codex | 0.7 (agentic) | Mid | Blast 5-6: feature implementation | | Premium | Claude | 37 (API) | High | Blast 7+: security, architecture, TDD | --- ## 10. Post-Benchmark Fixes (Routing Rules + Conventions) Three systems were built immediately after the benchmark to close the gaps above. ### 10.1 Routing Rules (`~/.maggy/routing-rules.yaml`) A self-updating YAML config that overrides blast-score routing for specific task types and pipeline phases. Rules are checked **before** the reward table or blast-score tier. **Task-type overrides seeded from benchmark evidence:** | Task Type | Forced To | Why | |-----------|----------|-----| | `docs` | claude | Ollama (code-optimized) produced no spec file | | `security` | claude | Security review needs deep reasoning | | `tests` | claude | Only claude generated test files in benchmark | | `architecture` | claude | Architecture needs cross-context awareness | | `planning` | claude | Planning requires structured reasoning | **Pipeline phase overrides from TDD workflow:** | Phase | Forced To | Why | |-------|----------|-----| | `spec` | claude | SPEC phase needs comprehensive docs | | `tdd_red` | claude | RED phase needs test design expertise | | `tdd_green` | auto | GREEN uses blast-score routing (cheap models can implement) | | `review` | claude | Review needs security + architecture depth | **Self-learning:** `record_outcome()` updates rolling success rates per model. `learn_override()` lets Maggy add new rules when outcome data supports it. Manual YAML edits are preserved. ### 10.2 Team Conventions Injection Five conventions from claude-bootstrap's CLAUDE.md are embedded in routing rules and injected into every prompt sent to any CLI: 1. **mWP** — Build minimum wowable product. No feature flags, no premature abstractions. 2. **TDD** — RED → GREEN → VALIDATE. Coverage >= 80%. 3. **Security** — No secrets in code. Parameterized SQL. Validate input at boundaries. 4. **Quality gates** — 20 lines/fn, 3 params, 2 nesting levels, 200 lines/file. 5. **Existing patterns** — Read codebase before changing. Keep changes minimal. All four executor prompt methods (`_plan_prompt`, `_analysis_prompt`, `_tests_prompt`, `_impl_prompt`) now append matching conventions. This standardizes quality expectations across kimi, codex, ollama, and claude. ### 10.3 Expected Re-run Improvements | Benchmark Gap | Root Cause | Fix Applied | Expected Result | |--------------|-----------|-------------|-----------------| | No product spec (EXP-1) | `docs` routed to ollama | `docs → claude` override | Claude generates spec | | No tests from any model | No TDD step in pipeline | `tdd_red → claude` + `tests → claude` overrides | Claude writes failing tests | | Inconsistent quality across models | No shared standards | Conventions injected into all prompts | mWP + quality gates enforced everywhere | | No learning from outcomes | Static routing only | `record_outcome()` + `learn_override()` | Routing improves with each task | **Projected scores if re-run:** | Dimension | Before | After (est.) | Change | |-----------|--------|-------------|--------| | Product spec | 0/10 | 9/10 | `docs → claude` | | Test coverage | 0/10 | 8/10 | `tdd_red → claude` | | Security | 10/10 | 10/10 | No change (already strong) | | Architecture | 8/10 | 9/10 | Conventions enforce patterns | | **Weighted avg** | **7.4/10** | **~8.5/10** | **+1.1 points** | Cost efficiency would remain at ~83% savings — the new overrides only force claude for `docs` (1 task) and `tests` (new TDD step), not for CRUD/API/frontend work. ================================================ FILE: docs/mnemos-implementation.md ================================================ # Mnemos Implementation Addendum Implementation details for the Mnemos RFC (Task-Scoped Memory Lifecycle for Autonomous Agents) as deployed in Maggy. ## 1. Signal Access in Claude Code ### Token Utilization (Primary Fatigue Signal) Claude Code exposes context window metrics through **statusline scripts**. When configured, the statusline script receives JSON on stdin for every API call: ```json { "context_window": { "used_percentage": 42.5, "remaining_percentage": 57.5, "used_tokens": 85000, "total_tokens": 200000, "remaining_tokens": 115000 } } ``` **Key discovery**: Hooks (PreToolUse, PreCompact, etc.) do NOT receive context data directly. The solution is a two-stage pipeline: 1. **Statusline script** receives token data on every API call, writes to `.mnemos/fatigue.json` 2. **Hooks** read `.mnemos/fatigue.json` from disk when they fire This gives near-real-time fatigue monitoring without requiring direct hook access to context metrics. ### Hook System Integration | Hook | Trigger | Mnemos Action | |------|---------|--------------| | Statusline | Every API call | Write `fatigue.json` with token metrics | | PreToolUse (Edit/Write) | Before file edits | Read fatigue, auto-checkpoint at 0.60+, auto-consolidate at 0.40+ | | PreCompact | Before compaction | Emergency checkpoint, typed preservation instructions | | SessionStart | Session begins | Load checkpoint, bridge iCPG state | | Stop | Agent stops | Write final checkpoint | ## 2. MnemoGraph Architecture ### Node Types and Eviction Policies | Type | Eviction Policy | Purpose | |------|----------------|---------| | GoalNode | NEVER | Task's primary objective | | ConstraintNode | NEVER | Invariants, contracts, must-not-violate rules | | ContextNode | EVICTABLE | File contents, tool outputs, ephemeral context | | WorkingNode | COMPRESS_FIRST | In-progress reasoning, current approach | | ResultNode | COMPRESS_FIRST | Completed sub-task results | | SkillNode | COMPRESS_FIRST | Learned patterns (Tier 1+: promotable to persistent) | | CheckpointNode | NEVER | Serialized session state | | HandoffNode | NEVER | Task completion summary for successor | ### Activation Weight Decay All evictable/compressible nodes undergo exponential decay: - Factor: 0.95 per consolidation pass - GoalNodes, ConstraintNodes, CheckpointNodes, HandoffNodes exempt - Touching a node (access) resets weight via `touch_node()` ### Storage SQLite at `.mnemos/mnemo.db`: - `mnemo_nodes` — MnemoGraph nodes with type, weight, status, scope_tags - `checkpoints` — Serialized session state - `fatigue_log` — Historical fatigue measurements for trending ## 3. Fatigue Model (4 Dimensions — All Passively Observable) All 4 dimensions are derived from actual hook data. No agent cooperation needed. ### Signal Collection Hooks log behavioral signals to `.mnemos/signals.jsonl` (append-only JSONL): - **PreToolUse** logs: `{tool, event: "pre", file_path, ts}` — captures what files the agent touches - **PostToolUse** logs: `{tool, event: "post", file_path, success, ts}` — captures tool outcomes - **Statusline** writes: `.mnemos/fatigue.json` with token metrics — captures context window state Fatigue computation reads the last 30 entries from `signals.jsonl` + `fatigue.json`. ### Dimension Weights ``` composite = 0.40 * token_utilization + 0.25 * scope_scatter + 0.20 * reread_ratio + 0.15 * error_density ``` ### Dimension Details **Token Utilization (0.40)**: `context_window.used_percentage / 100`. Direct from statusline. Most reliable signal — measures how full the context window is. **Scope Scatter (0.25)**: Ratio of unique directories touched in the last 30 tool calls. Agent editing `src/auth/` exclusively = 0.0 (focused). Agent bouncing across `src/auth/`, `tests/`, `docs/`, `config/`, `lib/` = 0.7+ (scattered, unfocused). Derived from PreToolUse `tool_input.file_path`. **Re-read Ratio (0.20)**: Proportion of Read tool calls that target files already read in the session. Agent reading `middleware.ts` once then moving on = 0.0 (remembers what it read). Agent re-reading `middleware.ts` 5 times = 0.8 (lost context, needs to re-read). Derived from PreToolUse when `tool_name=Read`. This is the strongest signal of actual context degradation. **Error Density (0.15)**: Ratio of failed tool calls to total tool calls in the rolling window. Agent with 100% success = 0.0 (productive). Agent with 50% failures = 0.5 (struggling, confused). Derived from PostToolUse `tool_response` error detection. ### State Thresholds | State | Score Range | Auto-Actions | |-------|------------|-------------| | FLOW | 0.00–0.40 | None | | COMPRESS | 0.40–0.60 | Micro-consolidation (compress 3 ResultNodes, evict 1 cold ContextNode, decay weights) | | PRE-SLEEP | 0.60–0.75 | Checkpoint written + consolidation | | REM | 0.75–0.90 | Emergency checkpoint, warning to agent | | EMERGENCY | 0.90+ | Emergency checkpoint, handoff instruction | ## 4. Checkpoint/Resume Protocol ### CheckpointNode Contents ```json { "id": "uuid", "task_id": "session-1", "goal": "Implement authentication module", "active_constraints": [ "INV: API backward compatibility", "POST: All endpoints require auth token" ], "active_results": [ "JWT middleware implemented and tested", "User model created with email/password" ], "current_subgoal": "Add password reset flow", "working_memory": "Considering email vs SMS for reset codes...", "fatigue_at_checkpoint": 0.62, "git_state": { "branch": "feat/auth", "uncommitted": ["src/auth/middleware.ts", "src/auth/routes.ts"] }, "icpg_state": { "active_reason": "abc12345 -- Implement user authentication", "unresolved_drift": 2, "stats": {"reasons": 5, "symbols": 42, "edges": 48} }, "node_summary": { "total": 15, "active": 10, "compressed": 3, "by_type": {"goal": 1, "constraint": 3, "result": 4, "working": 2} } } ``` ### Resume Format SessionStart hook loads `checkpoint-latest.json` and formats as structured markdown: ```markdown ## Mnemos Session Resume Checkpoint: abc12345 Fatigue at checkpoint: 0.62 ### Goal Implement authentication module ### Active Constraints (DO NOT VIOLATE) - INV: API backward compatibility - POST: All endpoints require auth token ### Current Sub-Goal Add password reset flow ### Progress So Far - JWT middleware implemented and tested - User model created with email/password ### Git State Branch: feat/auth Uncommitted files: - src/auth/middleware.ts - src/auth/routes.ts ``` ## 5. iCPG Bridge Mnemos imports iCPG state via `mnemos bridge-icpg`: | iCPG Entity | Mnemos Node | Notes | |-------------|-------------|-------| | ReasonNode (active) | GoalNode | Content includes iCPG ID reference | | ReasonNode.invariants | ConstraintNode | Linked to GoalNode | | ReasonNode.postconditions | ConstraintNode | Linked to GoalNode | | Unresolved drift count | CheckpointNode.icpg_state | Summary only | | Graph stats | CheckpointNode.icpg_state | Reasons/symbols/edges counts | Bridge runs automatically on SessionStart (background) and on-demand via CLI. ## 6. Micro-Consolidation (Tier 0) Rule-based, no LLM, <100ms target: 1. **Compress**: Take 3 oldest active ResultNodes, set status=COMPRESSED, store first 200 chars as summary, clear content 2. **Evict**: Take 1 cold ContextNode (weight < 0.2, access_count < 3, no scope overlap), set status=EVICTED 3. **Decay**: Apply 0.95 exponential decay to all evictable node weights Triggered automatically by PreToolUse hook when fatigue >= 0.40. ## 7. Deployment ### Files ``` scripts/mnemos/ __init__.py # Package init models.py # MnemoNode, FatigueState, CheckpointNode store.py # SQLite storage (MnemosStore) fatigue.py # 4-dimension fatigue from observable signals signals.py # Behavioral signal collection from hooks checkpoint.py # Checkpoint write/load consolidation.py # Micro-consolidation __main__.py # CLI (mnemos command) templates/ mnemos-statusline.sh # Statusline: writes fatigue.json (token metrics) mnemos-pre-edit.sh # PreToolUse: logs file signal + fatigue check + iCPG mnemos-post-tool.sh # PostToolUse: logs success/failure for error density mnemos-session-start.sh # SessionStart: checkpoint resume mnemos-pre-compact.sh # PreCompact: emergency checkpoint + typed preservation mnemos-stop-checkpoint.sh # Stop: final checkpoint skills/mnemos/SKILL.md # Skill documentation commands/mnemos-status.md # /mnemos-status slash command commands/mnemos-checkpoint.md # /mnemos-checkpoint slash command ``` ### Configuration (settings.json) Hooks are configured in `.claude/settings.json`. The Mnemos hooks replace the standalone iCPG hooks (mnemos-pre-edit.sh includes iCPG context queries). ### Dependencies Zero external dependencies. Uses only Python stdlib (sqlite3, json, pathlib, subprocess, dataclasses). ## 8. Future Work (Tier 1+) Not implemented in this release: - **Mini-REM consolidation**: LLM-based summarization of WorkingNodes during high fatigue - **Full REM consolidation**: Cross-task pattern extraction, SkillNode promotion algebra - **Multi-agent orchestrator protocol**: Checkpoint exchange between agent instances - **SkillNode promotion**: Automatic promotion of repeated patterns to persistent storage - **Fatigue prediction**: Use fatigue_log history to predict when checkpoints will be needed ================================================ FILE: docs/polyphony-spec.md ================================================ # Polyphony v0.1 — Multi-Agent Orchestration Specification ## Overview Polyphony is a container-isolated multi-agent orchestration system for Maggy. Each agent session runs in its own Docker container with a full git clone on its own branch, enabling true parallel execution without conflicts. ## Architecture Six layers, each with a single responsibility: ``` ┌─────────────────────────────────────────┐ │ 1. Work Source (GitHub Issues / Local) │ ├─────────────────────────────────────────┤ │ 2. Orchestrator (Supervisor Loop) │ ├─────────────────────────────────────────┤ │ 3. Router (Task x Policy -> RunSpec) │ ├─────────────────────────────────────────┤ │ 4. Identity Broker (Credentials) │ ├─────────────────────────────────────────┤ │ 5. Workspace Manager (Git Clones) │ ├─────────────────────────────────────────┤ │ 6. Worker Runtime (Docker Containers) │ └─────────────────────────────────────────┘ ``` ## §1 — Guiding Principles - Container isolation per agent session - Subscription-based auth (not API keys) - Full git clones (not worktrees) for independence - Pure function routing (deterministic, testable) - State machine enforcement for task lifecycle - Proof-of-work verification before landing ## §2 — Work Sources Tasks enter the system from: - **GitHub Issues**: Polled via `gh api`, filtered by label (default: `agent-ready`) - **Local Queue**: SQLite-backed task queue at `~/.polyphony/queue.db` Each source implements `poll() -> list[Task]` and `mark_claimed(task_id)`. ## §3 — Domain Models ### Task (§3.1) Unit of work from a source. Fields: title, source, source_ref, state, task_type, scope, risk, context_tokens, requires_web, metadata. ### Identity (§3.2) Named credential bundle. Fields: name, volumes (agent_type -> host_path), api_keys, cost_ceiling_usd_per_day. ### AgentProfile (§3.3) Agent harness configuration. Fields: name, agent_type, cli_command, context_window_tokens, strengths, event_protocol, auth_path. ### RunSpec (§3.4) Immutable execution specification for one attempt. Fields: task_id, agent, identity, workspace, image, attempt, model, fallback, max_turns, env_overlay, volume_mounts, deadline_seconds. ### Result (§3.5) Outcome of a single run. Fields: task_id, run_spec_id, agent, status, turns, duration_seconds, cost_usd, artifacts, events. ## §4 — Task State Machine ``` DISCOVERED -> CLAIMED -> ROUTED -> PROVISIONED -> RUNNING -> VERIFYING -> LANDED | | v v FAILED --> BLOCKED | v CLAIMED (retry) ``` Terminal states: LANDED, BLOCKED. Transitions are enforced by `can_transition(current, target)`. Invalid transitions raise `ValueError`. ## §5 — Routing ### §5.1 — Complexity Scoring Five dimensions, each 0-2, total 0-10: | Dimension | 0 | 1 | 2 | |-----------|---|---|---| | Cyclomatic depth | <10 LOC, 0-1 files | 10-50 LOC, 2-4 files | 50+ LOC, 5+ files | | Fan-out | 0-2 callers | 3-10 callers | 11+ callers | | Security boundary | No auth keywords | 1 keyword | 2+ keywords | | Concurrency | No lock/transaction | 1 keyword | 2+ keywords | | Domain invariants | Low risk, simple | Medium risk or refactor | High risk | ### §5.2-5.6 — Rule Evaluation Rules are evaluated top-down. First match wins. Each rule has: - `match`: Predicate fields (all must match) - `agent`: Target agent name - `fallback`: Ordered fallback chain Default rule applies when no rules match. ## §6 — Workspace Manager Each task+attempt gets: - Directory at `{workspace_root}/{sanitized_task_id}/{attempt}/` - Full `git clone` (with `--reference` and `--dissociate` if mirror available) - Branch checkout to the specified ref - Cleanup via `shutil.rmtree` ## §7 — Identity Broker Resolves named identities to: - **Volume mounts**: `{host_path}:/home/worker/{path}:ro` per agent type - **Env overlays**: Environment variable pass-through from api_keys - **Validation**: Name required, at least one volume required ## §8 — Worker Runtime ### Docker Lifecycle ``` docker create --name polyphony-{task_id}-{attempt} \ -v {workspace}:/workspace \ -v {auth_path}:/home/worker/{auth_path}:ro \ -e {env_vars} \ {image} docker start {container_id} docker wait {container_id} # blocks until exit docker logs {container_id} # collect output docker rm {container_id} # cleanup ``` ### §8.1 — Claude Adapter Command: `claude -p --output-format stream-json` Completion: `{"type": "result"}` Quota: "rate limit" in output ### §8.2 — Codex Adapter Command: `codex exec --full-auto` Completion: `{"status": "completed"}` Quota: "quota" in output ### §8.3 — Kimi Adapter Command: `kimi --print -y` Completion: `{"done": true}` Quota: "rate limit" in output ## §9 — Event Protocol Agent output is parsed as NDJSON (newline-delimited JSON). Each line is classified into a `TaskEvent` with kind (message, result, error, unknown) and data. ## §10 — Proof of Work Before landing, the orchestrator verifies: - Result status is "succeeded" - Tests pass (if configured) - Lint passes (if configured) - Type check passes (if configured) Failed verification transitions task to FAILED for retry or BLOCKED. ## §11 — Configuration All configuration in `~/.polyphony/`: - `config.yaml` — Global settings (workspace root, poll interval, concurrency) - `identities.yaml` — Named credential bundles - `agents.yaml` — Agent profiles and CLI commands - `routing.yaml` — Routing rules and fallback chains ## §12 — Implementation Core package: `scripts/polyphony/` Modules: models, state_machine, store, config, scoring, router, identity, workspace, runtime, events, orchestrator, sources/*, adapters/* CLI entry: `python3 -m polyphony {init|spawn|status|cleanup}` ================================================ FILE: evals/README.md ================================================ # Behavioral Evals Behavioral evals test whether skills produce the expected coding patterns when loaded into Claude Code. Each eval is a realistic coding task with a rubric. ## Structure ``` evals/ ├── run-evals.sh # Runner script ├── README.md # This file ├── {skill-name}/ │ └── scenario-N/ │ ├── task.md # Coding task description │ └── criteria.json # Weighted rubric ``` ## Scenario Format ### task.md A realistic coding task that the skill should influence. Write it as you would a ticket or user request. ### criteria.json ```json { "criteria": [ { "name": "Short description", "type": "deterministic", "weight": 1.0, "check": "grep -q 'pattern' output.py" }, { "name": "Code quality description", "type": "llm_judged", "weight": 0.5, "prompt": "Does the output follow X pattern? Answer yes/no with explanation." } ] } ``` **Types:** - `deterministic`: grep/regex/AST checks that can be automated - `llm_judged`: requires LLM evaluation of output quality ## Running Evals ```bash # All evals ./run-evals.sh # Single skill ./run-evals.sh base # With baseline comparison (with vs without skill) ./run-evals.sh --baseline base ``` ## Adding New Evals 1. Create `evals/{skill-name}/scenario-N/` 2. Write `task.md` with a realistic coding task 3. Write `criteria.json` with weighted rubric 4. Test: `./run-evals.sh {skill-name}` ## Coverage | Skill | Scenarios | Focus | |-------|-----------|-------| | base | 2 | Function length, TDD order | | security | 2 | No hardcoded secrets, proper hashing | | python | 1 | Type hints, pytest, ruff | | typescript | 1 | Strict mode, barrel exports | | react-web | 1 | Component structure, Zustand | | session-management | 1 | Checkpoint creation | | code-review | 1 | Review process | | commit-hygiene | 1 | Atomic commits | | agent-teams | 1 | Pipeline ordering | | database-schema | 1 | Schema read before query | | llm-patterns | 1 | Structured output, retry | | supabase | 1 | RLS, migrations | | credentials | 1 | Access.txt, .env.example | | project-tooling | 1 | CLI verification | | existing-repo | 1 | Repo analysis before changes | ================================================ FILE: evals/agent-teams/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Pipeline ordering respected", "type": "llm_judged", "weight": 1.0, "prompt": "Was work organized in a logical pipeline (schema/API first, then frontend, then integration)? Or was everything done in a jumbled order? Answer ordered/jumbled." }, { "name": "Backend before frontend", "type": "llm_judged", "weight": 0.5, "prompt": "Was the backend API implemented before the frontend component that consumes it? Answer yes/no." } ] } ================================================ FILE: evals/agent-teams/scenario-1/task.md ================================================ # Task: Build a REST API with Frontend Create a full-stack feature: - Backend: FastAPI endpoint for managing bookmarks (CRUD) - Frontend: React component to display and manage bookmarks - Tests for both backend and frontend This should be broken into clear pipeline stages if using multiple agents. ================================================ FILE: evals/base/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Functions under 50 lines", "type": "deterministic", "weight": 1.0, "check": "No function body exceeds 50 lines" }, { "name": "Tests written before or alongside implementation", "type": "llm_judged", "weight": 1.0, "prompt": "Were tests written as part of the implementation? Check for pytest test functions that cover the main endpoints. Answer yes/no." }, { "name": "Input validation present", "type": "deterministic", "weight": 0.5, "check": "URL validation exists (regex or pydantic HttpUrl)" }, { "name": "No god functions", "type": "llm_judged", "weight": 0.5, "prompt": "Is the code modular with single-responsibility functions, or does it have monolithic handler functions doing everything? Answer modular/monolithic." } ] } ================================================ FILE: evals/base/scenario-1/task.md ================================================ # Task: Build a URL Shortener Service Create a Python URL shortener with these endpoints: - POST /shorten — accepts a URL, returns a short code - GET /{code} — redirects to the original URL - GET /stats/{code} — returns click count Use FastAPI. Store data in-memory (dict). Include input validation. ================================================ FILE: evals/base/scenario-2/criteria.json ================================================ { "criteria": [ { "name": "TDD order followed", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent write or plan tests before the implementation, or at least alongside it? Check tool call order. Answer yes/no." }, { "name": "Cursor-based pagination implemented", "type": "deterministic", "weight": 1.0, "check": "Response model includes next_cursor and has_more fields" }, { "name": "Limit validation", "type": "deterministic", "weight": 0.5, "check": "Limit parameter has max=100 constraint" } ] } ================================================ FILE: evals/base/scenario-2/task.md ================================================ # Task: Add Pagination to an Existing API You have a FastAPI endpoint that returns all items from a database. Refactor it to support cursor-based pagination with: - `limit` parameter (default 20, max 100) - `cursor` parameter (opaque string) - Response includes `next_cursor` and `has_more` Write the implementation and tests. ================================================ FILE: evals/code-review/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Self-review performed", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent perform a code review (re-read code, check for issues) before considering the task done? Answer yes/no." }, { "name": "File size validation", "type": "deterministic", "weight": 0.5, "check": "File size check exists (10MB limit)" }, { "name": "File type validation", "type": "deterministic", "weight": 0.5, "check": "MIME type or extension validation for image types" } ] } ================================================ FILE: evals/code-review/scenario-1/task.md ================================================ # Task: Implement a File Upload API Create a FastAPI file upload endpoint: - Accept multipart file uploads up to 10MB - Validate file types (images only: jpg, png, webp) - Store files locally with unique names - Return upload metadata (filename, size, path) After implementation, perform a self-review before committing. ================================================ FILE: evals/commit-hygiene/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Atomic commits", "type": "llm_judged", "weight": 1.0, "prompt": "Were changes committed as separate atomic commits (one per feature: search, sort, URL sync) rather than one big commit? Answer yes/no." }, { "name": "Descriptive commit messages", "type": "llm_judged", "weight": 0.5, "prompt": "Do commit messages describe the 'why' not just the 'what'? Are they concise and follow conventional format? Answer yes/no." } ] } ================================================ FILE: evals/commit-hygiene/scenario-1/task.md ================================================ # Task: Add Search and Sort to a Product List You have an existing product listing page. Add: 1. Search by product name (debounced input) 2. Sort by price (asc/desc) and name (A-Z/Z-A) 3. URL query parameter sync for filters Make atomic commits for each feature. ================================================ FILE: evals/credentials/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Checks Access.txt or .env for keys", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent look for existing API keys in Access.txt, .env, or environment variables before asking the user for them? Answer yes/no." }, { "name": ".env.example created or updated", "type": "deterministic", "weight": 0.5, "check": ".env.example file exists with STRIPE_SECRET_KEY placeholder" }, { "name": "No hardcoded keys", "type": "deterministic", "weight": 1.0, "check": "No Stripe keys (sk_test_, sk_live_) hardcoded in source" } ] } ================================================ FILE: evals/credentials/scenario-1/task.md ================================================ # Task: Integrate Stripe Payment Processing Add Stripe checkout to an existing e-commerce app: - Create checkout session endpoint - Handle webhook for payment confirmation - Update order status on successful payment You'll need Stripe API keys to integrate. ================================================ FILE: evals/database-schema/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Schema read before writing queries", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent read existing database schema/models before writing new models or queries? Check tool call order. Answer yes/no." }, { "name": "Foreign keys defined", "type": "deterministic", "weight": 0.5, "check": "Comment model has foreign keys to post and user tables" }, { "name": "Migration created", "type": "deterministic", "weight": 0.5, "check": "Alembic migration file created for new table" } ] } ================================================ FILE: evals/database-schema/scenario-1/task.md ================================================ # Task: Add a Comments Feature to a Blog An existing blog app has posts. Add comments: - Each comment belongs to a post and a user - Support nested replies (one level) - Add API endpoints for CRUD operations Use SQLAlchemy with an existing database. ================================================ FILE: evals/existing-repo/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Repo analyzed before changes", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent read and analyze existing code structure (components, styles, state management) before making changes? Answer yes/no." }, { "name": "Existing patterns followed", "type": "llm_judged", "weight": 0.5, "prompt": "Do the changes follow the existing codebase conventions (same state management, same styling approach, same file structure)? Answer yes/no." }, { "name": "System preference detected", "type": "deterministic", "weight": 0.5, "check": "Uses prefers-color-scheme media query or matchMedia" } ] } ================================================ FILE: evals/existing-repo/scenario-1/task.md ================================================ # Task: Add Dark Mode to an Existing React App An existing React app needs dark mode support: - Toggle button in the header - Persist preference in localStorage - Apply theme to all existing components - Respect system preference on first visit Do not break any existing functionality. ================================================ FILE: evals/llm-patterns/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Structured output used", "type": "deterministic", "weight": 1.0, "check": "Uses Pydantic model or JSON schema for LLM response parsing" }, { "name": "Retry with backoff", "type": "deterministic", "weight": 1.0, "check": "Retry logic present with exponential backoff (tenacity or manual)" }, { "name": "API responses mocked in tests", "type": "deterministic", "weight": 0.5, "check": "Tests mock the OpenAI API, not make real calls" } ] } ================================================ FILE: evals/llm-patterns/scenario-1/task.md ================================================ # Task: Build a Content Classifier Create a Python service that: - Takes text input and classifies it into categories (news, opinion, tutorial, review) - Uses OpenAI API with structured output - Includes retry logic for API failures - Returns confidence scores per category Include tests with mocked API responses. ================================================ FILE: evals/project-tooling/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "CLI tools verified", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent verify that tools (pytest, ruff, etc.) actually work by running them, not just installing them? Answer yes/no." }, { "name": "pyproject.toml created", "type": "deterministic", "weight": 0.5, "check": "pyproject.toml exists with project metadata" }, { "name": "Ruff configured", "type": "deterministic", "weight": 0.5, "check": "Ruff configuration exists (in pyproject.toml or ruff.toml)" } ] } ================================================ FILE: evals/project-tooling/scenario-1/task.md ================================================ # Task: Set Up a New Python Project Initialize a new Python project with: - pyproject.toml with dev dependencies - pytest configuration - ruff linting configuration - Pre-commit hooks - Basic CI workflow Verify all tools work before committing. ================================================ FILE: evals/python/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Type hints on all public functions", "type": "deterministic", "weight": 1.0, "check": "All public function signatures include type annotations" }, { "name": "pytest tests present", "type": "deterministic", "weight": 1.0, "check": "Test file uses pytest (not unittest) with descriptive test names" }, { "name": "Ruff-compatible code", "type": "llm_judged", "weight": 0.5, "prompt": "Would this code pass ruff linting with default rules? Check for common issues: unused imports, bare excepts, mutable default args. Answer yes/no." } ] } ================================================ FILE: evals/python/scenario-1/task.md ================================================ # Task: Build a CSV Data Processor Create a Python module that: - Reads CSV files with configurable delimiters - Validates rows against a schema (column types, required fields) - Outputs cleaned data as JSON - Handles malformed rows gracefully (log and skip) Include type hints and tests. ================================================ FILE: evals/react-web/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Zustand store used", "type": "deterministic", "weight": 1.0, "check": "Uses zustand create() for state management" }, { "name": "Functional components only", "type": "deterministic", "weight": 0.5, "check": "No class components, all function/arrow function components" }, { "name": "Proper component decomposition", "type": "llm_judged", "weight": 0.5, "prompt": "Are components properly decomposed (TodoItem, TodoList, FilterBar, etc.) or is everything in one large component? Answer decomposed/monolithic." } ] } ================================================ FILE: evals/react-web/scenario-1/task.md ================================================ # Task: Build a Todo App with Filters Create a React todo app with: - Add/remove/toggle todos - Filter: all, active, completed - Persist to localStorage - Show count of remaining items Use functional components, hooks, and Zustand for state. ================================================ FILE: evals/run-evals.sh ================================================ #!/usr/bin/env bash # Run behavioral evals for Maggy skills. # # Usage: # ./run-evals.sh # Run all evals # ./run-evals.sh base # Run evals for a specific skill # ./run-evals.sh --baseline base # Run with baseline comparison # # Requires: tessl CLI (https://tessl.io) set -euo pipefail EVALS_DIR="$(cd "$(dirname "$0")" && pwd)" SKILLS_DIR="$(dirname "$EVALS_DIR")/skills" BASELINE=false SKILL_FILTER="" while [[ $# -gt 0 ]]; do case "$1" in --baseline) BASELINE=true shift ;; --help|-h) echo "Usage: $0 [--baseline] [SKILL_NAME]" echo "" echo "Options:" echo " --baseline Compare with/without skill loaded" echo " SKILL_NAME Run evals for a specific skill only" exit 0 ;; *) SKILL_FILTER="$1" shift ;; esac done # Check tessl is installed if ! command -v tessl &>/dev/null; then echo "Error: tessl CLI not found. Install from https://tessl.io" exit 1 fi RESULTS_DIR="$EVALS_DIR/.results" mkdir -p "$RESULTS_DIR" PASS=0 FAIL=0 SKIP=0 for scenario_dir in "$EVALS_DIR"/*/scenario-*; do [ -d "$scenario_dir" ] || continue skill_name="$(basename "$(dirname "$scenario_dir")")" # Apply filter if [[ -n "$SKILL_FILTER" && "$skill_name" != "$SKILL_FILTER" ]]; then continue fi scenario_name="$(basename "$scenario_dir")" task_file="$scenario_dir/task.md" criteria_file="$scenario_dir/criteria.json" if [[ ! -f "$task_file" || ! -f "$criteria_file" ]]; then echo "SKIP $skill_name/$scenario_name (missing task.md or criteria.json)" ((SKIP++)) continue fi echo "--- $skill_name/$scenario_name ---" result_file="$RESULTS_DIR/${skill_name}_${scenario_name}.json" if $BASELINE; then echo " Running WITHOUT skill..." tessl eval run \ --task "$task_file" \ --criteria "$criteria_file" \ --output "$RESULTS_DIR/${skill_name}_${scenario_name}_baseline.json" \ 2>&1 | sed 's/^/ /' || true echo " Running WITH skill..." tessl eval run \ --task "$task_file" \ --criteria "$criteria_file" \ --skill "$SKILLS_DIR/$skill_name" \ --output "$result_file" \ 2>&1 | sed 's/^/ /' || true else tessl eval run \ --task "$task_file" \ --criteria "$criteria_file" \ --skill "$SKILLS_DIR/$skill_name" \ --output "$result_file" \ 2>&1 | sed 's/^/ /' || true fi if [[ -f "$result_file" ]]; then ((PASS++)) else ((FAIL++)) fi done echo "" echo "=== Eval Summary ===" echo "Pass: $PASS Fail: $FAIL Skip: $SKIP" ================================================ FILE: evals/security/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Password hashed with bcrypt or argon2", "type": "deterministic", "weight": 1.0, "check": "Uses bcrypt, argon2, or passlib for password hashing (not md5/sha256)" }, { "name": "No hardcoded secrets", "type": "deterministic", "weight": 1.0, "check": "No hardcoded API keys, JWT secrets, or database passwords in source" }, { "name": "Password not in response", "type": "deterministic", "weight": 0.5, "check": "Response model excludes password/hash field" }, { "name": "Environment variables for secrets", "type": "llm_judged", "weight": 0.5, "prompt": "Are secrets (DB URL, JWT secret) loaded from environment variables or a config file, not hardcoded? Answer yes/no." } ] } ================================================ FILE: evals/security/scenario-1/task.md ================================================ # Task: Build User Registration Create a user registration endpoint: - Accept email and password - Store user in database - Return user ID and email (not password) Use FastAPI and SQLAlchemy. Include a login endpoint that checks credentials. ================================================ FILE: evals/security/scenario-2/criteria.json ================================================ { "criteria": [ { "name": "Keys not logged or exposed", "type": "llm_judged", "weight": 1.0, "prompt": "Are API keys properly protected? Check: not logged in plain text, not returned in full after creation, stored hashed. Answer yes/no with details." }, { "name": "Timing-safe comparison", "type": "deterministic", "weight": 0.5, "check": "Uses hmac.compare_digest or secrets.compare_digest for key comparison" }, { "name": "Rate limiting implemented", "type": "deterministic", "weight": 0.5, "check": "Rate limiting logic exists with per-key tracking" } ] } ================================================ FILE: evals/security/scenario-2/task.md ================================================ # Task: Add API Key Authentication Add API key authentication middleware to an existing FastAPI app: - Keys stored in database with user association - Rate limiting per key (100 req/min) - Key rotation support (old key valid for 24h after rotation) - Admin endpoint to create/revoke keys ================================================ FILE: evals/session-management/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Session state checkpoint created", "type": "llm_judged", "weight": 1.0, "prompt": "Did the agent create or update session state files (current-state.md or similar) during implementation? Answer yes/no." }, { "name": "State persisted across refresh", "type": "deterministic", "weight": 0.5, "check": "Uses localStorage, sessionStorage, or similar persistence for form state" } ] } ================================================ FILE: evals/session-management/scenario-1/task.md ================================================ # Task: Build a Multi-Step Form Wizard Create a React multi-step form (3 steps: personal info, address, review) with: - Step navigation (next/back) - Data persistence across steps - Validation per step - Summary on final step The session should be resumable if the user refreshes. ================================================ FILE: evals/supabase/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "RLS policies created", "type": "deterministic", "weight": 1.0, "check": "SQL includes CREATE POLICY or ALTER TABLE ENABLE ROW LEVEL SECURITY" }, { "name": "Migration file created", "type": "deterministic", "weight": 1.0, "check": "Migration file exists in supabase/migrations/" }, { "name": "Profile linked to auth.users", "type": "deterministic", "weight": 0.5, "check": "Foreign key reference to auth.users(id)" } ] } ================================================ FILE: evals/supabase/scenario-1/task.md ================================================ # Task: Build a User Profile System Create a Supabase-backed user profile system: - profiles table linked to auth.users - RLS policies: users can only read/update their own profile - Edge function for profile avatar upload - Migration file for the table Use the Supabase CLI for migrations. ================================================ FILE: evals/typescript/scenario-1/criteria.json ================================================ { "criteria": [ { "name": "Strict TypeScript mode", "type": "deterministic", "weight": 1.0, "check": "tsconfig.json has strict: true" }, { "name": "Barrel export from index.ts", "type": "deterministic", "weight": 0.5, "check": "index.ts exists with re-exports" }, { "name": "Proper generic types", "type": "llm_judged", "weight": 0.5, "prompt": "Does the task queue use proper TypeScript generics for task payloads and results, avoiding 'any' type? Answer yes/no." } ] } ================================================ FILE: evals/typescript/scenario-1/task.md ================================================ # Task: Build a Task Queue Library Create a TypeScript task queue that: - Accepts async functions with priority levels - Processes tasks with configurable concurrency - Supports retry with exponential backoff - Emits events: task:start, task:complete, task:fail Export types and the main class from an index.ts barrel file. ================================================ FILE: hooks/post-commit-graph ================================================ #!/bin/bash # Post-Commit Graph Update Hook # # Triggers incremental codebase-memory-mcp graph update after each commit. # This hook is LIGHTWEIGHT (~10ms) — it does NOT run the MCP server or # any heavy process. It touches a marker file that the already-running # codebase-memory-mcp file watcher picks up. # # Installed by: /initialize-project or ~/.claude/install-hooks.sh # Remove with: rm .git/hooks/post-commit (or remove the code-graph section) # Skip if code graph is not configured for this project if [ ! -f ".mcp.json" ] || ! grep -q "codebase-memory" ".mcp.json" 2>/dev/null; then exit 0 fi # Get list of committed code files COMMITTED_FILES=$(git diff-tree --no-commit-id --name-only -r HEAD 2>/dev/null) if [ -z "$COMMITTED_FILES" ]; then exit 0 fi # Filter to code files only (skip configs, docs, images, etc.) CODE_EXTENSIONS='\.ts$|\.tsx$|\.js$|\.jsx$|\.py$|\.go$|\.rs$|\.java$|\.rb$|\.php$|\.swift$|\.kt$|\.c$|\.cpp$|\.h$|\.hpp$|\.cs$|\.scala$|\.lua$|\.vue$|\.svelte$' CODE_FILES=$(echo "$COMMITTED_FILES" | grep -E "$CODE_EXTENSIONS" || true) if [ -z "$CODE_FILES" ]; then exit 0 fi FILE_COUNT=$(echo "$CODE_FILES" | wc -l | tr -d ' ') # Touch marker file for codebase-memory-mcp file watcher # This is the lightest possible signal — no blocking, no spawning processes if [ -d ".code-graph" ]; then touch ".code-graph/.needs-update" 2>/dev/null || true fi echo "code-graph: update queued ($FILE_COUNT code files changed)" exit 0 ================================================ FILE: hooks/pre-push ================================================ #!/bin/bash # Claude Code Review - Pre-Push Hook # Runs /code-review on changes before pushing to remote # Blocks push if Critical or High severity issues are found set -e # Colors RED='\033[0;31m' YELLOW='\033[1;33m' GREEN='\033[0;32m' NC='\033[0m' # No Color echo "" echo "🔍 Running Claude Code Review before push..." echo "" # Get the remote and URL being pushed to remote="$1" url="$2" # Read stdin to get refs being pushed while read local_ref local_sha remote_ref remote_sha; do if [ "$local_sha" = "0000000000000000000000000000000000000000" ]; then # Branch is being deleted, skip continue fi if [ "$remote_sha" = "0000000000000000000000000000000000000000" ]; then # New branch, compare against default branch base_ref=$(git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's@^refs/remotes/origin/@@' || echo "main") range="origin/$base_ref...$local_sha" else # Existing branch, compare against remote range="$remote_sha...$local_sha" fi # Get changed files changed_files=$(git diff --name-only "$range" 2>/dev/null | grep -E '\.(ts|tsx|js|jsx|py|go|rs|java|rb|php|swift|kt)$' || true) if [ -z "$changed_files" ]; then echo -e "${GREEN}✅ No code files to review${NC}" exit 0 fi file_count=$(echo "$changed_files" | wc -l | tr -d ' ') echo "📁 Reviewing $file_count file(s)..." echo "" # Run Claude code review review_output=$(mktemp) if ! command -v claude &> /dev/null; then echo -e "${YELLOW}⚠️ Claude CLI not found. Skipping code review.${NC}" echo " Install: npm install -g @anthropic-ai/claude-code" exit 0 fi # Run code review with --print flag for non-interactive output if claude --print "/code-review $changed_files" > "$review_output" 2>&1; then # Check the explicit Status line first (most reliable) if grep -q "Status: ✅ PASS" "$review_output"; then echo -e "${GREEN}✅ Code review passed${NC}" # Show summary if there are medium/low issues if grep -qE '🟡|🟢' "$review_output"; then echo "" echo -e "${YELLOW}ℹ️ Advisory issues (non-blocking):${NC}" grep -E '🟡|🟢' "$review_output" | head -5 fi elif grep -q "Status: ❌" "$review_output"; then echo -e "${RED}❌ PUSH BLOCKED - Critical/High issues found${NC}" echo "" cat "$review_output" echo "" echo -e "${RED}Fix critical/high issues before pushing.${NC}" rm "$review_output" exit 1 else # Fallback: parse severity counts from the summary table # Match "| Critical | N |" or "Critical: N" patterns critical_count=$(grep -oP 'Critical[:\s|]+\K[0-9]+' "$review_output" | head -1 || echo "0") high_count=$(grep -oP 'High[:\s|]+\K[0-9]+' "$review_output" | head -1 || echo "0") critical_count=${critical_count:-0} high_count=${high_count:-0} if [ "$critical_count" -gt 0 ] || [ "$high_count" -gt 0 ]; then echo -e "${RED}❌ PUSH BLOCKED - Critical: $critical_count, High: $high_count${NC}" echo "" cat "$review_output" echo "" echo -e "${RED}Fix critical/high issues before pushing.${NC}" rm "$review_output" exit 1 else echo -e "${GREEN}✅ Code review passed${NC}" if grep -qE '🟡|🟢' "$review_output"; then echo "" echo -e "${YELLOW}ℹ️ Advisory issues (non-blocking):${NC}" grep -E '🟡|🟢' "$review_output" | head -5 fi fi fi else echo -e "${YELLOW}⚠️ Code review failed to run. Allowing push.${NC}" echo " Check Claude CLI configuration." fi rm -f "$review_output" done echo "" exit 0 ================================================ FILE: hooks/workspace/check-contract-freshness.sh ================================================ #!/bin/bash # Contract Freshness Check - Session Start Hook # Checks if workspace contracts are stale and advises user # Run time: ~5 seconds WORKSPACE_DIR="_project_specs/workspace" STALENESS_THRESHOLD=86400 # 24 hours in seconds WARNING_THRESHOLD=604800 # 7 days in seconds # Colors YELLOW='\033[1;33m' BLUE='\033[0;34m' GREEN='\033[0;32m' RED='\033[0;31m' NC='\033[0m' # Check if workspace is configured if [ ! -f "$WORKSPACE_DIR/CONTRACTS.md" ]; then # No workspace configured - silent exit exit 0 fi if [ ! -f "$WORKSPACE_DIR/.contract-sources" ]; then echo -e "${YELLOW}⚠️ Workspace configured but no contract sources defined${NC}" echo " Run /analyze-workspace to set up contract monitoring" exit 0 fi # Get last analysis timestamp LAST_ANALYSIS=$(stat -f %m "$WORKSPACE_DIR/CONTRACTS.md" 2>/dev/null || stat -c %Y "$WORKSPACE_DIR/CONTRACTS.md" 2>/dev/null) NOW=$(date +%s) AGE=$((NOW - LAST_ANALYSIS)) # Check for stale analysis if [ "$AGE" -gt "$WARNING_THRESHOLD" ]; then DAYS=$((AGE / 86400)) echo -e "${RED}📅 Workspace contracts are ${DAYS} days old${NC}" echo " Run /analyze-workspace for full refresh" echo "" fi # Check if any contract sources changed since last sync CHANGED_FILES="" CHANGED_COUNT=0 while IFS= read -r source || [ -n "$source" ]; do # Skip comments and empty lines [[ "$source" =~ ^#.*$ ]] && continue [[ -z "$source" ]] && continue if [ -f "$source" ]; then SOURCE_MTIME=$(stat -f %m "$source" 2>/dev/null || stat -c %Y "$source" 2>/dev/null) if [ "$SOURCE_MTIME" -gt "$LAST_ANALYSIS" ]; then CHANGED_FILES="$CHANGED_FILES\n - $source" CHANGED_COUNT=$((CHANGED_COUNT + 1)) fi fi done < "$WORKSPACE_DIR/.contract-sources" # Report changes if [ "$CHANGED_COUNT" -gt 0 ]; then echo -e "${YELLOW}🔄 Contract sources changed since last sync:${NC}" echo -e "$CHANGED_FILES" echo "" echo -e " Run ${BLUE}/sync-contracts${NC} to update" echo "" elif [ "$AGE" -gt "$STALENESS_THRESHOLD" ]; then HOURS=$((AGE / 3600)) echo -e "${YELLOW}📅 Last contract sync: ${HOURS} hours ago${NC}" echo -e " Consider running ${BLUE}/sync-contracts${NC}" echo "" else # Fresh - silent success : fi exit 0 ================================================ FILE: hooks/workspace/check-graph-freshness.sh ================================================ #!/bin/bash # Check Graph Freshness - Session Start Advisory # # Warns if code graph data is older than the latest commit. # Run at session start to ensure Claude is working with current data. GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # Skip if no graph configured if [ ! -f ".mcp.json" ] || ! grep -q "codebase-memory" ".mcp.json" 2>/dev/null; then exit 0 fi # Skip if no .code-graph directory (graph not yet built) if [ ! -d ".code-graph" ]; then echo -e "${YELLOW}code-graph: No graph data found. Run index_repository via MCP to build.${NC}" exit 0 fi # Get latest commit timestamp LATEST_COMMIT=$(git log -1 --format=%ct 2>/dev/null || echo "0") # Get graph last-updated timestamp (modification time of the DB or marker) if [ -f ".code-graph/.last-updated" ]; then GRAPH_UPDATED=$(cat ".code-graph/.last-updated" 2>/dev/null || echo "0") elif [ "$(uname)" = "Darwin" ]; then # macOS: stat -f %m GRAPH_UPDATED=$(stat -f %m ".code-graph/" 2>/dev/null || echo "0") else # Linux: stat -c %Y GRAPH_UPDATED=$(stat -c %Y ".code-graph/" 2>/dev/null || echo "0") fi # Compare timestamps DIFF=$((LATEST_COMMIT - GRAPH_UPDATED)) if [ "$DIFF" -gt 300 ]; then # More than 5 minutes stale MINUTES=$((DIFF / 60)) echo -e "${YELLOW}code-graph: Graph may be stale (~${MINUTES}m behind latest commit)${NC}" echo " The MCP file watcher should auto-update." echo " If stale, use index_repository to rebuild." elif [ "$DIFF" -gt 60 ]; then # Slightly stale (1-5 minutes) — just a note echo -e "${YELLOW}code-graph: Graph is slightly behind latest commit (auto-updating)${NC}" else echo -e "${GREEN}code-graph: Graph data is fresh${NC}" fi exit 0 ================================================ FILE: hooks/workspace/post-commit-contracts.sh ================================================ #!/bin/bash # Post-Commit Contract Sync Hook # Automatically syncs contracts when contract source files are committed # Run time: ~15 seconds (only when contracts change) WORKSPACE_DIR="_project_specs/workspace" # Colors GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # Check if workspace is configured if [ ! -f "$WORKSPACE_DIR/.contract-sources" ]; then exit 0 fi # Get list of committed files COMMITTED_FILES=$(git diff-tree --no-commit-id --name-only -r HEAD 2>/dev/null) if [ -z "$COMMITTED_FILES" ]; then exit 0 fi # Check if any committed files are contract sources CONTRACTS_CHANGED=false CHANGED_SOURCES="" while IFS= read -r source || [ -n "$source" ]; do # Skip comments and empty lines [[ "$source" =~ ^#.*$ ]] && continue [[ -z "$source" ]] && continue if echo "$COMMITTED_FILES" | grep -q "$source"; then CONTRACTS_CHANGED=true CHANGED_SOURCES="$CHANGED_SOURCES $source" fi done < "$WORKSPACE_DIR/.contract-sources" # If contracts changed, run lightweight sync if [ "$CONTRACTS_CHANGED" = true ]; then echo "" echo -e "${YELLOW}📝 Contract files changed in this commit:${NC}" for src in $CHANGED_SOURCES; do echo " - $src" done echo "" # Check if Claude CLI is available if command -v claude &> /dev/null; then echo -e "${BLUE}⚡ Running lightweight contract sync...${NC}" # Run sync in silent/lightweight mode if claude --print "/sync-contracts --lightweight" > /dev/null 2>&1; then echo -e "${GREEN}✅ Contracts synced${NC}" else echo -e "${YELLOW}⚠️ Contract sync failed - run /sync-contracts manually${NC}" fi else echo -e "${YELLOW}⚠️ Claude CLI not found${NC}" echo " Run /sync-contracts manually to update contracts" fi echo "" fi exit 0 ================================================ FILE: hooks/workspace/pre-push-contracts.sh ================================================ #!/bin/bash # Pre-Push Contract Validation Hook # Validates contract consistency before pushing # Blocks push if contracts are out of sync # Run time: ~10 seconds WORKSPACE_DIR="_project_specs/workspace" # Colors RED='\033[0;31m' YELLOW='\033[1;33m' GREEN='\033[0;32m' BLUE='\033[0;34m' NC='\033[0m' # Check if workspace is configured if [ ! -f "$WORKSPACE_DIR/CONTRACTS.md" ]; then exit 0 fi if [ ! -f "$WORKSPACE_DIR/.contract-sources" ]; then exit 0 fi echo "" echo -e "${BLUE}🔍 Validating workspace contracts...${NC}" VALIDATION_ERRORS="" WARNING_COUNT=0 ERROR_COUNT=0 # Get last sync timestamp LAST_SYNC=$(stat -f %m "$WORKSPACE_DIR/CONTRACTS.md" 2>/dev/null || stat -c %Y "$WORKSPACE_DIR/CONTRACTS.md" 2>/dev/null) # Check if any contract sources changed since last sync STALE_SOURCES="" while IFS= read -r source || [ -n "$source" ]; do # Skip comments and empty lines [[ "$source" =~ ^#.*$ ]] && continue [[ -z "$source" ]] && continue if [ -f "$source" ]; then SOURCE_MTIME=$(stat -f %m "$source" 2>/dev/null || stat -c %Y "$source" 2>/dev/null) if [ "$SOURCE_MTIME" -gt "$LAST_SYNC" ]; then STALE_SOURCES="$STALE_SOURCES\n - $source" ERROR_COUNT=$((ERROR_COUNT + 1)) fi else VALIDATION_ERRORS="$VALIDATION_ERRORS\n⚠️ Contract source missing: $source" WARNING_COUNT=$((WARNING_COUNT + 1)) fi done < "$WORKSPACE_DIR/.contract-sources" # Check OpenAPI consistency (if exists) if [ -f "apps/api/openapi.json" ] || [ -f "openapi.json" ]; then OPENAPI_FILE=$([ -f "apps/api/openapi.json" ] && echo "apps/api/openapi.json" || echo "openapi.json") if command -v jq &> /dev/null; then ACTUAL_ENDPOINTS=$(jq -r '.paths | keys | length' "$OPENAPI_FILE" 2>/dev/null || echo "0") DOCUMENTED_ENDPOINTS=$(grep -cE "^\| (GET|POST|PUT|PATCH|DELETE)" "$WORKSPACE_DIR/CONTRACTS.md" 2>/dev/null || echo "0") if [ "$ACTUAL_ENDPOINTS" != "0" ] && [ "$DOCUMENTED_ENDPOINTS" != "0" ]; then if [ "$ACTUAL_ENDPOINTS" != "$DOCUMENTED_ENDPOINTS" ]; then VALIDATION_ERRORS="$VALIDATION_ERRORS\n⚠️ Endpoint count mismatch: OpenAPI has $ACTUAL_ENDPOINTS, CONTRACTS.md has $DOCUMENTED_ENDPOINTS" WARNING_COUNT=$((WARNING_COUNT + 1)) fi fi fi fi # Report results if [ "$ERROR_COUNT" -gt 0 ]; then echo -e "${RED}❌ Contract sources changed but not synced:${NC}" echo -e "$STALE_SOURCES" echo "" echo -e "${RED}Run /sync-contracts before pushing${NC}" echo -e "Or bypass with: ${YELLOW}git push --no-verify${NC}" echo "" exit 1 fi if [ "$WARNING_COUNT" -gt 0 ]; then echo -e "${YELLOW}⚠️ Validation warnings:${NC}" echo -e "$VALIDATION_ERRORS" echo "" echo -e "${YELLOW}Consider running /sync-contracts${NC}" echo "" # Warnings don't block push fi if [ "$ERROR_COUNT" -eq 0 ]; then echo -e "${GREEN}✅ Contracts validated${NC}" fi exit 0 ================================================ FILE: install.sh ================================================ #!/bin/bash # Maggy Installer set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CLAUDE_DIR="$HOME/.claude" echo "Installing Maggy v4.0.0..." echo "" # Save bootstrap directory location for other scripts echo "$SCRIPT_DIR" > "$HOME/.claude/.bootstrap-dir" # Create directories mkdir -p "$CLAUDE_DIR/commands" mkdir -p "$CLAUDE_DIR/skills" mkdir -p "$CLAUDE_DIR/hooks" mkdir -p "$CLAUDE_DIR/rules" # Copy all commands cp "$SCRIPT_DIR/commands/"*.md "$CLAUDE_DIR/commands/" echo "✓ Installed commands:" ls -1 "$CLAUDE_DIR/commands/" | sed 's/^/ - \//' | sed 's/\.md$//' # Copy skills (folder structure with SKILL.md) echo "" echo "Installing skills..." rm -rf "$CLAUDE_DIR/skills" mkdir -p "$CLAUDE_DIR/skills" skill_count=0 for skill_dir in "$SCRIPT_DIR/skills"/*/; do if [ -d "$skill_dir" ] && [ -f "$skill_dir/SKILL.md" ]; then skill_name=$(basename "$skill_dir") cp -r "${skill_dir%/}" "$CLAUDE_DIR/skills/" skill_count=$((skill_count + 1)) fi done echo "✓ Installed $skill_count skills (folder/SKILL.md structure)" # Cross-tool skill installation (Kimi CLI, Codex CLI) DETECTED_AGENTS=$("$SCRIPT_DIR/scripts/detect-agents.sh" 2>/dev/null || true) if echo "$DETECTED_AGENTS" | grep -q "kimi"; then "$SCRIPT_DIR/scripts/install-skills.sh" "$HOME/.kimi/skills" "$SCRIPT_DIR/skills" echo " Also installed skills to ~/.kimi/skills/ (Kimi CLI)" fi if echo "$DETECTED_AGENTS" | grep -q "codex"; then "$SCRIPT_DIR/scripts/install-skills.sh" "$HOME/.codex/skills" "$SCRIPT_DIR/skills" echo " Also installed skills to ~/.codex/skills/ (Codex CLI)" fi # Copy conditional rules echo "" echo "Installing conditional rules..." rm -rf "$CLAUDE_DIR/rules" mkdir -p "$CLAUDE_DIR/rules" rule_count=0 for rule_file in "$SCRIPT_DIR/rules/"*.md; do if [ -f "$rule_file" ]; then cp "$rule_file" "$CLAUDE_DIR/rules/" rule_count=$((rule_count + 1)) fi done echo "✓ Installed $rule_count conditional rules (with paths: frontmatter)" ls -1 "$CLAUDE_DIR/rules/" | sed 's/^/ - /' | sed 's/\.md$//' # Copy hooks cp "$SCRIPT_DIR/hooks/"* "$CLAUDE_DIR/hooks/" 2>/dev/null || true chmod +x "$CLAUDE_DIR/hooks/"* 2>/dev/null || true echo "" echo "✓ Installed git hooks (templates)" # Copy templates echo "" echo "Installing templates..." mkdir -p "$CLAUDE_DIR/templates" cp "$SCRIPT_DIR/templates/"* "$CLAUDE_DIR/templates/" 2>/dev/null || true chmod +x "$CLAUDE_DIR/templates/tdd-loop-check.sh" 2>/dev/null || true chmod +x "$CLAUDE_DIR/templates/pre-compact.sh" 2>/dev/null || true chmod +x "$CLAUDE_DIR/templates/codex-auto-review.sh" 2>/dev/null || true echo "✓ Installed templates (CLAUDE.md, AGENTS.md, CLAUDE.local.md, settings.json, config.toml)" # Cross-tool config installation if echo "$DETECTED_AGENTS" | grep -q "kimi"; then mkdir -p "$HOME/.kimi" cp "$SCRIPT_DIR/templates/config.toml" "$HOME/.kimi/config.toml.bootstrap" 2>/dev/null || true echo " Kimi: hooks template at ~/.kimi/config.toml.bootstrap" fi if echo "$DETECTED_AGENTS" | grep -q "codex"; then mkdir -p "$HOME/.codex" cp "$SCRIPT_DIR/templates/AGENTS.md" "$HOME/.codex/templates/AGENTS.md" 2>/dev/null || { mkdir -p "$HOME/.codex/templates" cp "$SCRIPT_DIR/templates/AGENTS.md" "$HOME/.codex/templates/AGENTS.md" } echo " Codex: AGENTS.md template at ~/.codex/templates/" fi # Copy hook installer script cp "$SCRIPT_DIR/scripts/install-hooks.sh" "$CLAUDE_DIR/" 2>/dev/null || true chmod +x "$CLAUDE_DIR/install-hooks.sh" 2>/dev/null || true # Copy graph tools installer cp "$SCRIPT_DIR/scripts/install-graph-tools.sh" "$CLAUDE_DIR/" 2>/dev/null || true chmod +x "$CLAUDE_DIR/install-graph-tools.sh" 2>/dev/null || true # Install Polyphony CLI shim POLYPHONY_SRC="$SCRIPT_DIR/scripts/polyphony" if [ -f "$POLYPHONY_SRC/__main__.py" ]; then INSTALL_DIR="$HOME/.local/bin" mkdir -p "$INSTALL_DIR" cat > "$INSTALL_DIR/polyphony" << SHIM #!/bin/bash exec python3 -c "import sys; sys.path.insert(0, '$SCRIPT_DIR/scripts'); from polyphony.__main__ import main; sys.exit(main())" "\$@" SHIM chmod +x "$INSTALL_DIR/polyphony" echo "" echo "✓ Installed polyphony CLI shim" # Create default config if missing if [ ! -d "$HOME/.polyphony" ]; then mkdir -p "$HOME/.polyphony" cp -n "$SCRIPT_DIR/templates/polyphony-config.yaml" "$HOME/.polyphony/config.yaml" 2>/dev/null || true cp -n "$SCRIPT_DIR/templates/polyphony-identities.yaml" "$HOME/.polyphony/identities.yaml" 2>/dev/null || true cp -n "$SCRIPT_DIR/templates/polyphony-agents.yaml" "$HOME/.polyphony/agents.yaml" 2>/dev/null || true cp -n "$SCRIPT_DIR/templates/polyphony-routing.yaml" "$HOME/.polyphony/routing.yaml" 2>/dev/null || true echo "✓ Created ~/.polyphony/ config" fi fi # Run validation echo "" echo "Running validation..." if [ -f "$SCRIPT_DIR/tests/validate-structure.sh" ]; then if "$SCRIPT_DIR/tests/validate-structure.sh" --quick; then echo "" else echo "" echo "⚠ Validation found issues. Run full validation:" echo " $SCRIPT_DIR/tests/validate-structure.sh --full" fi fi echo "" echo "================================================================" echo " Installation complete! (v4.0.0)" echo "================================================================" echo "" echo "What's new in v4.0.0:" echo " - Polyphony: container-isolated parallel agents (Docker/OrbStack)" echo " - /spawn-team now uses Polyphony by default (fallback to native)" echo " - polyphony CLI: init, spawn, status, cleanup" echo " - Cross-tool support: Claude Code + Kimi CLI + Codex CLI" echo "" echo "Usage:" echo " 1. Open any project folder" echo " 2. Run: claude (or kimi, or codex)" echo " 3. Type: /initialize-project" echo "" echo "Commands installed:" echo " /initialize-project - Full project setup (includes Polyphony)" echo " /spawn-team - Spawn agent team (containers by default)" echo " /sync-agents - Sync config between Claude/Kimi/Codex" echo " /check-contributors - Team coordination" echo " /update-code-index - Regenerate code index" echo "" echo "Polyphony CLI:" echo " polyphony init - Create ~/.polyphony/ config" echo " polyphony spawn - Create and route a task" echo " polyphony status - Show task states" echo " polyphony cleanup - Remove completed workspaces" echo "" echo "Container isolation (Polyphony):" if echo "$DETECTED_AGENTS" | grep -q "docker"; then echo " [OK] Docker - container isolation available" elif echo "$DETECTED_AGENTS" | grep -q "orbstack"; then echo " [OK] OrbStack - container isolation available" else echo " [--] Docker - not found (brew install --cask docker)" fi if echo "$DETECTED_AGENTS" | grep -q "polyphony"; then echo " [OK] Polyphony - CLI installed" else echo " [--] Polyphony - CLI shim not on PATH (add ~/.local/bin to PATH)" fi echo "" echo "Cross-tool compatibility:" if echo "$DETECTED_AGENTS" | grep -q "kimi"; then echo " [OK] Kimi CLI - skills + hooks installed" else echo " [--] Kimi CLI - not found (curl -L code.kimi.com/install.sh | bash)" fi if echo "$DETECTED_AGENTS" | grep -q "codex"; then echo " [OK] Codex CLI - skills + AGENTS.md installed" else echo " [--] Codex CLI - not found (npm i -g @openai/codex)" fi echo "" echo "Git Hooks (per-project):" echo " cd your-project && ~/.claude/install-hooks.sh" echo "" echo "Code Graph Tools:" echo " ~/.claude/install-graph-tools.sh - Install Tier 1 (default)" echo " ~/.claude/install-graph-tools.sh --joern - Also install Tier 2 (CPG)" echo " ~/.claude/install-graph-tools.sh --codeql - Also install Tier 3 (security)" echo " ~/.claude/install-graph-tools.sh --all - Install all tiers" echo "" echo "Validation:" echo " $SCRIPT_DIR/tests/validate-structure.sh --full" echo "" ================================================ FILE: maggy/.gitignore ================================================ __pycache__/ *.py[cod] *$py.class .pytest_cache/ .mypy_cache/ .ruff_cache/ *.egg-info/ ================================================ FILE: maggy/PLAN.md ================================================ # Maggy — Generic AI Engineering Command Center Ships as a core component of Maggy. One install, works with any team. ## What Maggy Is A local, self-improving AI agent that turns your issue tracker into an AI-prioritized inbox with one-click execution. Uses Maggy's iCPG for codebase intelligence and spawns `claude -p` for implementation. Not a cloud service — runs on your machine, talks to your APIs, uses your Claude Code. ## Vision ``` $ maggy init Org name: Acme Corp Issue tracker? (github / asana / linear) → github GitHub org: acmecorp Repos to monitor: api, web, mobile Competitor domain (for intelligence): fintech Paste your OKRs (or skip): ... ✓ Config saved to ~/.maggy/config.yaml ✓ Bootstrapping iCPG for 3 repos... ✓ Discovering competitors in "fintech"... (found 28) ✓ Ready: http://localhost:8080 ``` That's it. Works the same for any org. ## Architecture ``` maggy/ ├── maggy/ # The Maggy dashboard app │ ├── PLAN.md # this file │ ├── README.md # user docs │ ├── install.sh # one-line install │ ├── pyproject.toml # deps │ ├── config.example.yaml # config template │ ├── maggy/ # Python package (importable as `maggy`) │ │ ├── main.py # FastAPI entry │ │ ├── config.py # loads ~/.maggy/config.yaml │ │ ├── providers/ │ │ │ ├── base.py # IssueTrackerProvider Protocol │ │ │ ├── github_issues.py # GitHub Issues impl │ │ │ └── asana.py # Asana impl (linear deferred) │ │ ├── services/ │ │ │ ├── inbox.py # AI-prioritized ranking │ │ │ ├── competitor.py # discovery + monitoring + briefing │ │ │ └── executor.py # TDD pipeline with iCPG enrichment │ │ ├── api/ │ │ │ └── routes.py # REST endpoints │ │ └── static/ │ │ ├── index.html # dashboard │ │ └── app.js # vanilla JS ├── commands/ │ ├── maggy.md # /maggy → launch dashboard │ └── maggy-init.md # /maggy-init → setup wizard ├── skills/ │ └── maggy/ │ └── SKILL.md # Maggy capabilities reference └── scripts/icpg/ # ALREADY EXISTS — Maggy calls this ``` ## Key Design Decisions ### 1. Config-driven, not hardcoded A single `~/.maggy/config.yaml` drives everything. No hardcoded board IDs, repo names, team members, OKRs, or competitor lists. All that stuff lives in config. ```yaml org: name: "Acme Corp" domain: "fintech" # drives competitor category + system prompt issue_tracker: provider: "github" # "github" | "asana" (linear = stub) github: org: "acmecorp" repos: ["acmecorp/api", "acmecorp/web"] # PAT read from env: GITHUB_TOKEN codebases: - path: "~/dev/acmecorp/api" key: "api" - path: "~/dev/acmecorp/web" key: "web" competitors: categories: ["fintech", "embedded-finance"] # Maggy auto-discovers. Stores in ~/.maggy/competitors.json ai: provider: "anthropic" model: "claude-sonnet-4-5-20250929" # API key from ANTHROPIC_API_KEY env storage: # SQLite by default — zero setup. Supabase optional. backend: "sqlite" path: "~/.maggy/maggy.db" dashboard: port: 8080 auth_mode: "local" # no auth for single-user local use ``` ### 2. Provider abstraction for issue trackers The #1 coupling in the zenloop version is Asana. Generic Maggy defines a Protocol and all services use it: ```python class IssueTrackerProvider(Protocol): async def list_tasks(self, board: str | None = None, state: str = "open") -> list[Task] async def get_task(self, task_id: str) -> Task async def add_comment(self, task_id: str, text: str) -> None async def update_status(self, task_id: str, status: str) -> None async def list_followed(self, user_id: str | None = None) -> list[Task] async def search_tasks(self, query: str) -> list[Task] ``` `GitHubIssuesProvider` and `AsanaProvider` both implement this. Services call `provider.list_tasks()` — they don't care what's underneath. ### 3. Reuses Maggy's iCPG Don't duplicate iCPG. Maggy shells out to the iCPG CLI: ```python # executor.py async def _get_icpg_context(title: str, notes: str) -> str: keywords = extract_keywords(title + notes) context = [] for kw in keywords[:5]: result = await run_cmd(["icpg", "query", "symbols", "--keyword", kw, "--json"]) context.append(result) return format_icpg_block(context) ``` This means the dashboard automatically benefits from iCPG upgrades. No duplicate symbol indexing. ### 4. SQLite-first storage The zenloop version used Supabase for P2P coordination. For a single-user local install, SQLite is simpler and zero-setup. P2P and multi-user stays optional: - **Default (SQLite):** `~/.maggy/maggy.db`. Zero setup. - **Optional (Supabase):** For teams that want shared state and P2P handoff. ### 5. Dashboard is minimal but real Not a React SPA — Tailwind CDN + vanilla JS. Matches Maggy's philosophy (no build step, dead simple). Three views: 1. **Inbox** — AI-prioritized issues with Execute/Plan/Comment buttons 2. **Competitor News** — daily AI briefing + news feed 3. **Settings** — view/edit config, health check ### 6. Ships with Maggy User installs Maggy, runs `/maggy-init` in Claude Code, and the dashboard is configured + running. `/maggy` in any Claude Code session opens the dashboard. ## MVP Scope (what I'm building now) **In scope:** - [x] Directory structure - [ ] Config loader + example - [ ] IssueTrackerProvider Protocol + GitHub Issues + Asana impls - [ ] Inbox service (AI-prioritized) - [ ] Competitor service (AI-discovered, daily briefing) - [ ] Executor service (TDD pipeline with iCPG enrichment) - [ ] FastAPI server + 8 endpoints - [ ] Minimal HTML dashboard - [ ] install.sh + pyproject.toml + README - [ ] /maggy and /maggy-init commands - [ ] skills/maggy/SKILL.md **Deferred to v2 (not MVP):** - Meeting bot (voice) - Slack integration - P2P network + session handoff - Self-improvement (`/improve-maggy`) - Heartbeat service (background processing) - BambooHR integration - Auto-review (PRs, tickets) - 27 AI tools → starts with 5 core tools - Linear provider (stub only) ## How to test independently After install: ```bash cd ~/Documents/AI-Playground/maggy/maggy ./install.sh # Configure cp config.example.yaml ~/.maggy/config.yaml # Edit ~/.maggy/config.yaml with your GitHub org/repos # Set env vars export ANTHROPIC_API_KEY=sk-ant-... export GITHUB_TOKEN=ghp_... # Run python -m maggy.main # Open http://localhost:8080 ``` Or from inside Claude Code (after bootstrap install): ``` /maggy-init # interactive setup /maggy # launch dashboard ``` Should work out-of-the-box for any GitHub-based team. ## Success criteria 1. Fresh install on a machine that never saw zenloop → works 2. Points at any GitHub org → inbox populates with issues 3. AI prioritization runs → issues ranked 4. Click Execute → TDD pipeline spawns `claude -p` with iCPG context injected 5. Competitor discovery for any domain → competitors found + daily briefing 6. No hardcoded zenloop anything anywhere in the code That's the bar. ================================================ FILE: maggy/README.md ================================================ # Maggy **Autonomous AI engineering command center.** Install once, point it at your codebases and issue tracker, and get: - **Interactive Chat** — auto-connects to all active Claude/Codex/Kimi sessions, take over from the web UI with full session continuity (`--resume`) - **AI-prioritized Tasks** — ranks open issues by urgency + OKR alignment - **One-click Execute** — spawns `claude -p` with iCPG-enriched prompts, runs TDD pipeline - **Competitor Intelligence** — auto-discovers competitors, daily AI briefing - **Process Insights** — CLI session history analysis, health signals, self-improvement recommendations - **P2P Mesh** — multi-node session sync and handoff across machines - **Auto-Bootstrap** — all services seed themselves on startup (history, CIKG, events) ## Install ```bash cd maggy/maggy ./install.sh ``` ## Configure Edit `~/.maggy/config.yaml`: ```yaml org: name: "Acme Corp" domain: "fintech" issue_tracker: provider: "github" github: org: "acmecorp" repos: ["acmecorp/api", "acmecorp/web"] codebases: - { path: "~/dev/acmecorp/api", key: "api" } - { path: "~/dev/acmecorp/web", key: "web" } competitors: categories: ["fintech", "embedded-finance"] ``` Set credentials: ```bash export GITHUB_TOKEN=ghp_... export ANTHROPIC_API_KEY=sk-ant-... ``` ## Run ```bash python3 -m maggy.main ``` Open `http://localhost:8080`. ## Dashboard Navigation is grouped by intent: | Group | Tabs | Purpose | |-------|------|---------| | **Work** | Chat, Tasks, Watching | Do things — chat with Claude, triage issues | | **Intel** | Competitors, Insights | Learn things — competitor news, session analytics | | **System** | Budget, Models, Forge, Settings | Configure — spend limits, model routing, MCP gaps | Chat is the default tab — auto-connects to all running CLI sessions on load. ## From inside Claude Code ``` /maggy-init # interactive setup wizard /maggy # launch dashboard ``` ## Features - **Interactive Chat** — SSE streaming, session continuity via `--resume`, path-based history matching, auto-connect to active CLI sessions - **Activity Scanner** — detects running `claude`, `codex`, `kimi` processes via `ps aux` + `lsof` - **History Analysis** — parses 260+ CLI sessions, topic extraction, session patterns - **Self-Improvement** — signal collection, health scoring, actionable recommendations - **CIKG Knowledge Graph** — codebase nodes, technology detection, landscape queries - **Event Spine** — structured event emission and querying across all services - **Engram Memory** — write/query/expire memory entries with metadata - **Budget Tracking** — daily spend limits with per-provider breakdown - **Model Routing** — reward-based heatmap for model selection by task type - **MCP Forge** — detects capability gaps from filesystem, suggests MCP tools - **P2P Mesh** — WebSocket sync, peer discovery, state quarantine, org-scoped networks - **Heartbeat** — scheduled jobs (history refresh, engram expiry, self-improve, mesh sync) ## Hardening - **Working dir whitelist** — Execute and Chat both validate paths against configured codebase roots - **Chat streaming lock** — per-session `asyncio.Lock` prevents concurrent subprocess spawning - **SSRF protection** — RSS/blog feed URLs validated before fetch (blocks loopback, private-network) - **CLAUDECODE env stripping** — subprocess spawning removes `CLAUDECODE` to allow nested Claude sessions - **Process lifecycle** — Claude subprocesses killed on timeout; non-zero exits marked failed - **Input validation** — Execute mode `Literal["tdd", "plan"]`; malformed IDs return 404 - **503 onboarding mode** — unconfigured state returns 503 with setup pointer - **Safe external links** — scheme allowlist + `rel="noopener noreferrer"` - **No-cache static files** — `Cache-Control: no-store` prevents stale JS in browser ## Architecture See [PLAN.md](./PLAN.md) for the full architecture rationale. 1. **Provider abstraction** — `IssueTrackerProvider` Protocol (GitHub, Asana, Linear stub) 2. **Config-driven** — zero hardcoded IDs, orgs, or competitor lists 3. **iCPG integration** — context enrichment from code property graph 4. **SQLite-first** — single-user local install, zero setup 5. **Auto-bootstrap** — all services seed on startup, no empty tabs 6. **Grouped UI** — Work / Intel / System navigation by intent ## License MIT ================================================ FILE: maggy/config.example.yaml ================================================ # Maggy configuration # Copy this to ~/.maggy/config.yaml and customize. org: name: "Your Org" # Drives competitor auto-discovery and system prompt phrasing. # Examples: "fintech", "devtools", "cx-feedback", "healthcare", "marketplaces" domain: "your-domain" issue_tracker: # Currently supported: "github" | "asana" # ("linear" is a stub and not selectable yet — tracking via #TODO) provider: "github" github: # Your GitHub org or user org: "your-org" # Repos to monitor (full name: "org/repo") repos: - "your-org/api" - "your-org/web" # Optional: only show issues with these labels (empty = all) labels: [] # Read-only token from env: GITHUB_TOKEN asana: # Used when provider: "asana". Ignore if using GitHub. workspace_id: "" # Project GIDs for each "board" that appears in the sidebar boards: dev: "" bugs: "" # Token from env: ASANA_API_KEY codebases: # Paths to repos Maggy can execute in. When you click Execute on a ticket, # Maggy picks the right repo based on keyword matching. - path: "~/dev/your-org/api" key: "api" # Optional: default working_dir override per repo - path: "~/dev/your-org/web" key: "web" competitors: # Maggy auto-discovers competitors in these categories using AI + G2/Capterra research. # Results stored in ~/.maggy/competitors.json — edit freely. categories: - "your-primary-category" # Optional: seed with specific competitor names to ensure they're tracked seed: - "CompetitorOne" - "CompetitorTwo" okrs: # Two ways to provide OKRs: # source: "yaml" → list them inline below # source: "skip" → no OKR tracking source: "skip" # If source == "yaml": items: [] # Example items structure: # - id: "Q2-1" # title: "Reduce p95 latency to 200ms" # keywords: ["latency", "performance", "slow"] ai: provider: "anthropic" model: "claude-sonnet-4-5-20250929" # API key from env: ANTHROPIC_API_KEY max_budget_usd_per_execute: 5.0 storage: # SQLite by default — zero setup. For multi-user/P2P, use Supabase (not yet supported in MVP). backend: "sqlite" path: "~/.maggy/maggy.db" dashboard: host: "127.0.0.1" port: 8080 # "local" = no auth (single-user local install). # "token" = require X-API-Key header matching MAGGY_API_KEY env var. auth_mode: "local" # Paths to Maggy installation — auto-detected, usually don't touch. bootstrap: # If omitted, Maggy looks at ~/.claude/.bootstrap-dir written by install.sh path: "" ================================================ FILE: maggy/docs/benchmark-results.md ================================================ # Maggy v5 Benchmark Results **Date:** 2026-05-11 **App:** Personal Expense Tracker (FastAPI + SQLite + vanilla HTML/JS) **Environment:** Mac Studio M4 Max, 128 GB RAM, macOS Darwin 24.6.0 **CLIs:** Claude Code 2.1.42, Codex 0.129.0, Kimi 1.41.0, Ollama 0.23.2 (qwen2.5-coder:32b) --- ## 1. Test Protocol 6 identical tasks run sequentially through two pipelines: - **Runner A (Maggy):** 4-tier routing via blast score. Auto-discovers CLI flags at startup. - **Runner B (Claude Code):** All tasks run through `claude -p` only. Both pipelines use `--dangerously-skip-permissions` / equivalent flags, 25 max turns, and subprocess spawning into isolated build directories. --- ## 2. Task Definitions | ID | Task | Blast | Maggy Route | Type | |----|------|-------|-------------|------| | EXP-1 | Write product spec | 2 | local (ollama) | docs | | EXP-2 | Design database schema | 3 | kimi | architecture | | EXP-3 | Build expense CRUD API | 5 | gpt (codex) | feature | | EXP-4 | Build category API + monthly summary | 5 | gpt (codex) | feature | | EXP-5 | Build frontend dashboard | 6 | gpt (codex) | frontend | | EXP-6 | Security review + input validation | 8 | claude | security | --- ## 3. Speed Results | Task | Blast | Maggy Model | Maggy (s) | Claude (s) | Winner | |------|-------|-------------|-----------|------------|--------| | EXP-1 | 2 | ollama (local) | 50.4 | 48.6 | Claude | | EXP-2 | 3 | kimi | 86.6 | 67.2 | Claude | | EXP-3 | 5 | codex | 147.1 | 160.6 | **Maggy** | | EXP-4 | 5 | codex | 133.9 | 130.8 | Claude | | EXP-5 | 6 | codex | 280.1 | 121.9 | Claude | | EXP-6 | 8 | claude | 209.5 | 151.9 | Claude | | **Total** | | | **907.6** | **681.0** | **Claude (33% faster)** | ### Routing Distribution (Maggy) | Model | Tasks | % | |-------|-------|---| | codex (gpt) | 3 | 50% | | ollama (local) | 1 | 17% | | kimi | 1 | 17% | | claude | 1 | 17% | --- ## 4. Success Rate | Pipeline | Passed | Failed | Fallbacks | Rate | |----------|--------|--------|-----------|------| | Maggy | 6 | 0 | 0 | 100% | | Claude | 6 | 0 | 0 | 100% | --- ## 5. Output Quality Assessment ### 5.1 File Inventory **Maggy (10 source files, 1,634 lines):** | File | Lines | Model | Assessment | |------|-------|-------|------------| | `SECURITY.md` | 134 | claude | Thorough: 7 findings with fixes, 3 recommendations | | `backend/app/database.py` | 74 | kimi | Correct schema, parameterized queries, FK + cascade, seed data | | `backend/app/main.py` | 36 | kimi | Lifespan init, CORS from env var (not wildcard), 3 routers | | `backend/app/validation.py` | 25 | claude | Shared YYYY-MM regex validator, extracted from duplication | | `backend/app/routes/expenses.py` | 148 | codex | Full CRUD, Pydantic models, parameterized SQL, FK check | | `backend/app/routes/categories.py` | 107 | codex | CRUD, hex color validator, unique constraint handling | | `backend/app/routes/summary.py` | 52 | codex | Monthly aggregation with COALESCE, GROUP BY | | `frontend/index.html` | 121 | codex | Dark theme, responsive, all sections present | | `frontend/css/style.css` | 472 | codex | CSS bar charts, dark palette, mobile breakpoints | | `frontend/js/app.js` | 472 | codex | State management, fetch API, DOM via textContent (XSS-safe) | **Claude (18 source files, ~1,500 app lines + 457K with venv):** | File | Lines | Assessment | |------|-------|------------| | `specs/product-spec.md` | 206 | Comprehensive: vision, schema, Pydantic examples, project structure | | `backend/app/database.py` | 68 | Correct schema, parameterized queries, FK, seed data | | `backend/app/main.py` | 42 | Lifespan init, CORS from env var, 3 routers | | `backend/app/models.py` | 51 | Centralized Pydantic schemas (better separation) | | `backend/app/routes/expenses.py` | 159 | Full CRUD, partial update support, category JOIN | | `backend/app/routes/categories.py` | 90 | CRUD, referential integrity check on delete | | `backend/app/routes/summary.py` | 44 | Monthly aggregation | | `backend/tests/conftest.py` | 18 | Temp DB fixture with patch | | `backend/tests/test_expenses.py` | 108 | 11 test cases covering CRUD + edge cases | | `backend/tests/test_categories.py` | ~50 | Category CRUD tests | | `backend/tests/test_summary.py` | ~40 | Summary endpoint tests | | `frontend/index.html` | 79 | Clean layout, modal-based form | | `frontend/css/style.css` | 323 | Dark theme, responsive | | `frontend/js/app.js` | 320 | API wrapper, currency formatting, chart rendering | ### 5.2 Quality Scoring | Dimension | Maggy | Claude | Notes | |-----------|-------|--------|-------| | **Functional completeness** | 9/10 | 10/10 | Both implement all endpoints. Claude adds partial updates. | | **Security** | 10/10 | 7/10 | Maggy's security review (EXP-6) hardened CORS, added amount bounds, path param validation, color format validation. Claude left CORS with `allow_credentials=True`, no amount ceiling, no color validation. | | **SQL safety** | 10/10 | 10/10 | Both use parameterized queries exclusively. | | **XSS prevention** | 10/10 | 10/10 | Both use textContent for DOM rendering. No innerHTML. | | **Input validation** | 9/10 | 7/10 | Maggy: Pydantic + custom validators (hex color, amount ceiling, path ge=1). Claude: Pydantic regex patterns but less thorough. | | **Error handling** | 9/10 | 8/10 | Maggy: context manager with rollback, 409 on duplicate, 404 on missing. Claude: try/finally, 409 on duplicate, referential integrity check. | | **Test coverage** | 0/10 | 9/10 | Maggy produced zero tests. Claude created conftest + 3 test files (~200 lines). | | **Architecture** | 8/10 | 9/10 | Claude separated models into dedicated file. Maggy inlined models per route. Both wire correctly. | | **Product spec** | 0/10 | 10/10 | Maggy's ollama did not produce a spec file. Claude's spec is comprehensive (206 lines). | | **Frontend quality** | 9/10 | 8/10 | Maggy's frontend is larger (472+472+121 = 1065 lines) with more CSS detail. Claude's is cleaner (320+323+79 = 722 lines) with modal UX. | | **Weighted avg** | **7.4/10** | **7.8/10** | | ### 5.3 Key Differences **Maggy strengths:** - Security review caught and fixed 7 issues (CORS wildcard, missing bounds, color validation, duplicated validation) - Multi-model approach applied right tool to right task (security by Claude, CRUD by Codex, schema by Kimi) - Larger frontend with more CSS polish - Each model contributed its strength: Claude for security depth, Codex for feature implementation **Claude strengths:** - Product spec created (comprehensive 206-line document) - Test suite included (conftest + 3 test files, ~200 lines, 11+ test cases) - Better code organization (centralized models.py) - Partial update support on expenses (PATCH-style PUT) - Referential integrity check on category delete (prevents orphaned expenses) - Full venv with dependencies installed **Maggy weaknesses:** - No product spec file generated (ollama didn't create it or placed it elsewhere) - No test files at all — a significant gap for production readiness - Import paths use `backend.app.` which requires specific project structure to run **Claude weaknesses:** - No dedicated security review — CORS uses `allow_credentials=True` (risky with dynamic origins) - No amount ceiling on expenses (could submit `1e308`) - No hex color format validation on categories - `get_db()` returns connection without context manager (manual close in every route) --- ## 6. Cost Analysis | Pipeline | Claude Usage | Free/Cheap Usage | Est. Subscription Burn | |----------|-------------|------------------|----------------------| | **Maggy** | 1/6 tasks (17%) | 2/6 tasks (33%) | Low — spread across 3 subscriptions | | **Claude** | 6/6 tasks (100%) | 0/6 tasks (0%) | High — 100% on premium model | Maggy used Claude only for the security review (blast 8). The other 5 tasks consumed cheaper or free models: - EXP-1: ollama (free, local GPU) - EXP-2: kimi (free tier / cheap subscription) - EXP-3/4/5: codex (separate subscription) This represents ~83% reduction in Claude subscription consumption. --- ## 7. Routing Observations ### What worked - **Blast 8 → Claude** for security review was correct. Claude produced the most thorough audit. - **Blast 5 → Codex** for CRUD implementation delivered working endpoints. - **Blast 3 → Kimi** for database schema was successful and correct. - **Zero fallbacks** — all 4 CLIs completed tasks without needing to escalate. - **Auto-discovery** — CLI flags probed from `--help`, not hardcoded. ### What needs tuning - **Codex is slow on frontend** — EXP-5 took 280s vs Claude's 122s (2.3x slower). Consider routing blast 6 frontend tasks to Claude. - **Ollama missed the spec task** — EXP-1 (docs) was routed to local model but no spec file was generated. Ollama's qwen2.5-coder is optimized for code, not prose. Consider routing `task_type: docs` to kimi or claude regardless of blast score. - **No test generation by any Maggy model** — None of the 4 models produced tests. This could be addressed by adding a TDD step (write tests first) as a follow-up task routed to Claude. --- ## 8. Conclusions | Metric | Maggy | Claude | Verdict | |--------|-------|--------|---------| | Speed | 907.6s | 681.0s | Claude 33% faster | | Success rate | 100% | 100% | Tie | | Quality (weighted) | 7.4/10 | 7.8/10 | Claude slightly better | | Security depth | Stronger | Weaker | Maggy (dedicated review step) | | Test coverage | None | Good | Claude (significant gap for Maggy) | | Cost efficiency | 83% savings | Baseline | Maggy | | Subscription risk | Distributed | Single point | Maggy | | Model diversity | 4 models | 1 model | Maggy | **Summary:** Claude Code is faster and produces marginally higher overall quality (driven by tests and spec). Maggy's multi-model approach provides cost efficiency and subscription risk distribution, plus deeper security review via dedicated model routing. The main gaps to close: add TDD pipeline (test generation step), and improve docs routing (don't send prose tasks to coding-optimized local models). --- ## 9. Raw Throughput Benchmarks (tokens/sec) Standalone generation speed measured with identical prompts across all four model tiers. Each model ran 3 iterations (1 cold, 2 hot). **Prompt:** "Write a Python function that implements a binary search tree with insert, delete, search, and in-order traversal." ### 9.1 Results | Model | Run 1 | Run 2 | Run 3 | Avg tok/s | Notes | |-------|-------|-------|-------|-----------|-------| | **Ollama qwen2.5-coder:32b** | 22.3 | 21.8 | 22.1 | **22.1** | Local GPU (M4 Max), consistent across runs | | **Ollama qwen3-coder:30b-a3b-q8_0** | 75.3 | 75.4 | 76.3 | **75.7** | MoE (3.3B active/30B total), Q8_0, **3.4x faster than qwen2.5** | | **Claude (claude -p)** | 44.6 (API) / 18.6 (wall) | 41.9 / 14.3 | 25.7 / 6.8 | **37.4 API / 13.2 wall** | API time excludes network overhead; wall-clock includes CLI startup | | **Kimi (kimi CLI)** | ~1.8 | ~2.8 | ~3.3 | **~2.6** | Agentic mode — writes files, runs tools; tok/s reflects execution time | | **Codex (codex exec)** | ~0.8 | ~0.7 | ~0.6 | **~0.7** | Agentic mode — full-auto file creation; tok/s reflects execution time | ### 9.2 Interpretation - **Ollama qwen3-coder (local):** **75.7 tok/s** — 3.4x faster than qwen2.5-coder:32b (22.1 tok/s) and **2x faster than Claude's API rate** (37.4 tok/s). MoE architecture (3.3B active / 30B total params) means only a fraction of parameters are computed per token. Cold start adds ~13s for model load; hot runs start in <100ms. This makes qwen3-coder the fastest model in the fleet for pure generation. - **Ollama qwen2.5-coder (retired):** Was 22 tok/s. Replaced by qwen3-coder which is 3.4x faster with comparable quality. - **Claude:** 37 tok/s API generation. Still the strongest for reasoning-heavy tasks (security, architecture, TDD). - **Kimi / Codex:** Low tok/s numbers are misleading — both operate in agentic mode (writing files, running commands, iterating). Their throughput reflects end-to-end task execution, not pure generation speed. ### 9.3 Routing Implications | Tier | Model | tok/s | Cost | Best For | |------|-------|-------|------|----------| | Local | Ollama qwen3-coder:30b-a3b-q8_0 | 75.7 | Free | Blast 1-3: simple edits, CRUD, code generation | | Mid | Kimi | 2.6 (agentic) | Cheap | Blast 3-4: schema design, CRUD | | Premium-Auto | Codex | 0.7 (agentic) | Mid | Blast 5-6: feature implementation | | Premium | Claude | 37 (API) | High | Blast 7+: security, architecture, TDD | ### 9.4 Qwen3-Coder Quality Assessment Two coding tasks evaluated for correctness and code quality: **Task 1: Binary Search Tree** (same prompt as throughput benchmark) - Insert, delete (leaf/internal/root), search, in-order traversal — all correct - Clean class structure, recursive helpers, inorder-successor delete - Handles duplicate-ignore semantics correctly - **Score: 10/10** — functionally identical to Claude's output **Task 2: Async Rate Limiter** (token bucket, concurrent-safe) - `asyncio.Lock` for concurrency safety - `_refill()` based on elapsed time — correct token bucket math - `acquire()` waits in loop, `try_acquire()` returns immediately - Burst exhaustion + refill timing verified within 1ms of expected - 10 concurrent tasks completed without deadlock - **Score: 9/10** — correct and safe; minor: polling loop at 1ms instead of event-driven wait **Quality Summary:** | Dimension | qwen3-coder | qwen2.5-coder | Claude | |-----------|-------------|---------------|--------| | Correctness | 10/10 | 9/10 | 10/10 | | Code structure | 9/10 | 8/10 | 10/10 | | Concurrency safety | 9/10 | N/A | 10/10 | | Generation speed | **75.7 tok/s** | 22.1 tok/s | 37.4 tok/s | | Cost | Free | Free | $$$ | **Verdict:** qwen3-coder is a major upgrade — 3.4x faster than qwen2.5 with equal or better code quality. At 75.7 tok/s it's the fastest model in the fleet, making it ideal for blast 1-4 tasks where speed matters and deep reasoning isn't required. --- ## 10. Post-Benchmark Fixes (Routing Rules + Conventions) Three systems were built immediately after the benchmark to close the gaps above. ### 10.1 Routing Rules (`~/.maggy/routing-rules.yaml`) A self-updating YAML config that overrides blast-score routing for specific task types and pipeline phases. Rules are checked **before** the reward table or blast-score tier. **Task-type overrides seeded from benchmark evidence:** | Task Type | Forced To | Why | |-----------|----------|-----| | `docs` | claude | Ollama (code-optimized) produced no spec file | | `security` | claude | Security review needs deep reasoning | | `tests` | claude | Only claude generated test files in benchmark | | `architecture` | claude | Architecture needs cross-context awareness | | `planning` | claude | Planning requires structured reasoning | **Pipeline phase overrides from TDD workflow:** | Phase | Forced To | Why | |-------|----------|-----| | `spec` | claude | SPEC phase needs comprehensive docs | | `tdd_red` | claude | RED phase needs test design expertise | | `tdd_green` | auto | GREEN uses blast-score routing (cheap models can implement) | | `review` | claude | Review needs security + architecture depth | **Self-learning:** `record_outcome()` updates rolling success rates per model. `learn_override()` lets Maggy add new rules when outcome data supports it. Manual YAML edits are preserved. ### 10.2 Team Conventions Injection Five conventions from claude-bootstrap's CLAUDE.md are embedded in routing rules and injected into every prompt sent to any CLI: 1. **mWP** — Build minimum wowable product. No feature flags, no premature abstractions. 2. **TDD** — RED → GREEN → VALIDATE. Coverage >= 80%. 3. **Security** — No secrets in code. Parameterized SQL. Validate input at boundaries. 4. **Quality gates** — 20 lines/fn, 3 params, 2 nesting levels, 200 lines/file. 5. **Existing patterns** — Read codebase before changing. Keep changes minimal. All four executor prompt methods (`_plan_prompt`, `_analysis_prompt`, `_tests_prompt`, `_impl_prompt`) now append matching conventions. This standardizes quality expectations across kimi, codex, ollama, and claude. ### 10.3 Expected Re-run Improvements | Benchmark Gap | Root Cause | Fix Applied | Expected Result | |--------------|-----------|-------------|-----------------| | No product spec (EXP-1) | `docs` routed to ollama | `docs → claude` override | Claude generates spec | | No tests from any model | No TDD step in pipeline | `tdd_red → claude` + `tests → claude` overrides | Claude writes failing tests | | Inconsistent quality across models | No shared standards | Conventions injected into all prompts | mWP + quality gates enforced everywhere | | No learning from outcomes | Static routing only | `record_outcome()` + `learn_override()` | Routing improves with each task | **Projected scores if re-run:** | Dimension | Before | After (est.) | Change | |-----------|--------|-------------|--------| | Product spec | 0/10 | 9/10 | `docs → claude` | | Test coverage | 0/10 | 8/10 | `tdd_red → claude` | | Security | 10/10 | 10/10 | No change (already strong) | | Architecture | 8/10 | 9/10 | Conventions enforce patterns | | **Weighted avg** | **7.4/10** | **~8.5/10** | **+1.1 points** | Cost efficiency would remain at ~83% savings — the new overrides only force claude for `docs` (1 task) and `tests` (new TDD step), not for CRUD/API/frontend work. ================================================ FILE: maggy/docs/maggy-rfc.md ================================================ # Maggy: An Autonomous AI Engineering Platform **RFC — Request for Comments** **Author:** Ali Shaheen, Protaige **Date:** May 2026 **Version:** 5.0 --- ## 1. Executive Summary Maggy is a local-first, self-improving AI engineering platform that transforms how development teams build software. Unlike code assistants that wait for prompts, Maggy is an autonomous agent that observes, learns, and optimizes — continuously improving its own effectiveness across models, workflows, and team knowledge. **What makes Maggy different:** - **Multi-model orchestration** — Maggy routes tasks to the best model (Claude, GPT-4o, Gemini, Kimi, DeepSeek, local Qwen) based on learned performance data, not static rules. When one model hits quota, work continues seamlessly on the next. - **Self-improving closed-loop control** — Every task Maggy completes generates reward signals that improve its future decisions. Model routing, inbox ordering, workflow steps, and fatigue management all optimize automatically. - **Process intelligence** — Maggy doesn't just write code. It learns from CI results, PR reviews, CodeRabbit findings, and merge patterns to preemptively fix issues before they reach reviewers. - **Maggy Mesh** — A peer-to-peer network connecting Maggy instances across a team. One developer's hard-won CI fix becomes the entire team's knowledge. Autonomously. Instantly. - **Local-first, no vendor lock-in** — All data stays on developer machines. No cloud dependency. No vendor seeing your code. Works offline with local models. **The value proposition:** A team of 5 developers running Maggy Mesh for 6 months accumulates 4x the learning of a solo developer. New team members inherit collective intelligence on day one. CI pass rates go up, review rounds go down, and the system gets smarter every week — without anyone configuring it. --- ## 2. Vision: Autonomous Engineering, Not Code Generation The current generation of AI coding tools — Copilot, Cursor, Devin — are fundamentally reactive. They complete code when prompted, suggest edits when asked, and run tasks when instructed. They're sophisticated typeaheads, not engineers. An engineer doesn't just write code. An engineer: - **Prioritizes** — Which ticket matters most right now? - **Plans** — What's the blast radius? What could break? - **Validates** — Does this feature align with the market? Do competitors have it? - **Executes** — Write the code, with the right model for the task - **Verifies** — Did CI pass? Did reviewers approve? Did it deploy cleanly? - **Learns** — What worked? What didn't? How do I do it better next time? Maggy does all of this. It's the first AI platform designed around the full software development lifecycle, not just the "write code" step. ### The Autonomy Spectrum ``` Level 0: Autocomplete (Copilot, TabNine) → Completes the current line → No context beyond the file → No learning Level 1: Chat Assistant (ChatGPT, Claude) → Answers questions about code → No project context → No memory between sessions Level 2: Project-Aware Assistant (Cursor, Continue) → Understands the codebase → Can edit multiple files → Limited memory (rules, preferences) Level 3: Task Agent (Devin, Claude Code Agent) → Executes multi-step tasks → Uses tools (terminal, browser) → Single-model, single-project Level 4: Autonomous Engineering Platform (Maggy) ← WE ARE HERE → Multi-model, multi-project orchestration → Self-improving from every task → Process intelligence (learns from CI, reviews, deploys) → Team intelligence via P2P mesh → Market validation before engineering ``` --- ## 3. Architecture Overview ### The Component Map ``` ┌─────────────────────────────────────────────────────────────┐ │ MAGGY WEB DASHBOARD │ │ ┌──────────┐ ┌─────────┐ ┌────────┐ ┌───────┐ ┌────────┐ │ │ │ Inbox │ │ Budget │ │ Agents │ │ CIKG │ │Process │ │ │ │ (ranked) │ │ (live) │ │(status)│ │ (gaps)│ │(health)│ │ │ └──────────┘ └─────────┘ └────────┘ └───────┘ └────────┘ │ └──────────────────────────┬──────────────────────────────────┘ │ ┌────────────┴────────────┐ │ ORCHESTRATOR LAYER │ │ │ │ Pi Agent (universal │ │ harness, RPC mode) │ │ │ │ Token Budget Manager │ │ Model Router (learned) │ │ Dual-Model Planner │ └────────┬────────────────┘ │ ┌──────────────┼──────────────┐ │ │ │ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ │Container│ │Container│ │Container│ │ 1 │ │ 2 │ │ 3 │ │ Claude │ │ GPT-4o │ │ Qwen │ │ (auth) │ │ (front) │ │ (docs) │ └─────────┘ └─────────┘ └─────────┘ │ │ │ ┌────┴──────────────┴──────────────┴────┐ │ INTELLIGENCE LAYER │ │ │ │ iCPG — blast radius, drift, intent │ │ Mnemos — memory, fatigue, checkpoints │ │ codebase-memory-mcp — code graph │ │ CIKG — competitive intelligence │ │ Process Intelligence — CI/PR/deploy │ │ MCP Forge — capability expansion │ │ Maggy Mesh — P2P team learning │ └────────────────────────────────────────┘ ``` ### Pi: The Universal Agent Harness Pi replaces per-CLI adapters with a single interface to every model. It runs inside Polyphony containers in RPC mode over stdin/stdout. The same PiAdapter code controls Claude, GPT-4o, Gemini, Kimi, DeepSeek, or a local Qwen — with identical tool interfaces. **Model fallback chain:** ``` Claude → GPT-4o → Gemini → Kimi → DeepSeek → Qwen (local, unlimited) ``` When a model hits quota or rate limits: 1. Mnemos writes a structured checkpoint (goal, constraints, progress, state) 2. Pi switches to the next model 3. The checkpoint is injected as context 4. The new model verifies it understands the task before continuing 5. If verification fails, escalate to the next tier — don't retry on a weaker model **The user never notices the switch.** Work continues. That's the wow. ### Token Budget Manager ```yaml providers: anthropic: daily_limit_usd: 50.00 used_today_usd: 32.15 model_preference: claude-sonnet-4 openai: daily_limit_usd: 30.00 used_today_usd: 5.20 model_preference: gpt-4o local: daily_limit_usd: 0 # free model_preference: qwen2.5-coder:32b ``` The budget manager prevents runaway costs. When anthropic hits $50, Maggy doesn't stop — it routes to OpenAI. When OpenAI hits $30, it routes to local Qwen. Work never stops. --- ## 4. Self-Improvement: Multi-Level Closed-Loop Control This is Maggy's core differentiator. Every task teaches Maggy something. Every CI failure, every review comment, every deploy result feeds back into the system. Maggy gets smarter every day — without anyone configuring it. ### The Objective Function ``` efficiency = (value_delivered / time_spent) x quality_multiplier where: value_delivered = tickets landed + features shipped + bugs fixed time_spent = wall clock from ticket selection to merge quality_multiplier = 1.0 - (bug_escape_rate + revert_rate + incident_rate) ``` ### Five Control Levels | Level | Frequency | What It Does | |-------|-----------|-------------| | **L0 — Real-time** | Seconds | Catches tool failures, test failures, fatigue spikes, scope drift *as they happen*. Switches models mid-task when quality degrades. | | **L1 — Task** | Minutes | Computes task reward score. Updates model performance table. Logs process signals. | | **L2 — Daily** | Hours | Catches operational degradation: CI pass rate drops, model failure spikes, budget burn rate anomalies. Disables failing models. | | **L3 — Weekly** | Days | Strategic optimization: evolves skill files, adjusts workflow steps, triggers MCP Forge for capability gaps, patches prompts. | | **L4 — Monthly** | Weeks | Meta-optimization: recalibrates reward signals, adjusts tier boundaries, tunes exploration rate, changes the improvement process itself. | **Key principle:** Inner loops provide stability. Outer loops provide optimization. L0 catches a failing model in seconds — the user barely notices. L3 makes routing smarter over weeks — the system quietly improves. L4 makes the improvement process itself better over months. ### What Gets Optimized **Model routing** — Maggy tracks reward per `(model x task_type x blast_tier)` triple. After 50+ tasks, routing outperforms random assignment by 20%+. ``` (claude, auth, high): +0.92 ← claude excels at auth (qwen, docs, low): +0.85 ← qwen is fast and free for docs (gpt-4o, frontend, medium): +0.78 ← gpt-4o is strong on frontend ``` **Inbox ordering** — Learns which tickets the user actually picks first. Adjusts urgency weights to match user behavior. **Workflow steps** — Drops steps that never catch issues (e.g., Codex counter-check on blast < 3). Re-enables them when they become valuable again. **Fatigue management** — Learns each user's optimal session length and pre-checkpoints at the right moment. Not at a generic threshold — at *your* threshold. --- ## 5. Process Intelligence: Learning from the Full SDLC Most AI tools optimize code generation. Maggy optimizes the **entire development process**. ### Environment Discovery On first run per project, Maggy auto-discovers the developer's workflow — no configuration: - **Ticketing:** GitHub Issues, Asana, Linear, Jira - **CI/CD:** GitHub Actions, Jenkins, CircleCI - **Code quality:** ESLint, ruff, mypy, pre-commit, coverage - **Review process:** Required reviewers, CODEOWNERS, branch protection - **Integrations:** CodeRabbit, Dependabot, Renovate, Vercel ### Signal Collection Maggy continuously collects signals from the SDLC: | Signal Source | What Maggy Learns | |--------------|-------------------| | CI results | Which code patterns cause test failures | | PR review comments | What reviewers consistently flag | | CodeRabbit findings | Security and quality issues by pattern | | Merge patterns | How many rounds of review, time to merge | | Deploy results | Which changes cause deploy failures | ### Preemptive Fixes The pattern engine correlates `(code_pattern, review_feedback)` pairs: > "Your reviewer always flags missing error handling in API routes. Maggy added it before the PR was created. Review rounds dropped from 2.8 to 1.1." This is not prompt engineering. This is autonomous process optimization — Maggy observed a pattern, validated it statistically, and changed its behavior to prevent the issue. No human told it to. --- ## 6. Engram: Cross-Session Memory ### The Amnesia Problem Every AI coding tool today is an amnesiac. When a session ends, everything the agent learned — project conventions, reviewer preferences, codebase idioms, tool configurations — evaporates. The next session starts from scratch. This isn't a minor inconvenience; it's the fundamental bottleneck preventing AI agents from becoming genuinely useful over time. Engram identifies seven distinct amnesia pathologies: | Amnesia Type | What Gets Lost | Impact | |-------------|---------------|--------| | **Anterograde** | New memories fail to form across sessions | Every session restarts from zero | | **Retrograde** | Existing memories degrade over time | Learned patterns fade | | **Temporal** | When something happened is lost | Can't track how things changed | | **Source** | Where a fact came from is lost | Can't trust or audit memories | | **Interference** | Memories from one context contaminate another | Project A's patterns leak into Project B | | **Context-binding** | Right memory, wrong retrieval context | Conventions exist but aren't surfaced when needed | | **Confabulation** | Inferred patterns presented as confirmed facts | Agent "remembers" things it actually guessed | ### The Memory Lifecycle Engram completes Maggy's memory stack: ``` Mnemos (within-task) → What the agent remembers during a single task ↓ promote (confidence > 0.8, evidence >= 3) Engram (cross-session) → What survives between sessions, per machine ↓ distill to typed memory Mesh (cross-machine) → What's shared across the team, P2P ``` Without Engram, Maggy has a 10-minute memory. With Engram, knowledge compounds across every session. After 100 sessions, Maggy knows your project's conventions, your reviewers' preferences, your CI failure patterns — and applies them automatically. ### Three-Tier Namespace Model Memory is organized into three tiers to prevent both cross-project contamination and useful-pattern siloing: 1. **Local** — project-specific memories (strict isolation). A Python FastAPI project's conventions never contaminate a React project's patterns. 2. **Portfolio** — abstracted cross-project patterns. When a local pattern proves useful across 3+ projects, it's promoted — but only after de-contextualization (stripping project-specific names and paths). 3. **Mesh** — peer-derived memories (quarantined on arrival). Must be locally validated before promotion to portfolio. This three-tier model means Engram gets smarter across projects without cross-contamination. ### Engram as Improvement Substrate Engram absorbs the improvement ledger. The ledger is the mutation log (what changed), Engram is the memory substrate (persists it across sessions), and the reward registry tracks whether it worked. Every self-modification becomes a persistent, queryable memory — Maggy remembers not just what it learned, but what it tried and what failed. ### Amnesia Score Each project gets a 7-dimension diagnostic score (0.0 = perfect retention, 1.0 = total amnesia). The L3 weekly loop analyzes Amnesia Scores and adjusts encoding rules: if anterograde score is high, lower the promotion threshold; if interference is high, tighten namespace isolation. ### Research Basis Engram builds on validated research: Mem0 (186M API calls, memory-as-object model), Zep/Graphiti (temporal validity windows), Hindsight (91.4% on LongMemEval, fact vs opinion separation), MAGMA (multi-graph retrieval with 45.5% higher reasoning accuracy), and A-MEM (Zettelkasten-style associative encoding). What none of these systems address is the combination of namespace isolation, origin tracking, temporal validity, and amnesia diagnosis in a single architecture designed for multi-project AI agents. --- ## 7. Maggy Mesh: Peer-to-Peer Team Intelligence ### The Problem A solo developer's Maggy learns from their tasks. But teams have 5, 10, 50 developers — each independently discovering the same CI fixes, the same reviewer preferences, the same model performance patterns. That's wasted learning. ### The Solution Maggy Mesh connects instances across a team into a peer-to-peer network. Each Maggy autonomously shares learned intelligence with other Maggys in the same organization. ``` ┌──────────────────────────────────────────────────────────┐ │ ORGANIZATION │ │ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ Maggy-A │◄──►│ Maggy-B │◄──►│ Maggy-C │ │ │ │ (Ali) │ │ (Sarah) │ │ (John) │ │ │ │ Python │ │ React │ │ DevOps │ │ │ └─────────┘ └─────────┘ └─────────┘ │ │ ▲ ▲ ▲ │ │ └──────────────┴──────────────┘ │ │ Full mesh — everyone sees │ │ everyone's learnings │ └──────────────────────────────────────────────────────────┘ ``` ### What Gets Shared Not everything. Maggy Mesh shares **typed memory classes** with different merge rules: | Type | Example | Merge Rule | |------|---------|-----------| | **Scores** | "Claude scores 0.92 on auth tasks" | Weighted average by sample count | | **Patterns** | "Add error handling before PR" | Union-merge with frequency tracking | | **Policies** | "Route blast 7+ to premium only" | Backtest-gated — must pass on local data | | **Gaps** | "No Linear integration" | Additive accumulation | ### Provenance Every shared memory carries full provenance: - **Who:** peer_id, peer_name - **Where:** project_key, language, toolchain - **When:** created_at, last_verified - **How much:** evidence_count, confidence (decays with age) This enables intelligent filtering: "Only accept Python patterns from peers working on Python projects." ### Quarantine System Incoming peer data doesn't go live immediately. It enters quarantine: 1. **Self-confirmed:** Local data validates the pattern within 30 days 2. **Crowd-confirmed:** 3+ peers independently report the same pattern 3. **Human override:** Developer manually promotes or rejects This prevents poisoning, stale data propagation, and context collapse. A bad pattern from one node can't silently corrupt the entire team. ### Cold Start A new team member installs Maggy, discovers peers via mDNS, and receives the entire team's collective intelligence — quarantined until locally validated. Day one, they have the benefit of months of team learning. ### The Compound Effect ``` Individual Maggy: knowledge = learning_rate x time Team Mesh (n peers): knowledge = n x learning_rate x time x sharing_factor 5 developers, 6 months: Solo: 1 x 1.0 x 180 = 180 learning units Mesh: 5 x 1.0 x 180 x 0.8 = 720 learning units (4x multiplier) ``` The sharing_factor (0.8) accounts for context mismatch and quarantine filtering. The effect is superlinear because peers validate each other's patterns through crowd confirmation. --- ## 8. Lexon: Semantic Tool Binding ### The Tool Overload Problem As Maggy's capabilities grow — MCP Forge auto-generates servers, Process Intelligence adds signal collectors, each project adds environment-specific tools — the tool count will cross 50, then 100. Research shows tool selection accuracy collapses at this scale: RAG-MCP demonstrated accuracy dropping from 87% to 13% as tools grew from 10 to 100. A second failure mode persists even with retrieval: the **vocabulary gap**. Tool descriptions are written by engineers. Users speak in their own vocabulary. "I want to blast my leads" doesn't match `create_campaign` by any lexical metric. Maggy needs to learn that for *this user*, "blast" means bulk email send. ### Two-Tier Routing Lexon solves this with a two-tier pipeline that runs in parallel: 1. **Tier A — Fast LLM Router** (<300ms): A compact tool manifest (name + 1-line description, ~400 tokens for 80 tools) fed to a fast model. Returns 5-7 candidates with rationale. JSON schema constrained to valid tool names — no hallucinated tools. 2. **Tier B — Multilingual Semantic Retriever**: Vector search over the full tool registry, indexed by description, example queries, and learned synonyms. Multilingual embedding model ensures queries in any language match correctly. Candidates from both tiers are unioned and deduplicated. Each tier compensates for the other's failure mode: the LLM captures intent-level reasoning; the retriever captures lexical variants and multilingual matches. ### Terminology Map A three-level vocabulary store that learns over time: - **System level**: Built-in tool descriptions (baseline) - **Org level**: Team-shared vocabulary, propagated via Mesh (e.g., "follow up" = specific CRM workflow) - **User level**: Personal shortcuts and preferences (e.g., "morning sequence" = campaign with time=09:00) Resolution: user overrides org overrides system. **NOT bindings** encode negative matches — "blast" is explicitly NOT "delete_all" — preventing recurring mis-selections. ### Dual-Mode Disambiguation When confidence is ambiguous, Lexon has two resolution modes: **Self-clarify (default, autonomous):** Lexon resolves ambiguity without asking the user by consulting iCPG's structured intent, Mnemos context, Engram's past bindings, process history, and Mesh consensus. If any source resolves confidence above threshold, proceed silently. The goal: 95%+ resolutions via self-clarify after 50+ interactions. **User-clarify (irreversible actions only):** Triggered only for destructive, expensive, or irreversible actions (delete, deploy, billing changes). Presents 2-3 concrete options. The user's selection becomes a permanent binding. Autonomous agents should almost never trigger user-clarify. This is what separates Maggy from tools that interrupt you constantly. ### Personalization Five implicit learning signals update the Terminology Map without user effort: 1. **Correction** → add NOT binding + positive binding 2. **Affirmation** → increment confidence 3. **Repetition** (5+) → promote to high-confidence synonym 4. **Disambiguation selection** → capture as user-level binding 5. **Clarification repetition** (3+) → escalate to explicit preference prompt High-confidence bindings persist via Engram across sessions and propagate to the org via Mesh. ### Tool Contract Binding Lexon doesn't just bind phrases to tool names — it binds to tool contracts. Each LexonRecord records the tool version and schema hash at bind time. When a tool's API changes, Lexon detects the schema drift and re-evaluates bindings rather than silently calling a tool with a different interface. This matters because MCP Forge auto-generates tools from API docs that evolve. ### Outcome-Bearing Records Every LexonRecord carries an outcome reward (-1.0 to 1.0): did the binding produce good results? Corrections are tracked with their source (user explicit, CI failure, review comment). This transforms Lexon from a static lookup table into a reward-bearing learning system that gets measurably better at tool selection over time. ### Research Basis Lexon builds on: RAG-MCP (Anthropic, 2025 — retrieval-based tool selection), Tool2Vec (2024 — example queries as embedding targets), ToolTree (ICLR 2026 — MCTS-style tool planning), Tool-MVR (2025 — self-correction loops), and Gorilla (Berkeley, 2023 — fine-tuned tool LLMs). Lexon's contribution is the unified architecture combining retrieval, disambiguation, multilingual support, and adaptive personalization — no prior system addresses all four. --- ## 9. Event Spine: The Nervous System ### Why an Event Spine Maggy's components — iCPG, Mnemos, Lexon, Engram, Process Intelligence, Mesh — each generate events in their own formats. Without a canonical event spine, correlating "user said X → Lexon bound tool Y → execution failed → memory Z was created → mutation W was proposed" requires stitching together six different log formats. The Event Spine defines a single ordered event stream that every component writes to: ``` IntentEvent → BindingEvent → ExecutionEvent → MemoryEvent ↓ MeshEvent ← MutationEvent ← OutcomeEvent ← PersistenceEvent ``` Eight typed events, each carrying a common header (event_id, task_id, project_id, agent_id, model_id, confidence, namespace, policy_version, reward_delta). This enables: - **End-to-end tracing**: follow a task_id across all 8 event types - **Reward attribution**: OutcomeEvent.reward propagates back to BindingEvent (was tool selection good?) and MutationEvent (was self-modification good?) - **Replay debugging**: reproduce failures from the event stream without re-executing - **Amnesia diagnosis**: compare MemoryEvent → PersistenceEvent conversion rate per project - **Self-improvement validation**: MutationEvent + OutcomeEvent = evidence for whether L3/L4 changes helped ### The Positioning Statement > Maggy understands intent through iCPG. Maggy survives task execution through Mnemos. Maggy chooses the right capability through Lexon. Maggy remembers consequences through Engram. Maggy evolves behavior through rewards. Maggy spreads successful mutations through Mesh. > > The Event Spine connects all six into a single typed, correlated, reward-bearing event stream. This is the nervous system of an autonomous engineering agent. --- ## 10. Competitive Landscape The AI coding tool market has exploded into distinct categories. Understanding where Maggy fits — and where it doesn't compete — is critical for positioning. ### 10.1 Market Taxonomy The landscape breaks into five categories, each with different value propositions: ``` ┌─────────────────────────────────────────────────────────────────┐ │ AI CODING TOOL TAXONOMY (2026) │ │ │ │ 1. CLOUD AGENT PLATFORMS (autonomous, cloud-hosted) │ │ Codex (OpenAI), Devin (Cognition), Copilot Cloud Agent │ │ Claude Managed Agents │ │ │ │ 2. AI-NATIVE IDEs (editor-first, multi-model) │ │ Cursor, Windsurf (Codeium/Cognition) │ │ │ │ 3. CLI AGENTS (terminal-first, model-agnostic) │ │ Claude Code, Codex CLI, Aider, OpenCode, Cline │ │ │ │ 4. APP BUILDERS (prompt-to-app, no-code/low-code) │ │ Lovable, Bolt.new, Replit Agent, v0 (Vercel) │ │ │ │ 5. AUTONOMOUS ENGINEERING PLATFORMS │ │ Maggy ← ONLY ENTRY │ │ (self-improving + process intelligence + team mesh) │ └─────────────────────────────────────────────────────────────────┘ ``` Maggy is not competing with Lovable (app builders) or Cursor (IDE experience). Maggy competes on a different axis: **autonomous improvement over time**. The question isn't "which tool writes better code today?" — it's "which tool writes better code *next month* than it did *this month*?" ### 10.2 Cloud Agent Platforms #### OpenAI Codex (Cloud) Codex is OpenAI's cloud-hosted autonomous coding agent, launched May 2025. Each task runs in its own sandboxed cloud environment preloaded with your GitHub repository. It can write features, fix bugs, run tests, and submit PRs — all in parallel. | Capability | Codex Cloud | Maggy | |-----------|-------------|-------| | Execution model | Cloud sandbox (internet disabled) | Local containers (full network) | | Model | codex-1 (o3 variant), GPT-5.3-Codex | 6+ models, learned routing | | Parallel tasks | Yes (multiple cloud sandboxes) | Yes (Polyphony containers) | | Self-improvement | No | 5-level closed-loop control | | Process intelligence | No | Full SDLC learning | | Team learning | No cross-instance learning | Mesh (P2P, autonomous) | | SWE-bench Verified | 85% (GPT-5.3-Codex) | Model-dependent (routes to best) | | Cost | ChatGPT Pro/Enterprise subscription | Self-hosted, pay-per-model-use | | Data privacy | Code sent to OpenAI cloud | Local-first, code stays on machine | | Trigger automation | Codex Jobs (on GitHub push) | Process Intelligence (on any signal) | **Codex's strength:** Cloud-native parallel execution with strong sandboxing. The upcoming Codex Jobs feature (automated triggers on git events) is compelling for CI/CD workflows. **Maggy's edge:** Codex treats each task as independent — it doesn't learn from past tasks, doesn't track reviewer patterns, and doesn't share knowledge across team members. Maggy's L1-L4 control loops mean task #100 is handled significantly better than task #1. #### Devin (Cognition) Devin is an autonomous cloud-based AI software engineer. It reached $73M ARR by early 2026, with 67% of PRs merged autonomously. Cognition also acquired Windsurf for ~$250M. | Capability | Devin | Maggy | |-----------|-------|-------| | Execution model | Cloud VM with browser | Local containers | | Knowledge system | Playbooks + Knowledge docs (manual) | Dynamic typed memory (automatic) | | Cross-instance learning | No — knowledge is per-org, manually curated | Yes — Mesh shares automatically | | Multi-model | Limited | 6+ models with auto-routing | | Self-improvement | Playbooks improve via manual updates | 5-level automatic control loops | | Process intelligence | No | CI, reviews, deploys, merge patterns | | Managed Devins | Yes (parallel orchestration) | Yes (Polyphony containers) | | SWE-bench Verified | 45.8% (Devin 2.0, unassisted) | Model-dependent | | Cost | $500/mo Teams, custom Enterprise | Self-hosted | | Scheduling | Recurring/one-time scheduled sessions | Continuous background operation | **Devin's strength:** Enterprise organization structure, admin controls, playbook management. The acquisition of Windsurf gives them an IDE play too. **Maggy's edge:** Devin's knowledge system is manually curated — someone writes playbooks and knowledge docs. Maggy's intelligence is learned automatically from task outcomes. Devin doesn't share learnings across team members' instances; Maggy Mesh does this autonomously. #### Claude Managed Agents Anthropic's cloud agent platform, updated May 2026 with three significant features: dreaming, outcomes, and multi-agent orchestration. | Capability | Claude Managed Agents | Maggy | |-----------|----------------------|-------| | Execution model | Secure cloud containers | Local containers | | Dreaming | Yes — reviews past sessions, extracts patterns | Similar to L3/L4 loops | | Memory | Per-agent + cross-agent via dreaming | Typed memory (scores, patterns, policies, gaps) | | Multi-agent | Orchestration + webhooks | Polyphony containers + cross-agent delegation | | Self-improvement | Dreaming (research preview) | 5-level closed-loop control (designed in) | | Process intelligence | No | Full SDLC learning | | Team learning | Cross-agent dreaming (same org) | Mesh (P2P, cross-machine) | | Local execution | No (cloud only) | Yes (local-first) | **Claude Managed Agents' strength:** Dreaming is the closest any competitor comes to Maggy's self-improvement concept. Harvey (legal AI) saw 6x task completion improvement after implementing dreaming. The cross-agent pattern extraction is genuinely novel. **Maggy's edge:** Dreaming is cloud-only and Anthropic-locked. Maggy's control loops work locally, across any model, and share learnings across developer machines — not just across agent sessions in the cloud. #### GitHub Copilot (Cloud Agent + Agent Mode) Copilot evolved from autocomplete to a multi-layered platform: inline suggestions, chat, agent mode (IDE), and cloud agent (autonomous). | Capability | Copilot | Maggy | |-----------|---------|-------| | Code completion | Best-in-class inline suggestions | Via Pi (any model) | | Cloud agent | Yes — autonomous PRs from issues | Yes — local containers | | Agent mode | IDE-integrated (VS Code, Visual Studio) | CLI + web dashboard | | Custom agents | User-level + repo-level definitions | Skills + iCPG + Mnemos | | Multi-model | Yes (GPT-4o, Claude, Gemini via settings) | Yes (6+ models, learned routing) | | Security tools | Security Reviewer agent (beta) | iCPG drift detection | | Self-improvement | No | 5-level closed-loop control | | Process intelligence | No | Full SDLC learning | | Team learning | Spaces (cloud-mediated, admin-controlled) | Mesh (P2P, autonomous) | | Debugger agent | Yes (Visual Studio, runtime validation) | L0 real-time control | | Ecosystem | GitHub-native (Issues, PRs, Actions) | GitHub API + any ticketing system | **Copilot's strength:** Deepest IDE integration. The debugger agent validating fixes against runtime behavior is unique. GitHub ecosystem integration is unmatched. Custom agents with workspace awareness, MCP connections, and model selection are powerful. **Maggy's edge:** Copilot doesn't learn from its mistakes. It doesn't track which model does best on which task type. It doesn't observe CI results to preemptively fix reviewer complaints. And Spaces is admin-curated knowledge — not automatically learned intelligence. ### 10.3 AI-Native IDEs #### Cursor Cursor is the leading AI-native IDE (~$100M+ ARR), a fork of VS Code with deep AI integration. | Capability | Cursor | Maggy | |-----------|--------|-------| | IDE experience | Native (fork of VS Code) | CLI + web dashboard | | Background agents | 8 parallel cloud agents | Polyphony local containers | | Memories | Project-scoped, persisted across sessions | Typed memory with provenance | | Rules | `.cursorrules`, project rules | Skills (`.md`), iCPG, Mnemos | | Security review | Always-on PR security agents (beta) | iCPG constraints + drift | | Team features | Centralized billing, usage analytics | Mesh (P2P intelligence sharing) | | Model routing | Manual selection | Learned from reward data | | Self-improvement | Memories (passive) | 5-level active control loops | | Process intelligence | No | Full SDLC learning | | Context management | Rules, skills, MCPs, subagents | Skills, iCPG, Mnemos, code graph | **Cursor's strength:** UX polish, background agents at scale (8 parallel), and the always-on security review agents. The context usage breakdown (rules, skills, MCPs) shows mature observability. **Maggy's edge:** Cursor's memories are passive ("remember this fact"). Maggy's memory is active — it observes outcomes and adjusts behavior. Cursor doesn't learn from CI failures, doesn't track reviewer patterns, and doesn't share intelligence P2P. #### Windsurf (Codeium → Cognition) Windsurf's Cascade agent plans and executes multi-file edits with a dedicated planning agent running in the background. Acquired by Cognition (Devin) for ~$250M in December 2025. | Capability | Windsurf | Maggy | |-----------|----------|-------| | Agent | Cascade (plan + execute) | Multi-level control loops | | Codemaps | AI-annotated visual code maps | codebase-memory-mcp graph | | Built-in browser | Yes (web context for Cascade) | Process Intelligence API hooks | | Self-improvement | No | 5-level closed-loop control | | Cost | $15/mo Pro | Self-hosted | ### 10.4 CLI Agents #### Claude Code Anthropic's terminal-first coding agent. Runs locally, supports multi-agent orchestration via Task tool with teams. | Capability | Claude Code | Maggy | |-----------|-------------|-------| | Multi-agent | Task tool, teams, SendMessage | Polyphony containers + Pi | | Model | Claude only | 6+ models with auto-routing | | IDE integration | VS Code, JetBrains, desktop app | CLI + web dashboard | | Hooks | PreToolUse, PostToolUse, Stop | Skills + hooks + L0 real-time | | Self-improvement | No | 5-level closed-loop control | | MCP support | Native | Native + MCP Forge (auto-generate) | **Note:** Maggy is *built on* Claude Code's infrastructure (skills, hooks, MCP). It extends Claude Code with self-improvement, multi-model routing, process intelligence, and team mesh. #### Codex CLI (OpenAI) Open-source (Apache-2.0), Rust-based terminal agent. 81K+ GitHub stars. Runs locally, authenticates via ChatGPT account or API key. | Capability | Codex CLI | Maggy | |-----------|-----------|-------| | Open source | Yes (Apache-2.0, 81K stars) | Yes | | Language | Rust (96.3%) | Python | | Model | OpenAI models only | 6+ providers | | Self-improvement | No | 5-level closed-loop control | | Team learning | No | Mesh (P2P) | #### Aider Open-source CLI pair programmer. 39K+ GitHub stars, 4.1M+ installations. Model-agnostic with an architect/editor dual-model approach. | Capability | Aider | Maggy | |-----------|-------|-------| | Open source | Yes (39K stars) | Yes | | Multi-model | Yes (75+ providers) | Yes (6+ with auto-routing) | | Architect mode | Dual-model: strong planner + cheap editor | Dual-model planning (Phase 6) | | Git integration | Every edit = reviewable commit | iCPG + Polyphony branches | | Auto-lint/test | Yes (on every change) | L0 real-time control | | Self-improvement | No | 5-level closed-loop control | | Team learning | No | Mesh (P2P) | **Aider's strength:** The architect/editor mode is clever cost optimization — expensive model plans, cheap model executes. Maggy's Phase 6 dual-model planning is similar but adds conflict resolution and outcome tracking. #### OpenCode Was a Go-based CLI with TUI (Bubble Tea), 12K+ stars. **Archived September 2025**, now continued as "Crush" by the original author (Charm team). Supported 75+ LLM providers, SQLite session storage, LSP integration. ### 10.5 App Builders These tools target a different audience (non-developers, designers, rapid prototyping) but are worth understanding as they represent the "opposite end" of the autonomy spectrum. #### Lovable Prompt-to-full-stack-app builder. 2.3M users, $100M ARR, $6.6B valuation (Series B, Dec 2025, backed by Nvidia/Salesforce). | Capability | Lovable | Maggy | |-----------|---------|-------| | Target user | Non-developers, designers | Professional developers | | Output | Full-stack app from prompt | Code changes to existing codebase | | Stack | React + TypeScript + Supabase | Any stack | | Agent mode | Autonomous development mode | Multi-level control loops | | GitHub sync | Yes | Native (git-first) | | Self-improvement | No | 5-level closed-loop control | #### Bolt.new, Replit Agent, v0 - **Bolt.new** — Browser-based JS app generator. 1M+ websites generated in 5 months. - **Replit Agent 4** (March 2026) — Handles auth, databases, parallel task execution, Design Mode, checkpoint rollback. Richest ecosystem (50+ languages). - **v0** (Vercel) — Specializes in React components with Tailwind/shadcn/ui. Precision frontend generation. These are complementary to Maggy, not competitive. A developer might use Lovable to prototype, then bring the codebase into Maggy for professional development with CI integration, code quality tracking, and team collaboration. ### 10.6 Summary Comparison Matrix | Capability | Codex Cloud | Devin | Claude Managed | Copilot | Cursor | Claude Code | Aider | Maggy | |-----------|------------|-------|---------------|---------|--------|-------------|-------|-------| | **Self-improvement** | - | - | Dreaming (preview) | - | - | - | - | 5-level control | | **Process intelligence** | - | - | - | - | - | - | - | Full SDLC | | **Team learning** | - | - | Cross-agent dreaming | Spaces | Org memories | - | - | P2P Mesh | | **Multi-model routing** | - | Limited | - | Manual | Manual | - | Manual | Learned | | **Local-first** | - | - | - | - | Partial | Yes | Yes | Yes | | **Cloud agents** | Yes | Yes | Yes | Yes | Yes | - | - | - | | **IDE integration** | VS Code | Browser | - | Native | Native | VS Code | Terminal | Dashboard | | **Open source** | CLI only | - | - | - | - | - | Yes | Yes | | **Vendor lock-in** | OpenAI | Cognition | Anthropic | GitHub | Cursor | Anthropic | None | None | ### 10.7 Where Maggy Wins 1. **Self-improvement is the product** — No other tool has a formal multi-level control system. Claude's dreaming is the closest, but it's cloud-only and single-vendor. 2. **Process intelligence is unique** — Nobody else learns from CI results, reviewer comments, and merge patterns to preemptively fix code. 3. **Autonomous team learning** — Mesh shares typed, provenanced intelligence P2P without a central server. Everyone else's "team features" are admin-curated knowledge or cloud-mediated memory. 4. **Model-agnostic by design** — Not locked to any provider. Learns which model is best for which task type automatically. 5. **Local-first with no compromises** — Code never leaves developer machines. Works offline with local models. No vendor sees your proprietary codebase. ### 10.8 Where Competitors Win Today - **Copilot:** Deepest IDE integration, GitHub ecosystem, largest user base - **Cursor:** Best editor UX, background agents at scale, security review agents - **Devin:** Enterprise controls, playbooks, $73M ARR proves market demand - **Claude Managed Agents:** Dreaming is genuinely novel, cloud scalability - **Codex Cloud:** Parallel cloud sandboxes, upcoming Codex Jobs automation - **Lovable:** Prompt-to-app for non-developers, $6.6B validates the broader market - **Aider:** Open-source community (39K stars), architect/editor cost optimization --- ## 11. Migration Roadmap ### Phase Dependencies ``` Phase 1: PiAdapter + Token Budget ──────────────────┐ │ │ ├── Phase 2: Model Routing (blast→model) │ ├── Phase 3: Mnemos Multi-Model Fatigue │ ├── Phase 6: Dual-Model Planning │ │ │ Phase 4: CIKG Extract ────────────────┐ │ │ │ │ └───────────┬──────────────────────┘ │ │ │ Phase 5: Maggy v2 Dashboard ◄─────────────────────────┘ │ ├── Phase 7: Vercel Deploy Containers (Docker) ├── Phase 8: Process Intelligence ──────┐ ├── Phase 9: MCP Forge │ │ │ └── Phase 11: Maggy Mesh ◄──────────────┘ │ Phase 10: Integration Testing ◄─────────────┘ │ Phase 3 + Phase 5 ──► Phase 12: Engram ─────┘ │ Phase 9 + Phase 12 ─► Phase 13: Lexon │ Phase 12 + Phase 13 ─► Phase 14: Event Spine ``` ### Phase Summary | Phase | What | Priority | Effort | Dependencies | |-------|------|----------|--------|-------------| | 1 | PiAdapter + token budget | P0 | Large | Pi installed | | 2 | Model routing (blast→model) | P0 | Medium | Phase 1 + iCPG | | 3 | Mnemos multi-model fatigue | P1 | Medium | Phase 1 | | 4 | CIKG extraction | P1 | Medium | Supabase | | 5 | Maggy v2 dashboard | P0 | Large | Phases 1-4 | | 6 | Dual-model planning | P2 | Medium | Phase 1 | | 7 | Vercel deploy containers | P2 | Medium | Docker | | 8 | Process intelligence | P1 | Large | Phase 5 + GitHub API | | 9 | MCP Forge | P2 | Large | Phase 5 | | 10 | Integration testing + docs | P1 | Large | All phases | | 11 | Maggy Mesh (P2P) | P2 | XL | Phase 5 + Phase 8 | | 12 | Engram (cross-session memory) | P1 | Large | Phase 3 + Phase 5 | | 13 | Lexon (semantic tool binding) | P2 | Large | Phase 9 + Phase 12 | | 14 | Event Spine (canonical event flow) | P2 | Medium | Phase 12 + Phase 13 | --- ## 12. Research Foundations & Prior Art Maggy's architecture draws from five distinct research streams. This isn't a tool assembled from hype — each component maps to validated research with production evidence. ### 12.1 Self-Evolving Agent Systems The field of self-improving AI agents has exploded in 2025-2026. Papers mentioning "AI Agent" or "Agentic AI" in 2025 exceeded the total from 2020-2024 combined by more than twofold. **Key papers and systems:** - **SICA — Self-Improving Coding Agent (ICLR 2025 Workshop)** — An agent that autonomously edits its own codebase, climbing from 17% to 53% on SWE-bench Verified through self-modification. This validates Maggy's core thesis: agents that modify their own behavior based on outcomes dramatically outperform static agents. ([Paper](https://openreview.net/pdf?id=rShJCyLsOr)) - **Godel Agent (ACL 2025)** — Uses runtime monkey-patching with safety verification. The agent modifies both its task-solving policy and its own learning algorithm, guided by high-level objectives while formal invariant checking prevents unsafe changes. Maggy's L3/L4 control loops use a similar principle: change the improvement process itself, but with rollback safeguards. - **SAGE — Skill Augmented GRPO (December 2025)** — Agents accumulate reusable function libraries across task chains, achieving 8.9% goal completion gains while reducing output tokens by 59%. This directly parallels Maggy's skill evolution in L3, where successful patterns get codified into reusable skills. - **HyperAgents (2026)** — Makes the meta-level itself editable. Agents improve *how they improve*, discovering domain-general skills (memory management, prompt engineering, exploration strategies) that transfer across coding, mathematics, and scientific domains. Maggy's L4 monthly evolution loop is designed for exactly this: improving the improvement process. - **SWE-RL (Meta, 2025)** — Uses self-play where agents alternate between bug injection and fixing roles, gaining +10.4 points on SWE-bench Verified without human-labeled data. This reinforcement-based approach validates Maggy's reward registry concept. - **AlphaEvolve (Google DeepMind)** — Recovered 0.7% of Google's worldwide compute through automated algorithm optimization. This is the first evidence of hyperscale ROI from self-improving agents — validating that autonomous optimization can deliver measurable economic value. **Maggy's position:** Maggy applies self-evolution at the *operational* level (routing, workflows, process patterns) rather than at the model-weight level. This is more practical for a local-first system — you don't need GPU clusters to improve model routing decisions based on task rewards. ### 12.2 Agent Memory Systems Memory has emerged as the central bottleneck for autonomous agents. A comprehensive 2025-2026 survey ("Memory in the Age of AI Agents") offers a structured taxonomy of how memory is designed, implemented, and evaluated in modern LLM-based agents. **Key developments:** - **Mem0 (2025-2026)** — Dominates commercially with 186 million API calls quarterly. The graph-enhanced variant (Mem0g) builds a directed, labeled knowledge graph alongside the vector store. Maggy's typed memory system (scores, patterns, policies, gaps) is similarly structured but uses domain-specific merge rules rather than a general-purpose graph. - **Collaborative Memory (2025)** — A framework for multi-user, multi-agent environments with asymmetric, time-evolving access controls. Maintains private memory (per-user) and shared memory (selectively shared). This directly validates Maggy Mesh's approach of personal memory + team memory with provenance-based filtering. - **MAGMA: Multi-Graph Agentic Memory Architecture (2026)** — Uses multiple graph structures for different memory types. Parallels Maggy's typed memory classes where scores, patterns, and policies each have different storage and merge semantics. - **SimpleMem (2025)** — Achieved 26.4% average F1 improvement over baselines with 30x token reduction. Demonstrates that structured memory management produces dramatically better results than naive context stuffing. **Maggy's position:** Most memory systems are passive stores. Maggy's memory is active — the L1-L4 control loops continuously update, prune, and evolve stored knowledge based on outcomes. The Mesh adds a distributed dimension that no other agent memory system currently implements. ### 12.3 Federated & Distributed AI - **Federated AI Agents** — Intelligent software systems that learn collaboratively across multiple devices while keeping data localized. This is the theoretical foundation for Maggy Mesh: share learned intelligence, not raw data. - **Agentic Federated Learning (ICML 2025)** — Autonomous agents collaborate on distributed learning tasks, each contributing local expertise to a shared model. Maggy adapts this from model training to operational intelligence: instead of sharing gradients, Maggy shares typed memory (scores, patterns, policies) with provenance. - **Multi-Agent Collaboration Surveys (ACM DEAI 2025)** — A unified taxonomy decomposing AI agents into Perception, Brain, Planning, Action, Tool Use, and Collaboration subsystems. Surveys show collaborative architectures outperform isolated agents by 30-60% on complex tasks. Gartner reported a 1,445% surge in multi-agent system inquiries from Q1 2024 to Q2 2025. - **CRDT-inspired merge** — Conflict-free replicated data types allow distributed systems to merge state without coordination. Maggy uses type-specific merge rules (weighted average for scores, union for patterns, backtest-gated for policies) inspired by CRDT semantics. ### 12.4 Self-Improving Coding in Production The research isn't just theoretical. Production deployments validate that self-improving agents deliver measurable value: | System | Result | Relevance to Maggy | |--------|--------|-------------------| | **Meta's REA** | Doubled model accuracy; 3 engineers improved 8 models simultaneously | Multi-model optimization works at scale | | **Cognition (Devin)** | $73M ARR, 67% of PRs merged autonomously | Market demand for autonomous engineering is real | | **Harvey + Claude Dreaming** | 6x task completion improvement | Cross-session pattern extraction works | | **Karpathy's autoresearch** | 630-line script, 700 experiments in 2 days, 20 optimizations, 11% efficiency gain | Automated experimentation finds real improvements | | **AlphaEvolve** | 0.7% of Google's worldwide compute recovered | Self-improvement produces hyperscale ROI | **Claude Managed Agents — Dreaming (May 2026):** Anthropic's most relevant competitive move. Dreaming is a scheduled process that reviews past agent sessions, extracts patterns, and curates memories so agents improve over time. It surfaces insights no single session could see: recurring mistakes, workflows that multiple agents converge on, and team-shared preferences. This is the closest any competitor comes to Maggy's L3/L4 control loops — but it's cloud-only, Anthropic-locked, and doesn't include process intelligence (CI/review/deploy learning). ### 12.5 Control Theory Foundations - **Inner-outer loop control** — Industrial control systems use fast inner loops for stability and slow outer loops for optimization. Maggy's L0 (seconds) through L4 (months) hierarchy mirrors this established engineering pattern. The key insight: outer loops NEVER override inner loop stability. L3 can change routing policy, but L0 still catches in-task failures regardless. - **Reinforcement learning from task outcomes** — Maggy's reward registry applies RLHF principles at the system level, using task outcomes (CI pass, review rounds, deploy success) and user behavior (overrides, re-dos, reverts) as reward signals. Unlike RLHF for model training, this operates at the operational level without any model fine-tuning. ### 12.6 Local-First Software - **Local-first principles (Ink & Switch, 2019)** — Software that works offline, keeps data on user devices, and syncs peer-to-peer. Maggy's architecture is explicitly local-first: SQLite databases, local filesystem storage, optional P2P sync. - **Privacy-first trend (2026)** — Multiple tools now emphasize data privacy. OpenCode stores no code or context data. Aider runs entirely locally. The market is moving toward local execution as enterprises grow wary of sending proprietary code to cloud services. Maggy was designed local-first from day one — this isn't a retrofit. ### 12.7 Market Context The AI coding tool market is at an inflection point: - **Gartner predicts 40% of enterprise apps will include task-specific AI agents by 2026**, up from less than 5% in 2025. - **57% of organizations** report measurable impact from AI agents in software development (2025 industry survey). - The explosion of coding CLIs (30+ tools in 2026) reflects a shift from IDE-native AI to terminal-first agents that understand codebases, git history, and development workflows. - **SWE-bench scores** continue to climb: Claude Mythos Preview hits 93.9% on Verified, 77.8% on Pro. But raw coding ability is becoming commoditized. The differentiation is moving to *what surrounds the model*: memory, learning, process integration, and team collaboration. **The implication for Maggy:** Raw code generation quality is converging across models. The next competitive frontier is *what happens around the generation*: learning from outcomes, optimizing processes, sharing intelligence across teams. This is exactly where Maggy's architecture is positioned. --- ## 13. How to Get Started ### Installation ```bash git clone https://github.com/alinaqi/maggy.git cd maggy ./install.sh ``` ### Current State (v4.0) Today, Maggy includes: - **Skills system** — Markdown-based instructions for AI agents (TDD, security, iCPG, Mnemos, etc.) - **Polyphony** — Container-isolated multi-agent orchestration (173 tests, 14 modules) - **iCPG** — Intent-augmented code property graph with blast radius scoring - **Mnemos** — Task-scoped memory lifecycle with typed MnemoGraph - **Cross-agent delegation** — Complexity-based task routing to Codex, Kimi, etc. - **Skill-lint** — Quality gates for skill files - **Behavioral evals** — Test framework for skill effectiveness ### Roadmap to v5.0 The 14-phase migration path takes Maggy from a single-project, single-model toolkit to the multi-project, multi-model, self-improving, team-learning platform described in this RFC. --- ## Contact **Ali Shaheen** — ali@protaige.com **Protaige** — Building the future of autonomous AI engineering --- *This document describes the Maggy v5 architecture as designed. Implementation follows the 11-phase migration path. For technical details, see `docs/architecture-v5.md`. For phase-level task specs, see `_project_specs/phases/`.* ================================================ FILE: maggy/install.sh ================================================ #!/usr/bin/env bash # Maggy installer — sets up deps and copies config template. # # Usage: ./install.sh set -euo pipefail HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" MAGGY_HOME="${MAGGY_HOME:-$HOME/.maggy}" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo " Maggy — Generic AI Engineering Command Center" echo " Installing to: $MAGGY_HOME" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo # 1. Check Python — enforce the 3.11+ minimum from pyproject.toml's requires-python. if ! command -v python3 >/dev/null 2>&1; then echo "❌ python3 not found. Install Python 3.11 or later first." exit 1 fi PY_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') if ! python3 -c 'import sys; raise SystemExit(0 if sys.version_info >= (3, 11) else 1)'; then echo "❌ Python 3.11 or later is required. Found Python $PY_VERSION." echo " Install a newer Python (e.g. via pyenv, homebrew, or python.org)." exit 1 fi echo "✓ Python $PY_VERSION" # 2. Check claude CLI if ! command -v claude >/dev/null 2>&1; then echo "⚠ claude CLI not found on PATH. Maggy can still run, but Execute won't work until you install Claude Code." else echo "✓ claude CLI found" fi # 3. Install Python deps echo echo "Installing Python dependencies..." python3 -m pip install --upgrade pip >/dev/null 2>&1 || true python3 -m pip install -e "$HERE" || python3 -m pip install -r "$HERE/requirements.txt" 2>/dev/null || { # Fallback: explicit install of runtime deps python3 -m pip install 'fastapi>=0.115' 'uvicorn[standard]>=0.30' 'httpx>=0.27' 'anthropic>=0.40' 'pyyaml>=6.0' 'feedparser>=6.0' 'pydantic>=2.6' } echo "✓ Dependencies installed" # 4. Config directory + template mkdir -p "$MAGGY_HOME" if [ ! -f "$MAGGY_HOME/config.yaml" ]; then cp "$HERE/config.example.yaml" "$MAGGY_HOME/config.yaml" echo "✓ Wrote config template to $MAGGY_HOME/config.yaml" NEEDS_CONFIG=1 else echo "✓ Config already exists at $MAGGY_HOME/config.yaml (not overwritten)" NEEDS_CONFIG=0 fi # 5. Remember bootstrap location for iCPG integration BOOTSTRAP_MARKER="$HOME/.claude/.bootstrap-dir" if [ ! -f "$BOOTSTRAP_MARKER" ]; then mkdir -p "$HOME/.claude" # Maggy lives in <bootstrap>/maggy — one level up is bootstrap root echo "$(cd "$HERE/.." && pwd)" > "$BOOTSTRAP_MARKER" echo "✓ Marked bootstrap location for iCPG access" fi echo echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" if [ "$NEEDS_CONFIG" = "1" ]; then echo "Next steps:" echo " 1. Edit $MAGGY_HOME/config.yaml" echo " - Set your org name, domain, GitHub org + repos" echo " - Set codebase paths for each repo you want Maggy to execute in" echo echo " 2. Export credentials:" echo " export GITHUB_TOKEN=ghp_... # repo + issues scopes" echo " export ANTHROPIC_API_KEY=sk-ant-..." echo echo " 3. Run:" echo " cd $HERE && python3 -m maggy.main" echo echo " 4. Open http://localhost:8080" else echo "Ready to run:" echo " cd $HERE && python3 -m maggy.main" echo " Then open http://localhost:8080" fi echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" ================================================ FILE: maggy/maggy/__init__.py ================================================ """Maggy — generic AI engineering command center.""" __version__ = "0.1.0" ================================================ FILE: maggy/maggy/adapters/__init__.py ================================================ """Unified agent adapters for multi-model execution.""" ================================================ FILE: maggy/maggy/adapters/cli_discovery.py ================================================ """Auto-discover installed AI CLIs and their command-line flags. Probes each CLI via --help, parses capabilities, and builds command templates that PiAdapter uses to spawn prompts. """ from __future__ import annotations import logging import re import shutil import subprocess from dataclasses import dataclass, field logger = logging.getLogger(__name__) _HELP_TIMEOUT = 10 @dataclass class CliProfile: """Discovered CLI capabilities and flags.""" name: str binary: str version: str = "" installed: bool = False prompt_flag: str = "" work_dir_flag: str = "" auto_approve_flag: str = "" output_format_flag: str = "" max_turns_flag: str = "" afk_flag: str = "" uses_exec_subcommand: bool = False uses_run_subcommand: bool = False run_model: str = "" prompt_is_positional: bool = False def build_command( self, prompt: str, wd: str, max_turns: int, ) -> list[str]: """Build full CLI command from discovered flags.""" cmd = [self.binary] if self.uses_exec_subcommand: cmd.append("exec") elif self.uses_run_subcommand: cmd += ["run", self.run_model] if self.prompt_is_positional: if self.prompt_flag: cmd.append(self.prompt_flag) cmd.append(prompt) elif self.prompt_flag: cmd += [self.prompt_flag, prompt] else: cmd.append(prompt) if self.work_dir_flag: cmd += [self.work_dir_flag, wd] if self.auto_approve_flag: cmd.append(self.auto_approve_flag) if self.afk_flag: cmd.append(self.afk_flag) if self.output_format_flag: cmd += [self.output_format_flag, "json"] if self.max_turns_flag and max_turns > 0: cmd += [self.max_turns_flag, str(max_turns)] return cmd @dataclass class DiscoveryResult: """Result of scanning all known CLI tools.""" profiles: dict[str, CliProfile] = field(default_factory=dict) errors: list[str] = field(default_factory=list) _KNOWN_CLIS = ["claude", "codex", "kimi", "deepseek", "ollama"] def discover_all() -> DiscoveryResult: """Scan for all known AI CLIs and probe capabilities.""" result = DiscoveryResult() for name in _KNOWN_CLIS: profile = discover_cli(name) result.profiles[name] = profile if not profile.installed: result.errors.append(f"{name}: not found") return result def discover_cli(name: str) -> CliProfile: """Probe a single CLI binary for capabilities.""" binary = shutil.which(name) if not binary: return CliProfile(name=name, binary=name) profile = CliProfile(name=name, binary=binary, installed=True) profile.version = _get_version(binary) help_text = _get_help(binary, "") _extract_flags(profile, help_text) if profile.uses_exec_subcommand: exec_help = _get_help(binary, "exec") _refine_from_exec(profile, exec_help) if profile.uses_run_subcommand: run_help = _get_help(binary, "run") _refine_from_run(profile, run_help) _post_process(profile) return profile def _extract_flags(profile: CliProfile, text: str) -> None: """Extract flags by matching known flag names in help.""" # Print/prompt mode if _has(text, r"-p,\s*--print\b"): profile.prompt_flag = "--print" elif _has(text, r"(-p|--prompt)\b"): profile.prompt_flag = "-p" # Working directory if _has(text, r"--work-dir\b"): profile.work_dir_flag = "-w" elif _has(text, r"-C,\s*--cd\b"): profile.work_dir_flag = "-C" elif _has(text, r"--cwd\b"): profile.work_dir_flag = "--cwd" # Auto-approve / skip permissions if _has(text, r"--dangerously-skip-permissions\b"): profile.auto_approve_flag = "--dangerously-skip-permissions" elif _has(text, r"--dangerously-bypass-approvals"): profile.auto_approve_flag = "--dangerously-bypass-approvals-and-sandbox" elif _has(text, r"--yolo\b"): profile.auto_approve_flag = "--yolo" elif _has(text, r"--auto-approve\b"): profile.auto_approve_flag = "--auto-approve" # Output format if _has(text, r"--output-format\b"): profile.output_format_flag = "--output-format" # Max turns / steps if _has(text, r"--max-turns\b"): profile.max_turns_flag = "--max-turns" elif _has(text, r"--max-steps-per"): profile.max_turns_flag = "--max-steps-per-turn" elif _has(text, r"--max-steps\b"): profile.max_turns_flag = "--max-steps" # AFK mode if _has(text, r"--afk\b"): profile.afk_flag = "--afk" # Exec subcommand for non-interactive use if _has(text, r"\bexec\b.*non-interactive"): profile.uses_exec_subcommand = True # Run subcommand (ollama-style: "run Run a model") if _has(text, r"\brun\s+Run a model\b"): profile.uses_run_subcommand = True def _refine_from_exec(profile: CliProfile, text: str) -> None: """Override flags with more specific exec subcommand flags.""" if _has(text, r"-C,\s*--cd\b"): profile.work_dir_flag = "-C" if _has(text, r"--dangerously-bypass-approvals"): profile.auto_approve_flag = "--dangerously-bypass-approvals-and-sandbox" def _refine_from_run(profile: CliProfile, text: str) -> None: """Extract flags from run subcommand help (ollama-style).""" profile.prompt_is_positional = True profile.prompt_flag = "" def _post_process(profile: CliProfile) -> None: """Apply heuristics after flag extraction.""" # --print means non-interactive mode; prompt is positional if profile.prompt_flag == "--print": profile.prompt_is_positional = True profile.prompt_flag = "-p" # exec subcommand: prompt is also positional if profile.uses_exec_subcommand: profile.prompt_is_positional = True profile.prompt_flag = "" # run subcommand (ollama): prompt is positional, need model if profile.uses_run_subcommand: profile.prompt_is_positional = True profile.prompt_flag = "" if not profile.run_model: profile.run_model = _detect_ollama_model(profile) # Claude uses subprocess cwd, not a --cd flag if "claude" in profile.name.lower(): profile.work_dir_flag = "" # If -p is a prompt arg (not print mode), --output-format # is likely tied to --print mode and will error in -p mode if not profile.prompt_is_positional and profile.output_format_flag: profile.output_format_flag = "" def _detect_ollama_model(profile: CliProfile) -> str: """Find best coding model available in ollama.""" try: out = subprocess.run( [profile.binary, "list"], capture_output=True, text=True, timeout=_HELP_TIMEOUT, ) text = out.stdout.lower() except (subprocess.TimeoutExpired, OSError): return "qwen3-coder:30b-a3b-q8_0" # Prefer Qwen3-Coder (MoE, 3.3B active), then older models prefs = [ "qwen3-coder:30b-a3b-q8_0", "qwen3-coder:30b", "qwen2.5-coder:32b", "qwen2.5-coder:14b", "qwen2.5-coder:7b", "deepseek-coder-v2", "codellama:34b", "codellama:13b", "qwen3:32b", "llama3.1:70b", "llama3.1:8b", ] for model in prefs: if model.split(":")[0] in text: return model # Fallback: first listed model lines = out.stdout.strip().splitlines() if len(lines) > 1: return lines[1].split()[0] return "qwen3-coder:30b-a3b-q8_0" def _has(text: str, pattern: str) -> bool: """Check if pattern exists in text (case-insensitive).""" return bool(re.search(pattern, text, re.IGNORECASE)) def _get_version(binary: str) -> str: """Get CLI version string.""" for flag in ("--version", "-V", "-v"): try: out = subprocess.run( [binary, flag], capture_output=True, text=True, timeout=_HELP_TIMEOUT, env=_clean_env(), ) text = (out.stdout + out.stderr).strip() if text and len(text) < 200: return text.split("\n")[0] except (subprocess.TimeoutExpired, OSError): continue return "" def _get_help(binary: str, subcommand: str) -> str: """Run --help and return output.""" cmd = [binary] if subcommand: cmd.append(subcommand) cmd.append("--help") try: out = subprocess.run( cmd, capture_output=True, text=True, timeout=_HELP_TIMEOUT, env=_clean_env(), ) return (out.stdout + out.stderr).strip() except (subprocess.TimeoutExpired, OSError) as exc: logger.debug("Help failed for %s: %s", binary, exc) return "" def _clean_env() -> dict[str, str]: """Return env without CLAUDECODE to avoid nesting block.""" import os env = os.environ.copy() env.pop("CLAUDECODE", None) return env ================================================ FILE: maggy/maggy/adapters/pi.py ================================================ """Unified adapter for CLI prompts and Pi RPC control. Auto-discovers installed AI CLIs and their flags at init time so Maggy can orchestrate any subscription-based tool (claude, codex, kimi, etc.) without hardcoded command templates. """ from __future__ import annotations import asyncio import json import logging import os import shutil import subprocess from dataclasses import dataclass from typing import AsyncIterator from maggy.adapters.cli_discovery import ( CliProfile, DiscoveryResult, discover_all, ) logger = logging.getLogger(__name__) def _extract_usage(raw: str) -> tuple[float, int, int, str]: """Parse JSON CLI output for cost/tokens; fall back to raw text.""" try: d = json.loads(raw) u = d.get("usage") or {} return ( float(d.get("cost_usd") or 0), int(u.get("input_tokens") or 0), int(u.get("output_tokens") or 0), str(d.get("result", raw)), ) except (json.JSONDecodeError, ValueError, TypeError): return 0.0, 0, 0, raw @dataclass class ModelEntry: name: str provider: str model_id: str tier: str cost_per_1k: float = 0.0 daily_limit_usd: float = 50.0 cli_command: str = "claude" context_window: int = 200_000 DEFAULT_MODELS: list[ModelEntry] = [ ModelEntry("local", "ollama", "qwen3-coder:30b-a3b-q8_0", "local", 0.0, 0.0, "ollama", 32_000), ModelEntry("kimi", "moonshot", "kimi-k2", "cheap", 0.001, 10.0, "kimi", 128_000), ModelEntry("deepseek", "deepseek", "deepseek-v3", "cheap", 0.002, 10.0, "deepseek", 128_000), ModelEntry("gpt", "openai", "gpt-4o", "medium", 0.01, 20.0, "codex", 128_000), ModelEntry("claude", "anthropic", "claude-sonnet-4", "premium", 0.03, 50.0, "claude", 200_000), ModelEntry("codex", "openai", "codex", "validator", 0.02, 30.0, "codex", 200_000), ] QUOTA_MARKERS = frozenset( {"rate limit", "quota", "429", "too many requests", "capacity", "overloaded"} ) @dataclass class RunResult: model: str success: bool output: str = "" error: str = "" cost_usd: float = 0.0 input_tokens: int = 0 output_tokens: int = 0 turns: int = 0 quota_hit: bool = False class PiAdapter: def __init__( self, models: list[ModelEntry] | None = None, rpc_command: str = "pi", discovery: DiscoveryResult | None = None, ): entries = models or DEFAULT_MODELS self._models = {entry.name: entry for entry in entries} self._fallback_order = [ entry.name for entry in sorted(entries, key=lambda m: m.cost_per_1k) ] self._rpc_command = rpc_command self._rpc_process: subprocess.Popen[str] | None = None self._streaming = False self._discovery = discovery or discover_all() self._profiles: dict[str, CliProfile] = self._discovery.profiles self._log_discovery() def get_model(self, name: str) -> ModelEntry | None: return self._models.get(name) def list_models(self) -> list[ModelEntry]: return list(self._models.values()) def fallback_chain(self, start: str) -> list[str]: try: idx = self._fallback_order.index(start) except ValueError: return self._fallback_order return self._fallback_order[idx + 1 :] async def send_prompt( self, model_name: str, prompt: str, working_dir: str, max_turns: int = 20, timeout: int = 600, ) -> RunResult: model = self._models.get(model_name) if not model: return RunResult(model=model_name, success=False, error=f"Unknown model: {model_name}") try: proc = await self._spawn_prompt(model, prompt, max_turns, working_dir) stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout) return self._prompt_result(model_name, proc.returncode or 0, stdout or b"") except asyncio.TimeoutError: return RunResult(model=model_name, success=False, error="Timed out") except FileNotFoundError: return RunResult( model=model_name, success=False, error=f"CLI '{model.cli_command}' not found" ) async def send_with_fallback( self, model_name: str, prompt: str, working_dir: str, max_turns: int = 20, ) -> RunResult: result = await self.send_prompt(model_name, prompt, working_dir, max_turns) if result.success: return result for fallback in self.fallback_chain(model_name): logger.info("Falling back from %s to %s", model_name, fallback) result = await self.send_prompt(fallback, prompt, working_dir, max_turns) if result.success: return result return result def send_rpc(self, command: dict[str, object]) -> dict[str, object]: proc = self._ensure_rpc_process() stdin = self._require_stream(proc.stdin, "stdin") stdout = self._require_stream(proc.stdout, "stdout") if self._streaming: raise RuntimeError("Cannot send RPC while streaming") stdin.write(f"{json.dumps(command, separators=(',', ':'))}\n") stdin.flush() line = stdout.readline() return json.loads(line or "{}") def switch_model(self, provider: str, model: str) -> bool: payload = {"command": "set_model", "provider": provider, "model": model} return bool(self.send_rpc(payload).get("ok")) async def stream_events(self) -> AsyncIterator[dict[str, object]]: if self._streaming: raise RuntimeError("Already streaming events") stdout = self._require_stream(self._ensure_rpc_process().stdout, "stdout") self._streaming = True try: while True: line = await asyncio.to_thread(stdout.readline) if not line: break yield json.loads(line) finally: self._streaming = False def _build_command( self, model: ModelEntry, prompt: str, max_turns: int, wd: str, ) -> list[str]: profile = self._profiles.get(model.cli_command) if profile and profile.installed: return profile.build_command(prompt, wd, max_turns) return [model.cli_command, "-p", prompt] def _detect_quota(self, text: str) -> bool: return any(marker in text.lower() for marker in QUOTA_MARKERS) def _detect_pi(self) -> bool: return shutil.which(self._rpc_command) is not None async def _spawn_prompt( self, model: ModelEntry, prompt: str, max_turns: int, working_dir: str, ) -> asyncio.subprocess.Process: env = os.environ.copy() env.pop("CLAUDECODE", None) return await asyncio.create_subprocess_exec( *self._build_command(model, prompt, max_turns, working_dir), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, cwd=working_dir, env=env, ) def _log_discovery(self) -> None: for name, p in self._profiles.items(): level = logging.INFO if p.installed else logging.DEBUG logger.log(level, "CLI %s: %s v%s", "OK" if p.installed else "missing", name, p.version) @property def discovered_profiles(self) -> dict[str, CliProfile]: return dict(self._profiles) def _prompt_result(self, model_name: str, code: int, stdout: bytes) -> RunResult: raw = stdout.decode("utf-8", errors="replace") quota = self._detect_quota(raw) cost, in_t, out_t, text = _extract_usage(raw) return RunResult( model=model_name, success=code == 0, output=text, error="" if code == 0 else f"Exit code {code}", quota_hit=quota, cost_usd=cost, input_tokens=in_t, output_tokens=out_t, ) def _ensure_rpc_process(self) -> subprocess.Popen[str]: proc = self._rpc_process if proc and getattr(proc, "poll", lambda: None)() is None: return proc self._rpc_process = subprocess.Popen( [self._rpc_command], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, ) return self._rpc_process def _require_stream(self, stream: object, name: str): if stream is None: raise RuntimeError(f"Pi RPC {name} is unavailable") return stream ================================================ FILE: maggy/maggy/api/__init__.py ================================================ ================================================ FILE: maggy/maggy/api/auth.py ================================================ """Shared authentication and configuration guards.""" from __future__ import annotations from fastapi import HTTPException, Request def check_auth( request: Request, x_api_key: str | None, ) -> None: """Simple token check. Bypassed when auth_mode='local'.""" cfg = request.app.state.cfg if cfg.dashboard.auth_mode == "local": return expected = cfg.dashboard.api_key if not expected or x_api_key != expected: raise HTTPException( status_code=401, detail="Invalid or missing X-API-Key", ) def require_configured(request: Request) -> None: """Abort 503 if Maggy is not configured.""" if not getattr(request.app.state, "configured", False): raise HTTPException( status_code=503, detail="Maggy is not configured yet.", ) def require_provider(request: Request) -> None: """Abort 503 if no provider credentials (Tier 2).""" mode = getattr(request.app.state, "mode", "local") if mode != "full": raise HTTPException( status_code=503, detail="Provider credentials required. " "Set GITHUB_TOKEN or configure Asana.", ) ================================================ FILE: maggy/maggy/api/routes.py ================================================ """REST API routes — wraps services. All routes under /api/*.""" from __future__ import annotations import logging from typing import Literal from fastapi import APIRouter, Header, HTTPException, Query, Request from pydantic import BaseModel logger = logging.getLogger(__name__) router = APIRouter(prefix="/api", tags=["maggy"]) def _auth(request: Request, x_api_key: str | None) -> None: """Simple token check. Bypassed when auth_mode='local'.""" cfg = request.app.state.cfg if cfg.dashboard.auth_mode == "local": return expected = cfg.dashboard.api_key if not expected or x_api_key != expected: raise HTTPException(status_code=401, detail="Invalid or missing X-API-Key") def _require_configured(request: Request) -> None: """Abort 503 if no provider credentials (Tier 2).""" mode = getattr(request.app.state, "mode", "local") if mode != "full": raise HTTPException( status_code=503, detail="Provider credentials required. " "Set GITHUB_TOKEN or configure Asana.", ) # ── Health + Config ────────────────────────────────────────────────────── @router.get("/health") async def health(request: Request) -> dict: cfg = request.app.state.cfg mode = getattr(request.app.state, "mode", "local") return { "status": "ok", "version": "0.1.0", "mode": mode, "provider": cfg.issue_tracker.provider, "org": cfg.org.name, "codebases": len(cfg.codebases), "competitors_enabled": bool( cfg.competitors.categories, ), } @router.get("/activity") async def get_activity(request: Request) -> dict: """Live CLI sessions + recent prompts. No credentials needed.""" return request.app.state.activity.get_activity() @router.get("/discovery") async def get_discovery(request: Request) -> dict: """Return auto-discovered environment info.""" from maggy.discovery import full_discovery result = full_discovery() return { "clis": result.clis, "repos": result.repos, "active_projects": result.active_projects, "tokens": result.tokens, "github_org": result.github_org, } @router.get("/config") async def get_config(request: Request, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) cfg = request.app.state.cfg # Redact secrets before returning return { "org": {"name": cfg.org.name, "domain": cfg.org.domain}, "issue_tracker": {"provider": cfg.issue_tracker.provider}, "codebases": [{"key": c.key, "path": c.path} for c in cfg.codebases], "competitors": {"categories": cfg.competitors.categories, "seed": cfg.competitors.seed}, "okrs": {"source": cfg.okrs.source, "count": len(cfg.okrs.items)}, "ai": {"provider": cfg.ai.provider, "model": cfg.ai.model, "has_key": bool(cfg.ai.api_key)}, } # ── Inbox ──────────────────────────────────────────────────────────────── @router.get("/inbox") async def get_inbox(request: Request, refresh: bool = Query(False), x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) items = await request.app.state.inbox.get_prioritized(force_refresh=refresh) return {"items": items, "total": len(items)} @router.get("/followed") async def get_followed(request: Request, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) try: tasks = await request.app.state.provider.list_followed(limit=50) except Exception as e: logger.warning("list_followed failed: %s", e) raise HTTPException(status_code=502, detail="Issue tracker unavailable") return { "items": [ { "id": t.id, "title": t.title, "board": t.board, "url": t.url, "assignee": t.assignee, "updated_at": t.updated_at, "labels": t.labels, } for t in tasks ], "total": len(tasks), } # ── Task detail + comments ─────────────────────────────────────────────── @router.get("/task/{task_id:path}") async def get_task(request: Request, task_id: str, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) try: task = await request.app.state.provider.get_task(task_id) except Exception as e: logger.warning("get_task(%s) failed: %s", task_id, e) raise HTTPException(status_code=502, detail="Issue tracker unavailable") if not task: raise HTTPException(status_code=404, detail="Task not found") try: comments = await request.app.state.provider.get_comments(task_id) except Exception as e: logger.warning("get_comments(%s) failed: %s", task_id, e) comments = [] return { "task": { "id": task.id, "title": task.title, "description": task.description, "status": task.status, "assignee": task.assignee, "url": task.url, "labels": task.labels, "board": task.board, "created_at": task.created_at, "updated_at": task.updated_at, }, "comments": [{"id": c.id, "author": c.author, "text": c.text, "created_at": c.created_at} for c in comments], } class CommentRequest(BaseModel): text: str @router.post("/task/{task_id:path}/comment") async def post_comment(request: Request, task_id: str, body: CommentRequest, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) if not body.text.strip(): raise HTTPException(status_code=400, detail="Comment text is required") try: comment = await request.app.state.provider.add_comment(task_id, body.text) except Exception as e: logger.warning("add_comment(%s) failed: %s", task_id, e) raise HTTPException(status_code=502, detail="Issue tracker unavailable") if not comment: raise HTTPException(status_code=502, detail="Issue tracker rejected the comment") return {"ok": True, "comment": {"id": comment.id, "text": comment.text, "created_at": comment.created_at}} class StatusRequest(BaseModel): status: str @router.post("/task/{task_id:path}/status") async def update_status(request: Request, task_id: str, body: StatusRequest, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) try: ok = await request.app.state.provider.update_status(task_id, body.status) except Exception as e: logger.warning("update_status(%s) failed: %s", task_id, e) raise HTTPException(status_code=502, detail="Issue tracker unavailable") return {"ok": ok} # ── Execute ────────────────────────────────────────────────────────────── class ExecuteRequest(BaseModel): task_id: str mode: Literal["tdd", "plan"] = "tdd" working_dir: str | None = None # override; otherwise auto-picked @router.post("/execute") async def execute(request: Request, body: ExecuteRequest, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) try: session_id = await request.app.state.executor.start( task_id=body.task_id, mode=body.mode, working_dir=body.working_dir, ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) return {"session_id": session_id, "status": "running"} @router.get("/execute/sessions") async def list_sessions(request: Request, x_api_key: str | None = Header(None)) -> list[dict]: _auth(request, x_api_key) _require_configured(request) return request.app.state.executor.list_sessions() @router.get("/execute/sessions/{session_id}") async def get_session(request: Request, session_id: str, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) s = request.app.state.executor.get_session(session_id) if not s: raise HTTPException(status_code=404, detail="Session not found") return s # ── Competitors ────────────────────────────────────────────────────────── @router.get("/competitors") async def list_competitors(request: Request, x_api_key: str | None = Header(None)) -> list[dict]: _auth(request, x_api_key) _require_configured(request) return request.app.state.competitors.list_all() @router.post("/competitors/discover") async def discover_competitors(request: Request, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) return await request.app.state.competitors.discover() @router.post("/competitors/monitor") async def trigger_monitoring(request: Request, x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) return await request.app.state.competitors.monitor_all() @router.get("/competitors/news") async def get_competitor_news(request: Request, limit: int = Query(100), x_api_key: str | None = Header(None)) -> list[dict]: _auth(request, x_api_key) _require_configured(request) return request.app.state.competitors.get_news(limit=limit) @router.get("/competitors/news/summary") async def get_briefing(request: Request, refresh: bool = Query(False), x_api_key: str | None = Header(None)) -> dict: _auth(request, x_api_key) _require_configured(request) return await request.app.state.competitors.get_daily_briefing(refresh=refresh) ================================================ FILE: maggy/maggy/api/routes_budget.py ================================================ """Budget REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, Request from .auth import check_auth router = APIRouter(prefix="/api/budget", tags=["budget"]) @router.get("") async def get_budget( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Return current budget status.""" check_auth(request, x_api_key) budget = request.app.state.budget if not budget: return {"status": "unconfigured"} return budget.budget_status() @router.get("/by-provider") async def by_provider( request: Request, x_api_key: str | None = Header(None), ) -> list[dict]: """Return spend breakdown by provider.""" check_auth(request, x_api_key) budget = request.app.state.budget if not budget: return [] return budget.by_provider() ================================================ FILE: maggy/maggy/api/routes_chat.py ================================================ """Chat API routes — interactive Claude sessions via SSE.""" from __future__ import annotations import json import logging from dataclasses import asdict from fastapi import APIRouter, Header, HTTPException, Request from fastapi.responses import StreamingResponse from pydantic import BaseModel from maggy.api.auth import check_auth logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/chat", tags=["chat"]) def _require_chat(request: Request): chat = getattr(request.app.state, "chat", None) if chat is None: raise HTTPException( status_code=503, detail="Chat service not available.", ) return chat class CreateSessionRequest(BaseModel): project_key: str project_path: str | None = None class SendMessageRequest(BaseModel): message: str class RoutedMessageRequest(BaseModel): message: str blast_score: int | None = None task_type: str | None = None allowed_models: list[str] | None = None @router.post("/auto-connect") async def auto_connect( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Auto-connect to all active projects.""" check_auth(request, x_api_key) chat = _require_chat(request) activity = getattr(request.app.state, "activity", None) if not activity: return {"sessions": []} data = activity.get_activity() active = data.get("sessions", []) recent = data.get("recent", []) sessions = chat.auto_connect(active) history = getattr(request.app.state, "history", None) result = [] for s in sessions: ctx = _enrich_session(s, history, recent) result.append(_session_summary(s, ctx)) return {"sessions": result} def _enrich_session(s, history, recent: list[dict]) -> str: """Build context and resolve session ID.""" from maggy.services.chat_context import ( build_project_context, resolve_claude_session_id, ) ctx = build_project_context( history, s.working_dir, s.project_key, recent, ) s.history_context = ctx if not s.claude_session_id: sid = resolve_claude_session_id(s.working_dir) if sid: s.claude_session_id = sid return ctx def _session_summary(s, context: str) -> dict: """Format session for API response.""" return { "id": s.id, "project_key": s.project_key, "working_dir": s.working_dir, "status": s.status, "messages": len(s.messages), "history_context": context, "has_resume_id": bool(s.claude_session_id), } @router.post("/sessions") async def create_session( request: Request, body: CreateSessionRequest, x_api_key: str | None = Header(None), ) -> dict: """Create a new chat session.""" check_auth(request, x_api_key) chat = _require_chat(request) try: session = chat.create_session( body.project_key, project_path=body.project_path, ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) return { "id": session.id, "project_key": session.project_key, "working_dir": session.working_dir, "status": session.status, } @router.get("/sessions") async def list_sessions( request: Request, x_api_key: str | None = Header(None), ) -> list[dict]: """List all chat sessions.""" check_auth(request, x_api_key) chat = _require_chat(request) return [ { "id": s.id, "project_key": s.project_key, "status": s.status, "created_at": s.created_at, "messages": len(s.messages), } for s in chat.list_sessions() ] @router.get("/sessions/{session_id}") async def get_session( request: Request, session_id: str, x_api_key: str | None = Header(None), ) -> dict: """Get session details + message history.""" check_auth(request, x_api_key) chat = _require_chat(request) s = chat.get_session(session_id) if not s: raise HTTPException(status_code=404, detail="Session not found") return { "id": s.id, "project_key": s.project_key, "working_dir": s.working_dir, "status": s.status, "created_at": s.created_at, "history_context": s.history_context, "messages": [asdict(m) for m in s.messages], } @router.post("/sessions/{session_id}/send") async def send_message( request: Request, session_id: str, body: SendMessageRequest, x_api_key: str | None = Header(None), ): """Send a message and stream response via SSE.""" check_auth(request, x_api_key) chat = _require_chat(request) s = chat.get_session(session_id) if not s: raise HTTPException(status_code=404, detail="Session not found") if not body.message.strip(): raise HTTPException(status_code=400, detail="Message required") budget = getattr(request.app.state, "budget", None) async def event_stream(): async for chunk in chat.send(session_id, body.message): if budget and chunk.get("type") == "result": _record_chat_spend(budget, chunk) data = json.dumps(chunk) yield f"data: {data}\n\n" yield "data: {\"type\": \"done\"}\n\n" return StreamingResponse( event_stream(), media_type="text/event-stream", ) @router.post("/sessions/{session_id}/send-routed") async def send_routed( request: Request, session_id: str, body: RoutedMessageRequest, x_api_key: str | None = Header(None), ): """Send a message routed through blast-score engine.""" check_auth(request, x_api_key) chat = _require_chat(request) s = chat.get_session(session_id) if not s: raise HTTPException( status_code=404, detail="Session not found", ) if not body.message.strip(): raise HTTPException( status_code=400, detail="Message required", ) routing = getattr(request.app.state, "routing", None) budget = getattr(request.app.state, "budget", None) async def event_stream(): from maggy.services.chat_router import RoutedChat decision = None if routing: rc = RoutedChat(routing, budget) decision = rc.decide( body.message, body.blast_score, body.task_type, ) allowed = body.allowed_models if allowed and decision.model not in allowed: decision.model = allowed[0] decision.reason = f"restricted to {','.join(allowed)}" meta = { "type": "routing", "model": decision.model, "blast": decision.blast, "task_type": decision.task_type, "reason": decision.reason, } yield f"data: {json.dumps(meta)}\n\n" had_error = False async for chunk in chat.send(session_id, body.message): if budget and chunk.get("type") == "result": _record_chat_spend(budget, chunk) if chunk.get("type") == "error": had_error = True yield f"data: {json.dumps(chunk)}\n\n" _record_routing_outcome( routing, decision, had_error=had_error, ) yield 'data: {"type": "done"}\n\n' return StreamingResponse( event_stream(), media_type="text/event-stream", ) def _record_chat_spend(budget, chunk: dict) -> None: """Record token/cost data from a result chunk.""" cost = chunk.get("cost_usd", 0) in_t = chunk.get("input_tokens", 0) out_t = chunk.get("output_tokens", 0) if cost or in_t or out_t: budget.record_spend("anthropic", "claude", cost, in_t, out_t) def _record_routing_outcome(routing, decision, *, had_error: bool) -> None: """Record routing reward after chat completes.""" if not routing or not decision: return reward = 0.0 if had_error else 1.0 routing.record_outcome( decision.model, decision.task_type, decision.blast, reward, ) @router.delete("/sessions/{session_id}") async def delete_session( request: Request, session_id: str, x_api_key: str | None = Header(None), ) -> dict: """Delete a chat session.""" check_auth(request, x_api_key) chat = _require_chat(request) ok = chat.delete_session(session_id) if not ok: raise HTTPException(status_code=404, detail="Session not found") return {"ok": True} ================================================ FILE: maggy/maggy/api/routes_cikg.py ================================================ """CIKG REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, Request from .auth import check_auth router = APIRouter(prefix="/api/cikg", tags=["cikg"]) @router.get("/landscape") async def landscape( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Return competitive landscape summary.""" check_auth(request, x_api_key) graph = request.app.state.cikg if not graph: return {"error": "cikg not configured"} from maggy.cikg.queries import get_landscape return get_landscape(graph) @router.get("/gaps/{feature}") async def feature_gaps( request: Request, feature: str, x_api_key: str | None = Header(None), ) -> dict: """Score a feature against competitive landscape.""" check_auth(request, x_api_key) graph = request.app.state.cikg if not graph: return {"error": "cikg not configured"} from maggy.cikg.queries import find_gaps from dataclasses import asdict return asdict(find_gaps(graph, feature)) ================================================ FILE: maggy/maggy/api/routes_deploy.py ================================================ """Deploy REST endpoints.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, Request from pydantic import BaseModel, Field from .auth import check_auth router = APIRouter(prefix="/api/deploy", tags=["deploy"]) class CreateSessionRequest(BaseModel): project: str = Field(..., min_length=1, max_length=200) branch: str = Field(default="main", max_length=200) @router.get("/sessions") async def list_sessions( request: Request, x_api_key: str | None = Header(None), ) -> dict: """List all deploy sessions.""" check_auth(request, x_api_key) svc = request.app.state.deploy if not svc: return {"error": "deploy not configured"} return { "sessions": [asdict(s) for s in svc.list_sessions()], } @router.get("/sessions/{sid}") async def get_session( request: Request, sid: str, x_api_key: str | None = Header(None), ) -> dict: """Get a specific deploy session.""" check_auth(request, x_api_key) svc = request.app.state.deploy if not svc: return {"error": "deploy not configured"} session = svc.get_session(sid) if not session: return {"error": "session not found"} return asdict(session) @router.post("/sessions") async def create_session( request: Request, body: CreateSessionRequest, x_api_key: str | None = Header(None), ) -> dict: """Create a new deploy session.""" check_auth(request, x_api_key) svc = request.app.state.deploy if not svc: return {"error": "deploy not configured"} session = svc.create_session( project=body.project, branch=body.branch, ) return asdict(session) ================================================ FILE: maggy/maggy/api/routes_engram.py ================================================ """Engram REST endpoints.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, Request from .auth import check_auth router = APIRouter(prefix="/api/engram", tags=["engram"]) @router.get("/query") async def query_engrams( request: Request, namespace: str | None = None, memory_type: str | None = None, limit: int = 50, x_api_key: str | None = Header(None), ) -> dict: """Query engram records.""" check_auth(request, x_api_key) engram = request.app.state.engram if not engram: return {"error": "engram not configured"} records = engram.query( namespace=namespace, memory_type=memory_type, limit=limit, ) return {"records": [asdict(r) for r in records]} @router.get("/diagnostics") async def diagnostics( request: Request, namespace: str | None = None, x_api_key: str | None = Header(None), ) -> dict: """Run memory diagnostics.""" check_auth(request, x_api_key) store = request.app.state.engram if not store: return {"error": "engram not configured"} from maggy.engram.diagnostics import diagnose profile = diagnose(store, namespace) return asdict(profile) ================================================ FILE: maggy/maggy/api/routes_escalation.py ================================================ """Escalation REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, HTTPException, Request from pydantic import BaseModel from .auth import check_auth router = APIRouter(prefix="/api/escalations", tags=["escalations"]) class _EscalationIn(BaseModel): session_id: str reason: str context: dict = {} class _ResolveIn(BaseModel): guidance: str @router.get("") async def list_pending( request: Request, x_api_key: str | None = Header(None), ) -> list[dict]: """List pending escalations.""" check_auth(request, x_api_key) esc = request.app.state.escalator if not esc: return [] return [ { "id": p.id, "session_id": p.session_id, "reason": p.reason, "created_at": p.created_at, } for p in esc.list_pending() ] @router.post("", status_code=201) async def create_escalation( body: _EscalationIn, request: Request, x_api_key: str | None = Header(None), ) -> dict: """Create a new escalation.""" check_auth(request, x_api_key) esc = request.app.state.escalator if not esc: raise HTTPException(503, "Not configured") packet = esc.escalate( body.session_id, body.reason, body.context, ) return {"id": packet.id, "status": "pending"} @router.post("/{escalation_id}/resolve") async def resolve_escalation( escalation_id: str, body: _ResolveIn, request: Request, x_api_key: str | None = Header(None), ) -> dict: """Resolve an escalation with guidance.""" check_auth(request, x_api_key) esc = request.app.state.escalator if not esc: raise HTTPException(503, "Not configured") try: packet = esc.resolve(escalation_id, body.guidance) except KeyError: raise HTTPException(404, "Not found") return {"id": packet.id, "status": "resolved"} ================================================ FILE: maggy/maggy/api/routes_events.py ================================================ """Event Spine REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, Request from .auth import check_auth router = APIRouter(prefix="/api/events", tags=["events"]) @router.get("") async def query_events( request: Request, task_id: str | None = None, event_type: str | None = None, project_id: str | None = None, limit: int = 100, x_api_key: str | None = Header(None), ) -> list[dict]: """Query events with optional filters.""" check_auth(request, x_api_key) emitter = request.app.state.events if not emitter: return [] return emitter.query(task_id, event_type, project_id, limit) @router.get("/trace/{task_id}") async def trace_task( request: Request, task_id: str, x_api_key: str | None = Header(None), ) -> list[dict]: """Get full event chain for a task.""" check_auth(request, x_api_key) emitter = request.app.state.events if not emitter: return [] return emitter.trace(task_id) @router.get("/count") async def count_events( request: Request, event_type: str | None = None, project_id: str | None = None, x_api_key: str | None = Header(None), ) -> dict: """Count events matching filters.""" check_auth(request, x_api_key) emitter = request.app.state.events if not emitter: return {"count": 0} return {"count": emitter.count(event_type, project_id)} ================================================ FILE: maggy/maggy/api/routes_forge.py ================================================ """Forge REST endpoints.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, Request from pydantic import BaseModel, Field from .auth import check_auth router = APIRouter(prefix="/api/forge", tags=["forge"]) class GapReport(BaseModel): capability: str = Field(..., min_length=1, max_length=200) @router.get("/status") async def forge_status( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Return Forge connector status.""" check_auth(request, x_api_key) forge = request.app.state.forge if not forge: return {"error": "forge not configured"} return asdict(forge.status()) @router.get("/search") async def search_tools( request: Request, q: str = "", x_api_key: str | None = Header(None), ) -> dict: """Search the Forge tool registry.""" check_auth(request, x_api_key) forge = request.app.state.forge if not forge: return {"error": "forge not configured"} return {"results": forge.search_tools(q)} @router.get("/gaps") async def list_gaps( request: Request, x_api_key: str | None = Header(None), ) -> dict: """List detected capability gaps.""" check_auth(request, x_api_key) forge = request.app.state.forge if not forge: return {"error": "forge not configured"} return {"gaps": forge.get_gaps()} @router.post("/gaps") async def report_gap( request: Request, body: GapReport, x_api_key: str | None = Header(None), ) -> dict: """Report a capability gap.""" check_auth(request, x_api_key) forge = request.app.state.forge if not forge: return {"error": "forge not configured"} return forge.report_gap(body.capability) ================================================ FILE: maggy/maggy/api/routes_heartbeat.py ================================================ """Heartbeat API routes — scheduler status and manual triggers.""" from __future__ import annotations from fastapi import APIRouter, Header, HTTPException, Request from maggy.api.auth import check_auth router = APIRouter(prefix="/api", tags=["heartbeat"]) @router.get("/heartbeat/status") async def heartbeat_status( request: Request, x_api_key: str | None = Header(None), ) -> list[dict]: check_auth(request, x_api_key) scheduler = getattr(request.app.state, "heartbeat", None) if not scheduler: return [] return scheduler.status() @router.post("/heartbeat/trigger/{job_name}") async def trigger_job( request: Request, job_name: str, x_api_key: str | None = Header(None), ) -> dict: check_auth(request, x_api_key) scheduler = getattr(request.app.state, "heartbeat", None) if not scheduler: raise HTTPException(status_code=503, detail="Heartbeat not running") try: return await scheduler.trigger(job_name) except KeyError: raise HTTPException(status_code=404, detail=f"Job '{job_name}' not found") ================================================ FILE: maggy/maggy/api/routes_history.py ================================================ """API routes for session history analysis.""" from __future__ import annotations from fastapi import APIRouter, Header, HTTPException, Request from maggy.api.auth import check_auth router = APIRouter( prefix="/api/history", tags=["history"], ) def _require_history(request: Request): svc = getattr(request.app.state, "history", None) if svc is None: raise HTTPException( status_code=503, detail="History service not available.", ) return svc @router.post("/analyze") async def analyze_history( request: Request, x_api_key: str | None = Header(None), ): """Trigger full history analysis pipeline.""" check_auth(request, x_api_key) svc = _require_history(request) report = svc.analyze() return { "status": "ok", "total_sessions": report.total_sessions, "total_prompts": report.total_prompts, "providers": len(report.providers), "patterns": report.patterns, "summary": report.summary, } @router.get("/report") async def get_report( request: Request, x_api_key: str | None = Header(None), ): """Get latest cached history report.""" check_auth(request, x_api_key) svc = _require_history(request) report = svc.get_report() if not report: return {"status": "no_data"} return report @router.get("/sessions") async def get_sessions( request: Request, provider: str | None = None, x_api_key: str | None = Header(None), ): """Get parsed session records.""" check_auth(request, x_api_key) svc = _require_history(request) sessions = svc.get_sessions(provider=provider) return {"sessions": sessions, "total": len(sessions)} @router.get("/providers") async def list_providers( request: Request, x_api_key: str | None = Header(None), ): """List which CLI tools are available.""" check_auth(request, x_api_key) svc = _require_history(request) return {"providers": svc.available_providers()} ================================================ FILE: maggy/maggy/api/routes_improve.py ================================================ """Self-improvement API routes — reports and manual analysis.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, HTTPException, Request from maggy.api.auth import check_auth router = APIRouter(prefix="/api", tags=["improve"]) @router.get("/improve/report") async def get_report( request: Request, x_api_key: str | None = Header(None), ) -> dict: check_auth(request, x_api_key) introspector = getattr(request.app.state, "introspector", None) if not introspector: raise HTTPException(status_code=503, detail="Not configured") report = introspector.get_report() if not report: return {"report": None} return {"report": asdict(report)} @router.post("/improve/analyze") async def run_analysis( request: Request, x_api_key: str | None = Header(None), ) -> dict: check_auth(request, x_api_key) introspector = getattr(request.app.state, "introspector", None) if not introspector: raise HTTPException(status_code=503, detail="Not configured") report = introspector.analyze() return {"report": asdict(report)} ================================================ FILE: maggy/maggy/api/routes_lexon.py ================================================ """Lexon REST endpoints.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, Request from pydantic import BaseModel, Field from .auth import check_auth router = APIRouter(prefix="/api/lexon", tags=["lexon"]) class LearnRequest(BaseModel): phrase: str = Field(..., min_length=1, max_length=500) tool: str = Field(..., min_length=1, max_length=100) @router.get("/parse") async def parse_intent( request: Request, q: str = "", x_api_key: str | None = Header(None), ) -> dict: """Parse a phrase into a tool intent.""" check_auth(request, x_api_key) lexon = request.app.state.lexon if not lexon: return {"error": "lexon not configured"} record = lexon.route(q) return asdict(record) @router.post("/learn") async def learn_mapping( request: Request, body: LearnRequest, x_api_key: str | None = Header(None), ) -> dict: """Record a confirmed phrase-to-tool mapping.""" check_auth(request, x_api_key) lexon = request.app.state.lexon if not lexon: return {"error": "lexon not configured"} lexon.learn(body.phrase, body.tool) return {"status": "learned"} ================================================ FILE: maggy/maggy/api/routes_mesh.py ================================================ """Mesh P2P REST endpoints — data operations.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, Request from fastapi.responses import JSONResponse from pydantic import BaseModel, Field from .auth import check_auth router = APIRouter(prefix="/api/mesh", tags=["mesh"]) class AddPeerRequest(BaseModel): org: str peer_id: str name: str = "" address: str = "" port: int = Field(default=8080, ge=1, le=65535) class PromoteRequest(BaseModel): org: str key: str @router.get("/status") async def mesh_status( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Return mesh status across all networks.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return {"enabled": False, "peers": 0} return { "enabled": True, "peers": mesh.total_peers, "networks": mesh.list_networks(), } @router.get("/networks") async def list_networks( request: Request, x_api_key: str | None = Header(None), ) -> dict: """List all org-scoped mesh networks.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return {"networks": []} return {"networks": mesh.list_networks()} @router.get("/peers") async def list_peers( request: Request, org: str = "", x_api_key: str | None = Header(None), ) -> dict: """List peers, optionally filtered by org.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) if org: net = mesh.get_network(org) if not net: return JSONResponse( {"error": f"unknown org: {org}"}, status_code=404, ) return { "peers": [asdict(p) for p in net.peers.list_peers()], } peers = [] for status in mesh.list_networks(): net = mesh.get_network(status["org"]) if net: peers.extend( asdict(p) for p in net.peers.list_peers() ) return {"peers": peers} @router.post("/peers") async def add_peer( request: Request, body: AddPeerRequest, x_api_key: str | None = Header(None), ) -> dict: """Manually add a peer to a network.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) net = mesh.get_network(body.org) if not net: return JSONResponse( {"error": f"unknown org: {body.org}"}, status_code=404, ) from maggy.mesh.discovery import PeerInfo net.peers.register(PeerInfo( peer_id=body.peer_id, name=body.name, address=body.address, port=body.port, org=body.org, manual=True, )) return {"status": "added", "peer_id": body.peer_id} @router.get("/quarantine") async def quarantine_list( request: Request, org: str = "", x_api_key: str | None = Header(None), ) -> dict: """List quarantined items for an org.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) if not org: return JSONResponse( {"error": "org parameter required"}, status_code=422, ) net = mesh.get_network(org) if not net: return JSONResponse( {"error": f"unknown org: {org}"}, status_code=404, ) items = [asdict(e) for e in net.quarantine.list_all()] return {"items": items} @router.post("/promote") async def promote( request: Request, body: PromoteRequest, x_api_key: str | None = Header(None), ) -> dict: """Promote a quarantined item into shared memories.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) net = mesh.get_network(body.org) if not net: return JSONResponse( {"error": f"unknown org: {body.org}"}, status_code=404, ) ok = net.sync.promote_from_quarantine(body.key) return {"promoted": ok} ================================================ FILE: maggy/maggy/api/routes_mesh_admin.py ================================================ """Mesh P2P REST endpoints — admin operations.""" from __future__ import annotations from fastapi import APIRouter, Header, Request from fastapi.responses import JSONResponse from .auth import check_auth router = APIRouter(prefix="/api/mesh", tags=["mesh"]) @router.post("/announce") async def announce( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Announce self to all org mesh repos via git.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) cfg = request.app.state.cfg token = cfg.issue_tracker.github.token if not token: return JSONResponse( {"error": "no github token"}, status_code=422, ) result = await mesh.announce_all(token) return {"announced": result} @router.post("/discover") async def discover( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Trigger git-based peer discovery for all orgs.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) cfg = request.app.state.cfg token = cfg.issue_tracker.github.token if not token: return JSONResponse( {"error": "no github token"}, status_code=422, ) result = await mesh.discover(token) return {"discovered": result} @router.post("/setup") async def setup( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Onboarding: create mesh repos for all orgs.""" check_auth(request, x_api_key) mesh = request.app.state.mesh if not mesh: return JSONResponse( {"error": "mesh not enabled"}, status_code=503, ) cfg = request.app.state.cfg token = cfg.issue_tracker.github.token if not token: return JSONResponse( {"error": "no github token"}, status_code=422, ) result = await mesh.setup_repos(token) return {"repos_created": result} ================================================ FILE: maggy/maggy/api/routes_monitor.py ================================================ """API routes for monitor service — tracker polling.""" from __future__ import annotations from fastapi import APIRouter, Request router = APIRouter(prefix="/api/monitor", tags=["monitor"]) @router.get("/status") async def monitor_status(request: Request) -> dict: """Get active monitor status.""" svc = getattr(request.app.state, "monitor", None) if not svc: return {"active": 0, "monitors": []} return svc.status() @router.post("/start") async def monitor_start(request: Request) -> dict: """Start monitoring current project's tracker.""" svc = getattr(request.app.state, "monitor", None) if not svc: return {"ok": False, "error": "monitor not configured"} return {"ok": True, "active": len(svc.list_active())} @router.post("/stop") async def monitor_stop(request: Request) -> dict: """Stop all monitors.""" svc = getattr(request.app.state, "monitor", None) if not svc: return {"ok": False} for cfg in svc.list_active(): svc.remove(cfg.project_key) return {"ok": True} ================================================ FILE: maggy/maggy/api/routes_observability.py ================================================ """Observability signal REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, HTTPException, Request from pydantic import BaseModel from .auth import check_auth router = APIRouter( prefix="/api/observability", tags=["observability"], ) class _SignalIn(BaseModel): project: str signal_type: str value: float @router.get("/signals/{project}") async def get_signals( project: str, request: Request, x_api_key: str | None = Header(None), limit: int = 20, ) -> list[dict]: """Get recent signals for a project.""" check_auth(request, x_api_key) obs = request.app.state.observability if not obs: return [] return obs.recent_signals(project, min(limit, 100)) @router.post("/record", status_code=201) async def record_signal( body: _SignalIn, request: Request, x_api_key: str | None = Header(None), ) -> dict: """Record an observability signal.""" check_auth(request, x_api_key) obs = request.app.state.observability if not obs: raise HTTPException(503, "Not configured") obs.record_signal(body.project, body.signal_type, body.value) return {"status": "recorded"} ================================================ FILE: maggy/maggy/api/routes_planning.py ================================================ """Planning REST endpoints.""" from __future__ import annotations from dataclasses import asdict from fastapi import APIRouter, Header, Request from pydantic import BaseModel, Field from .auth import check_auth router = APIRouter(prefix="/api/planning", tags=["planning"]) class PlanGenerateRequest(BaseModel): task: str = Field(..., min_length=1, max_length=2000) blast_score: int = Field(default=0, ge=0, le=10) files: list[str] | None = None @router.post("/generate") async def generate_plan( request: Request, body: PlanGenerateRequest, x_api_key: str | None = Header(None), ) -> dict: """Generate a plan for a task.""" check_auth(request, x_api_key) svc = request.app.state.planning if not svc: return {"error": "planning not configured"} from maggy.planning import PlanRequest req = PlanRequest( task=body.task, blast_score=body.blast_score, file_context=body.files, ) result = svc.plan_task(req) plan = result["plan"] response = { "mode": result["mode"], "plan": asdict(plan), } if result.get("diff"): response["diff"] = asdict(result["diff"]) return response ================================================ FILE: maggy/maggy/api/routes_process.py ================================================ """Process Intelligence REST routes — /api/process/*.""" from __future__ import annotations import logging from fastapi import APIRouter, Header, HTTPException, Request from pydantic import BaseModel logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/process", tags=["process"]) def _auth(request: Request, x_api_key: str | None) -> None: cfg = request.app.state.cfg if cfg.dashboard.auth_mode == "local": return expected = cfg.dashboard.api_key if not expected or x_api_key != expected: raise HTTPException(401, "Invalid or missing X-API-Key") def _require_process(request: Request) -> None: if not getattr(request.app.state, "process", None): raise HTTPException(503, "Process Intelligence not configured") class AnalyzeRequest(BaseModel): project_key: str @router.post("/analyze") async def analyze( request: Request, body: AnalyzeRequest, x_api_key: str | None = Header(None), ) -> dict: """Trigger full PR analysis (background).""" _auth(request, x_api_key) _require_process(request) svc = request.app.state.process try: report = await svc.analyze(body.project_key) except ValueError as e: raise HTTPException(400, str(e)) except Exception as e: logger.exception("Analysis failed for %s", body.project_key) raise HTTPException(502, f"Analysis failed: {e}") return { "status": "completed", "project_key": body.project_key, "total_prs": report.total_prs, "summary": report.summary, } @router.get("/report/{project_key}") async def get_report( request: Request, project_key: str, x_api_key: str | None = Header(None), ) -> dict: """Get latest process report.""" _auth(request, x_api_key) _require_process(request) report = request.app.state.process.get_report(project_key) if not report: raise HTTPException(404, "No report found. Run /api/process/analyze first.") return report @router.get("/health/{project_key}") async def get_health( request: Request, project_key: str, x_api_key: str | None = Header(None), ) -> dict: """Get process health metrics.""" _auth(request, x_api_key) _require_process(request) health = request.app.state.process.get_health(project_key) if not health: raise HTTPException(404, "No health data. Run /api/process/analyze first.") return health ================================================ FILE: maggy/maggy/api/routes_projects.py ================================================ """Project registry REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, HTTPException, Request from pydantic import BaseModel from .auth import check_auth router = APIRouter(prefix="/api/projects", tags=["projects"]) class _ProjectIn(BaseModel): name: str repo: str path: str default_branch: str = "main" @router.get("") async def list_projects( request: Request, x_api_key: str | None = Header(None), ) -> list[dict]: """List all registered projects.""" check_auth(request, x_api_key) registry = request.app.state.registry if not registry: return [] return [ {"name": p.name, "repo": p.repo, "path": p.path} for p in registry.list() ] @router.get("/{name}") async def get_project( name: str, request: Request, x_api_key: str | None = Header(None), ) -> dict: """Get a single project by name.""" check_auth(request, x_api_key) registry = request.app.state.registry if not registry: raise HTTPException(404, "Not configured") project = registry.get(name) if not project: raise HTTPException(404, f"{name!r} not found") return { "name": project.name, "repo": project.repo, "path": project.path, } @router.post("", status_code=201) async def add_project( body: _ProjectIn, request: Request, x_api_key: str | None = Header(None), ) -> dict: """Register a new project.""" check_auth(request, x_api_key) registry = request.app.state.registry if not registry: raise HTTPException(503, "Not configured") from maggy.config import ProjectConfig project = ProjectConfig( name=body.name, repo=body.repo, path=body.path, default_branch=body.default_branch, ) try: registry.add(project) except ValueError as exc: raise HTTPException(409, str(exc)) from exc return {"name": project.name, "status": "created"} @router.delete("/{name}") async def remove_project( name: str, request: Request, x_api_key: str | None = Header(None), ) -> dict: """Remove a project by name.""" check_auth(request, x_api_key) registry = request.app.state.registry if not registry: raise HTTPException(503, "Not configured") if not registry.remove(name): raise HTTPException(404, f"{name!r} not found") return {"name": name, "status": "removed"} ================================================ FILE: maggy/maggy/api/routes_routing.py ================================================ """Routing REST endpoints.""" from __future__ import annotations from fastapi import APIRouter, Header, Request from .auth import check_auth router = APIRouter(prefix="/api/routing", tags=["routing"]) @router.get("/heatmap") async def heatmap( request: Request, x_api_key: str | None = Header(None), ) -> list[dict]: """Return reward heatmap for dashboard.""" check_auth(request, x_api_key) svc = request.app.state.routing if not svc: return [] return svc.get_heatmap() @router.get("/decide") async def decide( request: Request, blast: int = 0, task_type: str = "general", security: bool = False, x_api_key: str | None = Header(None), ) -> dict: """Get routing decision for given context.""" check_auth(request, x_api_key) svc = request.app.state.routing if not svc: return {"error": "routing not configured"} from maggy.routing import RoutingContext ctx = RoutingContext(blast, task_type, security) decision = svc.route(ctx) return { "primary": decision.primary, "validator": decision.validator, "fallback": decision.fallback_chain, "reason": decision.reason, } @router.get("/rules") async def rules( request: Request, x_api_key: str | None = Header(None), ) -> dict: """Return routing rules summary.""" check_auth(request, x_api_key) svc = request.app.state.routing if not svc: return {"mode": "unconfigured"} r = svc.rules overrides = { k: {"model": v.model, "reason": v.reason} for k, v in r.task_type_overrides.items() } perf = { k: { "strengths": v.strengths, "success_rate": v.success_rate, "tasks_completed": v.tasks_completed, } for k, v in r.model_performance.items() } return { "mode": svc.cfg.routing.mode, "task_type_overrides": overrides, "model_performance": perf, "conventions_count": len(r.conventions), } ================================================ FILE: maggy/maggy/api/routes_setup.py ================================================ """Setup and onboarding routes — detect missing config, guide users.""" from __future__ import annotations from fastapi import APIRouter, Request from pydantic import BaseModel, Field from maggy import config as config_mod router = APIRouter(prefix="/api/setup", tags=["setup"]) class ConfigureRequest(BaseModel): org_name: str = "" github_org: str = "" github_repos: list[str] = Field(default_factory=list) competitor_categories: list[str] = Field( default_factory=list, ) def _step(label: str, ok: bool, hint: str = "") -> dict: """Build a single setup step status.""" return { "label": label, "status": "done" if ok else "missing", "hint": hint, } def _build_steps(cfg) -> list[dict]: """Detect what's configured and what's missing.""" gh = cfg.issue_tracker.github return [ _step("GitHub token", bool(gh.token), ""), _step("GitHub organization", bool(gh.org), ""), _step( "GitHub repositories", bool(gh.repos), "Select repos to track issues from", ), _step( "AI provider", bool(cfg.ai.api_key) or _has_claude_cli(), "", ), _step("Codebases", bool(cfg.codebases), ""), ] def _has_claude_cli() -> bool: """Check if claude CLI is available.""" import shutil return shutil.which("claude") is not None def _discover_summary() -> dict: """Run discovery and return summary.""" from maggy.discovery import ( discover_cli_auth, discover_clis, discover_env_tokens, ) return { "clis": discover_clis(), "cli_auth": discover_cli_auth(), "tokens": discover_env_tokens(), } @router.get("/status") async def setup_status(request: Request) -> dict: """What's configured, what's missing.""" cfg = request.app.state.cfg steps = _build_steps(cfg) done = sum(1 for s in steps if s["status"] == "done") discovery = _discover_summary() return { "configured": request.app.state.mode == "full", "mode": request.app.state.mode, "steps": steps, "progress": f"{done}/{len(steps)}", "codebases": len(cfg.codebases), "github_org": cfg.issue_tracker.github.org, "discovery": discovery, } @router.post("/configure") async def configure( request: Request, body: ConfigureRequest, ) -> dict: """Update config sections dynamically.""" cfg = request.app.state.cfg if body.org_name: cfg.org.name = body.org_name if body.github_org: cfg.issue_tracker.github.org = body.github_org if body.github_repos: cfg.issue_tracker.github.repos = body.github_repos if body.competitor_categories: cfg.competitors.categories = body.competitor_categories config_mod.save(cfg) return {"saved": True} @router.post("/reload") async def reload_config(request: Request) -> dict: """Reload config and reinitialize services.""" from maggy.main import reconfigure reconfigure(request.app) mode = request.app.state.mode return {"mode": mode, "reloaded": True} @router.get("/discover-repos") async def discover_repos(request: Request) -> dict: """Return repos found on disk, grouped by org.""" from maggy.discovery import full_discovery result = full_discovery() return { "github_org": result.github_org, "github_orgs": result.github_orgs, "repos": [ {"key": r["key"], "path": r["path"]} for r in result.repos ], "cli_auth": result.cli_auth, "clis": result.clis, } @router.post("/auto-configure") async def auto_configure(request: Request) -> dict: """Run auto-discovery, save config, reload.""" cfg = config_mod.auto_configure() request.app.state.cfg = cfg from maggy.main import reconfigure reconfigure(request.app) return { "mode": request.app.state.mode, "codebases": len(cfg.codebases), "github_org": cfg.issue_tracker.github.org, "github_repos": cfg.issue_tracker.github.repos, "has_token": bool(cfg.issue_tracker.github.token), } @router.get("/cli-models") async def cli_models() -> dict: """Auto-discover AI CLIs and their capabilities.""" from maggy.adapters.cli_discovery import discover_all result = discover_all() profiles = [] for name, p in result.profiles.items(): profiles.append({ "name": name, "installed": p.installed, "version": p.version, "prompt_flag": p.prompt_flag, "work_dir_flag": p.work_dir_flag, "auto_approve": p.auto_approve_flag, "afk": p.afk_flag, }) installed = [p["name"] for p in profiles if p["installed"]] return { "profiles": profiles, "installed": installed, "ready": len(installed) > 0, } ================================================ FILE: maggy/maggy/budget.py ================================================ """Token budget manager — tracks spend per provider with daily limits.""" from __future__ import annotations import sqlite3 import tempfile from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Iterator from maggy.config import MaggyConfig def _today_utc() -> str: return datetime.now(timezone.utc).date().isoformat() @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: try: conn = _open_conn(path) except sqlite3.OperationalError: fallback = Path(tempfile.gettempdir()) / "maggy" / path.name conn = _open_conn(fallback) try: yield conn finally: conn.close() def _open_conn(path: Path) -> sqlite3.Connection: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row return conn SCHEMA = """ CREATE TABLE IF NOT EXISTS spend ( id INTEGER PRIMARY KEY AUTOINCREMENT, provider TEXT NOT NULL, model TEXT NOT NULL, cost_usd REAL NOT NULL, input_tokens INTEGER NOT NULL DEFAULT 0, output_tokens INTEGER NOT NULL DEFAULT 0, day TEXT NOT NULL, created_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_spend_day ON spend(day, provider); """ @dataclass(frozen=True) class ProviderBudget: """Budget limit and preferred model for a provider.""" provider: str daily_limit_usd: float model_preference: str class TaskSpendTracker: """Track task-level spend and repeated edits.""" def __init__(self, max_spend: float): self.max_spend = max_spend self._spent = 0.0 self.files_edited: dict[str, int] = {} def record(self, cost: float) -> None: self._spent += cost def total(self) -> float: return self._spent def is_exceeded(self) -> bool: return self._spent >= self.max_spend def record_edit(self, file_path: str) -> None: count = self.files_edited.get(file_path, 0) self.files_edited[file_path] = count + 1 def detect_loop(self, threshold: int = 3) -> list[str]: return [ path for path, count in self.files_edited.items() if count >= threshold ] class BudgetManager: """Track token spend per provider with daily limits.""" def __init__(self, cfg: MaggyConfig): self.daily_limit = cfg.budget.daily_limit_usd self._plan = cfg.budget.plan self.providers = list(cfg.budget.providers) self._provider_budgets = { item.provider: item for item in self.providers } self.warning_threshold = cfg.budget.warning_threshold db_dir = Path(cfg.storage.path).expanduser().parent self._db_path = db_dir / "budget.db" self._init_db() def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def record_spend( self, provider: str, model: str, cost_usd: float, input_tokens: int = 0, output_tokens: int = 0, ) -> None: now = datetime.now(timezone.utc) with _connect(self._db_path) as conn: conn.execute( "INSERT INTO spend " "(provider,model,cost_usd,input_tokens,output_tokens,day,created_at) " "VALUES (?,?,?,?,?,?,?)", (provider, model, cost_usd, input_tokens, output_tokens, now.date().isoformat(), now.isoformat()), ) conn.commit() def today_spend(self, provider: str | None = None) -> float: today = _today_utc() sql = "SELECT COALESCE(SUM(cost_usd),0) FROM spend WHERE day=?" params: list = [today] if provider: sql += " AND provider=?" params.append(provider) with _connect(self._db_path) as conn: row = conn.execute(sql, params).fetchone() return float(row[0]) def today_tokens(self, provider: str | None = None) -> dict: today = _today_utc() sql = ("SELECT COALESCE(SUM(input_tokens),0)," "COALESCE(SUM(output_tokens),0) FROM spend WHERE day=?") params: list = [today] if provider: sql += " AND provider=?" params.append(provider) with _connect(self._db_path) as conn: row = conn.execute(sql, params).fetchone() return {"input": int(row[0]), "output": int(row[1])} def budget_status(self) -> dict: spent = self.today_spend() ratio = spent / self.daily_limit if self.daily_limit > 0 else 0 status = "exhausted" if ratio >= 1.0 else ( "warning" if ratio >= self.warning_threshold else "ok") tokens = self.today_tokens() return { "spent_today_usd": round(spent, 4), "daily_limit_usd": self.daily_limit, "utilization": round(ratio, 3), "status": status, "plan": self._plan, "input_tokens": tokens["input"], "output_tokens": tokens["output"], } def by_provider(self) -> list[dict]: today = _today_utc() with _connect(self._db_path) as conn: rows = conn.execute( "SELECT provider, SUM(cost_usd) as total " "FROM spend WHERE day=? GROUP BY provider", (today,), ).fetchall() return [ {"provider": r["provider"], "spent_usd": round(r["total"], 4)} for r in rows ] def is_exhausted( self, provider: str | None = None, ) -> bool: """Check if daily budget is exhausted.""" spent = self.today_spend(provider) return spent >= self.daily_limit def is_provider_exhausted(self, provider: str) -> bool: """Check provider-specific budget when configured.""" budget = self._provider_budgets.get(provider) if budget is None: return self.is_exhausted(provider) return self.today_spend(provider) >= budget.daily_limit_usd def cheapest_available(self) -> str | None: """Return preferred model for the first provider with budget left.""" for budget in self.providers: if not self.is_provider_exhausted(budget.provider): return budget.model_preference return None ================================================ FILE: maggy/maggy/calibration/__init__.py ================================================ """Calibration exports.""" from .tracker import CalibrationTracker __all__ = ["CalibrationTracker"] ================================================ FILE: maggy/maggy/calibration/tracker.py ================================================ """SQLite-backed model calibration tracking.""" from __future__ import annotations import sqlite3 from contextlib import contextmanager from pathlib import Path from typing import Iterator SCHEMA = """ CREATE TABLE IF NOT EXISTS calibration ( id INTEGER PRIMARY KEY AUTOINCREMENT, model TEXT NOT NULL, task_type TEXT NOT NULL, predicted REAL NOT NULL, actual REAL NOT NULL ); CREATE INDEX IF NOT EXISTS idx_calibration_model ON calibration(model); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class CalibrationTracker: def __init__(self, db_path: Path): self._db_path = db_path self._init_db() def record( self, model: str, task_type: str, predicted: float, actual: float, ) -> None: with _connect(self._db_path) as conn: conn.execute( "INSERT INTO calibration (model, task_type, predicted, actual) " "VALUES (?, ?, ?, ?)", (model, task_type, predicted, actual), ) conn.commit() def accuracy(self, model: str) -> float: errors = self._errors(model) if not errors: return 0.0 score = sum(max(0.0, 1.0 - err) for err in errors) / len(errors) return round(score, 6) def calibration_error(self, model: str) -> float: errors = self._errors(model) if not errors: return 0.0 return round(sum(errors) / len(errors), 6) def _errors(self, model: str) -> list[float]: with _connect(self._db_path) as conn: rows = conn.execute( "SELECT predicted, actual FROM calibration WHERE model = ?", (model,), ).fetchall() return [abs(row["predicted"] - row["actual"]) for row in rows] def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) ================================================ FILE: maggy/maggy/checkpoint.py ================================================ """JSON checkpoint persistence for fallback chains.""" from __future__ import annotations import json from pathlib import Path DEFAULT_DIR = Path.home() / ".maggy" / "checkpoints" class CheckpointManager: def __init__(self, base_dir: Path = DEFAULT_DIR): self.base_dir = base_dir.expanduser() def write(self, session_id: str, data: dict) -> None: self.base_dir.mkdir(parents=True, exist_ok=True) payload = _normalize(data) target = self._path(session_id) tmp = target.with_suffix(".tmp") tmp.write_text(json.dumps(payload, indent=2)) tmp.replace(target) def read(self, session_id: str) -> dict | None: path = self._path(session_id) if not path.exists(): return None try: return json.loads(path.read_text()) except (json.JSONDecodeError, OSError): return None def delete(self, session_id: str) -> bool: path = self._path(session_id) if not path.exists(): return False path.unlink() return True def list_checkpoints(self) -> list[str]: if not self.base_dir.exists(): return [] names = [path.stem for path in self.base_dir.glob("*.json")] return sorted(names) def _path(self, session_id: str) -> Path: safe_id = _sanitize_id(session_id) target = (self.base_dir / f"{safe_id}.json").resolve() if not str(target).startswith(str(self.base_dir.resolve())): raise ValueError(f"Invalid session id: {session_id!r}") return target def _sanitize_id(session_id: str) -> str: import re if not session_id or not re.fullmatch(r"[a-zA-Z0-9_\-]+", session_id): raise ValueError(f"Invalid session id: {session_id!r}") return session_id def _normalize(data: dict) -> dict: return { "goal": str(data.get("goal", "")), "constraints": list(data.get("constraints", [])), "progress": list(data.get("progress", [])), "model_history": list(data.get("model_history", [])), "current_subgoal": str(data.get("current_subgoal", "")), "fatigue_score": float(data.get("fatigue_score", 0.0)), } ================================================ FILE: maggy/maggy/cikg/__init__.py ================================================ """Competitive Intelligence Knowledge Graph.""" ================================================ FILE: maggy/maggy/cikg/graph.py ================================================ """KnowledgeGraphService — CRUD operations for CIKG.""" from __future__ import annotations import json import sqlite3 from pathlib import Path from .models import Edge, Node from .storage import SCHEMA, _connect class KnowledgeGraphService: """SQLite-backed knowledge graph — CRUD only.""" def __init__(self, db_path: Path): self._db_path = db_path with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def add_node(self, node: Node) -> None: with _connect(self._db_path) as conn: conn.execute( "INSERT OR REPLACE INTO nodes VALUES (?,?,?,?,?,?)", (node.id, node.node_type, node.name, node.description, json.dumps(node.metadata), node.created_at), ) conn.commit() def get_node(self, node_id: str) -> Node | None: with _connect(self._db_path) as conn: row = conn.execute( "SELECT * FROM nodes WHERE id=?", (node_id,), ).fetchone() if not row: return None return _row_to_node(row) def list_nodes(self, node_type: str | None = None) -> list[Node]: with _connect(self._db_path) as conn: if node_type: rows = conn.execute( "SELECT * FROM nodes WHERE node_type=?", (node_type,), ).fetchall() else: rows = conn.execute("SELECT * FROM nodes").fetchall() return [_row_to_node(r) for r in rows] def add_edge(self, edge: Edge) -> None: with _connect(self._db_path) as conn: conn.execute( "INSERT OR REPLACE INTO edges VALUES (?,?,?,?,?)", (edge.source_id, edge.target_id, edge.edge_type, edge.weight, json.dumps(edge.metadata)), ) conn.commit() def get_edges(self, node_id: str, direction: str = "out") -> list[Edge]: with _connect(self._db_path) as conn: edges: list[Edge] = [] if direction in ("out", "both"): for r in conn.execute( "SELECT * FROM edges WHERE source_id=?", (node_id,), ).fetchall(): edges.append(_row_to_edge(r)) if direction in ("in", "both"): for r in conn.execute( "SELECT * FROM edges WHERE target_id=?", (node_id,), ).fetchall(): edges.append(_row_to_edge(r)) return edges def neighbors(self, node_id: str) -> list[Node]: edges = self.get_edges(node_id, "both") ids = set() for e in edges: ids.add(e.source_id) ids.add(e.target_id) ids.discard(node_id) return [n for n in (self.get_node(i) for i in ids) if n] def delete_node(self, node_id: str) -> None: with _connect(self._db_path) as conn: conn.execute("DELETE FROM nodes WHERE id=?", (node_id,)) conn.execute( "DELETE FROM edges WHERE source_id=? OR target_id=?", (node_id, node_id), ) conn.commit() def _row_to_node(r: sqlite3.Row) -> Node: return Node( id=r["id"], node_type=r["node_type"], name=r["name"], description=r["description"], metadata=json.loads(r["metadata"]), created_at=r["created_at"], ) def _row_to_edge(r: sqlite3.Row) -> Edge: return Edge( source_id=r["source_id"], target_id=r["target_id"], edge_type=r["edge_type"], weight=r["weight"], metadata=json.loads(r["metadata"]), ) ================================================ FILE: maggy/maggy/cikg/models.py ================================================ """CIKG node and edge models.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone NODE_TYPES = ( "codebase", "competitor", "feature", "market_segment", "product", "technology", "trend", ) EDGE_TYPES = ( "has_feature", "competes_with", "targets_market", "uses_technology", "protaige_has", "protaige_lacks", "threatens", ) @dataclass class Node: """A node in the knowledge graph.""" id: str node_type: str name: str description: str = "" metadata: dict = field(default_factory=dict) created_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) def __post_init__(self) -> None: if self.node_type not in NODE_TYPES: raise ValueError(f"Invalid node_type: {self.node_type!r}") @dataclass class Edge: """A directed edge between two nodes.""" source_id: str target_id: str edge_type: str weight: float = 1.0 metadata: dict = field(default_factory=dict) def __post_init__(self) -> None: if self.edge_type not in EDGE_TYPES: raise ValueError(f"Invalid edge_type: {self.edge_type!r}") @dataclass class MarketScore: """Result of a market scoring query.""" feature: str gap_count: int = 0 threat_level: str = "low" # low | medium | high trend_alignment: float = 0.0 recommendation: str = "" ================================================ FILE: maggy/maggy/cikg/queries.py ================================================ """CIKG query functions — gap analysis and market scoring.""" from __future__ import annotations from .graph import KnowledgeGraphService from .models import MarketScore, Node def find_gaps(graph: KnowledgeGraphService, feature_name: str) -> MarketScore: """Score a feature against the competitive landscape.""" feature_ids = _matching_ids(graph, "feature", feature_name) results = [] for node in graph.list_nodes("competitor"): has = bool(feature_ids & _targets_for(graph, node.id, "has_feature")) results.append({ "entity_id": node.id, "entity": node.name, "feature": feature_name, "status": "has" if has else "lacks", }) have_it = sum(1 for r in results if r["status"] == "has") total = len(results) threat = _threat_level(have_it, total) return MarketScore( feature=feature_name, gap_count=total - have_it, threat_level=threat, recommendation=_recommend(feature_name, have_it, total, threat), ) def find_gaps_raw(graph: KnowledgeGraphService, feature: str) -> list[dict]: """Return raw gap results per competitor.""" feature_ids = _matching_ids(graph, "feature", feature) results = [] for node in graph.list_nodes("competitor"): has = bool(feature_ids & _targets_for(graph, node.id, "has_feature")) results.append({ "entity_id": node.id, "entity": node.name, "feature": feature, "status": "has" if has else "lacks", }) return sorted(results, key=lambda r: r["entity"]) def compare_entities(graph: KnowledgeGraphService, id_a: str, id_b: str) -> dict: """Compare two entities by their features.""" a_feat = _targets_for(graph, id_a, "has_feature") b_feat = _targets_for(graph, id_b, "has_feature") related = graph.get_edges(id_a, "out") + graph.get_edges(id_b, "out") rels = [ {"source_id": e.source_id, "target_id": e.target_id, "edge_type": e.edge_type} for e in related if {e.source_id, e.target_id} == {id_a, id_b} ] return { "shared": sorted(a_feat & b_feat), "only_a": sorted(a_feat - b_feat), "only_b": sorted(b_feat - a_feat), "relationships": rels, } def get_landscape(graph: KnowledgeGraphService) -> dict: """Return competitive landscape summary.""" competitors = graph.list_nodes("competitor") features = graph.list_nodes("feature") techs = graph.list_nodes("technology") return { "competitors": len(competitors), "features_tracked": len(features), "technologies": len(techs), "top_competitors": [c.name for c in competitors[:10]], } def get_segment_landscape(graph: KnowledgeGraphService, segment: str) -> dict: """Return landscape for a specific market segment.""" seg_nodes = _matching_nodes(graph, "market_segment", segment) if not seg_nodes: return _empty_landscape(segment) seg_id = seg_nodes[0].id comp_ids = [ e.source_id for e in graph.get_edges(seg_id, "in") if e.edge_type == "targets_market" ] names = [graph.get_node(i).name for i in comp_ids if graph.get_node(i)] feats = set().union(*( _targets_for(graph, i, "has_feature") for i in comp_ids )) techs = set().union(*( _targets_for(graph, i, "uses_technology") for i in comp_ids )) threats = sum( 1 for i in comp_ids for e in graph.get_edges(i, "out") if e.edge_type == "threatens" and e.target_id in comp_ids ) return { "segment": seg_nodes[0].name, "competitors": len(comp_ids), "features_tracked": len(feats), "technologies": len(techs), "threat_count": threats, "top_competitors": sorted(names)[:10], } def _matching_ids(graph: KnowledgeGraphService, node_type: str, query: str) -> set[str]: return {n.id for n in _matching_nodes(graph, node_type, query)} def _matching_nodes(graph: KnowledgeGraphService, node_type: str, query: str) -> list[Node]: val = query.lower() return [n for n in graph.list_nodes(node_type) if val in n.name.lower() or val == n.id.lower()] def _targets_for(graph: KnowledgeGraphService, node_id: str, edge_type: str) -> set[str]: return {e.target_id for e in graph.get_edges(node_id, "out") if e.edge_type == edge_type} def _threat_level(have_it: int, total: int) -> str: if total == 0: return "low" ratio = have_it / total if ratio > 0.7: return "high" return "medium" if ratio > 0.3 else "low" def _recommend(feature: str, have: int, total: int, threat: str) -> str: if have == 0: return f"No competitor has '{feature}' — potential differentiator" suffix = {"high": "Table stakes — must have.", "medium": "Growing trend.", "low": "Differentiator opportunity."}[threat] return f"{have}/{total} competitors have this. {suffix}" def _empty_landscape(segment: str) -> dict: return { "segment": segment, "competitors": 0, "features_tracked": 0, "technologies": 0, "threat_count": 0, "top_competitors": [], } ================================================ FILE: maggy/maggy/cikg/storage.py ================================================ """SQLite helpers for the competitive graph.""" from __future__ import annotations import sqlite3 from contextlib import contextmanager from pathlib import Path from typing import Iterator SCHEMA = """ CREATE TABLE IF NOT EXISTS nodes ( id TEXT PRIMARY KEY, node_type TEXT NOT NULL, name TEXT NOT NULL, description TEXT DEFAULT '', metadata TEXT DEFAULT '{}', created_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_nodes_type ON nodes(node_type); CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name); CREATE TABLE IF NOT EXISTS edges ( source_id TEXT NOT NULL, target_id TEXT NOT NULL, edge_type TEXT NOT NULL, weight REAL DEFAULT 1.0, metadata TEXT DEFAULT '{}', PRIMARY KEY (source_id, target_id, edge_type) ); CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source_id); CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target_id); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() ================================================ FILE: maggy/maggy/cli.py ================================================ """Maggy CLI — terminal interface for the engineering platform.""" from __future__ import annotations import typer from maggy.cli_client import MaggyClient from maggy.cli_output import ( console, dump_json, render_budget, render_competitors, render_health, render_inbox, render_models, render_route, render_sessions, ) app = typer.Typer( name="maggy", help="Maggy — AI Engineering Platform", no_args_is_help=False, ) _client = MaggyClient() def _ensure() -> bool: if not _client._check_health(): console.print("[dim]Starting Maggy server...[/dim]") if not _client.ensure_server(): console.print("[red]Cannot reach Maggy server.[/red]") raise typer.Exit(1) return True @app.callback(invoke_without_command=True) def main(ctx: typer.Context) -> None: """Interactive REPL (in project) or dashboard.""" if ctx.invoked_subcommand is not None: return _ensure() from maggy.cli_chat import detect_project, run_chat project = detect_project(_client) if project: run_chat(_client, project, routed=True) else: serve() @app.command() def serve() -> None: """Start the Maggy server + web dashboard.""" from maggy.main import main as start_server start_server() @app.command() def status(json_out: bool = typer.Option(False, "--json")) -> None: """Show server health and config summary.""" _ensure() data = _client.health() dump_json(data) if json_out else render_health(data) @app.command() def inbox( refresh: bool = typer.Option(False, "--refresh"), json_out: bool = typer.Option(False, "--json"), ) -> None: """Show AI-ranked task inbox.""" _ensure() data = _client.inbox(refresh=refresh) if json_out: dump_json(data) elif not data.get("items"): console.print("[dim]No tasks in inbox.[/dim]") else: render_inbox(data) @app.command() def sessions(json_out: bool = typer.Option(False, "--json")) -> None: """List active AI sessions across projects.""" _ensure() data = _client.activity() dump_json(data) if json_out else render_sessions(data) @app.command() def chat( project: str = typer.Argument(..., help="Project key"), direct: bool = typer.Option(False, "--direct"), ) -> None: """Interactive chat with a project's AI session.""" _ensure() from maggy.cli_chat import run_chat run_chat(_client, project, routed=not direct) @app.command() def spawn( task: str = typer.Argument(..., help="Task description"), ) -> None: """Spawn a background AI session.""" _ensure() from maggy.cli_chat import detect_project from maggy.cli_sessions import spawn_session project = detect_project(_client) if not project: console.print("[red]Not in a project directory.[/red]") raise typer.Exit(1) spawn_session(_client, task, project) @app.command() def ps() -> None: """List all managed sessions (chat + executor).""" _ensure() from maggy.cli_sessions import list_all list_all(_client) @app.command() def kill( session_id: str = typer.Argument(..., help="Session ID"), ) -> None: """Stop a managed session.""" _ensure() from maggy.cli_sessions import kill_session kill_session(_client, session_id) @app.command() def execute( task_id: str = typer.Argument(..., help="Task ID"), plan: bool = typer.Option(False, "--plan"), ) -> None: """Execute a task via the TDD pipeline.""" _ensure() mode = "plan" if plan else "tdd" data = _client.execute(task_id, mode) console.print( f"[green]Started[/green] session " f"[bold]{data.get('session_id', '?')}[/bold] " f"({mode} mode)", ) @app.command() def route( blast: int = typer.Argument(..., help="Complexity 1-10"), task_type: str = typer.Option("general", "--type"), json_out: bool = typer.Option(False, "--json"), ) -> None: """Get routing decision for a complexity score.""" _ensure() data = _client.route(blast, task_type) dump_json(data) if json_out else render_route(data) @app.command() def budget(json_out: bool = typer.Option(False, "--json")) -> None: """Show per-provider token budget.""" _ensure() data = _client.budget_summary() dump_json(data) if json_out else render_budget(data) @app.command() def models(json_out: bool = typer.Option(False, "--json")) -> None: """Show model performance heatmap.""" _ensure() data = _client.models_heatmap() dump_json(data) if json_out else render_models(data) @app.command() def competitors( briefing: bool = typer.Option(False, "--briefing"), json_out: bool = typer.Option(False, "--json"), ) -> None: """Show competitor intelligence.""" _ensure() if briefing: data = _client.competitors_briefing() else: data = _client.competitors_news() if json_out: dump_json(data) elif briefing: console.print(data.get("summary", "No briefing available.")) else: render_competitors(data) @app.command() def process( project: str = typer.Argument(..., help="Project key"), json_out: bool = typer.Option(False, "--json"), ) -> None: """Show process health for a project.""" _ensure() data = _client.process_health(project) dump_json(data) if json_out else console.print_json(data=data) @app.command() def config(json_out: bool = typer.Option(False, "--json")) -> None: """Show current configuration (redacted).""" _ensure() dump_json(_client.config()) ================================================ FILE: maggy/maggy/cli_chat.py ================================================ """Interactive chat REPL for Maggy CLI with model routing.""" from __future__ import annotations import os from rich.console import Console from rich.live import Live from rich.markdown import Markdown from rich.prompt import Prompt from rich.spinner import Spinner from maggy.cli_repl_cmds import SessionState, dispatch from maggy.cli_welcome import render_welcome from maggy.services.session_detect import detect_all console = Console() EXIT_WORDS = frozenset({"exit", "bye", "quit", "/exit", "/bye"}) _QUOTA_MARKERS = ("rate_limit", "quota", "exceeded", "429") def detect_project(client) -> str | None: """Auto-detect project from current working directory.""" return client.detect_project(os.getcwd()) def run_chat( client, project: str, routed: bool = True, ) -> None: session, resumed = _find_or_create(client, project) sid = session.get("id", "?") wd = session.get("working_dir", "?") render_welcome(project, session, client) _show_resume_info(client, sid, wd) state = SessionState(session_id=sid, working_dir=wd) _repl_loop(client, state, routed) console.print("[dim]Session saved. Bye.[/dim]") def _find_or_create(client, project: str) -> tuple[dict, bool]: for s in client.chat_sessions(): if s.get("project_key") == project: return s, True return client.chat_create(project), False def _show_resume_info(client, sid: str, wd: str) -> None: detected = detect_all(wd) if detected.sessions: parts = [f"{s.cli}({s.session_id[:8]})" for s in detected.sessions] console.print(f"[dim]Prior: {', '.join(parts)}[/dim]") for msg in client.chat_history(sid).get("messages", [])[-3:]: role = msg.get("role", "?") text = msg.get("content", "")[:120] tag = "[cyan]You[/cyan]" if role == "user" else "[green]Maggy[/green]" console.print(f" {tag}: {text}") def _repl_loop(client, state: SessionState, routed: bool) -> None: blast_override: int | None = None while True: try: text = Prompt.ask("[bold cyan]>[/bold cyan]") except (KeyboardInterrupt, EOFError): console.print() break stripped = text.strip() if not stripped: continue if stripped == "/quit" or stripped.lower() in EXIT_WORDS: break if stripped == "/history": _show_history(client, state.session_id) continue if stripped == "/sessions": _show_sessions(client) continue if stripped == "/clear": console.clear() continue if stripped.startswith("/monitor"): data = _call_safe(client.monitor_status) console.print(f"[dim]Monitors: {data.get('active', 0)} active[/dim]") continue if stripped.startswith("/screenshot"): _handle_screenshot(stripped) continue if stripped.startswith("/blast"): blast_override = _parse_blast(stripped) continue if dispatch(stripped, client, state): continue if routed: chunks = client.chat_send_routed( state.session_id, stripped, blast=blast_override, allowed_models=state.allowed_models or None, ) else: chunks = client.chat_send_stream( state.session_id, stripped, ) _stream_chunks(chunks) blast_override = None def _parse_blast(text: str) -> int | None: parts = text.split() if len(parts) >= 2: try: val = max(1, min(10, int(parts[1]))) console.print(f"[dim]Blast override: {val}[/dim]") return val except ValueError: pass console.print("[dim]Usage: /blast N (1-10)[/dim]") return None def _stream_chunks(chunks) -> None: full, err = "", "" try: with Live( Spinner("dots", text="Thinking..."), console=console, refresh_per_second=8, ) as live: for chunk in chunks: ct = chunk.get("type", "") if ct == "routing": _show_routing(chunk) elif ct == "queued": pos = chunk.get("position", "?") live.update(Markdown(f"[dim]Queued (position {pos})[/dim]")) elif ct in ("warning", "agent_status"): console.print(f"[dim]{chunk.get('content', chunk.get('status', ''))}[/dim]") elif ct in ("text", "result"): full += chunk.get("content", "") live.update(Markdown(full)) elif ct == "error": err = chunk.get("content", "") elif ct == "done": break except KeyboardInterrupt: console.print("\n[dim]Interrupted[/dim]") except Exception as e: err = str(e) if err: console.print(f"[red]Error:[/red] {err}") if any(m in err.lower() for m in _QUOTA_MARKERS): from maggy.services.account_guide import render_switch_guide render_switch_guide("anthropic") def _call_safe(fn, default=None): try: return fn() except (Exception, SystemExit): return default if default is not None else {} def _handle_screenshot(text: str) -> None: """Send image to Qwen3-VL for analysis.""" from maggy.services.vision import analyze_image parts = text.split(None, 2) if len(parts) < 2: console.print("[dim]Usage: /screenshot <path> [prompt][/dim]") return path = parts[1] prompt = parts[2] if len(parts) > 2 else None console.print(f"[dim]Analyzing {path}...[/dim]") _stream_chunks(analyze_image(path, prompt)) def _show_routing(chunk: dict) -> None: console.print(f"[dim][{chunk.get('model', '?')}] blast={chunk.get('blast', '?')} {chunk.get('reason', '')}[/dim]") def _show_history(client, session_id: str) -> None: msgs = client.chat_history(session_id).get("messages", []) if not msgs: console.print("[dim]No messages yet.[/dim]") return for msg in msgs: role, content = msg.get("role", "?"), msg.get("content", "") tag = "[cyan]You[/cyan]" if role == "user" else "[green]Maggy[/green]" console.print(f" {tag}: {content[:120]}") def _show_sessions(client) -> None: sessions = client.chat_sessions() if not sessions: console.print("[dim]No chat sessions.[/dim]") return for s in sessions: sid = s.get("id", "?")[:8] proj = s.get("project_key", "?") n = s.get("messages", 0) console.print(f" [bold]{sid}[/bold] {proj} ({n} msgs)") ================================================ FILE: maggy/maggy/cli_client.py ================================================ """HTTP client for Maggy REST API.""" from __future__ import annotations import json import os import signal import subprocess import sys import time from urllib.parse import urlparse import httpx import typer from maggy.config import CONFIG_DIR DEFAULT_URL = "http://127.0.0.1:8080" HEALTH_TIMEOUT = 2.0 START_WAIT = 45.0 START_POLL = 1.0 class MaggyClient: """Thin wrapper over Maggy's REST API.""" def __init__(self, base_url: str = DEFAULT_URL): self.base_url = base_url.rstrip("/") # ── Server lifecycle ───────────────────────── def _check_health(self) -> bool: try: r = httpx.get( f"{self.base_url}/api/health", timeout=HEALTH_TIMEOUT, ) return r.status_code == 200 except (httpx.ConnectError, httpx.ReadTimeout): return False def _get_port(self) -> int: parsed = urlparse(self.base_url) return parsed.port or 8080 def _kill_stale_port(self) -> None: """Kill any process holding our port.""" try: result = subprocess.run( ["lsof", "-ti", f":{self._get_port()}"], capture_output=True, text=True, timeout=5, ) except (subprocess.SubprocessError, OSError): return for line in result.stdout.strip().splitlines(): try: os.kill(int(line.strip()), signal.SIGTERM) except (ValueError, ProcessLookupError, PermissionError): continue time.sleep(0.5) def _start_server(self) -> None: """Spawn server, logging to server.log.""" CONFIG_DIR.mkdir(parents=True, exist_ok=True) log = open(CONFIG_DIR / "server.log", "a") subprocess.Popen( [sys.executable, "-m", "maggy.main"], stdout=log, stderr=log, ) def ensure_server(self) -> bool: """Return True if server is reachable.""" if self._check_health(): return True self._kill_stale_port() self._start_server() deadline = time.monotonic() + START_WAIT while time.monotonic() < deadline: time.sleep(START_POLL) if self._check_health(): return True return False # ── API calls ──────────────────────────────── def _handle_error(self, r: httpx.Response) -> None: if r.is_success: return try: detail = r.json().get("detail", r.text) except Exception: detail = r.text from rich.console import Console Console(stderr=True).print( f"[red]Error {r.status_code}:[/red] {detail}", ) raise typer.Exit(1) def get(self, path: str, **params) -> dict | list: r = httpx.get( f"{self.base_url}{path}", params=params or None, timeout=30.0, ) self._handle_error(r) return r.json() def post(self, path: str, body: dict) -> dict: r = httpx.post( f"{self.base_url}{path}", json=body, timeout=60.0, ) self._handle_error(r) return r.json() def health(self) -> dict: return self.get("/api/health") def inbox(self, refresh: bool = False) -> dict: return self.get("/api/inbox", refresh=refresh) def activity(self) -> dict: return self.get("/api/activity") def route(self, blast: int, task_type: str) -> dict: return self.get( "/api/routing/decide", blast=blast, task_type=task_type, ) def budget_summary(self) -> dict: return self.get("/api/budget") def competitors_news(self, limit: int = 50) -> list: return self.get("/api/competitors/news", limit=limit) def competitors_briefing(self) -> dict: return self.get("/api/competitors/news/summary") def models_heatmap(self) -> list: return self.get("/api/routing/heatmap") def routing_rules(self) -> dict: return self.get("/api/routing/rules") def budget_by_provider(self) -> list: return self.get("/api/budget/by-provider") def process_health(self, project: str) -> dict: return self.get(f"/api/process/health/{project}") def config(self) -> dict: return self.get("/api/config") def execute(self, task_id: str, mode: str) -> dict: return self.post( "/api/execute", {"task_id": task_id, "mode": mode}, ) def sessions(self) -> list: return self.get("/api/execute/sessions") # ── Chat ────────────────────────────────────── def chat_create(self, project_key: str) -> dict: return self.post( "/api/chat/sessions", {"project_key": project_key}, ) def chat_sessions(self) -> list: return self.get("/api/chat/sessions") def chat_history(self, session_id: str) -> dict: return self.get(f"/api/chat/sessions/{session_id}") def chat_send_stream( self, session_id: str, message: str, ): """Yield parsed SSE chunks from chat endpoint.""" url = ( f"{self.base_url}" f"/api/chat/sessions/{session_id}/send" ) with httpx.stream( "POST", url, json={"message": message}, timeout=120.0, ) as r: for line in r.iter_lines(): if line.startswith("data: "): yield json.loads(line[6:]) def chat_send_routed( self, session_id: str, message: str, blast: int | None = None, allowed_models: list[str] | None = None, ): """Yield SSE chunks from routed chat endpoint.""" url = ( f"{self.base_url}" f"/api/chat/sessions/{session_id}/send-routed" ) body: dict = {"message": message} if blast is not None: body["blast_score"] = blast if allowed_models: body["allowed_models"] = allowed_models with httpx.stream( "POST", url, json=body, timeout=120.0, ) as r: for line in r.iter_lines(): if line.startswith("data: "): yield json.loads(line[6:]) def detect_project(self, cwd: str) -> str | None: """Match cwd against configured codebases.""" try: cfg = self.config() except Exception: return None for cb in cfg.get("codebases", []): if cwd.startswith(cb.get("path", "")): return cb.get("key") return None # ── Session management ───────────────────────── def spawn(self, task: str, project: str) -> dict: return self.post( "/api/execute", {"task_id": task, "mode": "tdd", "project_key": project}, ) def all_sessions(self) -> list: """Merge chat + executor sessions.""" chat = self.chat_sessions() executor = self.sessions() combined = [] for s in chat: combined.append({ "id": s.get("id"), "project": s.get("project_key", ""), "model": "claude", "status": s.get("status", ""), "type": "chat", "messages": s.get("messages", 0), }) for s in executor: combined.append({ "id": s.get("id"), "project": s.get("task_id", ""), "model": s.get("model", "?"), "status": s.get("status", ""), "type": "executor", "messages": 0, }) return combined def kill_session(self, session_id: str) -> dict: r = httpx.delete( f"{self.base_url}" f"/api/chat/sessions/{session_id}", timeout=10.0, ) self._handle_error(r) return r.json() # ── Monitor ──────────────────────────────────── def monitor_status(self) -> dict: return self.get("/api/monitor/status") def monitor_start(self) -> dict: return self.post("/api/monitor/start", {}) def monitor_stop(self) -> dict: return self.post("/api/monitor/stop", {}) # ── Health ───────────────────────────────────── def health_dashboard(self) -> dict: return self.get("/api/engram/diagnostics") def engram_diagnostics(self) -> dict: return self.get("/api/engram/diagnostics") ================================================ FILE: maggy/maggy/cli_output.py ================================================ """Rich terminal formatters for Maggy CLI output.""" from __future__ import annotations import json import sys from rich.console import Console from rich.panel import Panel from rich.table import Table console = Console() def _is_pipe() -> bool: return not sys.stdout.isatty() def dump_json(data) -> None: """Print raw JSON for piping / --json flag.""" print(json.dumps(data, indent=2)) # ── Status ────────────────────────────────────── def render_health(data: dict) -> None: t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() t.add_row("Status", f"[green]{data.get('status', '?')}[/green]") t.add_row("Mode", data.get("mode", "?")) t.add_row("Org", data.get("org", "?")) t.add_row("Codebases", str(data.get("codebases", 0))) t.add_row("Provider", data.get("provider", "?")) console.print(Panel(t, title="Maggy Status", border_style="blue")) # ── Inbox ─────────────────────────────────────── def render_inbox(data: dict) -> None: items = data.get("items", []) if not items: console.print("[dim]No tasks in inbox.[/dim]") return t = Table(title=f"Inbox ({len(items)} tasks)") t.add_column("#", style="bold", width=4) t.add_column("Title", min_width=30) t.add_column("Labels") t.add_column("Reason", style="dim") for item in items: labels = ", ".join(item.get("labels", [])[:3]) t.add_row( str(item.get("rank", "")), item.get("title", "")[:60], labels, item.get("ai_reason", "")[:40], ) console.print(t) # ── Sessions ──────────────────────────────────── def render_sessions(data: dict | list) -> None: items = data if isinstance(data, list) else data.get("sessions", []) if not items: console.print("[dim]No active sessions.[/dim]") return t = Table(title=f"Active Sessions ({len(items)})") t.add_column("PID", width=8) t.add_column("CLI") t.add_column("Project") t.add_column("Status") t.add_column("Agent") for s in items: cli = s.get("cli") or s.get("tool") or "?" agent = s.get("agent_name") or "" t.add_row( str(s.get("pid", "")), cli, s.get("project", "?"), s.get("status", "?"), agent, ) console.print(t) # ── Route ─────────────────────────────────────── def _model_name(val) -> str: if isinstance(val, dict): return val.get("name", "?") return str(val) if val else "?" def render_route(data: dict) -> None: t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() primary = _model_name(data.get("primary")) t.add_row("Primary", f"[green]{primary}[/green]") validator = data.get("validator") if validator: t.add_row("Validator", _model_name(validator)) fallback = data.get("fallback", []) if fallback: names = [_model_name(f) for f in fallback] t.add_row("Fallback", " → ".join(names)) t.add_row("Reason", str(data.get("reason", ""))) console.print(Panel(t, title="Routing Decision", border_style="yellow")) # ── Budget ────────────────────────────────────── def render_budget(data: dict) -> None: spent = data.get("spent_today_usd", 0) limit = data.get("daily_limit_usd", 0) pct = (spent / limit * 100) if limit else 0 bar_len = int(pct / 5) color = "red" if pct > 80 else "green" bar = f"[{color}]{'█' * bar_len}[/{color}]{'░' * (20 - bar_len)}" t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() t.add_row("Spent today", f"${spent:.2f}") t.add_row("Daily limit", f"${limit:.2f}") t.add_row("Utilization", f"{pct:.0f}% {bar}") t.add_row("Status", data.get("status", "?")) # Per-provider breakdown if available providers = data.get("providers", []) if providers: t.add_row("", "") for p in providers: p_used = p.get("used", 0) p_limit = p.get("limit", 0) t.add_row( p.get("name", "?"), f"${p_used:.2f} / ${p_limit:.2f}", ) console.print(Panel(t, title="Budget", border_style="green")) # ── Competitors ───────────────────────────────── def render_competitors(news: list) -> None: if not news: console.print("[dim]No competitor news.[/dim]") return t = Table(title=f"Competitor Intel ({len(news)} items)") t.add_column("Date", width=12) t.add_column("Type") t.add_column("Headline", min_width=40) for item in news[:20]: t.add_row( item.get("date", "?")[:10], item.get("event_type", "?"), item.get("headline", "")[:60], ) console.print(t) # ── Models ────────────────────────────────────── def render_models(heatmap: list) -> None: if not heatmap: console.print("[dim]No model performance data.[/dim]") return t = Table(title="Model Performance Heatmap") t.add_column("Model") t.add_column("Task Type") t.add_column("Reward", justify="right") for entry in heatmap: reward = entry.get("reward", 0) color = "green" if reward >= 0.8 else "yellow" if reward >= 0.5 else "red" t.add_row( entry.get("model", "?"), entry.get("task_type", "?"), f"[{color}]{reward:.2f}[/{color}]", ) console.print(t) ================================================ FILE: maggy/maggy/cli_repl_cmds.py ================================================ """REPL slash command handlers for Maggy CLI.""" from __future__ import annotations from dataclasses import dataclass, field from pathlib import Path from rich.console import Console from rich.markdown import Markdown from rich.panel import Panel from rich.table import Table console = Console() _KNOWN_MODELS = ("local", "kimi", "claude", "codex") def _call(fn, d=None): try: return fn() except (Exception, SystemExit): return d if d is not None else {} @dataclass class SessionState: """Mutable session-level state for REPL.""" session_id: str = "" working_dir: str = "" allowed_models: list[str] = field(default_factory=list) def dispatch(cmd: str, client, state: SessionState) -> bool: """Route a slash command. Returns True if handled.""" parts = cmd.strip().split(None, 1) name, args = parts[0].lower(), parts[1] if len(parts) > 1 else "" simple = { "/stats": cmd_stats, "/budget": cmd_budget, "/route": cmd_route, "/models": cmd_models, "/config": cmd_config, "/health": cmd_health, } if name in simple: simple[name](client) return True if name == "/use": cmd_use(args, state) elif name == "/claude-md": cmd_claude_md(state) elif name == "/help": cmd_help() else: return False return True def cmd_stats(client) -> None: b = _call(client.budget_summary) t = Table(title="Stats") t.add_column("Metric", style="bold") t.add_column("Value") t.add_row("Spent", f"${b.get('spent_today_usd', 0):.2f} / ${b.get('daily_limit_usd', 0):.2f}") in_t, out_t = b.get("input_tokens", 0), b.get("output_tokens", 0) if in_t or out_t: t.add_row("Tokens", f"{in_t:,} in / {out_t:,} out") t.add_row("Status", b.get("status", "?")) for p in _call(client.budget_by_provider, []): t.add_row(f" {p.get('provider', '?')}", f"${p.get('spent_usd', 0):.2f}") for h in _call(client.models_heatmap, [])[:8]: r, c = h.get("avg_reward", 0), "green" if h.get("avg_reward", 0) >= 0.8 else "yellow" t.add_row(f" {h.get('model', '?')} ({h.get('task_type', '')})", f"[{c}]{r:.2f}[/{c}] ({h.get('samples', 0)})") console.print(t) def cmd_budget(client) -> None: b = _call(client.budget_summary) spent, limit = b.get("spent_today_usd", 0), b.get("daily_limit_usd", 0) pct = (spent / limit * 100) if limit else 0 bl, c = min(20, int(pct / 5)), "red" if pct > 80 else "green" bar = f"[{c}]{'█' * bl}[/{c}]{'░' * (20 - bl)}" t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() if b.get("plan") == "subscription": t.add_row("Plan", "[green]Subscription[/green]") else: t.add_row("Spent", f"${spent:.2f} / ${limit:.2f}") t.add_row("Usage", f"{pct:.0f}% {bar}") t.add_row("Status", b.get("status", "?")) for p in _call(client.budget_by_provider, []): t.add_row(p.get("provider", "?"), f"${p.get('spent_usd', 0):.2f}") console.print(Panel(t, title="Budget", border_style="green")) def cmd_route(client) -> None: data = _call(client.routing_rules) t = Table(title=f"Routing ({data.get('mode', '?')})") t.add_column("Task Type", style="bold") t.add_column("Model") t.add_column("Reason", style="dim") for tt, info in data.get("task_type_overrides", {}).items(): t.add_row(tt, info.get("model", "?"), info.get("reason", "")) console.print(t) console.print("[dim]Blast: 1-3 cheap | 4-6 medium | 7-10 premium[/dim]") perf = data.get("model_performance", {}) if not perf: return pt = Table(title="Model Performance") pt.add_column("Model", style="bold") pt.add_column("Strengths") pt.add_column("Rate", justify="right") for model, info in perf.items(): pt.add_row(model, ", ".join(info.get("strengths", [])), f"{info.get('success_rate', 0):.0%}") console.print(pt) def cmd_models(client) -> None: heatmap = _call(client.models_heatmap, []) t = Table(title="Model Rewards") for col in ("Model", "Task Type", "Blast Tier"): t.add_column(col) t.add_column("Reward", justify="right") t.add_column("N", justify="right") if not heatmap: for m in _KNOWN_MODELS: t.add_row(m, "-", "-", "-", "0") else: for h in heatmap: r = h.get("avg_reward", 0) c = "green" if r >= 0.8 else "yellow" if r >= 0.5 else "red" t.add_row(h.get("model", "?"), h.get("task_type", "?"), h.get("blast_tier", "?"), f"[{c}]{r:.2f}[/{c}]", str(h.get("samples", 0))) console.print(t) def cmd_use(args: str, state: SessionState) -> None: """Set allowed models for this session.""" if not args or args.strip().lower() == "all": state.allowed_models = [] console.print("[dim]Routing: all models enabled[/dim]") return models = [m.strip() for m in args.split(",") if m.strip()] bad = [m for m in models if m not in _KNOWN_MODELS] if bad: console.print(f"[yellow]Unknown: {', '.join(bad)}. Known: {', '.join(_KNOWN_MODELS)}[/yellow]") state.allowed_models = models console.print(f"[dim]Routing restricted to: {', '.join(models)}[/dim]") def cmd_config(client) -> None: """Show configuration summary.""" cfg = _call(client.config) t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() cbs = cfg.get("codebases", []) t.add_row("Codebases", str(len(cbs))) for cb in cbs[:5]: t.add_row(f" {cb.get('key', '?')}", cb.get("path", "")) t.add_row("Routing", cfg.get("routing", {}).get("mode", "dynamic")) t.add_row("Limit", f"${cfg.get('budget', {}).get('daily_limit_usd', 0):.2f}") console.print(Panel(t, title="Config", border_style="blue")) def cmd_claude_md(state: SessionState) -> None: """Show project's CLAUDE.md.""" wd = Path(state.working_dir) for name in ("CLAUDE.md", ".claude/CLAUDE.md"): path = wd / name if path.exists(): console.print(Markdown(path.read_text())) return console.print("[dim]CLAUDE.md not found in project.[/dim]") def cmd_health(client) -> None: """Memory system health dashboard.""" data = _call(client.health_dashboard) eng = data if "health_score" in data else data.get("engram", {}) mn, score = data.get("mnemos", {}), eng.get("health_score", 0) c = "green" if score >= 0.7 else "yellow" if score >= 0.4 else "red" t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() t.add_row("Engram", f"[{c}]{score:.0%}[/{c}] ({eng.get('active', 0)}/{eng.get('total', 0)})") t.add_row("Mnemos", f"{mn.get('state', '?')} ({mn.get('composite', 0):.2f})") console.print(Panel(t, title="Health", border_style="green")) _HELP = """\ [bold]Commands:[/bold] /stats Budget+perf /budget Breakdown /route Rules+tiers /models Reward heatmap /health Memory health /monitor Trackers /screenshot F Analyze image with Qwen3-VL /claude-md CLAUDE.md /use M Restrict models /config Settings /blast N Override /history Messages /sessions List /clear Screen /quit Exit /help This help""" def cmd_help() -> None: console.print(_HELP) ================================================ FILE: maggy/maggy/cli_sessions.py ================================================ """Session management for Maggy CLI — spawn, list, kill.""" from __future__ import annotations from rich.console import Console from rich.table import Table console = Console() def spawn_session(client, task: str, project: str) -> None: """Spawn a background execution session.""" data = client.spawn(task, project) sid = data.get("session_id", "?") console.print( f"[green]Spawned[/green] session " f"[bold]{sid}[/bold] for {project}", ) def list_all(client) -> None: """Show all sessions (chat + executor).""" sessions = client.all_sessions() if not sessions: console.print("[dim]No active sessions.[/dim]") return t = Table(title="All Sessions") t.add_column("ID", width=12) t.add_column("Project") t.add_column("Model") t.add_column("Type") t.add_column("Status") for s in sessions: t.add_row( str(s.get("id", "?")), s.get("project", "?"), s.get("model", "?"), s.get("type", "?"), s.get("status", "?"), ) console.print(t) def kill_session(client, session_id: str) -> None: """Kill a session by ID.""" client.kill_session(session_id) console.print( f"[yellow]Killed[/yellow] session [bold]{session_id}[/bold]", ) ================================================ FILE: maggy/maggy/cli_welcome.py ================================================ """Rich welcome banner for Maggy CLI startup.""" from __future__ import annotations import os from rich.console import Console from rich.panel import Panel from rich.table import Table console = Console() VERSION = "0.5" def render_welcome( project: str, session: dict, client, ) -> None: """Print a rich 2-column welcome panel.""" t = Table(show_header=False, box=None, padding=(0, 2)) t.add_column(style="bold") t.add_column() _add_project_rows(t, project, session) _add_system_rows(t, client, session) label = "Resuming" if session.get("messages", 0) else "New" title = f"Maggy v{VERSION} - {label}" console.print(Panel(t, title=title, border_style="cyan")) console.print( "[dim]/help for commands | /stats for budget[/dim]\n", ) def _add_project_rows( t: Table, project: str, session: dict, ) -> None: """Left-side project info.""" wd = session.get("working_dir") or os.getcwd() short_wd = _shorten(wd, 35) msgs = session.get("messages", 0) sid = session.get("id", "?")[:8] t.add_row("Project", f"[bold]{project}[/bold]") t.add_row("Dir", short_wd) t.add_row("Session", f"{sid} ({msgs} msgs)") _KNOWN_MODELS = ("local", "kimi", "gpt", "claude", "codex") def _add_system_rows( t: Table, client, session: dict, ) -> None: """Right-side system state.""" budget = _safe_call(client.budget_summary) if isinstance(budget, dict) and budget.get("plan") == "subscription": t.add_row("Budget", "[green]Subscription[/green]") else: spent = budget.get("spent_today_usd", 0) if isinstance(budget, dict) else 0 limit = budget.get("daily_limit_usd", 0) if isinstance(budget, dict) else 0 t.add_row("Budget", f"${spent:.2f} / ${limit:.2f}") models = _safe_call(client.models_heatmap) count = len(models) if models else len(_KNOWN_MODELS) label = f"{len(models)} tracked" if models else f"{count} available" t.add_row("Models", label) status = budget.get("status", "?") if isinstance(budget, dict) else "?" t.add_row("Status", f"[green]{status}[/green]") _add_health_row(t, client) def _add_health_row(t: Table, client) -> None: """Show engram health score inline.""" diag = _safe_call(client.engram_diagnostics) if not isinstance(diag, dict): return score = diag.get("health_score", 0) color = "green" if score >= 0.7 else "yellow" if score >= 0.4 else "red" t.add_row("Memory", f"[{color}]{score:.0%}[/{color}]") def _safe_call(fn): """Call a client method, return empty on failure.""" try: return fn() or [] except Exception: return [] def _shorten(path: str, max_len: int) -> str: """Truncate long paths with ellipsis.""" if len(path) <= max_len: return path return "..." + path[-(max_len - 3) :] ================================================ FILE: maggy/maggy/config.py ================================================ """Config loader for Maggy — reads ~/.maggy/config.yaml with env overrides.""" from __future__ import annotations import os import tempfile from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any import yaml CONFIG_DIR = Path(os.environ.get("MAGGY_HOME", "~/.maggy")).expanduser() CONFIG_PATH = CONFIG_DIR / "config.yaml" if TYPE_CHECKING: from maggy.budget import ProviderBudget def _default_storage_path() -> str: return _safe_storage_path(CONFIG_DIR / "maggy.db") def _safe_storage_path(path: str | Path) -> str: target = Path(path).expanduser() try: target.parent.mkdir(parents=True, exist_ok=True) probe = target.parent / ".write-test" probe.write_text("") probe.unlink() return str(target) except OSError: fallback = Path(tempfile.gettempdir()) / "maggy" / "maggy.db" fallback.parent.mkdir(parents=True, exist_ok=True) return str(fallback) @dataclass class GitHubConfig: org: str = "" repos: list[str] = field(default_factory=list) labels: list[str] = field(default_factory=list) token: str = "" @dataclass class AsanaConfig: workspace_id: str = "" boards: dict[str, str] = field(default_factory=dict) token: str = "" @dataclass class LinearConfig: workspace: str = "" token: str = "" @dataclass class IssueTrackerConfig: provider: str = "github" github: GitHubConfig = field(default_factory=GitHubConfig) asana: AsanaConfig = field(default_factory=AsanaConfig) linear: LinearConfig = field(default_factory=LinearConfig) @dataclass class CodebaseConfig: path: str key: str @dataclass class ProjectConfig: name: str repo: str path: str default_branch: str icpg: bool = True cikg: bool = False @dataclass class OKRItem: id: str title: str keywords: list[str] = field(default_factory=list) @dataclass class OKRConfig: source: str = "skip" items: list[OKRItem] = field(default_factory=list) @dataclass class CompetitorsConfig: categories: list[str] = field(default_factory=list) seed: list[str] = field(default_factory=list) @dataclass class AIConfig: provider: str = "anthropic" model: str = "claude-sonnet-4-5-20250929" api_key: str = "" max_budget_usd_per_execute: float = 5.0 @dataclass class StorageConfig: backend: str = "sqlite" path: str = field(default_factory=_default_storage_path) @dataclass class DashboardConfig: host: str = "127.0.0.1" port: int = 8080 auth_mode: str = "local" api_key: str = "" @dataclass class OrgConfig: name: str = "Your Org" domain: str = "" @dataclass class BootstrapConfig: path: str = "" @dataclass class ModelTierConfig: name: str = "" provider: str = "" model: str = "" complexity_range: list[int] = field(default_factory=lambda: [0, 10]) strengths: list[str] = field(default_factory=list) cost_per_1k: float = 0.0 @dataclass class BudgetConfig: daily_limit_usd: float = 10.0 max_spend_per_task: float = 5.0 warning_threshold: float = 0.8 plan: str = "daily" providers: list["ProviderBudget"] = field(default_factory=list) @dataclass class RoutingConfig: mode: str = "dynamic" tiers: list[ModelTierConfig] = field(default_factory=list) @dataclass class MeshConfig: enabled: bool = False peer_id: str = "" port: int = 8080 org_key_secret: str = "" orgs: list[str] = field(default_factory=list) exclude_orgs: list[str] = field(default_factory=list) manual_peers: list[str] = field(default_factory=list) tunnel_url: str = "" git_discovery: bool = True share_interval: int = 600 @dataclass class HeartbeatConfig: enabled: bool = True history_interval: int = 1800 engram_interval: int = 3600 improve_interval: int = 3600 mesh_interval: int = 300 @dataclass class MaggyConfig: org: OrgConfig = field(default_factory=OrgConfig) issue_tracker: IssueTrackerConfig = field(default_factory=IssueTrackerConfig) codebases: list[CodebaseConfig] = field(default_factory=list) projects: list[ProjectConfig] = field(default_factory=list) competitors: CompetitorsConfig = field(default_factory=CompetitorsConfig) okrs: OKRConfig = field(default_factory=OKRConfig) ai: AIConfig = field(default_factory=AIConfig) storage: StorageConfig = field(default_factory=StorageConfig) dashboard: DashboardConfig = field(default_factory=DashboardConfig) bootstrap: BootstrapConfig = field(default_factory=BootstrapConfig) budget: BudgetConfig = field(default_factory=BudgetConfig) routing: RoutingConfig = field(default_factory=RoutingConfig) mesh: MeshConfig = field(default_factory=MeshConfig) heartbeat: HeartbeatConfig = field(default_factory=HeartbeatConfig) def codebase_paths(self) -> dict[str, Path]: """Return {key: expanded_path} for all configured codebases.""" return {c.key: Path(c.path).expanduser() for c in self.codebases} def resolve_bootstrap_path(self) -> Path | None: """Find Maggy install. Checks config, then ~/.claude/.bootstrap-dir.""" if self.bootstrap.path: return Path(self.bootstrap.path).expanduser() marker = Path.home() / ".claude" / ".bootstrap-dir" if marker.exists(): return Path(marker.read_text().strip()).expanduser() return None def _merge_env(cfg: MaggyConfig) -> MaggyConfig: """Override config with env vars where defined. Env wins over file.""" cfg.issue_tracker.github.token = os.environ.get("GITHUB_TOKEN", cfg.issue_tracker.github.token) # Fall back to git credential helper if no env var if not cfg.issue_tracker.github.token: cfg.issue_tracker.github.token = _git_credential_token() cfg.issue_tracker.asana.token = os.environ.get("ASANA_API_KEY", cfg.issue_tracker.asana.token) cfg.issue_tracker.linear.token = os.environ.get("LINEAR_API_KEY", cfg.issue_tracker.linear.token) cfg.ai.api_key = os.environ.get("ANTHROPIC_API_KEY", cfg.ai.api_key) cfg.dashboard.api_key = os.environ.get("MAGGY_API_KEY", cfg.dashboard.api_key) cfg.mesh.org_key_secret = os.environ.get("MAGGY_MESH_SECRET", cfg.mesh.org_key_secret) return cfg def _git_credential_token() -> str: """Read GitHub token from git credential helper.""" from maggy.discovery import discover_git_token return discover_git_token() def _from_dict(data: dict[str, Any]) -> MaggyConfig: """Build MaggyConfig from loaded YAML dict. Tolerates missing sections.""" from maggy.budget import ProviderBudget it_raw = data.get("issue_tracker") or {} tracker = IssueTrackerConfig( provider=it_raw.get("provider", "github"), github=GitHubConfig(**(it_raw.get("github") or {})), asana=AsanaConfig(**(it_raw.get("asana") or {})), linear=LinearConfig(**(it_raw.get("linear") or {})), ) okr_raw = data.get("okrs") or {} okrs = OKRConfig( source=okr_raw.get("source", "skip"), items=[OKRItem(**item) for item in (okr_raw.get("items") or [])], ) routing_raw = data.get("routing") or {} routing = RoutingConfig( mode=routing_raw.get("mode", "dynamic"), tiers=[ ModelTierConfig(**t) for t in (routing_raw.get("tiers") or []) ], ) budget_raw = data.get("budget") or {} providers = [ ProviderBudget(**item) for item in (budget_raw.get("providers") or []) ] storage_raw = data.get("storage") or {} return MaggyConfig( org=OrgConfig(**(data.get("org") or {})), issue_tracker=tracker, codebases=[CodebaseConfig(**c) for c in (data.get("codebases") or [])], projects=[ProjectConfig(**p) for p in (data.get("projects") or [])], competitors=CompetitorsConfig(**(data.get("competitors") or {})), okrs=okrs, ai=AIConfig(**(data.get("ai") or {})), storage=StorageConfig( backend=storage_raw.get("backend", "sqlite"), path=_safe_storage_path( storage_raw.get("path", _default_storage_path()) ), ), dashboard=DashboardConfig(**(data.get("dashboard") or {})), bootstrap=BootstrapConfig(**(data.get("bootstrap") or {})), budget=BudgetConfig( daily_limit_usd=budget_raw.get("daily_limit_usd", 10.0), warning_threshold=budget_raw.get("warning_threshold", 0.8), providers=providers, ), routing=routing, mesh=MeshConfig(**(data.get("mesh") or {})), heartbeat=HeartbeatConfig(**(data.get("heartbeat") or {})), ) _CACHED: MaggyConfig | None = None def _has_provider_credentials(cfg: MaggyConfig) -> bool: """Check if config has full provider credentials.""" if cfg.issue_tracker.provider == "github": gh = cfg.issue_tracker.github return bool(gh.org and gh.repos and gh.token) if cfg.issue_tracker.provider == "asana": az = cfg.issue_tracker.asana return bool(az.workspace_id and az.token) return False def _has_cli_history( home: Path | None = None, ) -> bool: """Check if any CLI data directories exist.""" root = home or Path.home() for d in (".claude", ".codex", ".kimi"): if (root / d).exists(): return True return False def auto_configure( home: Path | None = None, persist: bool = True, ) -> MaggyConfig: """Build config from auto-discovery.""" from maggy.discovery import full_discovery result = full_discovery(home) cfg = MaggyConfig( codebases=[ CodebaseConfig(path=r["path"], key=r["key"]) for r in result.repos ], ) if result.github_org: cfg.issue_tracker.github.org = result.github_org # Auto-populate repos matching the primary org if result.github_org: cfg.issue_tracker.github.repos = _repos_for_org( result.repos, result.github_org, ) if persist: save(cfg) return _merge_env(cfg) def _repos_for_org( repos: list[dict], org: str, ) -> list[str]: """Filter repo names belonging to a GitHub org.""" from maggy.discovery import infer_github_org matched: list[str] = [] for repo in repos: repo_org = infer_github_org(Path(repo["path"])) if repo_org == org: matched.append(repo["key"]) return matched def load(refresh: bool = False) -> MaggyConfig: """Load config from ~/.maggy/config.yaml, with env var overrides. Cached.""" global _CACHED if _CACHED is not None and not refresh: return _CACHED if not CONFIG_PATH.exists(): _CACHED = _merge_env(MaggyConfig()) return _CACHED with open(CONFIG_PATH) as f: data = yaml.safe_load(f) or {} _CACHED = _merge_env(_from_dict(data)) return _CACHED def save(cfg: MaggyConfig) -> None: """Write config back to ~/.maggy/config.yaml.""" CONFIG_DIR.mkdir(parents=True, exist_ok=True) # Convert dataclass → dict, strip empty tokens (they come from env) from dataclasses import asdict d = asdict(cfg) # Don't persist tokens — those come from env for section in ("github", "asana", "linear"): d.get("issue_tracker", {}).get(section, {}).pop("token", None) d.get("ai", {}).pop("api_key", None) d.get("dashboard", {}).pop("api_key", None) with open(CONFIG_PATH, "w") as f: yaml.safe_dump(d, f, sort_keys=False) global _CACHED _CACHED = None # force reload on next load() def is_configured() -> bool: """Check if Maggy has enough to be useful. Full mode: provider credentials present. Local mode: CLI history dirs exist (zero-config). """ if CONFIG_PATH.exists(): cfg = load(refresh=True) if _has_provider_credentials(cfg): return True if _has_cli_history(): return True return False ================================================ FILE: maggy/maggy/contracts/__init__.py ================================================ """Contracts exports.""" from .generator import ContractGenerator __all__ = ["ContractGenerator"] ================================================ FILE: maggy/maggy/contracts/generator.py ================================================ """Generate lightweight contract tests from postconditions.""" from __future__ import annotations import re class ContractGenerator: def from_postcondition(self, postcondition: str, symbol: str) -> str: test_name = _test_name(symbol) return ( f"def {test_name}() -> None:\n" f' """Contract for {symbol}."""\n' f" # Postcondition: {postcondition}\n" f" raise NotImplementedError(" f"\"Verify: {postcondition}\")\n" ) def _test_name(symbol: str) -> str: short = symbol.split(".")[-2:] slug = "_".join(short).lower() slug = re.sub(r"[^a-z0-9_]+", "_", slug) return f"test_{slug}_contract" ================================================ FILE: maggy/maggy/coordination/__init__.py ================================================ ================================================ FILE: maggy/maggy/coordination/lock_manager.py ================================================ """SQLite-backed file locks for multi-agent coordination.""" from __future__ import annotations import sqlite3 from contextlib import contextmanager from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Iterator LOCK_TTL = timedelta(minutes=30) SCHEMA = """ CREATE TABLE IF NOT EXISTS locks ( file_path TEXT NOT NULL, agent_id TEXT NOT NULL, acquired_at TEXT NOT NULL, expires_at TEXT NOT NULL ); CREATE UNIQUE INDEX IF NOT EXISTS idx_locks_file_path ON locks(file_path); CREATE INDEX IF NOT EXISTS idx_locks_expires_at ON locks(expires_at); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class LockManager: def __init__(self, db_path: Path): self._db_path = db_path self._init_db() def acquire(self, file_path: str, agent_id: str) -> bool: now, expires = _timestamps() with _connect(self._db_path) as conn: self._expire_locks(conn, now) try: conn.execute( "INSERT INTO locks(file_path, agent_id, acquired_at, expires_at) " "VALUES (?, ?, ?, ?)", (file_path, agent_id, now, expires), ) conn.commit() return True except sqlite3.IntegrityError: row = conn.execute( "SELECT agent_id FROM locks WHERE file_path = ?", (file_path,), ).fetchone() if row and row["agent_id"] == agent_id: conn.execute( "UPDATE locks SET acquired_at = ?, expires_at = ? " "WHERE file_path = ?", (now, expires, file_path), ) conn.commit() return True return False def release(self, file_path: str, agent_id: str) -> bool: with _connect(self._db_path) as conn: self._expire_locks(conn, _now()) cur = conn.execute( "DELETE FROM locks WHERE file_path = ? AND agent_id = ?", (file_path, agent_id), ) conn.commit() return cur.rowcount > 0 def release_all(self, agent_id: str) -> int: with _connect(self._db_path) as conn: self._expire_locks(conn, _now()) cur = conn.execute("DELETE FROM locks WHERE agent_id = ?", (agent_id,)) conn.commit() return cur.rowcount def conflicts(self, file_paths: list[str]) -> list[str]: if not file_paths: return [] marks = ", ".join("?" for _ in file_paths) with _connect(self._db_path) as conn: self._expire_locks(conn, _now()) rows = conn.execute( f"SELECT file_path FROM locks WHERE file_path IN ({marks})", file_paths, ).fetchall() locked = {row["file_path"] for row in rows} return [path for path in file_paths if path in locked] def _expire_locks(self, conn: sqlite3.Connection, now: str) -> None: conn.execute("DELETE FROM locks WHERE expires_at <= ?", (now,)) def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def _now() -> str: return datetime.now(timezone.utc).isoformat() def _timestamps() -> tuple[str, str]: now = datetime.now(timezone.utc) return now.isoformat(), (now + LOCK_TTL).isoformat() ================================================ FILE: maggy/maggy/deploy.py ================================================ """Deploy orchestrator — manages Vercel session containers.""" from __future__ import annotations import logging from dataclasses import dataclass, field from datetime import datetime, timezone logger = logging.getLogger(__name__) @dataclass class DeploySession: """Represents a running deploy session.""" session_id: str project: str branch: str status: str = "pending" # pending | building | live | failed url: str = "" created_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) class DeployService: """Manages deploy sessions (stub for container orchestration).""" def __init__(self): self._sessions: dict[str, DeploySession] = {} def create_session( self, project: str, branch: str, ) -> DeploySession: """Create a new deploy session.""" import uuid sid = str(uuid.uuid4())[:8] session = DeploySession( session_id=sid, project=project, branch=branch, status="building", ) self._sessions[sid] = session logger.info("Deploy session %s created for %s:%s", sid, project, branch) return session def get_session(self, sid: str) -> DeploySession | None: return self._sessions.get(sid) def list_sessions(self) -> list[DeploySession]: return list(self._sessions.values()) def update_status( self, sid: str, status: str, url: str = "", ) -> DeploySession | None: """Update session status.""" session = self._sessions.get(sid) if not session: return None session.status = status if url: session.url = url return session def teardown(self, sid: str) -> bool: """Remove a deploy session.""" if sid in self._sessions: del self._sessions[sid] return True return False ================================================ FILE: maggy/maggy/discovery.py ================================================ """Auto-discovery — detects local CLIs, repos, and dev environment.""" from __future__ import annotations import json import logging import os import shutil import subprocess from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path logger = logging.getLogger(__name__) SCAN_DIRS = [ "Documents", "dev", "projects", "code", "src", "workspace", "repos", "work", ] CLI_NAMES = ["claude", "codex", "kimi"] @dataclass class DiscoveryResult: """Everything auto-discovered about the local env.""" clis: dict[str, str] = field(default_factory=dict) cli_auth: dict[str, bool] = field( default_factory=dict, ) repos: list[dict] = field(default_factory=list) active_projects: list[str] = field( default_factory=list, ) tokens: dict[str, bool] = field( default_factory=dict, ) github_org: str = "" github_orgs: list[str] = field( default_factory=list, ) timestamp: str = "" def discover_clis() -> dict[str, str]: """Find installed CLI tools on PATH.""" result: dict[str, str] = {} for name in CLI_NAMES: path = shutil.which(name) if path: result[name] = path return result def discover_cli_auth() -> dict[str, bool]: """Check which CLIs have stored auth.""" home = Path.home() auth: dict[str, bool] = {} # Claude Code: has projects dir = subscription active claude_dir = home / ".claude" auth["claude"] = (claude_dir / "projects").is_dir() # Codex: auth.json with tokens codex_auth = home / ".codex" / "auth.json" auth["codex"] = _has_json_key(codex_auth, "tokens") # Kimi: credentials directory with token files kimi_creds = home / ".kimi" / "credentials" auth["kimi"] = kimi_creds.is_dir() and any( kimi_creds.iterdir() ) return auth def _has_json_key(path: Path, key: str) -> bool: """Check if JSON file exists and has a key.""" if not path.exists(): return False try: with open(path) as f: return bool(json.load(f).get(key)) except (json.JSONDecodeError, OSError): return False def discover_git_token() -> str: """Read GitHub token from git credential helper.""" try: result = subprocess.run( ["git", "credential", "fill"], input="protocol=https\nhost=github.com\n\n", capture_output=True, text=True, timeout=5, ) for line in result.stdout.splitlines(): if line.startswith("password="): return line.split("=", 1)[1] except (subprocess.SubprocessError, OSError): pass return "" def discover_repos( home: Path | None = None, ) -> list[dict]: """Scan common directories for git repos.""" root = home or Path.home() repos: list[dict] = [] for dirname in SCAN_DIRS: parent = root / dirname if not parent.exists(): continue _scan_dir(parent, repos, depth=0) if len(repos) >= 30: break return repos[:30] def _scan_dir( parent: Path, repos: list[dict], depth: int, ) -> None: """Recursively scan for .git dirs up to depth 3.""" if depth > 3 or len(repos) >= 30: return try: for child in sorted(parent.iterdir()): if not child.is_dir(): continue if child.name.startswith("."): continue git_dir = child / ".git" if git_dir.is_dir(): repos.append({ "path": str(child), "key": child.name, }) else: _scan_dir(child, repos, depth + 1) except PermissionError: pass def discover_active_projects( claude_dir: Path | None = None, ) -> list[str]: """Rank projects by prompt count from Claude history.""" cdir = claude_dir or (Path.home() / ".claude") history = cdir / "history.jsonl" if not history.exists(): return [] from collections import Counter counts: Counter[str] = Counter() try: for line in history.read_text().splitlines(): if not line.strip(): continue try: entry = json.loads(line) project = entry.get("project", "") if project: name = Path(project).name if name: counts[name] += 1 except json.JSONDecodeError: continue except OSError: return [] return [p for p, _ in counts.most_common(15)] def discover_env_tokens() -> dict[str, bool]: """Check env vars and git credential helper.""" tokens = { "GITHUB_TOKEN": bool( os.environ.get("GITHUB_TOKEN"), ), "ANTHROPIC_API_KEY": bool( os.environ.get("ANTHROPIC_API_KEY"), ), "ASANA_API_KEY": bool( os.environ.get("ASANA_API_KEY"), ), } # Fall back to git credential helper for GitHub if not tokens["GITHUB_TOKEN"]: tokens["GIT_CREDENTIAL"] = bool( discover_git_token(), ) return tokens def infer_github_org(repo_path: Path) -> str: """Infer GitHub org from git remote URL.""" try: result = subprocess.run( ["git", "remote", "get-url", "origin"], capture_output=True, text=True, cwd=str(repo_path), timeout=5, ) url = result.stdout.strip() return _parse_org_from_url(url) except (subprocess.SubprocessError, OSError): return "" def _parse_org_from_url(url: str) -> str: """Extract org from GitHub URL.""" if "github.com:" in url: parts = url.split("github.com:")[-1] return parts.split("/")[0] if "github.com/" in url: parts = url.split("github.com/")[-1] return parts.split("/")[0] return "" def discover_all_orgs(repos: list[dict]) -> list[str]: """Extract unique GitHub orgs from all repos.""" orgs: set[str] = set() for repo in repos: org = infer_github_org(Path(repo["path"])) if org: orgs.add(org) return sorted(orgs) def full_discovery( home: Path | None = None, ) -> DiscoveryResult: """Run all discovery checks.""" clis = discover_clis() cli_auth = discover_cli_auth() repos = discover_repos(home) projects = discover_active_projects() tokens = discover_env_tokens() all_orgs = discover_all_orgs(repos) org = all_orgs[0] if all_orgs else "" return DiscoveryResult( clis=clis, cli_auth=cli_auth, repos=repos, active_projects=projects, tokens=tokens, github_org=org, github_orgs=all_orgs, timestamp=datetime.now( timezone.utc ).isoformat(), ) ================================================ FILE: maggy/maggy/engram/__init__.py ================================================ """Engram — cross-session persistent memory.""" ================================================ FILE: maggy/maggy/engram/diagnostics.py ================================================ """AmnesiaProfile — 7-dimension memory diagnostics.""" from __future__ import annotations from dataclasses import dataclass from .store import EngramStore @dataclass class AmnesiaProfile: """7-dimension memory health assessment.""" total_memories: int = 0 active_count: int = 0 superseded_count: int = 0 facts: int = 0 decisions: int = 0 code_refs: int = 0 handoffs: int = 0 @property def health_score(self) -> float: """0.0-1.0 overall memory health.""" if self.total_memories == 0: return 0.0 active_ratio = self.active_count / self.total_memories diversity = sum( 1 for c in [ self.facts, self.decisions, self.code_refs, self.handoffs, ] if c > 0 ) / 4.0 return round( active_ratio * 0.6 + diversity * 0.4, 3, ) def diagnose( store: EngramStore, namespace: str | None = None, ) -> AmnesiaProfile: """Run diagnostics on memory store.""" all_records = store.query( namespace=namespace, active_only=False, limit=10000, ) active = [r for r in all_records if r.is_active] return AmnesiaProfile( total_memories=len(all_records), active_count=len(active), superseded_count=len(all_records) - len(active), facts=sum(1 for r in active if r.memory_type == "fact"), decisions=sum( 1 for r in active if r.memory_type == "decision" ), code_refs=sum( 1 for r in active if r.memory_type == "code_ref" ), handoffs=sum( 1 for r in active if r.memory_type == "handoff" ), ) ================================================ FILE: maggy/maggy/engram/record.py ================================================ """EngramRecord — the unit of persistent memory.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum class Origin(str, Enum): EXPLICIT = "explicit" INFERRED = "inferred" MESH = "mesh" class Validity(str, Enum): ACTIVE = "active" SUPERSEDED = "superseded" EXPIRED = "expired" @dataclass class EngramRecord: """A single unit of persistent memory.""" engram_id: str namespace: str memory_type: str # fact | decision | code_ref | handoff content: str origin: str = Origin.EXPLICIT validity: str = Validity.ACTIVE confidence: float = 1.0 tags: list[str] = field(default_factory=list) source_task: str = "" created_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) expires_at: str = "" @property def is_active(self) -> bool: return self.validity == Validity.ACTIVE def supersede(self) -> None: self.validity = Validity.SUPERSEDED ================================================ FILE: maggy/maggy/engram/retrieval.py ================================================ """Multi-path retrieval for Engram records.""" from __future__ import annotations from .record import EngramRecord from .store import EngramStore class EngramRetrieval: """Multi-path retrieval: semantic, temporal, causal, entity.""" def __init__(self, store: EngramStore): self._store = store def by_namespace( self, namespace: str, limit: int = 50, ) -> list[EngramRecord]: """Retrieve by namespace (project/session scope).""" return self._store.query( namespace=namespace, limit=limit, ) def by_type( self, memory_type: str, limit: int = 50, ) -> list[EngramRecord]: """Retrieve by memory type (fact/decision/etc).""" return self._store.query( memory_type=memory_type, limit=limit, ) def by_keyword( self, keyword: str, namespace: str | None = None, limit: int = 50, ) -> list[EngramRecord]: """Simple keyword search in content.""" records = self._store.query( namespace=namespace, limit=1000, ) matched = [ r for r in records if keyword.lower() in r.content.lower() ] return matched[:limit] def by_tag( self, tag: str, namespace: str | None = None, limit: int = 50, ) -> list[EngramRecord]: """Retrieve by tag.""" records = self._store.query( namespace=namespace, limit=1000, ) matched = [ r for r in records if tag in r.tags ] return matched[:limit] def recent(self, limit: int = 20) -> list[EngramRecord]: """Retrieve most recent records across all namespaces.""" return self._store.query( active_only=True, limit=limit, ) ================================================ FILE: maggy/maggy/engram/seed.py ================================================ """Seed engrams on first boot for non-zero health.""" from __future__ import annotations from .record import EngramRecord from .store import EngramStore _SEEDS = [ ("seed-fact-1", "fact", "Maggy uses blast-score routing " "to pick the optimal model per task."), ("seed-fact-2", "fact", "Quality gates: max 20 lines/fn, " "3 params, 2 nesting, 200 lines/file."), ("seed-decision-1", "decision", "TDD workflow: RED " "(failing tests) -> GREEN (pass) -> VALIDATE."), ("seed-decision-2", "decision", "Local Qwen3-Coder " "handles blast 0-5; Claude handles 5-10."), ("seed-coderef-1", "code_ref", "Routing tiers: process/model_router.py DEFAULT_TIERS"), ("seed-coderef-2", "code_ref", "Chat REPL: cli_chat.py _repl_loop"), ("seed-handoff-1", "handoff", "System initialized. " "Memory will grow as tasks are completed."), ] _REQUIRED_TYPES = {"fact", "decision", "code_ref", "handoff"} def seed_if_empty(store: EngramStore) -> None: """Seed missing memory types for healthy diversity.""" existing = { r.memory_type for r in store.query(active_only=True, limit=500) } missing = _REQUIRED_TYPES - existing if not missing: return for eid, mtype, content in _SEEDS: if mtype in missing: store.write(EngramRecord( engram_id=eid, namespace="system", memory_type=mtype, content=content, tags=["seed"], )) ================================================ FILE: maggy/maggy/engram/store.py ================================================ """SQLite store for Engram records with namespace isolation.""" from __future__ import annotations import json import sqlite3 from contextlib import contextmanager from pathlib import Path from typing import Iterator from .record import EngramRecord SCHEMA = """ CREATE TABLE IF NOT EXISTS engrams ( engram_id TEXT PRIMARY KEY, namespace TEXT NOT NULL, memory_type TEXT NOT NULL, content TEXT NOT NULL, origin TEXT NOT NULL DEFAULT 'explicit', validity TEXT NOT NULL DEFAULT 'active', confidence REAL NOT NULL DEFAULT 1.0, tags TEXT NOT NULL DEFAULT '[]', source_task TEXT NOT NULL DEFAULT '', created_at TEXT NOT NULL, expires_at TEXT NOT NULL DEFAULT '' ); CREATE INDEX IF NOT EXISTS idx_engram_ns ON engrams(namespace); CREATE INDEX IF NOT EXISTS idx_engram_type ON engrams(memory_type); CREATE INDEX IF NOT EXISTS idx_engram_validity ON engrams(validity); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class EngramStore: """SQLite-backed engram storage.""" def __init__(self, db_path: Path): self._db_path = db_path with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def write(self, record: EngramRecord) -> None: with _connect(self._db_path) as conn: conn.execute( "INSERT OR REPLACE INTO engrams " "VALUES (?,?,?,?,?,?,?,?,?,?,?)", ( record.engram_id, record.namespace, record.memory_type, record.content, record.origin, record.validity, record.confidence, json.dumps(record.tags), record.source_task, record.created_at, record.expires_at, ), ) conn.commit() def get( self, engram_id: str, ) -> EngramRecord | None: with _connect(self._db_path) as conn: row = conn.execute( "SELECT * FROM engrams " "WHERE engram_id=?", (engram_id,), ).fetchone() if not row: return None return self._row_to_record(row) def query( self, namespace: str | None = None, memory_type: str | None = None, active_only: bool = True, limit: int = 100, ) -> list[EngramRecord]: clauses: list[str] = [] params: list = [] if namespace: clauses.append("namespace = ?") params.append(namespace) if memory_type: clauses.append("memory_type = ?") params.append(memory_type) if active_only: clauses.append("validity = 'active'") where = ( f"WHERE {' AND '.join(clauses)}" if clauses else "" ) with _connect(self._db_path) as conn: rows = conn.execute( f"SELECT * FROM engrams {where} " f"ORDER BY created_at DESC LIMIT ?", params + [limit], ).fetchall() return [self._row_to_record(r) for r in rows] def count( self, namespace: str | None = None, ) -> int: with _connect(self._db_path) as conn: if namespace: row = conn.execute( "SELECT COUNT(*) FROM engrams " "WHERE namespace = ?", (namespace,), ).fetchone() else: row = conn.execute( "SELECT COUNT(*) FROM engrams", ).fetchone() return int(row[0]) def _row_to_record( self, r: sqlite3.Row, ) -> EngramRecord: return EngramRecord( engram_id=r["engram_id"], namespace=r["namespace"], memory_type=r["memory_type"], content=r["content"], origin=r["origin"], validity=r["validity"], confidence=r["confidence"], tags=json.loads(r["tags"]), source_task=r["source_task"], created_at=r["created_at"], expires_at=r["expires_at"], ) ================================================ FILE: maggy/maggy/escalation/__init__.py ================================================ ================================================ FILE: maggy/maggy/escalation/protocol.py ================================================ """Human escalation packets with SQLite persistence.""" from __future__ import annotations import json import sqlite3 import uuid from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Iterator SCHEMA = """ CREATE TABLE IF NOT EXISTS escalations ( id TEXT PRIMARY KEY, session_id TEXT NOT NULL, reason TEXT NOT NULL, context TEXT NOT NULL, agent_state TEXT NOT NULL, suggested_actions TEXT NOT NULL, created_at TEXT NOT NULL, resolved INTEGER NOT NULL, resolution TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_escalations_pending ON escalations(resolved, created_at); """ @dataclass class EscalationPacket: id: str session_id: str reason: str context: dict[str, object] agent_state: dict[str, object] suggested_actions: list[str] created_at: str resolved: bool resolution: str @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class Escalator: def __init__(self, db_path: Path): self._db_path = db_path self._init_db() def escalate( self, session_id: str, reason: str, context: dict[str, object] ) -> EscalationPacket: packet = _build_packet(session_id, reason, context) with _connect(self._db_path) as conn: conn.execute( "INSERT INTO escalations VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", _serialize(packet), ) conn.commit() return packet def resolve(self, escalation_id: str, guidance: str) -> EscalationPacket: with _connect(self._db_path) as conn: conn.execute( "UPDATE escalations SET resolved = 1, resolution = ? WHERE id = ?", (guidance, escalation_id), ) conn.commit() row = conn.execute( "SELECT * FROM escalations WHERE id = ?", (escalation_id,), ).fetchone() if not row: raise KeyError(escalation_id) return _from_row(row) def list_pending(self) -> list[EscalationPacket]: with _connect(self._db_path) as conn: rows = conn.execute( "SELECT * FROM escalations WHERE resolved = 0 ORDER BY created_at", ).fetchall() return [_from_row(row) for row in rows] def get(self, escalation_id: str) -> EscalationPacket | None: with _connect(self._db_path) as conn: row = conn.execute( "SELECT * FROM escalations WHERE id = ?", (escalation_id,), ).fetchone() return _from_row(row) if row else None def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def _build_packet( session_id: str, reason: str, context: dict[str, object] ) -> EscalationPacket: return EscalationPacket( id=str(uuid.uuid4()), session_id=session_id, reason=reason, context=context, agent_state=_dict_field(context, "agent_state"), suggested_actions=_list_field(context, "suggested_actions"), created_at=datetime.now(timezone.utc).isoformat(), resolved=False, resolution="", ) def _dict_field(context: dict[str, object], key: str) -> dict[str, object]: value = context.get(key, {}) return value if isinstance(value, dict) else {} def _list_field(context: dict[str, object], key: str) -> list[str]: value = context.get(key, []) return [item for item in value if isinstance(item, str)] if isinstance(value, list) else [] def _serialize(packet: EscalationPacket) -> tuple[object, ...]: return ( packet.id, packet.session_id, packet.reason, json.dumps(packet.context), json.dumps(packet.agent_state), json.dumps(packet.suggested_actions), packet.created_at, int(packet.resolved), packet.resolution, ) def _safe_json(raw: str, fallback: object) -> object: try: return json.loads(raw) except (json.JSONDecodeError, TypeError): return fallback def _from_row(row: sqlite3.Row) -> EscalationPacket: return EscalationPacket( id=row["id"], session_id=row["session_id"], reason=row["reason"], context=_safe_json(row["context"], {}), agent_state=_safe_json(row["agent_state"], {}), suggested_actions=_safe_json(row["suggested_actions"], []), created_at=row["created_at"], resolved=bool(row["resolved"]), resolution=row["resolution"], ) ================================================ FILE: maggy/maggy/event_spine/__init__.py ================================================ """Event Spine — canonical event flow for end-to-end tracing.""" from .emitter import EventEmitter from .header import EventHeader __all__ = ["EventEmitter", "EventHeader"] ================================================ FILE: maggy/maggy/event_spine/emitter.py ================================================ """Event emitter — write, query, and trace events.""" from __future__ import annotations import logging from dataclasses import asdict from .header import EventHeader from .store import EventStore logger = logging.getLogger(__name__) class EventEmitter: """Thread-safe event emission and query API.""" def __init__(self, store: EventStore): self._store = store def emit(self, event: object) -> str: """Write event to store. Returns event_id.""" header = getattr(event, "header", None) if not isinstance(header, EventHeader): raise ValueError("Event must have an EventHeader") data = asdict(event) self._store.write(header, data) logger.debug( "Event %s emitted: %s", header.event_type, header.event_id, ) return header.event_id def query( self, task_id: str | None = None, event_type: str | None = None, project_id: str | None = None, limit: int = 100, ) -> list[dict]: """Query events with optional filters.""" return self._store.query( task_id=task_id, event_type=event_type, project_id=project_id, limit=limit, ) def trace(self, task_id: str) -> list[dict]: """Return full ordered event chain for a task.""" return self._store.query( task_id=task_id, limit=10000, ) def count( self, event_type: str | None = None, project_id: str | None = None, ) -> int: """Count events matching filters.""" return self._store.count( event_type=event_type, project_id=project_id, ) ================================================ FILE: maggy/maggy/event_spine/events.py ================================================ """Eight typed event dataclasses for the Event Spine.""" from __future__ import annotations from dataclasses import dataclass, field from .header import EventHeader @dataclass class IntentEvent: """iCPG ReasonNode decomposition.""" header: EventHeader = field( default_factory=lambda: EventHeader("intent") ) intent_text: str = "" reason_node_id: str = "" decomposed_steps: list[str] = field(default_factory=list) @dataclass class BindingEvent: """Lexon tool selection + clarify mode.""" header: EventHeader = field( default_factory=lambda: EventHeader("binding") ) phrase: str = "" selected_tool: str = "" candidates: list[str] = field(default_factory=list) clarify_mode: str = "" # self_clarify | user_clarify @dataclass class ExecutionEvent: """Tool invocation input/output/duration.""" header: EventHeader = field( default_factory=lambda: EventHeader("execution") ) tool_name: str = "" input_summary: str = "" output_summary: str = "" duration_ms: int = 0 success: bool = True @dataclass class MemoryEvent: """Mnemos within-task memory write.""" header: EventHeader = field( default_factory=lambda: EventHeader("memory") ) memory_type: str = "" # fact | decision | code_ref | handoff content: str = "" node_id: str = "" @dataclass class PersistenceEvent: """Engram cross-session promotion.""" header: EventHeader = field( default_factory=lambda: EventHeader("persistence") ) engram_id: str = "" memory_type: str = "" content: str = "" source_namespace: str = "" target_namespace: str = "" @dataclass class OutcomeEvent: """Process Intelligence success/failure + reward.""" header: EventHeader = field( default_factory=lambda: EventHeader("outcome") ) success: bool = True reward: float = 0.0 metrics: dict = field(default_factory=dict) @dataclass class MutationEvent: """L2/L3/L4 self-modification.""" header: EventHeader = field( default_factory=lambda: EventHeader("mutation") ) control_level: str = "" # L2 | L3 | L4 target: str = "" old_value: str = "" new_value: str = "" reason: str = "" @dataclass class MeshEvent: """Cross-machine sharing + quarantine status.""" header: EventHeader = field( default_factory=lambda: EventHeader("mesh") ) peer_id: str = "" peer_name: str = "" action: str = "" # share | receive | quarantine | promote memory_type: str = "" content_key: str = "" EVENT_TYPES = { "intent": IntentEvent, "binding": BindingEvent, "execution": ExecutionEvent, "memory": MemoryEvent, "persistence": PersistenceEvent, "outcome": OutcomeEvent, "mutation": MutationEvent, "mesh": MeshEvent, } ================================================ FILE: maggy/maggy/event_spine/header.py ================================================ """Common EventHeader shared by all typed events.""" from __future__ import annotations import uuid from dataclasses import dataclass, field from datetime import datetime, timezone def _uuid() -> str: return str(uuid.uuid4()) def _now() -> str: return datetime.now(timezone.utc).isoformat() @dataclass class EventHeader: """Standard fields for every event in the spine.""" event_type: str event_id: str = field(default_factory=_uuid) task_id: str = "" project_id: str = "" agent_id: str = "" model_id: str = "" parent_event_id: str = "" confidence: float = 1.0 namespace: str = "" policy_version: str = "" reward_delta: float = 0.0 timestamp: str = field(default_factory=_now) schema_version: int = 1 ================================================ FILE: maggy/maggy/event_spine/store.py ================================================ """SQLite event store — append-only with archive support.""" from __future__ import annotations import gzip import json import sqlite3 from contextlib import contextmanager from pathlib import Path from typing import Iterator from .header import EventHeader SCHEMA = """ CREATE TABLE IF NOT EXISTS events ( id INTEGER PRIMARY KEY AUTOINCREMENT, event_id TEXT UNIQUE NOT NULL, event_type TEXT NOT NULL, task_id TEXT NOT NULL DEFAULT '', project_id TEXT NOT NULL DEFAULT '', agent_id TEXT NOT NULL DEFAULT '', model_id TEXT NOT NULL DEFAULT '', parent_event_id TEXT NOT NULL DEFAULT '', timestamp TEXT NOT NULL, payload TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_events_task ON events(task_id); CREATE INDEX IF NOT EXISTS idx_events_type ON events(event_type); CREATE INDEX IF NOT EXISTS idx_events_project ON events(project_id); CREATE INDEX IF NOT EXISTS idx_events_ts ON events(timestamp); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class EventStore: """Append-only SQLite event store.""" def __init__(self, db_path: Path): self._db_path = db_path self._init_db() def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def write( self, header: EventHeader, payload: dict, ) -> None: """Append an event.""" with _connect(self._db_path) as conn: conn.execute( "INSERT OR IGNORE INTO events " "(event_id, event_type, task_id, " "project_id, agent_id, model_id, " "parent_event_id, timestamp, payload) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", ( header.event_id, header.event_type, header.task_id, header.project_id, header.agent_id, header.model_id, header.parent_event_id, header.timestamp, json.dumps(payload), ), ) conn.commit() def query( self, task_id: str | None = None, event_type: str | None = None, project_id: str | None = None, limit: int = 100, ) -> list[dict]: """Query events with filters.""" clauses: list[str] = [] params: list = [] if task_id: clauses.append("task_id = ?") params.append(task_id) if event_type: clauses.append("event_type = ?") params.append(event_type) if project_id: clauses.append("project_id = ?") params.append(project_id) where = ( f"WHERE {' AND '.join(clauses)}" if clauses else "" ) sql = ( f"SELECT payload FROM events {where} " f"ORDER BY timestamp ASC LIMIT ?" ) params.append(limit) with _connect(self._db_path) as conn: rows = conn.execute(sql, params).fetchall() return [json.loads(r["payload"]) for r in rows] def count( self, event_type: str | None = None, project_id: str | None = None, ) -> int: """Count events matching filters.""" clauses: list[str] = [] params: list[str] = [] if event_type: clauses.append("event_type = ?") params.append(event_type) if project_id: clauses.append("project_id = ?") params.append(project_id) where = ( f"WHERE {' AND '.join(clauses)}" if clauses else "" ) with _connect(self._db_path) as conn: row = conn.execute( f"SELECT COUNT(*) FROM events {where}", params, ).fetchone() return int(row[0]) def archive_old( self, days: int = 90, archive_dir: Path | None = None, ) -> int: """Archive events older than N days.""" from datetime import datetime, timedelta, timezone cutoff = ( datetime.now(timezone.utc) - timedelta(days=days) ).isoformat() with _connect(self._db_path) as conn: rows = conn.execute( "SELECT payload FROM events " "WHERE timestamp < ?", (cutoff,), ).fetchall() if not rows: return 0 out_dir = archive_dir or ( self._db_path.parent / "events_archive" ) out_dir.mkdir(parents=True, exist_ok=True) archive_file = ( out_dir / f"events_{cutoff[:10]}.jsonl.gz" ) with gzip.open(archive_file, "wt") as f: for r in rows: f.write(r["payload"] + "\n") conn.execute( "DELETE FROM events WHERE timestamp < ?", (cutoff,), ) conn.commit() return len(rows) ================================================ FILE: maggy/maggy/fatigue.py ================================================ """Model-normalized fatigue tracking for cross-model sessions. Normalizes fatigue scores across models with different context windows so that 0.6 means "approaching limit" regardless of model. """ from __future__ import annotations from dataclasses import dataclass @dataclass class FatigueProfile: """Fatigue state for a model during a session.""" model: str context_window: int tokens_used: int = 0 turns: int = 0 recovery_reads: int = 0 @property def raw_utilization(self) -> float: """Raw context utilization 0.0-1.0.""" if self.context_window <= 0: return 0.0 return min(self.tokens_used / self.context_window, 1.0) @property def fatigue_score(self) -> float: """Normalized fatigue score 0.0-1.0. Combines context utilization with turn-based fatigue. Higher = more fatigued. """ ctx_factor = self.raw_utilization turn_factor = min(self.turns / 50.0, 1.0) return min(ctx_factor * 0.7 + turn_factor * 0.3, 1.0) def should_checkpoint(self, threshold: float = 0.6) -> bool: """Whether the model should checkpoint soon.""" return self.fatigue_score >= threshold MODEL_CONTEXT_WINDOWS: dict[str, int] = { "claude": 200_000, "gpt": 128_000, "kimi": 128_000, "deepseek": 128_000, "codex": 200_000, "local": 32_000, } def create_profile(model: str) -> FatigueProfile: """Create a fatigue profile for a known model.""" window = MODEL_CONTEXT_WINDOWS.get(model, 128_000) return FatigueProfile(model=model, context_window=window) def compare_fatigue( profiles: list[FatigueProfile], ) -> list[dict]: """Compare fatigue across active models.""" return [ { "model": p.model, "fatigue": round(p.fatigue_score, 3), "utilization": round(p.raw_utilization, 3), "turns": p.turns, "should_checkpoint": p.should_checkpoint(), } for p in sorted( profiles, key=lambda p: p.fatigue_score, reverse=True, ) ] ================================================ FILE: maggy/maggy/forge/__init__.py ================================================ """MCP Forge integration — bridge to mcp-forge pipeline.""" ================================================ FILE: maggy/maggy/forge/connector.py ================================================ """Bridge to mcp-forge — wraps registry, pipeline, codegen. Connects Maggy to the MCP Forge at ~/Documents/protaige/mcp-forge/ without requiring it on PYTHONPATH. Uses subprocess for pipeline invocation and file-based data exchange. """ from __future__ import annotations import logging from dataclasses import dataclass from pathlib import Path from .detector import GapDetector from .registry import ForgeRegistry logger = logging.getLogger(__name__) DEFAULT_FORGE_PATH = Path.home() / "Documents" / "protaige" / "mcp-forge" @dataclass class ForgeStatus: """Current state of the Forge connector.""" available: bool forge_path: str registry_count: int pending_gaps: int class ForgeConnector: """Bridge between Maggy and MCP Forge.""" def __init__( self, forge_path: Path | None = None, ): self._path = forge_path or DEFAULT_FORGE_PATH self._available = self._path.exists() self.registry = ForgeRegistry( self._path if self._available else None, ) self.detector = GapDetector() @property def available(self) -> bool: return self._available def status(self) -> ForgeStatus: """Return current connector status.""" return ForgeStatus( available=self._available, forge_path=str(self._path), registry_count=self.registry.count, pending_gaps=len(self.detector.list_gaps()), ) def search_tools(self, query: str) -> list[dict]: """Search the Forge registry.""" results = self.registry.search(query) return [ { "slug": t.slug, "mcp_url": t.mcp_url, "has_mcp": t.has_mcp, "auth_method": t.auth_method, } for t in results ] def report_gap(self, capability: str) -> dict: """Report a capability gap. Returns trigger status.""" triggered = self.detector.record_gap(capability) return { "capability": capability, "triggered": triggered, "message": ( f"Forge triggered for '{capability}'" if triggered else f"Gap recorded ({capability})" ), } def get_gaps(self) -> list[dict]: """Return all detected gaps.""" return [ { "capability": g.capability, "occurrences": g.occurrences, "triggered": g.triggered, } for g in self.detector.top_gaps(10) ] ================================================ FILE: maggy/maggy/forge/detector.py ================================================ """Capability gap detection — monitors unresolvable requests. Tracks patterns of failed tool lookups and triggers Forge after repeated occurrences of the same gap. """ from __future__ import annotations from collections import Counter from dataclasses import dataclass TRIGGER_THRESHOLD = 3 @dataclass class GapRecord: """A detected capability gap.""" capability: str occurrences: int = 0 triggered: bool = False class GapDetector: """Monitors capability gaps across requests.""" def __init__(self, threshold: int = TRIGGER_THRESHOLD): self._gaps: Counter = Counter() self._threshold = threshold self._triggered: set[str] = set() def record_gap(self, capability: str) -> bool: """Record a gap. Returns True if threshold reached.""" key = capability.lower().strip() self._gaps[key] += 1 if ( self._gaps[key] >= self._threshold and key not in self._triggered ): self._triggered.add(key) return True return False def list_gaps(self) -> list[GapRecord]: """Return all recorded gaps.""" return [ GapRecord( capability=cap, occurrences=count, triggered=cap in self._triggered, ) for cap, count in self._gaps.most_common() ] def top_gaps(self, n: int = 5) -> list[GapRecord]: """Return top N gaps by occurrence count.""" return self.list_gaps()[:n] def reset(self, capability: str) -> None: """Reset a gap counter after resolution.""" key = capability.lower().strip() if key in self._gaps: del self._gaps[key] self._triggered.discard(key) ================================================ FILE: maggy/maggy/forge/registry.py ================================================ """Tool registry — wraps mcp-forge's KNOWN_SERVERS. Provides enable/disable per project and search capabilities without requiring mcp-forge on PYTHONPATH. """ from __future__ import annotations import json from dataclasses import dataclass from pathlib import Path @dataclass class ToolInfo: """A registered MCP tool.""" slug: str mcp_url: str = "" has_mcp: str = "Community" auth_method: str = "API Key" enabled: bool = True class ForgeRegistry: """Project-aware tool registry.""" def __init__(self, forge_path: Path | None = None): self._tools: dict[str, ToolInfo] = {} self._forge_path = forge_path self._load_registry() def _load_registry(self) -> None: """Load from mcp-forge if available.""" if not self._forge_path: return reg_file = self._forge_path / "src" / "mcp_registry.py" if not reg_file.exists(): return # Parse KNOWN_SERVERS from the registry self._tools = _parse_registry(reg_file) def search(self, query: str) -> list[ToolInfo]: """Search tools by slug or keyword.""" q = query.lower() return [ t for t in self._tools.values() if q in t.slug or q in t.mcp_url.lower() ] def get(self, slug: str) -> ToolInfo | None: return self._tools.get(slug) def list_all(self) -> list[ToolInfo]: return list(self._tools.values()) def set_enabled(self, slug: str, enabled: bool) -> bool: tool = self._tools.get(slug) if not tool: return False tool.enabled = enabled return True @property def count(self) -> int: return len(self._tools) def _parse_registry(path: Path) -> dict[str, ToolInfo]: """Extract KNOWN_SERVERS entries from registry file.""" tools: dict[str, ToolInfo] = {} content = path.read_text() # Find dict literals in KNOWN_SERVERS list import re pattern = r'\{[^}]+\}' for match in re.finditer(pattern, content): try: # Clean Python dict to JSON-compatible raw = match.group() raw = raw.replace("'", '"') data = json.loads(raw) slug = data.get("slug", "") if slug: tools[slug] = ToolInfo( slug=slug, mcp_url=data.get("mcp_url", ""), has_mcp=data.get("has_mcp", "Community"), auth_method=data.get("auth_method", ""), ) except (json.JSONDecodeError, KeyError): continue return tools ================================================ FILE: maggy/maggy/heartbeat/__init__.py ================================================ """Heartbeat — background scheduler for periodic jobs.""" ================================================ FILE: maggy/maggy/heartbeat/jobs.py ================================================ """Built-in heartbeat jobs — wire to existing services.""" from __future__ import annotations import logging from datetime import datetime, timezone from maggy.engram.record import Validity logger = logging.getLogger(__name__) async def refresh_history(app) -> None: """Re-parse CLI session data.""" history = getattr(app.state, "history", None) if not history: return try: history.analyze() except Exception as exc: logger.warning("refresh_history failed: %s", exc) raise async def expire_engrams(app) -> None: """Mark expired engrams.""" engram = getattr(app.state, "engram", None) if not engram: return try: records = engram.query(active_only=True, limit=500) now = datetime.now(timezone.utc) for rec in records: if _is_expired(rec, now): rec.validity = Validity.expired engram.write(rec) except Exception as exc: logger.warning("expire_engrams failed: %s", exc) raise def _is_expired(rec, now) -> bool: """Check if an engram's TTL has elapsed.""" tags = getattr(rec, "tags", []) or [] ttl_tag = next((t for t in tags if t.startswith("ttl:")), None) if not ttl_tag: return False try: ttl = int(ttl_tag.split(":")[1]) except (IndexError, ValueError): return False created = rec.created_at if not created: return False created_dt = datetime.fromisoformat(created) return (now - created_dt).total_seconds() > ttl * 3600 async def self_improve(app) -> None: """Run self-improvement analysis.""" introspector = getattr(app.state, "introspector", None) if not introspector: return try: introspector.analyze() except Exception as exc: logger.warning("self_improve failed: %s", exc) raise async def mesh_heartbeat(app) -> None: """Discover peers, announce self, publish shares.""" mesh = getattr(app.state, "mesh", None) if not mesh: return cfg = getattr(app.state, "cfg", None) if not cfg: return try: token = cfg.issue_tracker.github.token if token and cfg.mesh.git_discovery: await mesh.discover(token) await mesh.announce_all(token) except Exception as exc: logger.warning("mesh_heartbeat failed: %s", exc) raise async def collect_signals(app) -> None: """Record periodic observability signals.""" obs = getattr(app.state, "observability", None) cfg = getattr(app.state, "cfg", None) if not obs or not cfg: return try: for cb in cfg.codebases: obs.record_signal(cb.key, "heartbeat", 1.0) except Exception as exc: logger.warning("collect_signals failed: %s", exc) raise ================================================ FILE: maggy/maggy/heartbeat/scheduler.py ================================================ """Core heartbeat scheduler — register and run periodic jobs.""" from __future__ import annotations import asyncio import logging from dataclasses import dataclass from datetime import datetime, timezone from typing import Awaitable, Callable logger = logging.getLogger(__name__) TICK_INTERVAL = 1.0 # seconds between scheduler ticks @dataclass class Job: name: str fn: Callable[..., Awaitable[None]] interval_seconds: int last_run: str = "" run_count: int = 0 last_error: str = "" enabled: bool = True def is_due(self) -> bool: if not self.last_run: return True last = datetime.fromisoformat(self.last_run) elapsed = (datetime.now(timezone.utc) - last).total_seconds() return elapsed >= self.interval_seconds class HeartbeatScheduler: def __init__(self) -> None: self._jobs: dict[str, Job] = {} self._task: asyncio.Task | None = None def register( self, name: str, fn: Callable, interval: int, ) -> None: if name in self._jobs: raise ValueError(f"Job '{name}' already registered") self._jobs[name] = Job( name=name, fn=fn, interval_seconds=interval, ) async def tick(self) -> None: for job in self._jobs.values(): if not job.enabled or not job.is_due(): continue await self._run_job(job) async def _run_job(self, job: Job) -> None: try: await job.fn() job.last_error = "" except Exception as exc: job.last_error = str(exc) logger.warning("Job %s failed: %s", job.name, exc) job.last_run = datetime.now(timezone.utc).isoformat() job.run_count += 1 async def trigger(self, name: str) -> dict: if name not in self._jobs: raise KeyError(name) job = self._jobs[name] await self._run_job(job) return {"ok": not job.last_error, "name": name} async def start(self) -> None: self._task = asyncio.create_task(self._loop()) logger.info("Heartbeat started — %d jobs", len(self._jobs)) async def stop(self) -> None: if self._task: self._task.cancel() try: await self._task except asyncio.CancelledError: pass self._task = None logger.info("Heartbeat stopped") async def _loop(self) -> None: while True: await self.tick() await asyncio.sleep(TICK_INTERVAL) def status(self) -> list[dict]: return [ { "name": j.name, "interval": j.interval_seconds, "last_run": j.last_run, "run_count": j.run_count, "last_error": j.last_error, "enabled": j.enabled, } for j in self._jobs.values() ] ================================================ FILE: maggy/maggy/history/__init__.py ================================================ """Session history analyzer — reads Claude/Codex/Kimi local state.""" ================================================ FILE: maggy/maggy/history/analyzer.py ================================================ """Aggregation and pattern detection for session history.""" from __future__ import annotations from collections import Counter, defaultdict from datetime import datetime from .models import ( HistoryReport, ProjectActivity, ProviderUsage, SessionEntry, TimeDistribution, _now_iso, ) def build_report( sessions: list[SessionEntry], ) -> HistoryReport: """Build complete history report from sessions.""" if not sessions: return HistoryReport( generated_at=_now_iso(), total_sessions=0, total_prompts=0, ) return HistoryReport( generated_at=_now_iso(), total_sessions=len(sessions), total_prompts=sum(s.prompt_count for s in sessions), providers=aggregate_by_provider(sessions), projects=aggregate_by_project(sessions), time_distribution=compute_time_distribution(sessions), top_topics=extract_top_topics(sessions), patterns=detect_patterns(sessions), ) def aggregate_by_provider( sessions: list[SessionEntry], ) -> list[ProviderUsage]: """Group sessions by provider.""" by_prov: dict[str, list[SessionEntry]] = defaultdict(list) for s in sessions: by_prov[s.provider].append(s) result: list[ProviderUsage] = [] for prov, items in sorted(by_prov.items()): minutes = sum( s.duration_minutes or 0 for s in items ) models: set[str] = set() for s in items: models.update(s.models_used) result.append(ProviderUsage( provider=prov, session_count=len(items), prompt_count=sum(s.prompt_count for s in items), total_minutes=minutes, models_used=sorted(models), )) return result def aggregate_by_project( sessions: list[SessionEntry], ) -> list[ProjectActivity]: """Group sessions by project.""" by_proj: dict[str, list[SessionEntry]] = defaultdict(list) for s in sessions: by_proj[s.project].append(s) result: list[ProjectActivity] = [] for proj, items in sorted(by_proj.items()): providers = sorted({s.provider for s in items}) dates = [s.started_at for s in items if s.started_at] date_range = (min(dates), max(dates)) if dates else ("", "") topics = _merge_topics(items) result.append(ProjectActivity( project=proj, total_sessions=len(items), total_prompts=sum(s.prompt_count for s in items), providers_used=providers, date_range=date_range, top_topics=topics[:5], )) return result def compute_time_distribution( sessions: list[SessionEntry], ) -> TimeDistribution: """Bucket sessions by hour, weekday, date.""" by_hour: Counter[int] = Counter() by_weekday: Counter[int] = Counter() by_date: Counter[str] = Counter() for s in sessions: if not s.started_at: continue try: dt = datetime.fromisoformat(s.started_at) except ValueError: continue by_hour[dt.hour] += 1 by_weekday[dt.weekday()] += 1 by_date[dt.strftime("%Y-%m-%d")] += s.prompt_count return TimeDistribution( by_hour=dict(by_hour), by_weekday=dict(by_weekday), by_date=dict(by_date), ) def extract_top_topics( sessions: list[SessionEntry], ) -> list[str]: """Frequency-rank topics across all sessions.""" counts: Counter[str] = Counter() for s in sessions: for t in s.topics: counts[t] += 1 return [t for t, _ in counts.most_common(10)] def detect_patterns( sessions: list[SessionEntry], ) -> list[str]: """Generate human-readable pattern observations.""" if not sessions: return [] patterns: list[str] = [] _detect_provider_dominance(sessions, patterns) _detect_session_stats(sessions, patterns) _detect_project_focus(sessions, patterns) return patterns def _detect_provider_dominance( sessions: list[SessionEntry], patterns: list[str], ) -> None: """Check if one provider dominates usage.""" counts = Counter(s.provider for s in sessions) total = len(sessions) for prov, count in counts.most_common(1): pct = count * 100 // total if pct >= 70: patterns.append( f"{pct}% of sessions use {prov}" ) def _detect_session_stats( sessions: list[SessionEntry], patterns: list[str], ) -> None: """Compute average session statistics.""" avg_prompts = ( sum(s.prompt_count for s in sessions) // len(sessions) ) durations = [ s.duration_minutes for s in sessions if s.duration_minutes is not None ] if durations: avg_min = sum(durations) / len(durations) patterns.append( f"Average session: {avg_prompts} prompts, " f"{avg_min:.0f} minutes" ) else: patterns.append( f"Average session: {avg_prompts} prompts" ) def _detect_project_focus( sessions: list[SessionEntry], patterns: list[str], ) -> None: """Detect high-activity projects.""" by_proj = Counter(s.project for s in sessions) for proj, count in by_proj.most_common(1): if count >= 5: patterns.append( f"Project '{proj}' had {count} sessions" f" — high focus" ) def _merge_topics( sessions: list[SessionEntry], ) -> list[str]: """Merge topics across sessions by frequency.""" counts: Counter[str] = Counter() for s in sessions: for t in s.topics: counts[t] += 1 return [t for t, _ in counts.most_common(10)] ================================================ FILE: maggy/maggy/history/models.py ================================================ """Data models for session history analysis.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone @dataclass class SessionEntry: """A single parsed session from any CLI.""" session_id: str provider: str # "claude" | "codex" | "kimi" project: str started_at: str ended_at: str prompt_count: int tool_use_count: int models_used: list[str] = field(default_factory=list) git_branch: str = "" topics: list[str] = field(default_factory=list) summary: str = "" @property def duration_minutes(self) -> float | None: """Session duration in minutes.""" if not self.started_at or not self.ended_at: return None try: start = datetime.fromisoformat(self.started_at) end = datetime.fromisoformat(self.ended_at) return (end - start).total_seconds() / 60 except (ValueError, TypeError): return None @dataclass class ProjectActivity: """Aggregated activity for a project across CLIs.""" project: str total_sessions: int total_prompts: int providers_used: list[str] = field(default_factory=list) date_range: tuple[str, str] = ("", "") top_topics: list[str] = field(default_factory=list) @dataclass class ProviderUsage: """Usage statistics per provider.""" provider: str session_count: int prompt_count: int total_minutes: float models_used: list[str] = field(default_factory=list) @dataclass class TimeDistribution: """Work distribution across time periods.""" by_hour: dict[int, int] = field(default_factory=dict) by_weekday: dict[int, int] = field(default_factory=dict) by_date: dict[str, int] = field(default_factory=dict) @dataclass class HistoryReport: """Complete analysis report.""" generated_at: str total_sessions: int total_prompts: int providers: list[ProviderUsage] = field( default_factory=list ) projects: list[ProjectActivity] = field( default_factory=list ) time_distribution: TimeDistribution | None = None top_topics: list[str] = field(default_factory=list) patterns: list[str] = field(default_factory=list) summary: str = "" def _now_iso() -> str: """Current UTC timestamp as ISO string.""" return datetime.now(timezone.utc).isoformat() ================================================ FILE: maggy/maggy/history/parsers/__init__.py ================================================ """History parsers for Claude Code, Codex CLI, and Kimi CLI.""" from .claude import ClaudeHistoryParser from .codex import CodexHistoryParser from .kimi import KimiHistoryParser __all__ = [ "ClaudeHistoryParser", "CodexHistoryParser", "KimiHistoryParser", ] ================================================ FILE: maggy/maggy/history/parsers/base.py ================================================ """Abstract base for CLI history parsers.""" from __future__ import annotations from abc import ABC, abstractmethod from maggy.history.models import SessionEntry class HistoryParser(ABC): """Base protocol for CLI history parsers.""" provider: str @abstractmethod def is_available(self) -> bool: """Check if this CLI's data directory exists.""" ... @abstractmethod def parse_sessions( self, limit: int = 500, ) -> list[SessionEntry]: """Parse session history into SessionEntry list.""" ... @abstractmethod def session_count(self) -> int: """Return total number of sessions available.""" ... ================================================ FILE: maggy/maggy/history/parsers/claude.py ================================================ """Claude Code history parser — reads ~/.claude/ local state.""" from __future__ import annotations import json import logging from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from maggy.history.models import SessionEntry from .base import HistoryParser logger = logging.getLogger(__name__) def _millis_to_iso(ms: int | float) -> str: """Convert Unix milliseconds to ISO-8601.""" dt = datetime.fromtimestamp(ms / 1000, tz=timezone.utc) return dt.isoformat() def _read_jsonl(path: Path) -> list[dict]: """Read JSONL file, skip bad lines.""" if not path.exists(): return [] results: list[dict] = [] try: for line in path.read_text().splitlines(): line = line.strip() if not line: continue try: results.append(json.loads(line)) except json.JSONDecodeError: continue except OSError: return [] return results def _extract_topics(prompts: list[str]) -> list[str]: """Extract keyword topics from prompt texts.""" from collections import Counter words: list[str] = [] for text in prompts: for w in text.lower().split(): if len(w) > 3 and w.isalpha(): words.append(w) counts = Counter(words) return [w for w, _ in counts.most_common(5)] class ClaudeHistoryParser(HistoryParser): """Parse Claude Code session history.""" provider = "claude" def __init__(self, claude_dir: Path | None = None): self._dir = claude_dir or ( Path.home() / ".claude" ) def is_available(self) -> bool: history = self._dir / "history.jsonl" return history.exists() def session_count(self) -> int: entries = _read_jsonl(self._dir / "history.jsonl") ids = {e.get("sessionId") for e in entries} ids.discard(None) return len(ids) def parse_sessions( self, limit: int = 500, ) -> list[SessionEntry]: entries = _read_jsonl(self._dir / "history.jsonl") if not entries: return [] grouped = self._group_by_session(entries) sessions: list[SessionEntry] = [] for sid, items in list(grouped.items())[:limit]: session = self._build_entry(sid, items) sessions.append(session) return sessions def _group_by_session( self, entries: list[dict], ) -> dict[str, list[dict]]: grouped: dict[str, list[dict]] = defaultdict(list) for e in entries: sid = e.get("sessionId") if sid: grouped[sid].append(e) return dict(grouped) def _build_entry( self, sid: str, items: list[dict], ) -> SessionEntry: timestamps = [ i["timestamp"] for i in items if "timestamp" in i ] project = items[0].get("project", "") prompts = [ i.get("display", "") for i in items if i.get("display") ] summary = prompts[0] if prompts else "" started = _millis_to_iso(min(timestamps)) if timestamps else "" ended = _millis_to_iso(max(timestamps)) if timestamps else "" # Try reading transcript for richer data extra = self._parse_transcript(sid, project) return SessionEntry( session_id=sid, provider="claude", project=self._slug(project), started_at=started, ended_at=ended, prompt_count=len(items), tool_use_count=extra.get("tool_uses", 0), models_used=extra.get("models", []), git_branch=extra.get("branch", ""), topics=_extract_topics(prompts), summary=summary, ) def _slug(self, project_path: str) -> str: """Extract project name from path.""" if not project_path: return "" return Path(project_path).name def _find_transcript( self, sid: str, project: str, ) -> Path | None: """Locate transcript JSONL by session ID.""" projects_dir = self._dir / "projects" if not projects_dir.exists(): return None slug = project.replace("/", "-").lstrip("-") direct = projects_dir / slug / f"{sid}.jsonl" if direct.exists(): return direct # Search all project dirs for the session for d in projects_dir.iterdir(): if not d.is_dir(): continue f = d / f"{sid}.jsonl" if f.exists(): return f return None def _parse_transcript( self, sid: str, project: str, ) -> dict: """Read session transcript for models/tools/branch.""" if not project: return {} transcript = self._find_transcript(sid, project) if not transcript: return {} entries = _read_jsonl(transcript) models: set[str] = set() tool_uses = 0 branch = "" for e in entries: etype = e.get("type", "") if etype == "assistant": m = e.get("model", "") if m: models.add(m) content = e.get("message", {}).get( "content", [] ) if isinstance(content, list): tool_uses += sum( 1 for b in content if isinstance(b, dict) and b.get("type") == "tool_use" ) elif etype == "user" and not branch: branch = e.get("gitBranch", "") return { "models": sorted(models), "tool_uses": tool_uses, "branch": branch, } ================================================ FILE: maggy/maggy/history/parsers/codex.py ================================================ """Codex CLI history parser — reads ~/.codex/ local state.""" from __future__ import annotations import json import logging from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from maggy.history.models import SessionEntry from .base import HistoryParser logger = logging.getLogger(__name__) def _seconds_to_iso(ts: int | float) -> str: """Convert Unix seconds to ISO-8601.""" dt = datetime.fromtimestamp(ts, tz=timezone.utc) return dt.isoformat() def _read_jsonl(path: Path) -> list[dict]: """Read JSONL file, skip bad lines.""" if not path.exists(): return [] results: list[dict] = [] try: for line in path.read_text().splitlines(): line = line.strip() if not line: continue try: results.append(json.loads(line)) except json.JSONDecodeError: continue except OSError: return [] return results def _extract_topics(texts: list[str]) -> list[str]: """Extract keyword topics from prompt texts.""" from collections import Counter words: list[str] = [] for text in texts: for w in text.lower().split(): if len(w) > 3 and w.isalpha(): words.append(w) counts = Counter(words) return [w for w, _ in counts.most_common(5)] class CodexHistoryParser(HistoryParser): """Parse OpenAI Codex CLI session history.""" provider = "codex" def __init__(self, codex_dir: Path | None = None): self._dir = codex_dir or ( Path.home() / ".codex" ) def is_available(self) -> bool: index = self._dir / "session_index.jsonl" return index.exists() def session_count(self) -> int: entries = _read_jsonl( self._dir / "session_index.jsonl" ) return len(entries) def parse_sessions( self, limit: int = 500, ) -> list[SessionEntry]: index = _read_jsonl( self._dir / "session_index.jsonl" ) if not index: return [] history = _read_jsonl( self._dir / "history.jsonl" ) prompts_by_sid = self._group_prompts(history) sessions: list[SessionEntry] = [] for entry in index[:limit]: sid = entry.get("id", "") if not sid: continue session = self._build_entry( entry, prompts_by_sid.get(sid, []), ) sessions.append(session) return sessions def _group_prompts( self, history: list[dict], ) -> dict[str, list[dict]]: grouped: dict[str, list[dict]] = defaultdict(list) for h in history: sid = h.get("session_id", "") if sid: grouped[sid].append(h) return dict(grouped) def _build_entry( self, index_entry: dict, prompts: list[dict], ) -> SessionEntry: sid = index_entry.get("id", "") thread_name = index_entry.get("thread_name", "") updated = index_entry.get("updated_at", "") timestamps = [ p["ts"] for p in prompts if "ts" in p ] texts = [ p.get("text", "") for p in prompts if p.get("text") ] started = _seconds_to_iso(min(timestamps)) if timestamps else updated ended = _seconds_to_iso(max(timestamps)) if timestamps else updated return SessionEntry( session_id=sid, provider="codex", project=thread_name, started_at=started, ended_at=ended, prompt_count=len(prompts), tool_use_count=0, models_used=[], topics=_extract_topics(texts), summary=thread_name or ( texts[0][:100] if texts else "" ), ) ================================================ FILE: maggy/maggy/history/parsers/kimi.py ================================================ """Kimi CLI history parser — reads ~/.kimi/ local state.""" from __future__ import annotations import json import logging from datetime import datetime, timezone from pathlib import Path from maggy.history.models import SessionEntry from .base import HistoryParser logger = logging.getLogger(__name__) def _float_to_iso(ts: float) -> str: """Convert Unix float seconds to ISO-8601.""" dt = datetime.fromtimestamp(ts, tz=timezone.utc) return dt.isoformat() def _read_jsonl(path: Path) -> list[dict]: """Read JSONL file, skip bad lines.""" if not path.exists(): return [] results: list[dict] = [] try: for line in path.read_text().splitlines(): line = line.strip() if not line: continue try: results.append(json.loads(line)) except json.JSONDecodeError: continue except OSError: return [] return results def _extract_topics(texts: list[str]) -> list[str]: """Extract keyword topics from texts.""" from collections import Counter words: list[str] = [] for text in texts: for w in text.lower().split(): if len(w) > 3 and w.isalpha(): words.append(w) counts = Counter(words) return [w for w, _ in counts.most_common(5)] class KimiHistoryParser(HistoryParser): """Parse Moonshot Kimi CLI session history.""" provider = "kimi" def __init__(self, kimi_dir: Path | None = None): self._dir = kimi_dir or ( Path.home() / ".kimi" ) def is_available(self) -> bool: sessions = self._dir / "sessions" return sessions.exists() and sessions.is_dir() def session_count(self) -> int: return len(self._find_session_dirs()) def parse_sessions( self, limit: int = 500, ) -> list[SessionEntry]: dirs = self._find_session_dirs() sessions: list[SessionEntry] = [] for d in dirs[:limit]: entry = self._parse_session_dir(d) if entry: sessions.append(entry) return sessions def _find_session_dirs(self) -> list[Path]: """Find all session UUID directories.""" sessions_root = self._dir / "sessions" if not sessions_root.exists(): return [] dirs: list[Path] = [] for hash_dir in sessions_root.iterdir(): if not hash_dir.is_dir(): continue for uuid_dir in hash_dir.iterdir(): if not uuid_dir.is_dir(): continue ctx = uuid_dir / "context.jsonl" if ctx.exists(): dirs.append(uuid_dir) return dirs def _parse_session_dir( self, session_dir: Path, ) -> SessionEntry | None: context = _read_jsonl( session_dir / "context.jsonl" ) if not context: return None user_msgs = [ e for e in context if e.get("role") == "user" ] prompts = [] for e in user_msgs: c = e.get("content", "") if isinstance(c, str): prompts.append(c) elif isinstance(c, list): prompts.append(str(c[0]) if c else "") summary = prompts[0][:100] if prompts else "" wire = self._parse_wire(session_dir) return SessionEntry( session_id=session_dir.name, provider="kimi", project="", started_at=wire.get("started", ""), ended_at=wire.get("ended", ""), prompt_count=len(user_msgs), tool_use_count=wire.get("steps", 0), models_used=[], topics=_extract_topics(prompts), summary=summary, ) def _parse_wire(self, session_dir: Path) -> dict: """Extract timestamps and step counts from wire.""" entries = _read_jsonl( session_dir / "wire.jsonl" ) if not entries: return {} timestamps: list[float] = [] steps = 0 for e in entries: ts = e.get("timestamp") if isinstance(ts, (int, float)): timestamps.append(float(ts)) msg_str = e.get("message", "") if "StepBegin" in str(msg_str): steps += 1 result: dict = {"steps": steps} if timestamps: result["started"] = _float_to_iso( min(timestamps) ) result["ended"] = _float_to_iso( max(timestamps) ) return result ================================================ FILE: maggy/maggy/history/service.py ================================================ """History analysis service — orchestrates the full pipeline.""" from __future__ import annotations import logging from pathlib import Path from .analyzer import build_report from .models import HistoryReport from .parsers.claude import ClaudeHistoryParser from .parsers.codex import CodexHistoryParser from .parsers.kimi import KimiHistoryParser from .store import HistoryStore logger = logging.getLogger(__name__) class HistoryService: """Orchestrates session history analysis.""" def __init__( self, db_path: Path | None = None, cli_dirs: dict[str, Path] | None = None, ): db = db_path or ( Path.home() / ".maggy" / "history.db" ) self._store = HistoryStore(db) dirs = cli_dirs or {} self._parsers = [ ClaudeHistoryParser(dirs.get("claude")), CodexHistoryParser(dirs.get("codex")), KimiHistoryParser(dirs.get("kimi")), ] def analyze(self) -> HistoryReport: """Parse all CLIs, analyze, store report.""" all_sessions = self._collect_sessions() report = build_report(all_sessions) if all_sessions: self._store.save_sessions(all_sessions) self._store.save_report(report) logger.info( "History analysis: %d sessions, %d prompts, " "%d providers", report.total_sessions, report.total_prompts, len(report.providers), ) return report def _collect_sessions(self) -> list: """Collect sessions from all available parsers.""" sessions = [] for parser in self._parsers: if not parser.is_available(): logger.debug( "%s not available, skipping", parser.provider, ) continue try: parsed = parser.parse_sessions() sessions.extend(parsed) logger.info( "Parsed %d sessions from %s", len(parsed), parser.provider, ) except Exception: logger.exception( "Failed to parse %s history", parser.provider, ) return sessions def get_report(self) -> dict | None: """Get latest cached report.""" return self._store.load_latest_report() def get_sessions( self, provider: str | None = None, ) -> list[dict]: """Get stored session records.""" return self._store.load_sessions( provider=provider, ) def available_providers(self) -> list[str]: """List which CLIs are available.""" return [ p.provider for p in self._parsers if p.is_available() ] ================================================ FILE: maggy/maggy/history/store.py ================================================ """SQLite store for session history data.""" from __future__ import annotations import json import sqlite3 from contextlib import contextmanager from dataclasses import asdict from datetime import datetime, timezone from pathlib import Path from typing import Iterator from .models import HistoryReport, SessionEntry SCHEMA = """ CREATE TABLE IF NOT EXISTS sessions ( id INTEGER PRIMARY KEY AUTOINCREMENT, session_id TEXT NOT NULL, provider TEXT NOT NULL, project TEXT NOT NULL, started_at TEXT NOT NULL, ended_at TEXT NOT NULL DEFAULT '', prompt_count INTEGER NOT NULL DEFAULT 0, tool_use_count INTEGER NOT NULL DEFAULT 0, models_used TEXT NOT NULL DEFAULT '[]', git_branch TEXT NOT NULL DEFAULT '', topics TEXT NOT NULL DEFAULT '[]', summary TEXT NOT NULL DEFAULT '', ingested_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_hsess_provider ON sessions(provider); CREATE INDEX IF NOT EXISTS idx_hsess_project ON sessions(project); CREATE TABLE IF NOT EXISTS history_reports ( id INTEGER PRIMARY KEY AUTOINCREMENT, generated_at TEXT NOT NULL, payload TEXT NOT NULL ); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class HistoryStore: """SQLite-backed session history storage.""" def __init__(self, db_path: Path): self._db_path = db_path with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def save_sessions( self, sessions: list[SessionEntry], ) -> None: """Save parsed session entries.""" now = datetime.now(timezone.utc).isoformat() with _connect(self._db_path) as conn: for s in sessions: conn.execute( "INSERT INTO sessions " "(session_id, provider, project, " "started_at, ended_at, prompt_count, " "tool_use_count, models_used, " "git_branch, topics, summary, " "ingested_at) " "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", ( s.session_id, s.provider, s.project, s.started_at, s.ended_at, s.prompt_count, s.tool_use_count, json.dumps(s.models_used), s.git_branch, json.dumps(s.topics), s.summary, now, ), ) conn.commit() def load_sessions( self, provider: str | None = None, limit: int = 500, ) -> list[dict]: """Load stored session records.""" with _connect(self._db_path) as conn: if provider: rows = conn.execute( "SELECT * FROM sessions " "WHERE provider = ? " "ORDER BY started_at DESC " "LIMIT ?", (provider, limit), ).fetchall() else: rows = conn.execute( "SELECT * FROM sessions " "ORDER BY started_at DESC " "LIMIT ?", (limit,), ).fetchall() return [self._row_to_dict(r) for r in rows] def save_report(self, report: HistoryReport) -> None: """Save an analysis report.""" payload = json.dumps(asdict(report)) with _connect(self._db_path) as conn: conn.execute( "INSERT INTO history_reports " "(generated_at, payload) VALUES (?, ?)", (report.generated_at, payload), ) conn.commit() def load_latest_report(self) -> dict | None: """Load the most recent report.""" with _connect(self._db_path) as conn: row = conn.execute( "SELECT payload FROM history_reports " "ORDER BY id DESC LIMIT 1", ).fetchone() if not row: return None return json.loads(row["payload"]) def _row_to_dict(self, r: sqlite3.Row) -> dict: """Convert a session row to dict.""" return { "session_id": r["session_id"], "provider": r["provider"], "project": r["project"], "started_at": r["started_at"], "ended_at": r["ended_at"], "prompt_count": r["prompt_count"], "tool_use_count": r["tool_use_count"], "models_used": json.loads(r["models_used"]), "git_branch": r["git_branch"], "topics": json.loads(r["topics"]), "summary": r["summary"], } ================================================ FILE: maggy/maggy/improve/__init__.py ================================================ """Self-improvement — signal collection and analysis.""" ================================================ FILE: maggy/maggy/improve/analyzer.py ================================================ """Analyze collected signals and produce recommendations.""" from __future__ import annotations from .models import Recommendation, SignalBundle LOW_REWARD = 0.4 HIGH_FAILURE_RATE = 0.2 LOW_USAGE_RATE = 0.05 LOW_HEALTH = 0.5 HIGH_UTILIZATION = 0.9 def analyze_routing(signals: SignalBundle) -> list[Recommendation]: """Flag models with low average reward.""" recs: list[Recommendation] = [] for entry in signals.routing.get("underperformers", []): recs.append(Recommendation( category="routing", severity="warning", message=( f"Model {entry.get('model', '?')} underperforms on " f"{entry.get('task_type', '?')} " f"(avg reward {entry.get('avg_reward', 0):.2f})." ), suggestion="Consider routing to a different model.", data=entry, )) return recs def analyze_failures(signals: SignalBundle) -> list[Recommendation]: """Flag high execution failure rates.""" rate = signals.events.get("failure_rate", 0) if rate < HIGH_FAILURE_RATE: return [] return [Recommendation( category="reliability", severity="action", message=f"Execution failure rate is {rate:.0%}.", suggestion="Check tool configuration and logs.", data=signals.events, )] def analyze_usage(signals: SignalBundle) -> list[Recommendation]: """Detect underutilized providers.""" recs: list[Recommendation] = [] by_provider = signals.history.get("by_provider", {}) total = signals.history.get("sessions", 0) if total == 0: return [] for provider, count in by_provider.items(): ratio = count / total if ratio < LOW_USAGE_RATE: recs.append(Recommendation( category="usage", severity="info", message=( f"{provider} used in only " f"{ratio:.0%} of sessions." ), suggestion="Consider removing or promoting it.", data={"provider": provider, "ratio": ratio}, )) return recs def analyze_gaps(signals: SignalBundle) -> list[Recommendation]: """Surface triggered capability gaps.""" recs: list[Recommendation] = [] for gap in signals.forge.get("gaps", []): recs.append(Recommendation( category="capability", severity="action", message=( f"Capability '{gap.get('name', '?')}' " f"requested {gap.get('count', 0)} times." ), suggestion="Consider building an MCP server.", data=gap, )) return recs def analyze_memory(signals: SignalBundle) -> list[Recommendation]: """Flag low engram health scores.""" score = signals.engram.get("health_score", 1.0) if score >= LOW_HEALTH: return [] return [Recommendation( category="memory", severity="warning", message=f"Memory health is {score:.2f}.", suggestion="Run engram cleanup or review superseded records.", data=signals.engram, )] def analyze_cost(signals: SignalBundle) -> list[Recommendation]: """Flag high budget utilization.""" util = signals.budget.get("utilization", 0) if util < HIGH_UTILIZATION: return [] return [Recommendation( category="cost", severity="action", message=f"Budget utilization at {util:.0%}.", suggestion="Increase daily_limit_usd or optimize routing.", data=signals.budget, )] def analyze_all(signals: SignalBundle) -> list[Recommendation]: """Run all analyzers and merge results.""" recs: list[Recommendation] = [] for fn in ( analyze_routing, analyze_failures, analyze_usage, analyze_gaps, analyze_memory, analyze_cost, ): recs.extend(fn(signals)) return recs ================================================ FILE: maggy/maggy/improve/models.py ================================================ """Data models for self-improvement analysis.""" from __future__ import annotations from dataclasses import dataclass, field @dataclass class Recommendation: category: str # routing | reliability | usage | capability | memory | cost severity: str # info | warning | action message: str suggestion: str data: dict = field(default_factory=dict) @dataclass class SignalBundle: routing: dict = field(default_factory=dict) events: dict = field(default_factory=dict) history: dict = field(default_factory=dict) forge: dict = field(default_factory=dict) engram: dict = field(default_factory=dict) budget: dict = field(default_factory=dict) collected_at: str = "" @dataclass class ImprovementReport: generated_at: str total_signals: int recommendations: list[Recommendation] = field(default_factory=list) health_summary: dict = field(default_factory=dict) top_actions: list[str] = field(default_factory=list) ================================================ FILE: maggy/maggy/improve/service.py ================================================ """Introspector — orchestrates signal collection and analysis.""" from __future__ import annotations import logging from datetime import datetime, timezone from .analyzer import analyze_all from .models import ImprovementReport, SignalBundle from .signals import collect_all logger = logging.getLogger(__name__) class Introspector: """Collect signals, analyze, persist recommendations.""" def __init__(self, app_state) -> None: self._state = app_state self._last_report: ImprovementReport | None = None def analyze(self) -> ImprovementReport: """Run full analysis cycle.""" signals = collect_all(self._state) recs = analyze_all(signals) report = self._build_report(signals, recs) self._persist(report) self._last_report = report return report def get_report(self) -> ImprovementReport | None: """Return the most recent report.""" return self._last_report def _build_report(self, signals, recs) -> ImprovementReport: total = sum( 1 for v in ( signals.routing, signals.events, signals.history, signals.forge, signals.engram, signals.budget, ) if v ) actions = [ r.message for r in recs if r.severity == "action" ][:3] health = self._health_summary(signals) return ImprovementReport( generated_at=datetime.now(timezone.utc).isoformat(), total_signals=total, recommendations=recs, health_summary=health, top_actions=actions, ) def _health_summary(self, s: SignalBundle) -> dict: summary: dict = {} if s.routing: bad = len(s.routing.get("underperformers", [])) summary["routing"] = 0.5 if bad else 1.0 if s.engram: summary["memory"] = s.engram.get("health_score", 1.0) if s.events: rate = s.events.get("failure_rate", 0) summary["reliability"] = round(1.0 - rate, 2) if s.budget: util = s.budget.get("utilization", 0) summary["cost"] = round(1.0 - util, 2) return summary def _persist(self, report: ImprovementReport) -> None: """Write report as engram + emit mutation events.""" engram = getattr(self._state, "engram", None) if engram: self._write_engram(engram, report) events = getattr(self._state, "events", None) if events: self._emit_mutations(events, report) def _write_engram(self, engram, report) -> None: from maggy.engram.record import EngramRecord import uuid try: record = EngramRecord( engram_id=uuid.uuid4().hex[:12], namespace="self-improvement", memory_type="fact", content=f"Report: {len(report.recommendations)} recs", tags=["auto-improve"], ) engram.write(record) except Exception as exc: logger.warning("Failed to write engram: %s", exc) def _emit_mutations(self, events, report) -> None: from maggy.event_spine.events import MutationEvent from maggy.event_spine.header import EventHeader for rec in report.recommendations: if rec.severity != "action": continue try: evt = MutationEvent( header=EventHeader(event_type="mutation"), control_level="advisory", target=rec.category, old_value="", new_value=rec.suggestion, reason=rec.message, ) events.emit(evt) except Exception as exc: logger.warning("Failed to emit: %s", exc) ================================================ FILE: maggy/maggy/improve/signals.py ================================================ """Signal collectors — pull data from existing services.""" from __future__ import annotations from datetime import datetime, timezone from .models import SignalBundle MIN_SAMPLES = 5 LOW_REWARD = 0.4 HIGH_FAILURE_RATE = 0.2 LOW_USAGE_RATE = 0.05 def collect_routing(routing) -> dict: """Read reward heatmap from RoutingService.""" heatmap = routing.get_heatmap() underperformers = [ entry for entry in heatmap if entry.get("count", 0) >= MIN_SAMPLES and entry.get("avg_reward", 1.0) < LOW_REWARD ] return {"heatmap": heatmap, "underperformers": underperformers} def collect_events(events) -> dict: """Read outcome events for failure analysis.""" outcomes = events.query(event_type="outcome", limit=200) total = len(outcomes) failures = sum( 1 for o in outcomes if not o.get("success", True) ) rate = failures / total if total else 0.0 return { "total": total, "failures": failures, "failure_rate": round(rate, 3), } def collect_history(history) -> dict: """Read session patterns from HistoryService.""" report = history.get_report() if not report: return {"sessions": 0, "patterns": []} return { "sessions": report.get("total_sessions", 0), "patterns": report.get("patterns", []), "by_provider": report.get("by_provider", {}), } def collect_forge(forge) -> dict: """Read capability gaps from ForgeConnector.""" gaps = forge.get_gaps() return {"gaps": gaps, "count": len(gaps)} def collect_engram(engram) -> dict: """Read memory health from EngramStore.""" from maggy.engram.diagnostics import diagnose profile = diagnose(engram) return { "health_score": profile.health_score, "total": profile.total_memories, "active": profile.active_count, "superseded": profile.superseded_count, } def collect_budget(budget) -> dict: """Read spend patterns from BudgetManager.""" return budget.budget_status() def collect_all(app_state) -> SignalBundle: """Collect signals from all available services.""" bundle = SignalBundle( collected_at=datetime.now(timezone.utc).isoformat(), ) if app_state.routing: bundle.routing = collect_routing(app_state.routing) if app_state.events: bundle.events = collect_events(app_state.events) if app_state.history: bundle.history = collect_history(app_state.history) if app_state.forge: bundle.forge = collect_forge(app_state.forge) if app_state.engram: bundle.engram = collect_engram(app_state.engram) if app_state.budget: bundle.budget = collect_budget(app_state.budget) return bundle ================================================ FILE: maggy/maggy/lexon/__init__.py ================================================ """Lexon — intent parsing and tool disambiguation.""" ================================================ FILE: maggy/maggy/lexon/disambiguate.py ================================================ """Confidence-gated disambiguation for ambiguous intents.""" from __future__ import annotations from dataclasses import dataclass SELF_CLARIFY_THRESHOLD = 0.5 USER_CLARIFY_THRESHOLD = 0.3 @dataclass class DisambiguationResult: """Outcome of disambiguation attempt.""" resolved: bool tool: str = "" mode: str = "" # self_clarify | user_clarify | none suggestions: list[str] | None = None def disambiguate( confidence: float, candidates: list[str], ) -> DisambiguationResult: """Determine disambiguation strategy. >= 0.7: auto-resolve (no disambiguation needed) 0.5-0.7: self-clarify (use context to pick) 0.3-0.5: user-clarify (ask the user) < 0.3: reject (too ambiguous) """ if confidence >= 0.7 and candidates: return DisambiguationResult( resolved=True, tool=candidates[0], mode="none", ) if confidence >= SELF_CLARIFY_THRESHOLD and candidates: return DisambiguationResult( resolved=True, tool=candidates[0], mode="self_clarify", suggestions=candidates[:3], ) if confidence >= USER_CLARIFY_THRESHOLD and candidates: return DisambiguationResult( resolved=False, mode="user_clarify", suggestions=candidates[:5], ) return DisambiguationResult( resolved=False, mode="none", suggestions=candidates[:3] if candidates else None, ) ================================================ FILE: maggy/maggy/lexon/personalization.py ================================================ """Implicit learning — tracks 5 user behavior signals.""" from __future__ import annotations from collections import Counter from dataclasses import dataclass, field @dataclass class UserSignals: """Five implicit signals for personalization.""" tool_frequency: Counter = field( default_factory=Counter ) correction_pairs: list[tuple[str, str]] = field( default_factory=list ) preferred_aliases: dict[str, str] = field( default_factory=dict ) rejection_count: Counter = field( default_factory=Counter ) confirmation_rate: dict[str, float] = field( default_factory=dict ) class PersonalizationEngine: """Learns from user behavior to improve intent parsing.""" def __init__(self): self._signals = UserSignals() def record_use(self, tool: str) -> None: """Signal 1: Track tool usage frequency.""" self._signals.tool_frequency[tool] += 1 def record_correction( self, wrong: str, correct: str, ) -> None: """Signal 2: Track user corrections.""" self._signals.correction_pairs.append( (wrong, correct) ) def record_alias( self, phrase: str, tool: str, ) -> None: """Signal 3: Track preferred naming.""" self._signals.preferred_aliases[ phrase.lower() ] = tool def record_rejection(self, tool: str) -> None: """Signal 4: Track rejected suggestions.""" self._signals.rejection_count[tool] += 1 def get_preferred(self, phrase: str) -> str | None: """Check if user has a preference for this phrase.""" return self._signals.preferred_aliases.get( phrase.lower() ) def top_tools(self, n: int = 5) -> list[str]: """Return most frequently used tools.""" return [ t for t, _ in self._signals.tool_frequency.most_common(n) ] @property def signals(self) -> UserSignals: return self._signals ================================================ FILE: maggy/maggy/lexon/record.py ================================================ """LexonRecord — parsed intent with confidence.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone @dataclass class LexonRecord: """A parsed user intent.""" phrase: str resolved_tool: str = "" confidence: float = 0.0 candidates: list[str] = field(default_factory=list) disambiguation_mode: str = "" # "" | self_clarify | user_clarify created_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) @property def is_ambiguous(self) -> bool: return self.confidence < 0.7 @property def needs_user_input(self) -> bool: return self.disambiguation_mode == "user_clarify" ================================================ FILE: maggy/maggy/lexon/router.py ================================================ """Two-tier Lexon router — fast keyword + fallback LLM.""" from __future__ import annotations from .disambiguate import disambiguate from .personalization import PersonalizationEngine from .record import LexonRecord from .terminology import TerminologyMap CONFIDENCE_THRESHOLD = 0.82 TOP2_GAP = 0.15 DEFAULT_TOOL_MANIFEST = { "deploy": ["vercel_deploy", "docker_push"], "test": ["pytest", "vitest", "jest"], "fix": ["code_edit", "patch"], "create": ["file_create", "scaffold"], "delete": ["file_delete", "cleanup"], "update": ["code_edit", "config_update"], "search": ["grep", "glob", "find"], "review": ["code_review", "pr_review"], } class LexonRouter: """Routes user phrases to tools using two tiers. Tier 1: Fast keyword/terminology lookup Tier 2: LLM-based intent classification (stub) """ def __init__(self, config: dict[str, object] | None = None): self._config = config or {} self._terms = TerminologyMap() self._personal = PersonalizationEngine() self._tool_map = self._load_tool_manifest() def route(self, phrase: str) -> LexonRecord: """Route a phrase to a tool.""" preferred = self._personal.get_preferred(phrase) if preferred: return LexonRecord( phrase=phrase, resolved_tool=preferred, confidence=0.95, candidates=[preferred], ) tier1 = self._route_tier1(phrase) if tier1: return tier1 return self._llm_classify(phrase) def learn(self, phrase: str, tool: str) -> None: """Record a confirmed tool selection.""" self._personal.record_use(tool) self._personal.record_alias(phrase, tool) @property def terminology(self) -> TerminologyMap: return self._terms @property def personalization(self) -> PersonalizationEngine: return self._personal def _load_tool_manifest(self) -> dict[str, list[str]]: manifest = self._config.get("tool_manifest", DEFAULT_TOOL_MANIFEST) if not isinstance(manifest, dict): return dict(DEFAULT_TOOL_MANIFEST) return { str(key): [str(item) for item in value] for key, value in manifest.items() if isinstance(value, list) } or dict(DEFAULT_TOOL_MANIFEST) def _llm_classify(self, phrase: str) -> LexonRecord: return LexonRecord( phrase=phrase, confidence=0.55, disambiguation_mode="llm", ) def _route_tier1(self, phrase: str) -> LexonRecord | None: for word in phrase.lower().split(): canonical = self._terms.resolve(word) if canonical and canonical in self._tool_map: return self._resolve_manifest_match(phrase, self._tool_map[canonical]) return None def _resolve_manifest_match( self, phrase: str, candidates: list[str], ) -> LexonRecord: confidence = self._keyword_confidence(candidates) if confidence < CONFIDENCE_THRESHOLD: return self._llm_classify(phrase) if self._top2_gap(candidates) < TOP2_GAP: return self._llm_classify(phrase) result = disambiguate(confidence, candidates) return LexonRecord( phrase=phrase, resolved_tool=result.tool if result.resolved else "", confidence=confidence, candidates=candidates, disambiguation_mode=result.mode, ) def _keyword_confidence(self, candidates: list[str]) -> float: if len(candidates) == 1: return 0.9 if len(candidates) == 2: return 0.84 return 0.8 def _top2_gap(self, candidates: list[str]) -> float: if len(candidates) <= 1: return 1.0 if len(candidates) == 2: return 0.18 return 0.1 ================================================ FILE: maggy/maggy/lexon/terminology.py ================================================ """3-level terminology map for intent normalization. Level 1: Canonical terms (e.g., "deploy") Level 2: Synonyms (e.g., "ship", "push", "release") Level 3: Project-specific aliases (learned over time) """ from __future__ import annotations from dataclasses import dataclass, field @dataclass class TermEntry: """A canonical term with synonyms.""" canonical: str synonyms: list[str] = field(default_factory=list) aliases: list[str] = field(default_factory=list) DEFAULT_TERMS: list[TermEntry] = [ TermEntry("deploy", ["ship", "push", "release", "publish"]), TermEntry("test", ["check", "verify", "validate", "qa"]), TermEntry("fix", ["repair", "patch", "resolve", "debug"]), TermEntry("create", ["add", "build", "make", "generate"]), TermEntry("delete", ["remove", "drop", "destroy", "clean"]), TermEntry("update", ["modify", "change", "edit", "revise"]), TermEntry("search", ["find", "lookup", "query", "locate"]), TermEntry("review", ["inspect", "audit", "examine", "check"]), ] class TerminologyMap: """Three-level terminology resolution.""" def __init__( self, terms: list[TermEntry] | None = None, ): # Deep copy to avoid mutating module-level defaults if terms is not None: self._terms = terms else: self._terms = [ TermEntry( t.canonical, list(t.synonyms), list(t.aliases), ) for t in DEFAULT_TERMS ] self._index = self._build_index() def _build_index(self) -> dict[str, str]: idx: dict[str, str] = {} for t in self._terms: idx[t.canonical] = t.canonical for s in t.synonyms: idx[s] = t.canonical for a in t.aliases: idx[a] = t.canonical return idx def resolve(self, word: str) -> str | None: """Resolve a word to its canonical form.""" return self._index.get(word.lower()) def add_alias(self, canonical: str, alias: str) -> bool: """Add a project-specific alias (Level 3).""" for t in self._terms: if t.canonical == canonical: t.aliases.append(alias.lower()) self._index[alias.lower()] = canonical return True return False def list_terms(self) -> list[TermEntry]: return list(self._terms) ================================================ FILE: maggy/maggy/main.py ================================================ """Maggy FastAPI app entrypoint.""" from __future__ import annotations import logging from contextlib import asynccontextmanager from pathlib import Path from fastapi import FastAPI from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles from starlette.middleware.base import ( BaseHTTPMiddleware, RequestResponseEndpoint, ) from starlette.requests import Request from starlette.responses import Response from maggy import config as config_mod from maggy import providers from maggy.api.routes import router as api_router from maggy.api.routes_budget import router as budget_router from maggy.api.routes_cikg import router as cikg_router from maggy.api.routes_deploy import router as deploy_router from maggy.api.routes_engram import router as engram_router from maggy.api.routes_events import router as events_router from maggy.api.routes_forge import router as forge_router from maggy.api.routes_heartbeat import router as heartbeat_router from maggy.api.routes_history import router as history_router from maggy.api.routes_improve import router as improve_router from maggy.api.routes_lexon import router as lexon_router from maggy.api.routes_mesh import router as mesh_router from maggy.api.routes_mesh_admin import router as mesh_admin_router from maggy.api.routes_planning import router as planning_router from maggy.api.routes_process import router as process_router from maggy.api.routes_routing import router as routing_router from maggy.api.routes_chat import router as chat_router from maggy.api.routes_escalation import router as escalation_router from maggy.api.routes_observability import router as observability_router from maggy.api.routes_projects import router as projects_router from maggy.api.routes_setup import router as setup_router from maggy.api.routes_users import router as users_router from maggy.mesh.ws_server import router as ws_mesh_router from maggy.budget import BudgetManager from maggy.event_spine.emitter import EventEmitter from maggy.event_spine.store import EventStore from maggy.history.service import HistoryService from maggy.process.service import ProcessService from maggy.routing import RoutingService from maggy.services.competitor import CompetitorService from maggy.services.executor import ExecutorService from maggy.services.inbox import InboxService logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s") logger = logging.getLogger("maggy") _TIER1_ATTRS = ("budget", "routing", "events", "cikg", "planning", "deploy", "forge", "engram", "lexon", "mesh", "activity", "registry", "escalator", "observability") _TIER2_ATTRS = ("provider", "inbox", "competitors", "executor", "process") def _init_tier1(app: FastAPI, cfg) -> None: """Tier 1: local-only services.""" db_dir = Path(cfg.storage.path).expanduser().parent app.state.budget = BudgetManager(cfg) app.state.routing = RoutingService(cfg) app.state.events = EventEmitter(EventStore(db_dir / "events.db")) from maggy.cikg.graph import KnowledgeGraphService app.state.cikg = KnowledgeGraphService(db_dir / "cikg.db") from maggy.planning import PlanningService app.state.planning = PlanningService(cfg) from maggy.deploy import DeployService app.state.deploy = DeployService() from maggy.forge.connector import ForgeConnector app.state.forge = ForgeConnector() from maggy.engram.store import EngramStore app.state.engram = EngramStore(db_dir / "engram.db") from maggy.engram.seed import seed_if_empty seed_if_empty(app.state.engram) from maggy.lexon.router import LexonRouter app.state.lexon = LexonRouter() _init_mesh(app, cfg) from maggy.services.activity import ActivityService app.state.activity = ActivityService() app.state.history = HistoryService(db_path=db_dir / "history.db") from maggy.improve.service import Introspector app.state.introspector = Introspector(app.state) from maggy.services.chat import ChatManager app.state.chat = ChatManager(cfg) from maggy.registry import ProjectRegistry app.state.registry = ProjectRegistry(cfg) from maggy.escalation.protocol import Escalator app.state.escalator = Escalator(db_dir / "escalations.db") from maggy.observability.collector import ObservabilityCollector app.state.observability = ObservabilityCollector(db_dir / "observability.db") def _init_mesh(app: FastAPI, cfg) -> None: """Wire MeshManager if enabled in config.""" if not cfg.mesh.enabled or not cfg.mesh.org_key_secret: if cfg.mesh.enabled and not cfg.mesh.org_key_secret: logger.warning("Mesh disabled: MAGGY_MESH_SECRET not set") app.state.mesh = None return from maggy.mesh.manager import MeshManager from maggy.mesh.org_scanner import effective_orgs from maggy.mesh.store import MeshStore db_dir = Path(cfg.storage.path).expanduser().parent store = MeshStore(db_dir / "mesh.db") mgr = MeshManager(cfg.mesh, store) for org in effective_orgs(cfg.mesh.orgs, [], cfg.mesh.exclude_orgs): mgr.add_network(org) app.state.mesh = mgr def _set_mode(app: FastAPI, cfg) -> None: """Initialize or skip Tier 2 based on credentials.""" if config_mod._has_provider_credentials(cfg): app.state.provider = providers.build(cfg) app.state.inbox = InboxService(cfg, app.state.provider) app.state.competitors = CompetitorService(cfg) app.state.executor = ExecutorService(cfg, app.state.provider) app.state.process = ProcessService(cfg) app.state.mode = "full" else: for attr in _TIER2_ATTRS: setattr(app.state, attr, None) app.state.mode = "local" async def _start_heartbeat(app: FastAPI) -> None: """Register and start the heartbeat scheduler.""" cfg = app.state.cfg if not cfg.heartbeat.enabled or not app.state.configured: app.state.heartbeat = None return from maggy.heartbeat.scheduler import HeartbeatScheduler from maggy.heartbeat.jobs import refresh_history, expire_engrams, self_improve, mesh_heartbeat, collect_signals from functools import partial sched = HeartbeatScheduler() sched.register("refresh_history", partial(refresh_history, app), cfg.heartbeat.history_interval) sched.register("expire_engrams", partial(expire_engrams, app), cfg.heartbeat.engram_interval) sched.register("self_improve", partial(self_improve, app), cfg.heartbeat.improve_interval) sched.register("collect_signals", partial(collect_signals, app), cfg.heartbeat.improve_interval) if cfg.mesh.enabled: sched.register("mesh_heartbeat", partial(mesh_heartbeat, app), cfg.heartbeat.mesh_interval) await sched.start() app.state.heartbeat = sched logger.info("Heartbeat started — %d jobs", len(sched._jobs)) @asynccontextmanager async def lifespan(app: FastAPI): """Startup/shutdown lifecycle.""" await _start_heartbeat(app) await _bootstrap(app) yield if app.state.heartbeat: await app.state.heartbeat.stop() async def _bootstrap(app: FastAPI) -> None: """Seed services with data on first startup.""" history = getattr(app.state, "history", None) if history: try: history.analyze() except Exception as e: logger.warning("Bootstrap history failed: %s", e) introspector = getattr(app.state, "introspector", None) if introspector: try: introspector.analyze() except Exception as e: logger.warning("Bootstrap improve failed: %s", e) cikg = getattr(app.state, "cikg", None) cfg = getattr(app.state, "cfg", None) if cikg and cfg: try: _seed_cikg(cikg, cfg) except Exception as e: logger.warning("Bootstrap CIKG failed: %s", e) def _seed_cikg(cikg, cfg) -> None: """Build initial knowledge graph from configured codebases.""" from datetime import datetime, timezone from maggy.cikg.models import Node now = datetime.now(timezone.utc).isoformat() for cb in cfg.codebases: path = Path(cb.path).expanduser() if not path.exists(): continue cikg.add_node(Node( id=f"codebase:{cb.key}", node_type="codebase", name=cb.key, description=str(path), metadata={"path": str(path)}, created_at=now, )) _add_language_nodes(cikg, cb.key, path, now) def _add_language_nodes(cikg, codebase_key, path, now) -> None: """Detect languages in a codebase and add nodes + edges.""" from maggy.cikg.models import Edge, Node ext_map = { ".py": "python", ".ts": "typescript", ".tsx": "typescript", ".js": "javascript", ".jsx": "javascript", ".go": "go", ".rs": "rust", ".java": "java", ".rb": "ruby", ".swift": "swift", ".kt": "kotlin", ".cs": "csharp", } skip_dirs = { "node_modules", ".git", "__pycache__", ".venv", "venv", "dist", "build", ".next", "target", } found: set[str] = set() # Only scan 2 levels deep to avoid slow recursive scan for child in path.iterdir(): if child.name in skip_dirs: continue if child.is_file() and child.suffix in ext_map: found.add(ext_map[child.suffix]) elif child.is_dir(): try: for f in child.iterdir(): if f.is_file() and f.suffix in ext_map: found.add(ext_map[f.suffix]) except PermissionError: pass if len(found) >= 10: break for lang in found: node_id = f"lang:{lang}" cikg.add_node(Node( id=node_id, node_type="technology", name=lang, description=f"{lang} programming language", metadata={}, created_at=now, )) cikg.add_edge(Edge( source_id=f"codebase:{codebase_key}", target_id=node_id, edge_type="uses_technology", )) class _NoCacheStatic(BaseHTTPMiddleware): """Add no-cache headers to /static responses.""" async def dispatch( self, request: Request, call_next: RequestResponseEndpoint, ) -> Response: response = await call_next(request) if request.url.path.startswith("/static"): response.headers["Cache-Control"] = "no-store" return response _ROUTERS = ( api_router, budget_router, chat_router, cikg_router, deploy_router, engram_router, escalation_router, events_router, forge_router, heartbeat_router, history_router, improve_router, lexon_router, mesh_router, mesh_admin_router, observability_router, planning_router, process_router, projects_router, routing_router, setup_router, users_router, ws_mesh_router, ) def create_app() -> FastAPI: """Build the FastAPI application.""" cfg = config_mod.load() if cfg.dashboard.auth_mode == "local" and cfg.dashboard.host not in ("127.0.0.1", "localhost", "::1"): raise RuntimeError( f"dashboard.auth_mode=\"local\" is only safe on loopback. " f"You configured host={cfg.dashboard.host!r} — set auth_mode=\"token\" and MAGGY_API_KEY, " f"or bind to 127.0.0.1." ) app = FastAPI(title="Maggy", version="0.1.0", lifespan=lifespan) app.add_middleware(_NoCacheStatic) app.state.cfg = cfg app.state.configured = config_mod.is_configured() if app.state.configured: _init_tier1(app, cfg) else: for attr in _TIER1_ATTRS: setattr(app.state, attr, None) from maggy.services.activity import ActivityService app.state.activity = ActivityService() app.state.history = HistoryService() app.state.introspector = None from maggy.services.chat import ChatManager app.state.chat = ChatManager(cfg) _set_mode(app, cfg) logger.info("Maggy ready (%s) — codebases=%d", app.state.mode, len(cfg.codebases)) for r in _ROUTERS: app.include_router(r) static_dir = Path(__file__).parent / "static" if static_dir.exists(): app.mount("/static", StaticFiles(directory=str(static_dir)), name="static") @app.get("/") async def index(): return FileResponse( str(static_dir / "index.html"), headers={"Cache-Control": "no-store"}, ) return app def reconfigure(app: FastAPI) -> None: """Reload config and reinitialize services.""" cfg = config_mod.load(refresh=True) app.state.cfg = cfg app.state.configured = config_mod.is_configured() if app.state.configured: _init_tier1(app, cfg) _set_mode(app, cfg) logger.info("Reconfigured — mode=%s", app.state.mode) app = create_app() def _print_banner(host: str, port: int) -> None: """Print startup banner with usage instructions.""" url = f"http://{host}:{port}" print("\n\033[1;38;5;208m Maggy\033[0m") print(f" Dashboard: \033[4m{url}\033[0m") print() print( " \033[33mKeep this terminal open\033[0m" " — Maggy runs here." ) print( " Use other terminals for Claude Code" " sessions." ) print( " Maggy Chat auto-connects to all" " active sessions." ) print( "\n Press Ctrl+C to stop.\n" ) def main() -> None: """Console script entrypoint.""" import uvicorn cfg = config_mod.load() _print_banner(cfg.dashboard.host, cfg.dashboard.port) uvicorn.run( "maggy.main:app", host=cfg.dashboard.host, port=cfg.dashboard.port, reload=False, ) if __name__ == "__main__": main() ================================================ FILE: maggy/maggy/mesh/__init__.py ================================================ """Maggy Mesh — P2P memory sharing between instances.""" ================================================ FILE: maggy/maggy/mesh/discovery.py ================================================ """Peer discovery — registry with optional SQLite backing.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone @dataclass class PeerInfo: """Known mesh peer.""" peer_id: str name: str address: str port: int = 8080 org: str = "" last_seen: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) manual: bool = False class PeerRegistry: """Registry of known mesh peers.""" def __init__(self, store=None, org: str = ""): self._store = store self._org = org self._peers: dict[str, PeerInfo] = {} if store and org: self._load_from_store() def _load_from_store(self) -> None: for row in self._store.list_peers(self._org): self._peers[row["peer_id"]] = PeerInfo( peer_id=row["peer_id"], name=row["name"], address=row["address"], port=row["port"], org=row.get("org", self._org), last_seen=row.get("last_seen", ""), manual=bool(row.get("manual", 0)), ) def register(self, peer: PeerInfo) -> None: if self._store and self._org: self._store.upsert_peer( peer.peer_id, peer.name, peer.address, peer.port, self._org, ) self._peers[peer.peer_id] = peer def unregister(self, peer_id: str) -> bool: if self._store and self._org: self._store.remove_peer(peer_id, self._org) if peer_id in self._peers: del self._peers[peer_id] return True return False def get(self, peer_id: str) -> PeerInfo | None: return self._peers.get(peer_id) def list_peers(self) -> list[PeerInfo]: return list(self._peers.values()) def update_seen(self, peer_id: str) -> None: peer = self._peers.get(peer_id) if peer: peer.last_seen = datetime.now( timezone.utc ).isoformat() if self._store and self._org: self._store.upsert_peer( peer.peer_id, peer.name, peer.address, peer.port, self._org, ) @property def count(self) -> int: return len(self._peers) ================================================ FILE: maggy/maggy/mesh/git_discovery.py ================================================ """Git-based peer discovery via GitHub Contents API.""" from __future__ import annotations import base64 import json import logging from dataclasses import dataclass import httpx logger = logging.getLogger(__name__) GITHUB_API = "https://api.github.com" REPO_NAME = "maggy-mesh" TIMEOUT = 15 @dataclass class Announcement: """Peer data for git-based discovery.""" peer_id: str name: str address: str port: int = 8080 org: str = "" def _headers(token: str) -> dict[str, str]: return { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } async def ensure_mesh_repo( org: str, token: str, private: bool = True, ) -> bool: """Create {org}/maggy-mesh repo if it doesn't exist.""" async with httpx.AsyncClient( timeout=TIMEOUT, headers=_headers(token), ) as client: resp = await client.get( f"{GITHUB_API}/repos/{org}/{REPO_NAME}", ) if resp.status_code == 200: return True resp = await client.post( f"{GITHUB_API}/orgs/{org}/repos", json={ "name": REPO_NAME, "private": private, "description": "Maggy mesh peer discovery", "auto_init": True, }, ) if resp.status_code in (200, 201): logger.info("Created %s/%s", org, REPO_NAME) return True logger.warning( "Failed to create %s/%s: %s", org, REPO_NAME, resp.status_code, ) return False async def announce( org: str, ann: Announcement, token: str, ) -> bool: """Write peer announcement to {org}/maggy-mesh.""" content = json.dumps({ "peer_id": ann.peer_id, "name": ann.name, "address": ann.address, "port": ann.port, "org": org, }, indent=2) encoded = base64.b64encode(content.encode()).decode() path = f"peers/{ann.peer_id}.json" async with httpx.AsyncClient( timeout=TIMEOUT, headers=_headers(token), ) as client: existing = await client.get( f"{GITHUB_API}/repos/{org}/{REPO_NAME}" f"/contents/{path}", ) sha = "" if existing.status_code == 200: sha = existing.json().get("sha", "") body: dict = { "message": f"announce {ann.peer_id}", "content": encoded, } if sha: body["sha"] = sha resp = await client.put( f"{GITHUB_API}/repos/{org}/{REPO_NAME}" f"/contents/{path}", json=body, ) if resp.status_code not in (200, 201): logger.warning( "Announce %s to %s failed: %s", ann.peer_id, org, resp.status_code, ) return resp.status_code in (200, 201) async def read_peers( org: str, token: str, ) -> list[dict]: """Read all peer announcements from {org}/maggy-mesh.""" async with httpx.AsyncClient( timeout=TIMEOUT, headers=_headers(token), ) as client: resp = await client.get( f"{GITHUB_API}/repos/{org}/{REPO_NAME}" "/contents/peers", ) if resp.status_code != 200: return [] items = resp.json() if not isinstance(items, list): return [] peers: list[dict] = [] for item in items: name = item.get("name", "") if not name.endswith(".json"): continue peer = _decode_peer(item) if peer: peers.append(peer) return peers def _decode_peer(item: dict) -> dict | None: """Decode peer from directory listing content.""" raw_content = item.get("content") if not raw_content: return None try: return json.loads(base64.b64decode(raw_content)) except (json.JSONDecodeError, Exception): return None async def remove_announcement( org: str, peer_id: str, token: str, ) -> bool: """Remove peer file on shutdown (best-effort).""" path = f"peers/{peer_id}.json" async with httpx.AsyncClient( timeout=TIMEOUT, headers=_headers(token), ) as client: resp = await client.get( f"{GITHUB_API}/repos/{org}/{REPO_NAME}" f"/contents/{path}", ) if resp.status_code != 200: return False sha = resp.json().get("sha", "") resp = await client.delete( f"{GITHUB_API}/repos/{org}/{REPO_NAME}" f"/contents/{path}", json={ "message": f"remove {peer_id}", "sha": sha, }, ) return resp.status_code == 200 ================================================ FILE: maggy/maggy/mesh/manager.py ================================================ """MeshManager — orchestrates multiple org networks.""" from __future__ import annotations import logging import platform from .discovery import PeerInfo from .git_discovery import ( Announcement, announce, ensure_mesh_repo, read_peers, ) from .network import Network, build_network from .store import MeshStore logger = logging.getLogger(__name__) class MeshManager: """Manages all org-scoped mesh networks.""" def __init__(self, cfg, store: MeshStore) -> None: self._cfg = cfg self._store = store self._networks: dict[str, Network] = {} def add_network(self, org: str) -> Network: net = build_network( org, self._cfg.org_key_secret, self._store, ) self._networks[org] = net return net def get_network(self, org: str) -> Network | None: return self._networks.get(org) def list_networks(self) -> list[dict]: return [n.status() for n in self._networks.values()] @property def total_peers(self) -> int: return sum( n.peers.count for n in self._networks.values() ) async def discover(self, token: str) -> dict: """Read peers from git for all networks.""" result: dict[str, int] = {} for org, net in self._networks.items(): if not self._cfg.git_discovery: continue peers = await read_peers(org, token) for p in peers: pid = p.get("peer_id", "") if pid == self._cfg.peer_id: continue net.peers.register(PeerInfo( peer_id=pid, name=p.get("name", ""), address=p.get("address", ""), port=p.get("port", 8080), org=org, )) result[org] = len(peers) return result async def announce_all(self, token: str) -> dict: """Announce self to all org mesh repos.""" address = self._resolve_address() result: dict[str, bool] = {} for org in self._networks: ann = Announcement( peer_id=self._cfg.peer_id, name=platform.node(), address=address, port=self._cfg.port, org=org, ) ok = await announce(org, ann, token) result[org] = ok return result async def setup_repos(self, token: str) -> dict: """Create mesh repos for all networks.""" result: dict[str, bool] = {} for org in self._networks: ok = await ensure_mesh_repo(org, token) result[org] = ok return result def _resolve_address(self) -> str: if self._cfg.tunnel_url: return self._cfg.tunnel_url return f"ws://127.0.0.1:{self._cfg.port}" ================================================ FILE: maggy/maggy/mesh/memory.py ================================================ """Typed memory categories for Mesh sharing.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum class MemoryType(str, Enum): SCORE = "score" PATTERN = "pattern" POLICY = "policy" GAP = "gap" @dataclass class SharedMemory: """A unit of shared memory in the Mesh.""" key: str memory_type: str content: dict = field(default_factory=dict) source_peer: str = "" confidence: float = 1.0 created_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) @property def is_trusted(self) -> bool: return self.confidence >= 0.5 ================================================ FILE: maggy/maggy/mesh/network.py ================================================ """Network — one isolated mesh per GitHub org.""" from __future__ import annotations import logging from dataclasses import dataclass from .discovery import PeerRegistry from .quarantine import QuarantineStore from .store import MeshStore from .sync import SyncEngine from .transport import derive_org_key logger = logging.getLogger(__name__) @dataclass class Network: """A single org-scoped mesh network.""" org: str org_key: str peers: PeerRegistry sync: SyncEngine quarantine: QuarantineStore def status(self) -> dict: return { "org": self.org, "peers": self.peers.count, "memories": self.sync.local_count, "quarantined": self.quarantine.count, } def build_network( org: str, secret: str, store: MeshStore, ) -> Network: """Create an org-scoped network with shared store.""" org_key = derive_org_key(org, secret) quarantine = QuarantineStore(store, org) return Network( org=org, org_key=org_key, peers=PeerRegistry(store, org), sync=SyncEngine(quarantine, store, org), quarantine=quarantine, ) ================================================ FILE: maggy/maggy/mesh/org_scanner.py ================================================ """Scan local repos for unique GitHub org names.""" from __future__ import annotations from pathlib import Path from maggy.discovery import discover_repos, infer_github_org def scan_orgs(home: Path | None = None) -> list[str]: """Return sorted unique GitHub org names from local repos.""" repos = discover_repos(home) orgs: set[str] = set() for repo in repos: org = infer_github_org(Path(repo["path"])) if org: orgs.add(org) return sorted(orgs) def effective_orgs( scanned: list[str], manual: list[str], excluded: list[str], ) -> list[str]: """Merge scanned + manual orgs, remove excluded.""" combined = set(scanned) | set(manual) combined -= set(excluded) return sorted(combined) ================================================ FILE: maggy/maggy/mesh/protocol.py ================================================ """Message types and serialization for Mesh protocol.""" from __future__ import annotations import json from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from enum import Enum class MessageType(str, Enum): HELLO = "hello" SHARE = "share" REQUEST = "request" RESPONSE = "response" QUARANTINE = "quarantine" PROMOTE = "promote" HEARTBEAT = "heartbeat" @dataclass class MeshMessage: """A message in the Mesh protocol.""" msg_type: str sender_id: str payload: dict = field(default_factory=dict) timestamp: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) def serialize(self) -> str: return json.dumps(asdict(self)) @classmethod def deserialize(cls, data: str) -> MeshMessage: d = json.loads(data) return cls(**d) def create_hello(peer_id: str, name: str) -> MeshMessage: return MeshMessage( msg_type=MessageType.HELLO, sender_id=peer_id, payload={"name": name}, ) def create_share( peer_id: str, key: str, content: dict, ) -> MeshMessage: return MeshMessage( msg_type=MessageType.SHARE, sender_id=peer_id, payload={ "key": key, "memory_type": content.get("memory_type", ""), "content": content, }, ) ================================================ FILE: maggy/maggy/mesh/provenance.py ================================================ """Provenance tracking with confidence decay.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone DECAY_PER_HOP = 0.1 MIN_CONFIDENCE = 0.1 @dataclass class Provenance: """Tracks origin and confidence of shared data.""" origin_peer: str hops: int = 0 base_confidence: float = 1.0 received_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) @property def effective_confidence(self) -> float: decayed = self.base_confidence - (self.hops * DECAY_PER_HOP) return max(decayed, MIN_CONFIDENCE) def add_hop(self) -> Provenance: """Create new provenance with one more hop.""" return Provenance( origin_peer=self.origin_peer, hops=self.hops + 1, base_confidence=self.base_confidence, ) ================================================ FILE: maggy/maggy/mesh/publisher.py ================================================ """Collect local data and build shareable memories.""" from __future__ import annotations from .memory import SharedMemory def collect_scores(routing, peer_id: str) -> list[SharedMemory]: """Build shareable routing score memories.""" if not routing: return [] shares: list[SharedMemory] = [] for entry in routing.get_heatmap(): if entry.get("count", 0) < 5: continue key = f"score:{entry.get('model', '')}:{entry.get('task_type', '')}" shares.append(SharedMemory( key=key, memory_type="score", content=entry, source_peer=peer_id, confidence=min(entry.get("count", 0) / 20, 1.0), )) return shares def collect_gaps(forge, peer_id: str) -> list[SharedMemory]: """Build shareable capability gap memories.""" if not forge: return [] shares: list[SharedMemory] = [] for gap in forge.get_gaps(): key = f"gap:{gap.get('name', '')}" shares.append(SharedMemory( key=key, memory_type="gap", content=gap, source_peer=peer_id, )) return shares def collect_policies(introspector, peer_id: str) -> list[SharedMemory]: """Build shareable policy memories from recommendations.""" if not introspector: return [] report = introspector.get_report() if not report: return [] shares: list[SharedMemory] = [] for rec in report.recommendations: if rec.severity != "action": continue key = f"policy:{rec.category}" shares.append(SharedMemory( key=key, memory_type="policy", content={"message": rec.message, "suggestion": rec.suggestion}, source_peer=peer_id, )) return shares def collect_all_shares(app_state, peer_id: str) -> list[SharedMemory]: """Collect all shareable data from local services.""" shares: list[SharedMemory] = [] shares.extend(collect_scores( getattr(app_state, "routing", None), peer_id, )) shares.extend(collect_gaps( getattr(app_state, "forge", None), peer_id, )) shares.extend(collect_policies( getattr(app_state, "introspector", None), peer_id, )) return shares ================================================ FILE: maggy/maggy/mesh/quarantine.py ================================================ """Quarantine system for untrusted mesh data.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone @dataclass class QuarantineEntry: """A quarantined memory item.""" key: str source_peer: str reason: str content: dict = field(default_factory=dict) memory_type: str = "" quarantined_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) class QuarantineStore: """Manages quarantined data from mesh peers.""" def __init__(self, store=None, org: str = ""): self._entries: dict[str, QuarantineEntry] = {} self._store = store self._org = org if store and org: self._load_from_store() def _load_from_store(self) -> None: for row in self._store.list_quarantined(self._org): self._entries[row["key"]] = QuarantineEntry( key=row["key"], source_peer=row["source_peer"], reason=row["reason"], content=row.get("content", {}), memory_type=row.get("memory_type", ""), ) def quarantine( self, key: str, source: str, reason: str, content: dict, memory_type: str = "", ) -> QuarantineEntry: entry = QuarantineEntry( key=key, source_peer=source, reason=reason, content=content, memory_type=memory_type, ) self._entries[key] = entry if self._store and self._org: self._store.quarantine_item( self._org, key, source, reason, content, ) return entry def get(self, key: str) -> QuarantineEntry | None: return self._entries.get(key) def list_all(self) -> list[QuarantineEntry]: return list(self._entries.values()) def promote(self, key: str) -> QuarantineEntry | None: """Remove from quarantine and return entry for acceptance.""" entry = self._entries.pop(key, None) if self._store and self._org: self._store.promote_item(self._org, key) return entry def reject(self, key: str) -> bool: """Permanently reject quarantined item.""" if key in self._entries: del self._entries[key] if self._store and self._org: self._store.promote_item(self._org, key) return True return key is not None @property def count(self) -> int: return len(self._entries) ================================================ FILE: maggy/maggy/mesh/store.py ================================================ """SQLite backing for mesh peers, memories, and quarantine.""" from __future__ import annotations import json import sqlite3 import threading from datetime import datetime, timezone from pathlib import Path SCHEMA = """ CREATE TABLE IF NOT EXISTS peers ( peer_id TEXT NOT NULL, name TEXT NOT NULL, address TEXT NOT NULL, port INTEGER NOT NULL DEFAULT 8080, org TEXT NOT NULL, last_seen TEXT NOT NULL, manual INTEGER NOT NULL DEFAULT 0, PRIMARY KEY (peer_id, org) ); CREATE TABLE IF NOT EXISTS shared_memories ( key TEXT NOT NULL, org TEXT NOT NULL, memory_type TEXT NOT NULL, content TEXT NOT NULL, source_peer TEXT NOT NULL, confidence REAL NOT NULL DEFAULT 1.0, created_at TEXT NOT NULL, PRIMARY KEY (key, org) ); CREATE TABLE IF NOT EXISTS quarantine ( key TEXT NOT NULL, org TEXT NOT NULL, source_peer TEXT NOT NULL, reason TEXT NOT NULL, content TEXT NOT NULL, quarantined_at TEXT NOT NULL, PRIMARY KEY (key, org) ); """ def _now() -> str: return datetime.now(timezone.utc).isoformat() class MeshStore: """SQLite-backed mesh storage with connection reuse.""" def __init__(self, db_path: Path) -> None: self._db = db_path self._lock = threading.Lock() db_path.parent.mkdir(parents=True, exist_ok=True) self._conn = sqlite3.connect( str(db_path), timeout=30.0, check_same_thread=False, ) self._conn.execute("PRAGMA journal_mode=WAL") self._conn.execute("PRAGMA busy_timeout=30000") self._conn.row_factory = sqlite3.Row self._conn.executescript(SCHEMA) # ── Peers ────────────────────────────────────────── def upsert_peer( self, peer_id: str, name: str, address: str, port: int, org: str, ) -> None: with self._lock: self._conn.execute( "INSERT OR REPLACE INTO peers " "VALUES (?,?,?,?,?,?,?)", (peer_id, name, address, port, org, _now(), 0), ) self._conn.commit() def get_peer( self, peer_id: str, org: str, ) -> dict | None: with self._lock: row = self._conn.execute( "SELECT * FROM peers " "WHERE peer_id=? AND org=?", (peer_id, org), ).fetchone() return dict(row) if row else None def list_peers( self, org: str | None = None, ) -> list[dict]: with self._lock: if org: rows = self._conn.execute( "SELECT * FROM peers WHERE org=?", (org,), ).fetchall() else: rows = self._conn.execute( "SELECT * FROM peers", ).fetchall() return [dict(r) for r in rows] def remove_peer( self, peer_id: str, org: str, ) -> bool: with self._lock: cur = self._conn.execute( "DELETE FROM peers " "WHERE peer_id=? AND org=?", (peer_id, org), ) self._conn.commit() return cur.rowcount > 0 # ── Memories ─────────────────────────────────────── def write_memory( self, org: str, key: str, memory_type: str, content: dict, source_peer: str, confidence: float = 1.0, ) -> None: with self._lock: self._conn.execute( "INSERT OR REPLACE INTO shared_memories " "VALUES (?,?,?,?,?,?,?)", (key, org, memory_type, json.dumps(content), source_peer, confidence, _now()), ) self._conn.commit() def list_memories(self, org: str) -> list[dict]: with self._lock: rows = self._conn.execute( "SELECT * FROM shared_memories WHERE org=?", (org,), ).fetchall() return [ {**dict(r), "content": json.loads(r["content"])} for r in rows ] # ── Quarantine ───────────────────────────────────── def quarantine_item( self, org: str, key: str, source: str, reason: str, content: dict, ) -> None: with self._lock: self._conn.execute( "INSERT OR REPLACE INTO quarantine " "VALUES (?,?,?,?,?,?)", (key, org, source, reason, json.dumps(content), _now()), ) self._conn.commit() def promote_item( self, org: str, key: str, ) -> bool: with self._lock: cur = self._conn.execute( "DELETE FROM quarantine " "WHERE key=? AND org=?", (key, org), ) self._conn.commit() return cur.rowcount > 0 def list_quarantined(self, org: str) -> list[dict]: with self._lock: rows = self._conn.execute( "SELECT * FROM quarantine WHERE org=?", (org,), ).fetchall() return [ {**dict(r), "content": json.loads(r["content"])} for r in rows ] def close(self) -> None: """Close the database connection.""" self._conn.close() ================================================ FILE: maggy/maggy/mesh/sync.py ================================================ """Sync engine — merges shared memories across peers.""" from __future__ import annotations import logging from dataclasses import dataclass from .memory import SharedMemory from .quarantine import QuarantineStore logger = logging.getLogger(__name__) CONFIDENCE_THRESHOLD = 0.5 @dataclass class SyncResult: """Result of a sync operation.""" accepted: int = 0 quarantined: int = 0 rejected: int = 0 class SyncEngine: """Merges incoming memories with local store.""" def __init__( self, quarantine: QuarantineStore, store=None, org: str = "", ): self._local: dict[str, SharedMemory] = {} self._quarantine = quarantine self._store = store self._org = org if store and org: self._load_from_store() def _load_from_store(self) -> None: for row in self._store.list_memories(self._org): self._local[row["key"]] = SharedMemory( key=row["key"], memory_type=row["memory_type"], content=row["content"], source_peer=row["source_peer"], confidence=row["confidence"], ) def sync_incoming( self, memories: list[SharedMemory], ) -> SyncResult: """Process incoming memories from a peer.""" result = SyncResult() for mem in memories: if mem.confidence >= CONFIDENCE_THRESHOLD: self._accept(mem) result.accepted += 1 else: self._quarantine.quarantine( key=mem.key, source=mem.source_peer, reason="low confidence", content=mem.content, memory_type=mem.memory_type, ) result.quarantined += 1 return result def _accept(self, mem: SharedMemory) -> None: self._local[mem.key] = mem if self._store and self._org: self._store.write_memory( self._org, mem.key, mem.memory_type, mem.content, mem.source_peer, mem.confidence, ) def promote_from_quarantine(self, key: str) -> bool: """Accept a quarantined item into shared memories.""" entry = self._quarantine.promote(key) if not entry: return False mem = SharedMemory( key=entry.key, memory_type=entry.memory_type, content=entry.content, source_peer=entry.source_peer, confidence=1.0, ) self._accept(mem) return True def get_local(self, key: str) -> SharedMemory | None: return self._local.get(key) def list_local(self) -> list[SharedMemory]: return list(self._local.values()) @property def local_count(self) -> int: return len(self._local) ================================================ FILE: maggy/maggy/mesh/transport.py ================================================ """Transport layer — HMAC auth and org key derivation.""" from __future__ import annotations import hashlib import hmac import json import logging import time from .protocol import MeshMessage logger = logging.getLogger(__name__) MAX_AGE_SECONDS = 300 # 5-minute replay window def derive_org_key(org: str, secret: str) -> str: """Derive per-org HMAC key from shared secret.""" return hmac.new( secret.encode(), org.encode(), hashlib.sha256, ).hexdigest() def compute_hmac(payload: str, key: str) -> str: """Compute HMAC-SHA256 for message authentication.""" return hmac.new( key.encode(), payload.encode(), hashlib.sha256, ).hexdigest() def verify_hmac( payload: str, key: str, signature: str, ) -> bool: """Verify HMAC signature.""" expected = compute_hmac(payload, key) return hmac.compare_digest(expected, signature) def sign_message(msg: MeshMessage, org_key: str) -> str: """Serialize and sign with timestamp for replay protection.""" payload = msg.serialize() ts = time.time() sig = compute_hmac(f"{payload}:{ts}", org_key) return json.dumps({"payload": payload, "sig": sig, "ts": ts}) def verify_message( raw: str, org_key: str, ) -> MeshMessage | None: """Verify signature and timestamp, then deserialize.""" try: envelope = json.loads(raw) payload = envelope["payload"] sig = envelope["sig"] ts = envelope.get("ts", 0) except (json.JSONDecodeError, KeyError): return None age = abs(time.time() - ts) if age > MAX_AGE_SECONDS: logger.debug("Rejected stale message (age=%.0fs)", age) return None if not verify_hmac(f"{payload}:{ts}", org_key, sig): return None return MeshMessage.deserialize(payload) ================================================ FILE: maggy/maggy/mesh/ws_client.py ================================================ """Async WebSocket client for mesh peer connections.""" from __future__ import annotations import asyncio import logging from .discovery import PeerInfo from .protocol import MeshMessage, create_hello from .transport import sign_message, verify_message logger = logging.getLogger(__name__) RECONNECT_DELAY = 10.0 class MeshClient: """Maintains WebSocket connections to known peers.""" def __init__(self, peer_id: str) -> None: self._peer_id = peer_id self._connections: dict[str, object] = {} self._tasks: dict[str, asyncio.Task] = {} async def connect( self, peer: PeerInfo, org: str, org_key: str, ) -> bool: """Connect to a peer and send HELLO.""" try: import websockets url = f"{peer.address}/ws/mesh" ws = await websockets.connect(url) hello = create_hello(self._peer_id, "client") hello.payload["org"] = org signed = sign_message(hello, org_key) await ws.send(signed) reply_raw = await ws.recv() reply = verify_message(reply_raw, org_key) if not reply: await ws.close() return False self._connections[peer.peer_id] = ws logger.info("Connected to peer %s", peer.peer_id) return True except Exception as exc: logger.debug("Connect to %s failed: %s", peer.peer_id, exc) return False async def send( self, peer_id: str, msg: MeshMessage, org_key: str, ) -> bool: """Send message to a connected peer.""" ws = self._connections.get(peer_id) if not ws: return False try: signed = sign_message(msg, org_key) await ws.send(signed) return True except Exception as exc: logger.debug("Send to %s failed: %s", peer_id, exc) self._connections.pop(peer_id, None) return False async def broadcast( self, peers: list[str], msg: MeshMessage, org_key: str, ) -> int: """Send to all specified peers. Returns success count.""" sent = 0 for pid in peers: if await self.send(pid, msg, org_key): sent += 1 return sent async def close_all(self) -> None: """Close all connections.""" for ws in self._connections.values(): try: await ws.close() except Exception: pass self._connections.clear() for task in self._tasks.values(): task.cancel() self._tasks.clear() @property def connected_count(self) -> int: return len(self._connections) def is_connected(self, peer_id: str) -> bool: return peer_id in self._connections ================================================ FILE: maggy/maggy/mesh/ws_server.py ================================================ """WebSocket server endpoint for mesh communication.""" from __future__ import annotations import asyncio import json import logging from fastapi import APIRouter, WebSocket, WebSocketDisconnect from .protocol import MessageType, MeshMessage, create_hello from .transport import sign_message, verify_message logger = logging.getLogger(__name__) router = APIRouter() HELLO_TIMEOUT = 10.0 MSG_TIMEOUT = 300.0 MAX_INVALID = 5 @router.websocket("/ws/mesh") async def mesh_ws(websocket: WebSocket) -> None: """Accept mesh peer connections.""" await websocket.accept() manager = getattr(websocket.app.state, "mesh", None) if not manager: await websocket.close(code=1008, reason="Mesh not enabled") return try: await _handle_connection(websocket, manager) except WebSocketDisconnect: logger.debug("Mesh peer disconnected") except asyncio.TimeoutError: logger.debug("Mesh peer timed out") except Exception as exc: logger.warning("Mesh WS error: %s", exc) async def _handle_connection(websocket, manager) -> None: """Authenticate and enter message loop.""" raw = await asyncio.wait_for( websocket.receive_text(), timeout=HELLO_TIMEOUT, ) org, msg = _authenticate(raw, manager) if not msg or not org: await websocket.close(code=1008, reason="Auth failed") return net = manager.get_network(org) if not net: await websocket.close(code=1008, reason="Unknown org") return peers = [ {"peer_id": p.peer_id, "address": p.address, "port": p.port} for p in net.peers.list_peers() ] reply = create_hello(manager._cfg.peer_id, "server") reply.payload["peers"] = peers signed = sign_message(reply, net.org_key) await websocket.send_text(signed) await _message_loop(websocket, net) async def _message_loop(websocket, net) -> None: """Rate-limited message receive loop.""" invalid_count = 0 while True: data = await asyncio.wait_for( websocket.receive_text(), timeout=MSG_TIMEOUT, ) incoming = verify_message(data, net.org_key) if not incoming: invalid_count += 1 if invalid_count >= MAX_INVALID: logger.warning("Too many invalid messages") break continue invalid_count = 0 await _dispatch(incoming, net) def _authenticate( raw: str, manager, ) -> tuple[str | None, MeshMessage | None]: """Try to authenticate a HELLO message.""" try: envelope = json.loads(raw) payload_str = envelope.get("payload", "") msg = MeshMessage.deserialize(payload_str) org = msg.payload.get("org", "") except (json.JSONDecodeError, KeyError, TypeError): return None, None if msg.msg_type != MessageType.HELLO: return None, None net = manager.get_network(org) if not net: return None, None verified = verify_message(raw, net.org_key) if not verified: return None, None return org, verified async def _dispatch(msg: MeshMessage, net) -> None: """Handle incoming message by type.""" if msg.msg_type == MessageType.SHARE: from .memory import SharedMemory mem = SharedMemory( key=msg.payload.get("key", ""), memory_type=msg.payload.get("memory_type", ""), content=msg.payload.get("content", {}), source_peer=msg.sender_id, confidence=msg.payload.get("confidence", 1.0), ) net.sync.sync_incoming([mem]) elif msg.msg_type == MessageType.HEARTBEAT: net.peers.update_seen(msg.sender_id) ================================================ FILE: maggy/maggy/mnemos/__init__.py ================================================ """Mnemos helpers for fatigue and signal tracking.""" from .fatigue import FatigueTracker from .signals import SignalLog __all__ = ["FatigueTracker", "SignalLog"] ================================================ FILE: maggy/maggy/mnemos/fatigue.py ================================================ """Cross-model fatigue tracking for Mnemos.""" from __future__ import annotations VALID_DIMENSIONS = frozenset({ "context_load", "turn_pressure", "reread_ratio", "handoff_risk", }) class FatigueTracker: """Track fatigue across four compression signals.""" def __init__(self, context_window: int = 200_000): self.context_window = context_window self.dimensions: dict[str, float] = { d: 0.0 for d in VALID_DIMENSIONS } def record(self, dimension: str, value: float) -> None: if dimension not in VALID_DIMENSIONS: raise ValueError( f"Unknown dimension {dimension!r}. " f"Valid: {sorted(VALID_DIMENSIONS)}" ) self.dimensions[dimension] = max(0.0, min(value, 1.0)) def on_model_switch(self, new_context_window: int) -> None: self.context_window = new_context_window value = self.dimensions["reread_ratio"] + 0.15 self.record("reread_ratio", value) def composite(self) -> float: return sum(self.dimensions.values()) / len(self.dimensions) def state(self) -> str: score = self.composite() if score >= 0.8: return "critical" if score >= 0.45: return "compress" return "ok" ================================================ FILE: maggy/maggy/mnemos/signals.py ================================================ """JSONL-backed signal logging for Mnemos.""" from __future__ import annotations import json from pathlib import Path class SignalLog: """Append and read Mnemos signal history.""" def __init__(self, path: Path): self._path = path def append(self, signal: dict) -> None: self._path.parent.mkdir(parents=True, exist_ok=True) with self._path.open("a", encoding="utf-8") as handle: handle.write(json.dumps(signal) + "\n") def recent(self, n: int) -> list[dict]: if n <= 0 or not self._path.exists(): return [] from collections import deque with self._path.open(encoding="utf-8") as handle: lines = deque(handle, maxlen=n) return [json.loads(line) for line in lines] ================================================ FILE: maggy/maggy/models/__init__.py ================================================ """Maggy data models.""" ================================================ FILE: maggy/maggy/models/plan.py ================================================ """Plan and PlanDiff models for dual-model planning.""" from __future__ import annotations from dataclasses import dataclass, field @dataclass class PlanStep: """A single step in a plan.""" description: str files: list[str] = field(default_factory=list) blast_estimate: int = 0 @dataclass class Plan: """A generated implementation plan.""" task: str model: str steps: list[PlanStep] = field(default_factory=list) risks: list[str] = field(default_factory=list) total_blast: int = 0 @property def step_count(self) -> int: return len(self.steps) @dataclass class PlanDiff: """Diff between primary and counter plans.""" agreed: list[str] = field(default_factory=list) conflicts: list[dict] = field(default_factory=list) primary_only: list[str] = field(default_factory=list) counter_only: list[str] = field(default_factory=list) @property def conflict_count(self) -> int: return len(self.conflicts) @property def agreement_ratio(self) -> float: total = ( len(self.agreed) + len(self.conflicts) + len(self.primary_only) + len(self.counter_only) ) if total == 0: return 1.0 return len(self.agreed) / total ================================================ FILE: maggy/maggy/observability/__init__.py ================================================ """Observability exports.""" from .collector import ObservabilityCollector __all__ = ["ObservabilityCollector"] ================================================ FILE: maggy/maggy/observability/collector.py ================================================ """SQLite-backed observability signal storage.""" from __future__ import annotations import sqlite3 from contextlib import contextmanager from datetime import datetime, timezone from pathlib import Path from typing import Iterator SCHEMA = """ CREATE TABLE IF NOT EXISTS signals ( id INTEGER PRIMARY KEY AUTOINCREMENT, project TEXT NOT NULL, signal_type TEXT NOT NULL, value REAL NOT NULL, created_at TEXT NOT NULL ); """ @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class ObservabilityCollector: def __init__(self, db_path: Path): self._db_path = db_path self._init_db() def record_signal( self, project: str, signal_type: str, value: float, ) -> None: now = datetime.now(timezone.utc).isoformat() with _connect(self._db_path) as conn: conn.execute( "INSERT INTO signals (project, signal_type, value, created_at) " "VALUES (?, ?, ?, ?)", (project, signal_type, value, now), ) conn.commit() def recent_signals( self, project: str, limit: int = 20, ) -> list[dict]: with _connect(self._db_path) as conn: rows = conn.execute( "SELECT project, signal_type, value, created_at " "FROM signals WHERE project = ? " "ORDER BY id DESC LIMIT ?", (project, limit), ).fetchall() return [dict(row) for row in rows] def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) ================================================ FILE: maggy/maggy/planning.py ================================================ """Dual-model planning orchestrator. Generates plan with primary model, counter-checks with secondary, merges into a diff showing agreements and conflicts. """ from __future__ import annotations import logging from dataclasses import dataclass from maggy.config import MaggyConfig from maggy.models.plan import Plan, PlanDiff, PlanStep logger = logging.getLogger(__name__) DUAL_PLAN_THRESHOLD = 4 @dataclass class PlanRequest: """Input for plan generation.""" task: str blast_score: int = 0 file_context: list[str] | None = None class PlanningService: """Dual-plan orchestrator.""" def __init__(self, cfg: MaggyConfig): self.cfg = cfg def should_dual_plan(self, blast_score: int) -> bool: """Only dual-plan for tasks above threshold.""" return blast_score >= DUAL_PLAN_THRESHOLD def generate_plan( self, task: str, model: str, files: list[str] | None = None, ) -> Plan: """Generate a plan (stub — real impl calls LLM).""" steps = [ PlanStep( description=f"Analyze {task}", files=files or [], blast_estimate=1, ), PlanStep( description=f"Implement {task}", files=files or [], blast_estimate=2, ), PlanStep( description=f"Test {task}", blast_estimate=1, ), ] return Plan( task=task, model=model, steps=steps, total_blast=sum(s.blast_estimate for s in steps), ) def diff_plans( self, primary: Plan, counter: Plan, ) -> PlanDiff: """Compare two plans and produce a diff.""" p_descs = {s.description for s in primary.steps} c_descs = {s.description for s in counter.steps} agreed = list(p_descs & c_descs) primary_only = list(p_descs - c_descs) counter_only = list(c_descs - p_descs) conflicts = [] for po in primary_only: for co in counter_only: if _similar(po, co): conflicts.append({ "primary": po, "counter": co, }) return PlanDiff( agreed=agreed, conflicts=conflicts, primary_only=[ p for p in primary_only if not any(c["primary"] == p for c in conflicts) ], counter_only=[ c for c in counter_only if not any(cf["counter"] == c for cf in conflicts) ], ) def plan_task(self, req: PlanRequest) -> dict: """Full planning flow for a task.""" primary = self.generate_plan( req.task, "claude", req.file_context, ) if not self.should_dual_plan(req.blast_score): return { "mode": "single", "plan": primary, "diff": None, } counter = self.generate_plan( req.task, "codex", req.file_context, ) diff = self.diff_plans(primary, counter) return { "mode": "dual", "plan": primary, "counter_plan": counter, "diff": diff, } def _similar(a: str, b: str) -> bool: """Simple word-overlap similarity check.""" a_words = set(a.lower().split()) b_words = set(b.lower().split()) if not a_words or not b_words: return False overlap = len(a_words & b_words) return overlap / min(len(a_words), len(b_words)) > 0.5 ================================================ FILE: maggy/maggy/process/__init__.py ================================================ """Process Intelligence — learns from PRs, reviews, CI to improve engineering.""" ================================================ FILE: maggy/maggy/process/discovery.py ================================================ """Environment auto-discovery — detects CI/CD, review tools, etc.""" from __future__ import annotations import logging from pathlib import Path import httpx logger = logging.getLogger(__name__) GITHUB_API = "https://api.github.com" def discover_local(project_path: Path) -> dict: """Discover tools from local filesystem markers.""" result: dict[str, list[str]] = { "ci": [], "quality": [], "review": [], "deps": [], } # CI/CD gh_workflows = project_path / ".github" / "workflows" if gh_workflows.exists(): result["ci"].append("github_actions") if (project_path / "Jenkinsfile").exists(): result["ci"].append("jenkins") if (project_path / ".circleci").exists(): result["ci"].append("circleci") if (project_path / ".gitlab-ci.yml").exists(): result["ci"].append("gitlab_ci") # Code quality if (project_path / ".eslintrc.json").exists() or \ (project_path / ".eslintrc.js").exists(): result["quality"].append("eslint") if (project_path / "pyproject.toml").exists(): content = (project_path / "pyproject.toml").read_text() if "ruff" in content: result["quality"].append("ruff") if "mypy" in content: result["quality"].append("mypy") if (project_path / ".pre-commit-config.yaml").exists(): result["quality"].append("pre-commit") # Review tools if (project_path / "CODEOWNERS").exists() or \ (project_path / ".github" / "CODEOWNERS").exists(): result["review"].append("codeowners") # Dependency management dependabot = project_path / ".github" / "dependabot.yml" if dependabot.exists(): result["deps"].append("dependabot") renovate = project_path / "renovate.json" if renovate.exists(): result["deps"].append("renovate") return result async def discover_github( repo: str, token: str, ) -> dict: """Discover integrations via GitHub API.""" result: dict[str, list[str]] = { "bots": [], "protection": [], } headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github.v3+json", } async with httpx.AsyncClient( timeout=10.0, headers=headers, ) as client: # Check branch protection try: resp = await client.get( f"{GITHUB_API}/repos/{repo}/branches/main" ) if resp.status_code == 200: data = resp.json() if data.get("protected"): result["protection"].append( "branch_protection" ) except httpx.HTTPError: pass # Check recent PR comments for bots try: resp = await client.get( f"{GITHUB_API}/repos/{repo}/pulls", params={"state": "all", "per_page": "5"}, ) if resp.status_code == 200: for pr in resp.json()[:3]: cr = await client.get( f"{GITHUB_API}/repos/{repo}" f"/pulls/{pr['number']}/comments", params={"per_page": "10"}, ) if cr.status_code == 200: for c in cr.json(): user = (c.get("user") or {}).get( "login", "" ).lower() if "coderabbit" in user: result["bots"].append( "coderabbit" ) if "dependabot" in user: result["bots"].append( "dependabot" ) # Deduplicate result["bots"] = list(set(result["bots"])) except httpx.HTTPError: pass return result ================================================ FILE: maggy/maggy/process/github_prs.py ================================================ """GitHub PR fetcher — reads PRs, reviews, and CI checks. Reuses patterns from providers/github_issues.py (httpx async, headers, error handling). Fetches up to 200 PRs per repo. """ from __future__ import annotations import logging import httpx from .models import CheckRecord, PRRecord, ReviewRecord logger = logging.getLogger(__name__) GITHUB_API = "https://api.github.com" DEFAULT_TIMEOUT = 15 def _headers(token: str) -> dict[str, str]: return { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } async def fetch_prs( repo: str, token: str, limit: int = 200, ) -> list[PRRecord]: """Fetch merged PRs with reviews and checks.""" raw_prs = await _fetch_pr_list(repo, token, limit) records: list[PRRecord] = [] async with httpx.AsyncClient( timeout=DEFAULT_TIMEOUT, headers=_headers(token) ) as client: for pr_data in raw_prs: detail = await _fetch_pr_detail( client, repo, pr_data["number"] ) pr = _parse_pr(detail or pr_data) pr.reviews = await _fetch_reviews( client, repo, pr.number ) if pr.head_sha: pr.checks = await _fetch_checks( client, repo, pr.head_sha ) pr.files = await _fetch_files( client, repo, pr.number ) records.append(pr) return records async def _fetch_pr_list( repo: str, token: str, limit: int, ) -> list[dict]: """Paginate through /pulls endpoint.""" results: list[dict] = [] page = 1 per_page = min(limit, 100) async with httpx.AsyncClient( timeout=DEFAULT_TIMEOUT, headers=_headers(token) ) as client: while len(results) < limit: resp = await client.get( f"{GITHUB_API}/repos/{repo}/pulls", params={ "state": "all", "sort": "updated", "direction": "desc", "per_page": str(per_page), "page": str(page), }, ) if resp.status_code != 200: _log_error(repo, "pulls", resp) break batch = resp.json() if not batch: break results.extend(batch) page += 1 return results[:limit] async def _fetch_pr_detail( client: httpx.AsyncClient, repo: str, pr_number: int, ) -> dict | None: """Fetch single PR detail (has additions/deletions).""" resp = await client.get( f"{GITHUB_API}/repos/{repo}/pulls/{pr_number}" ) if resp.status_code != 200: return None return resp.json() def _parse_pr(data: dict) -> PRRecord: """Convert raw GitHub PR JSON to PRRecord.""" return PRRecord( number=data.get("number", 0), title=data.get("title", ""), author=(data.get("user") or {}).get("login", ""), state=_pr_state(data), created_at=data.get("created_at", ""), merged_at=data.get("merged_at"), additions=data.get("additions", 0), deletions=data.get("deletions", 0), changed_files=data.get("changed_files", 0), head_sha=(data.get("head") or {}).get("sha", ""), base_branch=(data.get("base") or {}).get("ref", ""), ) def _pr_state(data: dict) -> str: if data.get("merged_at"): return "merged" return data.get("state", "open") async def _fetch_reviews( client: httpx.AsyncClient, repo: str, pr_number: int, ) -> list[ReviewRecord]: """Fetch all reviews for a PR.""" resp = await client.get( f"{GITHUB_API}/repos/{repo}/pulls/{pr_number}/reviews" ) if resp.status_code != 200: return [] return [ ReviewRecord( reviewer=(r.get("user") or {}).get("login", ""), state=r.get("state", ""), body=r.get("body") or "", submitted_at=r.get("submitted_at", ""), ) for r in resp.json() ] async def _fetch_checks( client: httpx.AsyncClient, repo: str, sha: str, ) -> list[CheckRecord]: """Fetch CI check runs for a commit.""" resp = await client.get( f"{GITHUB_API}/repos/{repo}/commits/{sha}/check-runs" ) if resp.status_code != 200: return [] return [ CheckRecord( name=c.get("name", ""), conclusion=c.get("conclusion") or "pending", started_at=c.get("started_at", ""), completed_at=c.get("completed_at") or "", ) for c in resp.json().get("check_runs", []) ] async def _fetch_files( client: httpx.AsyncClient, repo: str, pr_number: int, ) -> list[str]: """Fetch file paths changed in a PR.""" resp = await client.get( f"{GITHUB_API}/repos/{repo}/pulls/{pr_number}/files", params={"per_page": "100"}, ) if resp.status_code != 200: return [] return [ f.get("filename", "") for f in resp.json() if f.get("filename") ] def _log_error( repo: str, endpoint: str, resp: httpx.Response ) -> None: body = (resp.text or "")[:200].replace("\n", " ") logger.warning( "GitHub /repos/%s/%s returned %s: %s", repo, endpoint, resp.status_code, body, ) ================================================ FILE: maggy/maggy/process/model_router.py ================================================ """Dynamic model routing — routes tasks to models by complexity. Not just fallback chains: intelligent routing based on task complexity, security sensitivity, and task type. Simple tasks go to cheap models, complex tasks to premium, security-critical get dual validation. """ from __future__ import annotations from dataclasses import dataclass, field from .models import ModelTier DEFAULT_TIERS: list[ModelTier] = [ ModelTier( name="local", provider="ollama", model="qwen3-coder:30b-a3b-q8_0", cost_rank=1, complexity_min=0, complexity_max=5, strengths=["formatting", "simple_edits", "crud", "feature"], ), ModelTier( name="kimi", provider="moonshot", model="kimi-k2", cost_rank=2, complexity_min=0, complexity_max=5, strengths=["documentation", "simple_tasks"], ), ModelTier( name="codex", provider="openai", model="codex", cost_rank=3, complexity_min=4, complexity_max=10, strengths=["code_generation", "api_design", "review"], ), ModelTier( name="claude", provider="anthropic", model="claude-sonnet-4", cost_rank=4, complexity_min=5, complexity_max=10, strengths=["complex_reasoning", "security", "architecture"], ), ] @dataclass class RoutingDecision: """Result of dynamic model routing.""" primary: ModelTier validator: ModelTier | None = None reason: str = "" fallback_chain: list[str] = field(default_factory=list) def route_task( complexity_score: int, task_type: str = "general", security_sensitive: bool = False, tiers: list[ModelTier] | None = None, stakes: str = "low", ) -> RoutingDecision: """Route a task to the optimal model tier. Args: complexity_score: 0-10 from polyphony scoring task_type: "bug", "feature", "refactor", "test", etc. security_sensitive: True for auth/billing/PII tasks tiers: Custom tiers (defaults to DEFAULT_TIERS) """ available = tiers or DEFAULT_TIERS primaries = [ t for t in available if t.role == "primary" ] validators = [ t for t in available if t.role == "validator" ] primary = _select_primary( complexity_score, task_type, primaries, stakes, ) validator = _select_validator( complexity_score, security_sensitive, validators, stakes, ) fallback = _build_fallback(primary, primaries) reason = _build_reason( primary, complexity_score, task_type, security_sensitive ) return RoutingDecision( primary=primary, validator=validator, reason=reason, fallback_chain=fallback, ) def _select_primary( score: int, task_type: str, tiers: list[ModelTier], stakes: str = "low", ) -> ModelTier: """Pick the cheapest tier that handles the complexity.""" candidates = [ t for t in tiers if t.complexity_min <= score <= t.complexity_max ] if not candidates: return tiers[-1] # Fallback to most capable candidates.sort(key=lambda t: t.cost_rank) # High stakes or security: skip cheapest tiers high_risk = ( stakes == "high" or task_type in ("security", "auth", "billing") ) if high_risk: capable = [ c for c in candidates if c.cost_rank >= 3 ] if capable: return capable[0] return candidates[0] def _select_validator( score: int, security_sensitive: bool, validators: list[ModelTier], stakes: str = "low", ) -> ModelTier | None: """Add validation for high-risk tasks.""" if not validators: return None if score >= 8 or security_sensitive or stakes == "high": return validators[0] return None def _build_fallback( primary: ModelTier, tiers: list[ModelTier], ) -> list[str]: """Build fallback chain: next tier up, then next.""" above = [ t for t in tiers if t.cost_rank > primary.cost_rank ] above.sort(key=lambda t: t.cost_rank) return [t.name for t in above] def _build_reason( primary: ModelTier, score: int, task_type: str, security_sensitive: bool, ) -> str: """Human-readable routing explanation.""" parts = [f"complexity={score}/10"] if task_type != "general": parts.append(f"type={task_type}") if security_sensitive: parts.append("security-sensitive") parts.append(f"routed to {primary.name}") return ", ".join(parts) ================================================ FILE: maggy/maggy/process/models.py ================================================ """Dataclasses for Process Intelligence — PR records, reviews, CI checks.""" from __future__ import annotations from dataclasses import dataclass, field @dataclass class ReviewRecord: """A single PR review event.""" reviewer: str state: str # APPROVED, CHANGES_REQUESTED, COMMENTED body: str submitted_at: str @dataclass class CheckRecord: """A single CI check run result.""" name: str conclusion: str # success, failure, neutral, skipped started_at: str completed_at: str @dataclass class PRRecord: """A pull request with computed metrics.""" number: int title: str author: str state: str # open, closed, merged created_at: str merged_at: str | None additions: int deletions: int changed_files: int head_sha: str base_branch: str reviews: list[ReviewRecord] = field(default_factory=list) checks: list[CheckRecord] = field(default_factory=list) files: list[str] = field(default_factory=list) @property def total_lines(self) -> int: return self.additions + self.deletions @property def review_rounds(self) -> int: return sum( 1 for r in self.reviews if r.state == "CHANGES_REQUESTED" ) @property def time_to_merge_hours(self) -> float | None: if not self.merged_at or not self.created_at: return None from datetime import datetime, timezone fmt = "%Y-%m-%dT%H:%M:%SZ" try: created = datetime.strptime(self.created_at, fmt) merged = datetime.strptime(self.merged_at, fmt) created = created.replace(tzinfo=timezone.utc) merged = merged.replace(tzinfo=timezone.utc) return (merged - created).total_seconds() / 3600 except (ValueError, TypeError): return None @property def ci_passed(self) -> bool: if not self.checks: return True return all( c.conclusion in ("success", "neutral", "skipped") for c in self.checks ) @dataclass class ReviewSignal: """Recurring theme from a reviewer.""" reviewer: str theme: str count: int example_prs: list[int] = field(default_factory=list) @dataclass class CISignal: """CI failure pattern.""" check_name: str failure_count: int total_runs: int correlated_files: list[str] = field(default_factory=list) @property def failure_rate(self) -> float: if self.total_runs == 0: return 0.0 return self.failure_count / self.total_runs @dataclass class VelocitySignal: """PR velocity metrics.""" avg_time_to_merge_hours: float median_time_to_merge_hours: float avg_review_rounds: float avg_pr_size: float total_prs_analyzed: int @dataclass class ProcessReport: """The 5-minute analysis report.""" project_key: str generated_at: str total_prs: int velocity: VelocitySignal | None = None review_signals: list[ReviewSignal] = field(default_factory=list) ci_signals: list[CISignal] = field(default_factory=list) routing_recommendations: list[dict] = field( default_factory=list ) preemptive_fixes: list[str] = field(default_factory=list) summary: str = "" @dataclass class ModelTier: """A model tier for dynamic routing.""" name: str provider: str model: str cost_rank: int # 1=cheapest, 5=most expensive complexity_min: int # Min complexity score complexity_max: int # Max complexity score strengths: list[str] = field(default_factory=list) role: str = "primary" # "primary" | "validator" ================================================ FILE: maggy/maggy/process/patterns.py ================================================ """Pattern engine — correlates signals into actionable insights. Takes raw signals from signals.py and produces: - Preemptive fix recommendations - Routing recommendations per task type - Bottleneck identification """ from __future__ import annotations from .models import ( CISignal, PRRecord, ReviewSignal, VelocitySignal, ) def identify_bottlenecks( velocity: VelocitySignal | None, prs: list[PRRecord], ) -> list[str]: """Identify why PRs are slow.""" if not velocity: return ["Insufficient data — no merged PRs found"] bottlenecks: list[str] = [] if velocity.avg_time_to_merge_hours > 48: bottlenecks.append( f"Slow merge: avg {velocity.avg_time_to_merge_hours:.0f}h " f"(target: <24h)" ) if velocity.avg_review_rounds > 1.5: bottlenecks.append( f"High review churn: avg {velocity.avg_review_rounds:.1f} " f"rounds (target: <1.5)" ) if velocity.avg_pr_size > 500: bottlenecks.append( f"Large PRs: avg {velocity.avg_pr_size:.0f} lines " f"(target: <300)" ) # Size-velocity correlation large = [ p for p in prs if p.total_lines > 500 and p.time_to_merge_hours is not None ] small = [ p for p in prs if p.total_lines <= 200 and p.time_to_merge_hours is not None ] if large and small: avg_large = _avg_merge_time(large) avg_small = _avg_merge_time(small) if avg_large and avg_small and avg_large > avg_small * 2: ratio = avg_large / avg_small bottlenecks.append( f"Large PRs take {ratio:.1f}x longer to merge" ) if not bottlenecks: bottlenecks.append("No major bottlenecks detected") return bottlenecks def generate_preemptive_fixes( review_signals: list[ReviewSignal], ci_signals: list[CISignal], ) -> list[str]: """Generate actionable pre-PR fixes.""" fixes: list[str] = [] for sig in review_signals[:5]: fixes.append( f"Add {sig.theme.replace('_', ' ')} before PR " f"— reviewer {sig.reviewer} flags this " f"{sig.count}x" ) for sig in ci_signals[:3]: if sig.failure_rate > 0.2: files = ", ".join(sig.correlated_files[:3]) fix = ( f"Run {sig.check_name} locally before push " f"— fails {sig.failure_rate:.0%} of the time" ) if files: fix += f" (correlated with: {files})" fixes.append(fix) return fixes def generate_routing_recs( prs: list[PRRecord], ) -> list[dict]: """Recommend model routing per task pattern.""" recs: list[dict] = [] # Count security-related PRs sec_prs = [ p for p in prs if _is_security_related(p) ] if sec_prs: recs.append({ "pattern": "Security/auth changes", "model": "claude", "validation": "codex", "reason": ( f"{len(sec_prs)} security PRs found — " f"route to Claude + Codex validation" ), }) # Count test-only PRs test_prs = [ p for p in prs if _is_test_only(p) ] if test_prs: recs.append({ "pattern": "Test-only changes", "model": "kimi", "validation": None, "reason": ( f"{len(test_prs)} test-only PRs — " f"route to Kimi (cheaper)" ), }) # Count doc changes doc_prs = [p for p in prs if _is_docs(p)] if doc_prs: recs.append({ "pattern": "Documentation changes", "model": "kimi", "validation": None, "reason": ( f"{len(doc_prs)} doc PRs — " f"route to Kimi" ), }) # Complex multi-file changes complex_prs = [ p for p in prs if p.changed_files >= 10 ] if complex_prs: recs.append({ "pattern": "Multi-file refactors (10+ files)", "model": "claude", "validation": "codex", "reason": ( f"{len(complex_prs)} complex PRs — " f"route to Claude" ), }) return recs def _avg_merge_time(prs: list[PRRecord]) -> float | None: times = [ p.time_to_merge_hours for p in prs if p.time_to_merge_hours is not None ] if not times: return None return sum(times) / len(times) def _is_security_related(pr: PRRecord) -> bool: keywords = {"auth", "security", "token", "session"} title = pr.title.lower() return any(k in title for k in keywords) or any( "auth" in f or "security" in f for f in pr.files ) def _is_test_only(pr: PRRecord) -> bool: if not pr.files: return False return all( "test" in f.lower() or "spec" in f.lower() for f in pr.files ) def _is_docs(pr: PRRecord) -> bool: if not pr.files: return False return all( f.endswith(".md") or "doc" in f.lower() for f in pr.files ) ================================================ FILE: maggy/maggy/process/report.py ================================================ """Report generator — produces the 5-minute process analysis. Answers: 1. Why are your PRs slow? 2. What do reviewers always flag? 3. Which model should handle which task? 4. What will Maggy change before the next PR? """ from __future__ import annotations from .models import ( CISignal, ProcessReport, ReviewSignal, VelocitySignal, ) def generate_summary(report: ProcessReport) -> str: """Build human-readable summary from report data.""" lines: list[str] = [] lines.append( f"## Process Report: {report.project_key}" ) lines.append( f"Analyzed {report.total_prs} PRs" ) lines.append("") # Velocity if report.velocity: v = report.velocity lines.append("### PR Velocity") lines.append( f"- Avg time to merge: {v.avg_time_to_merge_hours:.1f}h" ) lines.append( f"- Median time to merge: " f"{v.median_time_to_merge_hours:.1f}h" ) lines.append( f"- Avg review rounds: {v.avg_review_rounds:.1f}" ) lines.append( f"- Avg PR size: {v.avg_pr_size:.0f} lines" ) lines.append("") # Review patterns if report.review_signals: lines.append("### Recurring Review Themes") for sig in report.review_signals[:5]: lines.append( f"- **{sig.reviewer}** flags " f"*{sig.theme.replace('_', ' ')}* " f"({sig.count}x)" ) lines.append("") # CI failures if report.ci_signals: lines.append("### CI Failure Patterns") for sig in report.ci_signals[:5]: lines.append( f"- **{sig.check_name}**: fails " f"{sig.failure_rate:.0%} of runs" ) if sig.correlated_files: files = ", ".join(sig.correlated_files[:3]) lines.append(f" Correlated with: {files}") lines.append("") # Routing if report.routing_recommendations: lines.append("### Model Routing Recommendations") for rec in report.routing_recommendations: model = rec.get("model", "?") pattern = rec.get("pattern", "?") lines.append(f"- {pattern} -> **{model}**") val = rec.get("validation") if val: lines.append( f" + validation by **{val}**" ) lines.append("") # Fixes if report.preemptive_fixes: lines.append("### Pre-emptive Fixes") for fix in report.preemptive_fixes: lines.append(f"- {fix}") lines.append("") return "\n".join(lines) def format_health_metrics( velocity: VelocitySignal | None, ci_signals: list[CISignal], review_signals: list[ReviewSignal], ) -> dict: """Format as structured health dashboard data.""" health: dict = {"status": "unknown"} if velocity: health["velocity"] = { "avg_merge_hours": ( velocity.avg_time_to_merge_hours ), "median_merge_hours": ( velocity.median_time_to_merge_hours ), "avg_review_rounds": velocity.avg_review_rounds, "avg_pr_size": velocity.avg_pr_size, "prs_analyzed": velocity.total_prs_analyzed, } ci_pass_rate = _ci_pass_rate(ci_signals) health["ci_pass_rate"] = ci_pass_rate health["top_review_themes"] = [ {"reviewer": s.reviewer, "theme": s.theme, "count": s.count} for s in review_signals[:5] ] # Overall status if velocity and ci_pass_rate is not None: if ( velocity.avg_review_rounds <= 1.5 and ci_pass_rate >= 0.9 ): health["status"] = "healthy" elif ( velocity.avg_review_rounds <= 2.5 and ci_pass_rate >= 0.7 ): health["status"] = "moderate" else: health["status"] = "needs_attention" return health def _ci_pass_rate( ci_signals: list[CISignal], ) -> float | None: """Overall CI pass rate across all checks.""" total_runs = sum(s.total_runs for s in ci_signals) total_fails = sum(s.failure_count for s in ci_signals) if total_runs == 0: return None return 1.0 - (total_fails / total_runs) ================================================ FILE: maggy/maggy/process/service.py ================================================ """Process Intelligence service — orchestrates the full pipeline. Pipeline: fetch PRs -> extract signals -> find patterns -> generate report. """ from __future__ import annotations import logging from datetime import datetime, timezone from pathlib import Path from maggy.config import MaggyConfig from . import github_prs from .models import ProcessReport from .patterns import ( generate_preemptive_fixes, generate_routing_recs, identify_bottlenecks, ) from .report import generate_summary from .signals import ( extract_ci_signals, extract_review_signals, extract_velocity_signals, ) from .store import ProcessStore logger = logging.getLogger(__name__) class ProcessService: """Orchestrates process intelligence analysis.""" def __init__(self, cfg: MaggyConfig): self.cfg = cfg db_path = ( Path(cfg.storage.path).expanduser().parent / "process.db" ) self.store = ProcessStore(db_path) async def analyze( self, project_key: str ) -> ProcessReport: """Run full analysis pipeline for a project.""" repo = self._resolve_repo(project_key) token = self.cfg.issue_tracker.github.token if not token: raise ValueError("GITHUB_TOKEN not configured") if not repo: raise ValueError( f"No repo found for project '{project_key}'" ) logger.info( "Analyzing %s — fetching PRs from %s", project_key, repo, ) # 1. Fetch PRs prs = await github_prs.fetch_prs( repo=repo, token=token, limit=200 ) logger.info("Fetched %d PRs from %s", len(prs), repo) # 2. Extract signals review_signals = extract_review_signals(prs) ci_signals = extract_ci_signals(prs) velocity = extract_velocity_signals(prs) # 3. Find patterns identify_bottlenecks(velocity, prs) fixes = generate_preemptive_fixes( review_signals, ci_signals ) routing = generate_routing_recs(prs) # 4. Build report now = datetime.now(timezone.utc).isoformat() report = ProcessReport( project_key=project_key, generated_at=now, total_prs=len(prs), velocity=velocity, review_signals=review_signals, ci_signals=ci_signals, routing_recommendations=routing, preemptive_fixes=fixes, ) report.summary = generate_summary(report) # 5. Persist self.store.save_report(report) logger.info( "Process report saved for %s: %d PRs, " "%d review signals, %d CI signals", project_key, len(prs), len(review_signals), len(ci_signals), ) return report def get_report(self, project_key: str) -> dict | None: """Get latest cached report.""" return self.store.load_latest_report(project_key) def get_health(self, project_key: str) -> dict | None: """Get health metrics from latest report.""" raw = self.store.load_latest_report(project_key) if not raw: return None return raw def _resolve_repo( self, project_key: str ) -> str | None: """Map project_key to GitHub org/repo.""" gh = self.cfg.issue_tracker.github for repo in gh.repos: slug = repo.split("/")[-1] if slug == project_key: return repo # Try matching against codebase keys for cb in self.cfg.codebases: if cb.key == project_key: slug = Path(cb.path).name if gh.org: return f"{gh.org}/{slug}" return None ================================================ FILE: maggy/maggy/process/signals.py ================================================ """Signal extraction — derives patterns from raw PR data. Three signal types: - Review signals: what do reviewers always flag? - CI signals: which checks fail and why? - Velocity signals: how fast do PRs merge? """ from __future__ import annotations from collections import Counter from .models import ( CISignal, PRRecord, ReviewSignal, VelocitySignal, ) # Keywords that indicate common review themes REVIEW_THEMES: dict[str, list[str]] = { "error_handling": [ "error", "exception", "try", "catch", "handle", "edge case", "null", "undefined", ], "testing": [ "test", "coverage", "assert", "mock", "spec", "unit test", "missing test", ], "naming": [ "naming", "rename", "variable name", "unclear", "confusing name", "readability", ], "types": [ "type", "typing", "annotation", "any type", "type hint", "interface", ], "security": [ "security", "auth", "sanitize", "inject", "xss", "csrf", "vulnerability", ], "performance": [ "performance", "slow", "optimize", "n+1", "cache", "memory", "complexity", ], "documentation": [ "document", "comment", "docstring", "readme", "jsdoc", "explain", ], "style": [ "style", "format", "indent", "lint", "spacing", "consistent", ], } def extract_review_signals( prs: list[PRRecord], ) -> list[ReviewSignal]: """Find recurring reviewer complaints.""" # reviewer -> theme -> [pr_numbers] hits: dict[str, dict[str, list[int]]] = {} for pr in prs: for review in pr.reviews: if not review.body: continue reviewer = review.reviewer if reviewer not in hits: hits[reviewer] = {} body_lower = review.body.lower() for theme, keywords in REVIEW_THEMES.items(): if _matches_theme(body_lower, keywords): theme_hits = hits[reviewer].setdefault( theme, [] ) if pr.number not in theme_hits: theme_hits.append(pr.number) signals: list[ReviewSignal] = [] for reviewer, themes in hits.items(): for theme, pr_nums in themes.items(): if len(pr_nums) >= 2: signals.append(ReviewSignal( reviewer=reviewer, theme=theme, count=len(pr_nums), example_prs=pr_nums[:5], )) signals.sort(key=lambda s: s.count, reverse=True) return signals def extract_ci_signals( prs: list[PRRecord], ) -> list[CISignal]: """Find CI failure patterns.""" # check_name -> {failures, total, files} stats: dict[str, dict] = {} for pr in prs: for check in pr.checks: if check.name not in stats: stats[check.name] = { "failures": 0, "total": 0, "files": Counter(), } stats[check.name]["total"] += 1 if check.conclusion == "failure": stats[check.name]["failures"] += 1 for f in pr.files: stats[check.name]["files"][f] += 1 signals: list[CISignal] = [] for name, data in stats.items(): if data["failures"] == 0: continue # Top correlated files (appear in >50% of failures) threshold = max(2, data["failures"] // 2) correlated = [ f for f, count in data["files"].most_common(5) if count >= threshold ] signals.append(CISignal( check_name=name, failure_count=data["failures"], total_runs=data["total"], correlated_files=correlated, )) signals.sort( key=lambda s: s.failure_rate, reverse=True ) return signals def extract_velocity_signals( prs: list[PRRecord], ) -> VelocitySignal | None: """Compute PR velocity metrics.""" merged = [p for p in prs if p.state == "merged"] if not merged: return None merge_times = [ p.time_to_merge_hours for p in merged if p.time_to_merge_hours is not None ] if not merge_times: return None merge_times.sort() avg_time = sum(merge_times) / len(merge_times) median_idx = len(merge_times) // 2 median_time = merge_times[median_idx] rounds = [p.review_rounds for p in merged] avg_rounds = sum(rounds) / len(rounds) if rounds else 0 sizes = [p.total_lines for p in merged] avg_size = sum(sizes) / len(sizes) if sizes else 0 return VelocitySignal( avg_time_to_merge_hours=round(avg_time, 1), median_time_to_merge_hours=round(median_time, 1), avg_review_rounds=round(avg_rounds, 2), avg_pr_size=round(avg_size, 1), total_prs_analyzed=len(merged), ) def _matches_theme( text: str, keywords: list[str] ) -> bool: """Check if text matches any keyword in theme.""" return any(kw in text for kw in keywords) ================================================ FILE: maggy/maggy/process/store.py ================================================ """SQLite persistence for process intelligence data. Stores PR records, signals, and reports. Follows the WAL + busy_timeout pattern from maggy/services/inbox.py. """ from __future__ import annotations import json import logging import sqlite3 from datetime import datetime, timezone from pathlib import Path from .models import ProcessReport logger = logging.getLogger(__name__) def _connect(path: Path) -> sqlite3.Connection: """Open SQLite with WAL mode for concurrency.""" db = sqlite3.connect(path, timeout=30.0) db.execute("PRAGMA journal_mode=WAL") db.execute("PRAGMA foreign_keys=ON") db.execute("PRAGMA busy_timeout=30000") return db class ProcessStore: """SQLite store for process intelligence.""" def __init__(self, db_path: Path): self.db_path = db_path self.db_path.parent.mkdir(parents=True, exist_ok=True) self._init_tables() def _init_tables(self) -> None: with _connect(self.db_path) as db: db.execute(""" CREATE TABLE IF NOT EXISTS pr_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_key TEXT NOT NULL, fetched_at TEXT NOT NULL, payload TEXT NOT NULL ) """) db.execute(""" CREATE TABLE IF NOT EXISTS reports ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_key TEXT NOT NULL, generated_at TEXT NOT NULL, payload TEXT NOT NULL ) """) db.execute( "CREATE INDEX IF NOT EXISTS idx_pr_project " "ON pr_data(project_key)" ) db.execute( "CREATE INDEX IF NOT EXISTS idx_report_project " "ON reports(project_key)" ) def save_pr_data( self, project_key: str, data: list[dict] ) -> None: """Store raw PR data as JSON.""" now = datetime.now(timezone.utc).isoformat() with _connect(self.db_path) as db: db.execute( "DELETE FROM pr_data WHERE project_key = ?", (project_key,), ) db.execute( "INSERT INTO pr_data " "(project_key, fetched_at, payload) " "VALUES (?, ?, ?)", (project_key, now, json.dumps(data)), ) def load_pr_data( self, project_key: str ) -> list[dict] | None: """Load cached PR data. Returns None if none.""" with _connect(self.db_path) as db: row = db.execute( "SELECT payload FROM pr_data " "WHERE project_key = ? " "ORDER BY id DESC LIMIT 1", (project_key,), ).fetchone() if not row: return None return json.loads(row[0]) def save_report(self, report: ProcessReport) -> None: """Store a generated report.""" payload = { "project_key": report.project_key, "generated_at": report.generated_at, "total_prs": report.total_prs, "summary": report.summary, "preemptive_fixes": report.preemptive_fixes, "routing_recommendations": ( report.routing_recommendations ), } if report.velocity: payload["velocity"] = { "avg_time_to_merge_hours": ( report.velocity.avg_time_to_merge_hours ), "median_time_to_merge_hours": ( report.velocity.median_time_to_merge_hours ), "avg_review_rounds": ( report.velocity.avg_review_rounds ), "avg_pr_size": report.velocity.avg_pr_size, "total_prs_analyzed": ( report.velocity.total_prs_analyzed ), } if report.review_signals: payload["review_signals"] = [ { "reviewer": s.reviewer, "theme": s.theme, "count": s.count, } for s in report.review_signals[:10] ] if report.ci_signals: payload["ci_signals"] = [ { "check_name": s.check_name, "failure_rate": round(s.failure_rate, 3), "failure_count": s.failure_count, } for s in report.ci_signals[:10] ] with _connect(self.db_path) as db: db.execute( "INSERT INTO reports " "(project_key, generated_at, payload) " "VALUES (?, ?, ?)", ( report.project_key, report.generated_at, json.dumps(payload), ), ) def load_latest_report( self, project_key: str ) -> dict | None: """Load the most recent report for a project.""" with _connect(self.db_path) as db: row = db.execute( "SELECT payload FROM reports " "WHERE project_key = ? " "ORDER BY id DESC LIMIT 1", (project_key,), ).fetchone() if not row: return None return json.loads(row[0]) ================================================ FILE: maggy/maggy/providers/__init__.py ================================================ """Issue tracker provider abstractions.""" from .asana import AsanaProvider from .base import Comment, IssueTrackerProvider, Task from .github_issues import GitHubIssuesProvider __all__ = [ "AsanaProvider", "Comment", "GitHubIssuesProvider", "IssueTrackerProvider", "Task", ] def build(cfg) -> IssueTrackerProvider: """Factory: build the right provider from MaggyConfig. Currently supported: 'github', 'asana'. 'linear' is a documented stub — config.is_configured() refuses to accept it, so we should never reach this function with that provider. If we do, raise with a clear message pointing at the roadmap. """ if cfg.issue_tracker.provider == "github": gh = cfg.issue_tracker.github return GitHubIssuesProvider(org=gh.org, repos=gh.repos, token=gh.token, labels=gh.labels) if cfg.issue_tracker.provider == "asana": az = cfg.issue_tracker.asana return AsanaProvider(workspace_id=az.workspace_id, boards=az.boards, token=az.token) if cfg.issue_tracker.provider == "linear": raise NotImplementedError( "Linear provider is a stub — not yet implemented. " "Use 'github' or 'asana' for now." ) raise ValueError(f"Unknown issue tracker provider: {cfg.issue_tracker.provider!r}") ================================================ FILE: maggy/maggy/providers/asana.py ================================================ """Asana provider — compatibility shim for teams migrating from the zenloop prototype.""" from __future__ import annotations import httpx from .base import Comment, Task ASANA_BASE = "https://app.asana.com/api/1.0" class AsanaProvider: """IssueTrackerProvider implementation for Asana. Simpler than the zenloop prototype — no USER_GIDS hardcoded. `list_followed` uses the authenticated user's GID via /users/me. """ def __init__(self, workspace_id: str, boards: dict[str, str], token: str): self.workspace_id = workspace_id # boards: {"dev": "project_gid", "bugs": "other_gid"} self.boards = boards self.token = token self._my_gid: str = "" def provider_name(self) -> str: return "asana" def _headers(self) -> dict[str, str]: return {"Authorization": f"Bearer {self.token}"} def _to_task(self, t: dict) -> Task: assignee = (t.get("assignee") or {}).get("name", "") projects = t.get("projects") or [] board = projects[0].get("name", "") if projects else "" return Task( id=t.get("gid", ""), title=t.get("name", ""), description=t.get("notes", "") or "", status="closed" if t.get("completed") else "open", assignee=assignee, url=t.get("permalink_url", ""), labels=[tag.get("name", "") for tag in (t.get("tags") or [])], board=board, created_at=t.get("created_at", ""), updated_at=t.get("modified_at", ""), raw=t, ) async def _get_my_gid(self, client: httpx.AsyncClient) -> str: if self._my_gid: return self._my_gid resp = await client.get(f"{ASANA_BASE}/users/me", headers=self._headers()) if resp.status_code == 200: self._my_gid = resp.json().get("data", {}).get("gid", "") return self._my_gid async def list_tasks(self, board: str | None = None, state: str = "open", limit: int = 50) -> list[Task]: if not self.boards: return [] # Which boards to query board_gids: list[str] if board and board in self.boards: board_gids = [self.boards[board]] else: board_gids = list(self.boards.values()) tasks: list[Task] = [] async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: for gid in board_gids: # `completed_since=now` tells Asana to exclude tasks completed # before this instant (i.e. give us open + just-now-completed). # Don't send it at all when we WANT completed tasks — empty # string is rejected by Asana's validator. params = { "opt_fields": "name,notes,completed,assignee.name,projects.name,modified_at,created_at,permalink_url,tags.name", "limit": str(min(limit, 100)), } if state == "open": params["completed_since"] = "now" resp = await client.get(f"{ASANA_BASE}/projects/{gid}/tasks", params=params) if resp.status_code != 200: continue for t in resp.json().get("data", []): # completed_since gives everything after a timestamp — we # still need to filter to match the requested state. if state == "open" and t.get("completed"): continue if state == "closed" and not t.get("completed"): continue tasks.append(self._to_task(t)) tasks.sort(key=lambda t: t.updated_at, reverse=True) return tasks[:limit] async def get_task(self, task_id: str) -> Task | None: async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.get( f"{ASANA_BASE}/tasks/{task_id}", params={"opt_fields": "name,notes,completed,assignee.name,projects.name,modified_at,created_at,permalink_url,tags.name"}, ) if resp.status_code != 200: return None return self._to_task(resp.json().get("data", {})) async def get_comments(self, task_id: str) -> list[Comment]: async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.get( f"{ASANA_BASE}/tasks/{task_id}/stories", params={"opt_fields": "type,text,created_at,created_by.name,resource_subtype"}, ) if resp.status_code != 200: return [] out: list[Comment] = [] for s in resp.json().get("data", []): if s.get("resource_subtype") != "comment_added": continue out.append(Comment( id=s.get("gid", ""), author=(s.get("created_by") or {}).get("name", ""), text=s.get("text", ""), created_at=s.get("created_at", ""), )) return out async def add_comment(self, task_id: str, text: str) -> Comment | None: async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.post( f"{ASANA_BASE}/tasks/{task_id}/stories", headers={**self._headers(), "Content-Type": "application/json"}, json={"data": {"text": text}}, ) if resp.status_code not in (200, 201): return None d = resp.json().get("data", {}) return Comment( id=d.get("gid", ""), author=(d.get("created_by") or {}).get("name", ""), text=d.get("text", text), created_at=d.get("created_at", ""), ) async def update_status(self, task_id: str, status: str) -> bool: completed = status.lower().strip() in ("done", "closed", "complete", "completed", "resolved") async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.put( f"{ASANA_BASE}/tasks/{task_id}", headers={**self._headers(), "Content-Type": "application/json"}, json={"data": {"completed": completed}}, ) return resp.status_code == 200 async def list_followed(self, user_id: str | None = None, limit: int = 50) -> list[Task]: if not self.workspace_id: return [] async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: gid = user_id or await self._get_my_gid(client) if not gid: return [] resp = await client.get( f"{ASANA_BASE}/workspaces/{self.workspace_id}/tasks/search", params={ "followers.any": gid, "completed": "false", "sort_by": "modified_at", "opt_fields": "name,notes,assignee.name,projects.name,modified_at,permalink_url", "limit": str(min(limit, 100)), }, ) if resp.status_code != 200: return [] return [self._to_task(t) for t in resp.json().get("data", [])] async def search_tasks(self, query: str, limit: int = 20) -> list[Task]: if not self.workspace_id: return [] async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.get( f"{ASANA_BASE}/workspaces/{self.workspace_id}/tasks/search", params={ "text": query, "opt_fields": "name,notes,completed,assignee.name,projects.name,modified_at,permalink_url", "limit": str(min(limit, 100)), }, ) if resp.status_code != 200: return [] return [self._to_task(t) for t in resp.json().get("data", [])] ================================================ FILE: maggy/maggy/providers/base.py ================================================ """IssueTrackerProvider Protocol — all trackers (GitHub, Asana, Linear) implement this. Services call provider.list_tasks() and work with Task/Comment dataclasses. They don't care which tracker is underneath. Swap providers without touching services. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Protocol @dataclass class Task: """Provider-agnostic task representation. Fields that don't apply to a given provider are left empty — never None for strings so downstream formatters don't need null checks. """ id: str # Provider-native ID ("123" for GH, "1213..." for Asana) title: str description: str = "" # Full body/notes status: str = "" # "open", "closed", "in progress", etc. assignee: str = "" # Display name author: str = "" # Who created it url: str = "" # Permalink labels: list[str] = field(default_factory=list) board: str = "" # Project/repo name created_at: str = "" # ISO 8601 updated_at: str = "" # ISO 8601 raw: dict = field(default_factory=dict) # Original provider payload for escape hatches @dataclass class Comment: id: str author: str text: str created_at: str = "" class IssueTrackerProvider(Protocol): """Common interface across GitHub Issues, Asana, Linear, etc.""" async def list_tasks(self, board: str | None = None, state: str = "open", limit: int = 50) -> list[Task]: """List tasks. `board` filters to a specific project/repo if provider supports it.""" ... async def get_task(self, task_id: str) -> Task | None: ... async def get_comments(self, task_id: str) -> list[Comment]: ... async def add_comment(self, task_id: str, text: str) -> Comment | None: ... async def update_status(self, task_id: str, status: str) -> bool: """Update status. For providers that use labels (GitHub), this maps intelligently.""" ... async def list_followed(self, user_id: str | None = None, limit: int = 50) -> list[Task]: """Tasks the user is watching/following/assigned to — powers the 'Latest' tab.""" ... async def search_tasks(self, query: str, limit: int = 20) -> list[Task]: ... def provider_name(self) -> str: """Return 'github' | 'asana' | 'linear' — for UI display.""" ... ================================================ FILE: maggy/maggy/providers/github_issues.py ================================================ """GitHub Issues provider — talks to GitHub REST API across multiple repos.""" from __future__ import annotations import logging import httpx from .base import Comment, Task logger = logging.getLogger(__name__) GITHUB_API = "https://api.github.com" class GitHubIssuesProvider: """IssueTrackerProvider implementation for GitHub Issues. Handles multiple repos transparently — list_tasks() aggregates across all configured repos. Task IDs are encoded as "repo/number" (e.g. "api/123") so we can round-trip back to the right repo. """ def __init__(self, org: str, repos: list[str], token: str, labels: list[str] | None = None): self.org = org self.repos = repos # Full names: ["org/api", "org/web"] self.token = token self.label_filter = labels or [] def provider_name(self) -> str: return "github" def _headers(self) -> dict[str, str]: return { "Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } def _encode_id(self, repo: str, number: int) -> str: # Store repo slug (without org prefix for compactness) + issue number slug = repo.split("/")[-1] return f"{slug}/{number}" def _decode_id(self, task_id: str) -> tuple[str, int] | None: """Parse 'slug/number' IDs. Returns None for malformed input. Returning None (instead of raising) lets the caller translate to a 404/None response instead of a 500 to the client. """ if not task_id or "/" not in task_id: return None slug, _, num_str = task_id.partition("/") if not num_str.isdigit(): return None number = int(num_str) for repo in self.repos: if repo.endswith("/" + slug): return repo, number # Fallback: assume org prefix (for repos not in the configured list) if self.org: return f"{self.org}/{slug}", number return None def _to_task(self, repo: str, issue: dict) -> Task: return Task( id=self._encode_id(repo, issue["number"]), title=issue.get("title", ""), description=issue.get("body") or "", status=issue.get("state", "open"), assignee=((issue.get("assignee") or {}) or {}).get("login", ""), author=((issue.get("user") or {}) or {}).get("login", ""), url=issue.get("html_url", ""), labels=[lbl["name"] for lbl in issue.get("labels", []) if isinstance(lbl, dict)], board=repo.split("/")[-1], created_at=issue.get("created_at", ""), updated_at=issue.get("updated_at", ""), raw=issue, ) async def list_tasks(self, board: str | None = None, state: str = "open", limit: int = 50) -> list[Task]: """List issues across repos (or one repo if `board` given). Excludes PRs.""" repos = [r for r in self.repos if not board or r.endswith("/" + board)] if not repos: return [] per_repo = max(1, limit // max(len(repos), 1)) tasks: list[Task] = [] async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: for repo in repos: params: dict[str, str] = {"state": state, "per_page": str(per_repo), "sort": "updated"} if self.label_filter: params["labels"] = ",".join(self.label_filter) resp = await client.get(f"{GITHUB_API}/repos/{repo}/issues", params=params) if resp.status_code != 200: # Log at WARNING so misconfiguration (bad token, repo renamed, # missing read scope) is visible instead of silently returning # an empty inbox. Include the status code + first 200 chars # of the response body to make diagnostics easy. body_excerpt = (resp.text or "")[:200].replace("\n", " ") logger.warning( "GitHub /repos/%s/issues returned %s: %s", repo, resp.status_code, body_excerpt, ) continue for issue in resp.json(): # GitHub returns PRs in /issues — filter them out if "pull_request" in issue: continue tasks.append(self._to_task(repo, issue)) tasks.sort(key=lambda t: t.updated_at, reverse=True) return tasks[:limit] async def get_task(self, task_id: str) -> Task | None: decoded = self._decode_id(task_id) if decoded is None: return None repo, number = decoded async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.get(f"{GITHUB_API}/repos/{repo}/issues/{number}") if resp.status_code != 200: return None return self._to_task(repo, resp.json()) async def get_comments(self, task_id: str) -> list[Comment]: decoded = self._decode_id(task_id) if decoded is None: return [] repo, number = decoded async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.get(f"{GITHUB_API}/repos/{repo}/issues/{number}/comments") if resp.status_code != 200: return [] return [ Comment( id=str(c["id"]), author=((c.get("user") or {}) or {}).get("login", ""), text=c.get("body", ""), created_at=c.get("created_at", ""), ) for c in resp.json() ] async def add_comment(self, task_id: str, text: str) -> Comment | None: decoded = self._decode_id(task_id) if decoded is None: return None repo, number = decoded async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.post( f"{GITHUB_API}/repos/{repo}/issues/{number}/comments", json={"body": text}, ) if resp.status_code not in (200, 201): return None c = resp.json() return Comment( id=str(c["id"]), author=((c.get("user") or {}) or {}).get("login", ""), text=c.get("body", ""), created_at=c.get("created_at", ""), ) async def update_status(self, task_id: str, status: str) -> bool: """GitHub issues only have open/closed — map any "done-like" status to closed.""" decoded = self._decode_id(task_id) if decoded is None: return False repo, number = decoded normalized = status.lower().strip() new_state = "closed" if normalized in ("done", "closed", "complete", "completed", "resolved") else "open" async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: resp = await client.patch( f"{GITHUB_API}/repos/{repo}/issues/{number}", json={"state": new_state}, ) return resp.status_code == 200 async def list_followed(self, user_id: str | None = None, limit: int = 50) -> list[Task]: """Issues assigned to or mentioning the authenticated user across configured repos. Refuses to run without repos — otherwise the GitHub search query would have no repo filter and hit every public issue on the site. """ if not self.repos: return [] async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: # Figure out the user if not provided if not user_id: me = await client.get(f"{GITHUB_API}/user") if me.status_code == 200: user_id = me.json().get("login", "") else: return [] # Use search API: is:open + assignee/mentions + repo filter repo_qual = " ".join(f"repo:{r}" for r in self.repos) query = f"is:issue is:open ({repo_qual}) (assignee:{user_id} OR mentions:{user_id})" resp = await client.get( f"{GITHUB_API}/search/issues", params={"q": query, "sort": "updated", "per_page": str(limit)}, ) if resp.status_code != 200: return [] tasks: list[Task] = [] for issue in resp.json().get("items", []): if "pull_request" in issue: continue # Derive repo from URL repo_url = issue.get("repository_url", "") repo = "/".join(repo_url.rstrip("/").split("/")[-2:]) tasks.append(self._to_task(repo, issue)) return tasks async def search_tasks(self, query: str, limit: int = 20) -> list[Task]: # Same guard as list_followed — without repos, the query would search # all public GitHub issues, which is never what we want. if not self.repos: return [] async with httpx.AsyncClient(timeout=15, headers=self._headers()) as client: repo_qual = " ".join(f"repo:{r}" for r in self.repos) q = f"is:issue {query} {repo_qual}" resp = await client.get( f"{GITHUB_API}/search/issues", params={"q": q, "per_page": str(limit)}, ) if resp.status_code != 200: return [] tasks: list[Task] = [] for issue in resp.json().get("items", []): if "pull_request" in issue: continue repo_url = issue.get("repository_url", "") repo = "/".join(repo_url.rstrip("/").split("/")[-2:]) tasks.append(self._to_task(repo, issue)) return tasks ================================================ FILE: maggy/maggy/providers/monday.py ================================================ """Monday.com provider — IssueTrackerProvider implementation.""" from __future__ import annotations import httpx from .base import Comment, Task MONDAY_API = "https://api.monday.com/v2" class MondayProvider: """IssueTrackerProvider for Monday.com boards.""" def __init__(self, api_token: str, board_id: str): self.api_token = api_token self.board_id = board_id def provider_name(self) -> str: return "monday" def _headers(self) -> dict[str, str]: return { "Authorization": self.api_token, "Content-Type": "application/json", } def _to_task(self, item: dict) -> Task: cols = item.get("column_values", []) status = _col_value(cols, "status") assignee = _col_value(cols, "person") return Task( id=item.get("id", ""), title=item.get("name", ""), description="", status=status, assignee=assignee, url=item.get("url", ""), created_at=item.get("created_at", ""), updated_at=item.get("updated_at", ""), raw=item, ) async def _query(self, q: str) -> dict: async with httpx.AsyncClient( timeout=15, headers=self._headers(), ) as client: resp = await client.post( MONDAY_API, json={"query": q}, ) if resp.status_code != 200: return {} return resp.json().get("data", {}) async def list_tasks(self, board=None, state="open", limit=50) -> list[Task]: bid = board or self.board_id q = _items_query(bid, limit) data = await self._query(q) boards = data.get("boards", []) if not boards: return [] items = boards[0].get("items_page", {}).get("items", []) return [self._to_task(i) for i in items] async def get_task(self, task_id: str) -> Task | None: q = f'{{ items(ids: [{task_id}]) {{ id name column_values {{ id text }} url created_at updated_at }} }}' data = await self._query(q) items = data.get("items", []) if not items: return None return self._to_task(items[0]) async def get_comments(self, task_id: str) -> list[Comment]: q = f'{{ items(ids: [{task_id}]) {{ updates {{ id body created_at creator {{ name }} }} }} }}' data = await self._query(q) items = data.get("items", []) if not items: return [] updates = items[0].get("updates", []) return [ Comment( id=u.get("id", ""), author=(u.get("creator") or {}).get("name", ""), text=u.get("body", ""), created_at=u.get("created_at", ""), ) for u in updates ] async def add_comment(self, task_id: str, text: str) -> Comment | None: escaped = text.replace('"', '\\"') q = f'mutation {{ create_update(item_id: {task_id}, body: "{escaped}") {{ id body }} }}' data = await self._query(q) update = data.get("create_update", {}) if not update: return None return Comment( id=update.get("id", ""), author="", text=update.get("body", text), ) async def update_status(self, task_id: str, status: str) -> bool: return False # Requires board-specific column ID async def list_followed(self, user_id=None, limit=50) -> list[Task]: return await self.list_tasks(limit=limit) async def search_tasks(self, query: str, limit=20) -> list[Task]: return await self.list_tasks(limit=limit) def _col_value(cols: list[dict], col_id: str) -> str: for c in cols: if c.get("id") == col_id: return c.get("text", "") return "" def _items_query(board_id: str, limit: int) -> str: return ( f'{{ boards(ids: [{board_id}]) {{ items_page(limit: {limit}) ' f'{{ items {{ id name column_values {{ id text }} url created_at updated_at }} }} }} }}' ) ================================================ FILE: maggy/maggy/recovery/__init__.py ================================================ ================================================ FILE: maggy/maggy/recovery/rollback.py ================================================ """Git-backed rollback savepoints for Maggy sessions.""" from __future__ import annotations import asyncio import re _SAFE_ID = re.compile(r"^[a-zA-Z0-9_\-]+$") def _validate_session_id(session_id: str) -> None: if not _SAFE_ID.match(session_id): raise ValueError(f"Invalid session_id: {session_id!r}") class RollbackManager: async def create_savepoint(self, session_id: str, working_dir: str) -> str: _validate_session_id(session_id) tag = _tag_name(session_id) code, output = await _run_git(working_dir, "tag", tag) if code != 0: raise RuntimeError(output or f"failed to create {tag}") return tag async def rollback(self, session_id: str, working_dir: str) -> bool: _validate_session_id(session_id) code, _ = await _run_git(working_dir, "reset", "--hard", _tag_name(session_id)) return code == 0 async def list_savepoints(self, working_dir: str) -> list[str]: code, output = await _run_git(working_dir, "tag", "--list", "maggy-save-*") if code != 0 or not output: return [] return output.splitlines() async def delete_savepoint(self, session_id: str, working_dir: str) -> bool: code, _ = await _run_git(working_dir, "tag", "-d", _tag_name(session_id)) return code == 0 async def _run_git(working_dir: str, *args: str) -> tuple[int, str]: proc = await asyncio.create_subprocess_exec( "git", *args, cwd=working_dir, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, ) stdout, _ = await proc.communicate() text = (stdout or b"").decode("utf-8", errors="replace").strip() return proc.returncode or 0, text def _tag_name(session_id: str) -> str: return f"maggy-save-{session_id}" ================================================ FILE: maggy/maggy/registry.py ================================================ """Project registry backed by Maggy config.""" from __future__ import annotations from maggy.config import MaggyConfig, ProjectConfig class ProjectRegistry: """Manage configured projects in memory.""" def __init__(self, cfg: MaggyConfig): self._projects = {project.name: project for project in cfg.projects} def list(self) -> list[ProjectConfig]: return list(self._projects.values()) def get(self, name: str) -> ProjectConfig | None: return self._projects.get(name) def add(self, project: ProjectConfig) -> None: if project.name in self._projects: raise ValueError(f"Project {project.name!r} already exists") self._projects[project.name] = project def remove(self, name: str) -> bool: return self._projects.pop(name, None) is not None ================================================ FILE: maggy/maggy/routing.py ================================================ """Blast-to-model routing with iCPG integration and reward learning. Routes tasks to the optimal model based on complexity score. High-blast tasks go to premium models, low-blast to cheap ones. Learns from reward scores over time. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path from maggy.calibration.tracker import CalibrationTracker from maggy.config import MaggyConfig from maggy.process.model_router import ( DEFAULT_TIERS, RoutingDecision, route_task, ) from maggy.routing_rules import apply_override from maggy.routing_rules_io import load as load_rules from maggy.routing_rules import record_outcome as rules_record from maggy.scores import RewardTable MIN_CALIBRATION_ACCURACY = 0.5 @dataclass class RoutingContext: """Input context for a routing decision.""" blast_score: int = 0 task_type: str = "general" security_sensitive: bool = False project_key: str = "" pipeline_phase: str = "" stakes: str = "low" class RoutingService: """Blast-score aware routing with rule overrides.""" def __init__(self, cfg: MaggyConfig): self.cfg = cfg self.rewards = RewardTable(cfg) db_dir = Path(cfg.storage.path).expanduser().parent self.calibration = CalibrationTracker( db_dir / "calibration.db", ) self.rules = load_rules() def route(self, ctx: RoutingContext) -> RoutingDecision: """Pick the best model for this task context.""" forced = apply_override( self.rules, ctx.task_type, ctx.pipeline_phase, ) if forced: return self._forced_decision(forced, ctx) override = self.rewards.best_model( ctx.task_type, self._blast_tier(ctx.blast_score), ) if override and self._is_calibrated(override): return RoutingDecision( primary=override, validator=None, fallback_chain=[], reason=( f"Learned: best for {ctx.task_type} " f"at blast {ctx.blast_score}" ), ) decision = route_task( ctx.blast_score, ctx.task_type, ctx.security_sensitive, stakes=ctx.stakes, ) return self._penalize_uncalibrated(decision) def record_outcome( self, model: str, task_type: str, blast_score: int, reward: float, ) -> None: """Record task outcome for learning.""" tier = self._blast_tier(blast_score) self.rewards.record(model, task_type, tier, reward) self.calibration.record(model, task_type, reward, reward) success = reward > 0.0 rules_record(self.rules, model, task_type, success) def reload_rules(self) -> None: """Reload rules from disk (after Maggy self-update).""" self.rules = load_rules() def get_heatmap(self) -> list[dict]: """Return reward heatmap data for dashboard.""" return self.rewards.heatmap() def _blast_tier(self, score: int) -> str: if score <= 3: return "low" if score <= 6: return "medium" return "high" def _is_calibrated(self, model: str) -> bool: acc = self.calibration.accuracy(model) return acc == 0.0 or acc >= MIN_CALIBRATION_ACCURACY def _forced_decision( self, model_name: str, ctx: RoutingContext, ) -> RoutingDecision: """Build decision from a rules override.""" tier = _find_tier(model_name) if tier is None: return route_task( ctx.blast_score, ctx.task_type, ctx.security_sensitive, stakes=ctx.stakes, ) validator = None if ctx.blast_score >= 8 or ctx.security_sensitive or ctx.stakes == "high": validator = _find_tier("codex") return RoutingDecision( primary=tier, validator=validator, fallback_chain=[], reason=f"Rule override: {ctx.task_type}" f"{f'/{ctx.pipeline_phase}' if ctx.pipeline_phase else ''}" f" → {model_name}", ) def _penalize_uncalibrated( self, decision: RoutingDecision, ) -> RoutingDecision: if not self._is_calibrated(decision.primary.name): chain = decision.fallback_chain if chain: return RoutingDecision( primary=chain[0], validator=decision.validator, fallback_chain=chain[1:], reason="Calibration penalty", ) return decision def _find_tier(name: str): """Look up a ModelTier by name from defaults.""" for t in DEFAULT_TIERS: if t.name == name: return t return None ================================================ FILE: maggy/maggy/routing_rules.py ================================================ """Routing rules — task-type, pipeline-phase, stakes, cascade config. Loaded from ~/.maggy/routing-rules.yaml. Maggy can self-update this file when benchmark or outcome data provides evidence for better routing decisions. Manual edits are preserved. """ from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path MIN_CONFIDENCE = 0.6 @dataclass class ModelOverride: """Force a specific model for a task type or phase.""" model: str reason: str = "" confidence: float = 1.0 source: str = "rule" @dataclass class PerformanceRecord: """Tracked model performance from outcomes.""" strengths: list[str] = field(default_factory=list) weaknesses: list[str] = field(default_factory=list) tasks_completed: int = 0 success_rate: float = 0.0 @dataclass class Convention: """A team convention injected into prompts.""" text: str applies_to: list[str] = field(default_factory=list) source: str = "manual" @dataclass class StakesLevel: """Patterns for a single stakes level.""" file_patterns: list[str] = field(default_factory=list) task_types: list[str] = field(default_factory=list) keywords: list[str] = field(default_factory=list) @dataclass class StakesPatterns: """Stakes classification config — high/medium/low.""" high: StakesLevel = field(default_factory=StakesLevel) medium: StakesLevel = field(default_factory=StakesLevel) low: StakesLevel = field(default_factory=StakesLevel) @dataclass class CascadePolicy: """Cascade execution policy.""" enabled: bool = True min_blast: int = 5 min_stakes: str = "medium" max_attempts: int = 3 quality_threshold: int = 3 @dataclass class RoutingRules: """All routing rules Maggy uses for orchestration.""" version: int = 1 updated_at: str = "" task_type_overrides: dict[str, ModelOverride] = field( default_factory=dict, ) pipeline_phases: dict[str, ModelOverride] = field( default_factory=dict, ) model_performance: dict[str, PerformanceRecord] = field( default_factory=dict, ) conventions: list[Convention] = field(default_factory=list) project_conventions: dict[str, list[Convention]] = field( default_factory=dict, ) stakes: StakesPatterns = field(default_factory=StakesPatterns) cascade: CascadePolicy = field(default_factory=CascadePolicy) def _now_iso() -> str: return datetime.now(timezone.utc).isoformat() def apply_override( rules: RoutingRules, task_type: str, phase: str | None = None, ) -> str | None: """Return model name if rules override routing.""" if phase and phase in rules.pipeline_phases: override = rules.pipeline_phases[phase] if override.model != "auto" and _trusted(override): return override.model if task_type in rules.task_type_overrides: override = rules.task_type_overrides[task_type] if _trusted(override): return override.model return None def record_outcome( rules: RoutingRules, model: str, task_type: str, success: bool, path: Path | None = None, ) -> None: """Update performance data from a task outcome.""" from maggy.routing_rules_io import save perf = rules.model_performance.get(model) if perf is None: perf = PerformanceRecord() rules.model_performance[model] = perf _update_perf(perf, task_type, success) rules.updated_at = _now_iso() save(rules, path) def learn_override( rules: RoutingRules, task_type: str, model: str, reason: str, confidence: float = 0.7, path: Path | None = None, ) -> None: """Maggy learns a new routing override from data.""" from maggy.routing_rules_io import save rules.task_type_overrides[task_type] = ModelOverride( model=model, reason=reason, confidence=confidence, source="learned", ) rules.updated_at = _now_iso() save(rules, path) def conventions_for( rules: RoutingRules, task_type: str, project_key: str | None = None, ) -> str: """Return conventions text relevant to a task type.""" all_convs = list(rules.conventions) if project_key and project_key in rules.project_conventions: all_convs.extend(rules.project_conventions[project_key]) lines = [ f"- {c.text}" for c in all_convs if "all" in c.applies_to or task_type in c.applies_to ] if not lines: return "" return "## Team Conventions\n" + "\n".join(lines) def _trusted(override: ModelOverride) -> bool: return override.confidence >= MIN_CONFIDENCE def _update_perf( perf: PerformanceRecord, task_type: str, success: bool, ) -> None: total = perf.tasks_completed rate = perf.success_rate new_total = total + 1 perf.tasks_completed = new_total perf.success_rate = round( (rate * total + (1.0 if success else 0.0)) / new_total, 3, ) if success and task_type not in perf.strengths: perf.strengths.append(task_type) if not success and task_type not in perf.weaknesses: perf.weaknesses.append(task_type) ================================================ FILE: maggy/maggy/routing_rules_defaults.py ================================================ """Default routing rules — seed data for first-run initialization.""" from __future__ import annotations from maggy.routing_rules import ( CascadePolicy, Convention, ModelOverride, PerformanceRecord, RoutingRules, StakesLevel, StakesPatterns, _now_iso, ) _CONV_DATA = [ ("mWP: Ship minimum wowable product, not MVP. " "Target 5-7 on the 11-star scale.", ["all"]), ("TDD: RED (failing tests) -> GREEN (minimal code) " "-> VALIDATE (lint, types, coverage >= 80%).", ["feature", "bug", "refactor"]), ("No secrets in code. Parameterized SQL only. " "Validate at API boundaries.", ["all"]), ("Quality gates: max 20 lines/function, 3 params, " "2 nesting levels, 200 lines/file.", ["all"]), ("Use existing patterns. Read codebase before " "changing. Keep changes minimal.", ["all"]), ] _OVERRIDES = { "docs": ("claude", "Not prose-optimized", 0.9, "benchmark"), "security": ("claude", "Deep reasoning needed", 1.0, "rule"), "architecture": ("claude", "Cross-context awareness", 0.8, "rule"), "tests": ("claude", "Test generation", 0.9, "benchmark"), "planning": ("claude", "Structured reasoning", 0.8, "rule"), } _PHASES = { "spec": ("claude", "Comprehensive docs", 1.0, "rule"), "tdd_red": ("claude", "Test design expertise", 0.9, "rule"), "tdd_green": ("auto", "Blast-score routing", 1.0, "rule"), "review": ("claude", "Security+arch depth", 1.0, "rule"), } _PERF = { "claude": (["security", "tests", "docs", "architecture"], ["cost"], 6, 1.0), "codex": (["code_generation", "api_design", "bug", "feature"], ["docs"], 5, 1.0), "kimi": (["schema", "simple_tasks", "docs"], ["complex_reasoning"], 1, 1.0), "local": (["code_formatting", "simple_edits", "feature"], ["docs", "prose"], 1, 1.0), } def default_conventions() -> list[Convention]: """Team conventions from claude-bootstrap skills.""" return [Convention(t, a, "claude-bootstrap") for t, a in _CONV_DATA] def default_stakes() -> StakesPatterns: return StakesPatterns( high=StakesLevel( ["auth", "billing", "payment", "migration", "security", "deploy", "infra", ".env"], ["security", "auth", "billing", "migration"], ["production", "customer data", "breaking change"], ), medium=StakesLevel( ["api", "routes", "models", "schema", "database"], ["feature", "refactor"], ), low=StakesLevel([], ["docs", "formatting", "tests"]), ) def default_rules() -> RoutingRules: """Seed rules from benchmark evidence + team conventions.""" return RoutingRules( version=1, updated_at=_now_iso(), conventions=default_conventions(), stakes=default_stakes(), cascade=CascadePolicy(), task_type_overrides={ k: ModelOverride(*v) for k, v in _OVERRIDES.items() }, pipeline_phases={ k: ModelOverride(*v) for k, v in _PHASES.items() }, model_performance={ k: PerformanceRecord(*v) for k, v in _PERF.items() }, ) ================================================ FILE: maggy/maggy/routing_rules_io.py ================================================ """Routing rules YAML I/O — load, save, serialize, deserialize.""" from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING import yaml from maggy.config import CONFIG_DIR if TYPE_CHECKING: from maggy.routing_rules import ( CascadePolicy, ModelOverride, PerformanceRecord, RoutingRules, StakesLevel, StakesPatterns, ) RULES_PATH = CONFIG_DIR / "routing-rules.yaml" def save(rules: RoutingRules, path: Path | None = None) -> None: """Write rules to YAML.""" target = path or RULES_PATH target.parent.mkdir(parents=True, exist_ok=True) data = to_dict(rules) target.write_text(yaml.safe_dump(data, sort_keys=False)) def load(path: Path | None = None) -> RoutingRules: """Load rules from YAML. Seeds defaults if missing.""" from maggy.routing_rules_defaults import default_conventions, default_rules target = path or RULES_PATH if not target.exists(): rules = default_rules() save(rules, target) return rules rules = from_yaml(target) if not rules.conventions: rules.conventions = default_conventions() save(rules, target) return rules def to_dict(rules: RoutingRules) -> dict: """Serialize RoutingRules to a plain dict for YAML.""" return { "version": rules.version, "updated_at": rules.updated_at, "stakes_patterns": _stakes_to_dict(rules.stakes), "cascade_policy": _cascade_to_dict(rules.cascade), "conventions": [ {"text": c.text, "applies_to": c.applies_to, "source": c.source} for c in rules.conventions ], "project_conventions": { k: [{"text": c.text, "applies_to": c.applies_to, "source": c.source} for c in v] for k, v in rules.project_conventions.items() }, "task_type_overrides": { k: _override_to_dict(v) for k, v in rules.task_type_overrides.items() }, "pipeline_phases": { k: _override_to_dict(v) for k, v in rules.pipeline_phases.items() }, "model_performance": { k: _perf_to_dict(v) for k, v in rules.model_performance.items() }, } def from_yaml(path: Path) -> RoutingRules: """Deserialize RoutingRules from a YAML file.""" from maggy.routing_rules import ( CascadePolicy as CP, Convention, ModelOverride as MO, PerformanceRecord as PR, RoutingRules as RR, ) data = yaml.safe_load(path.read_text()) or {} overrides = { k: MO(**v) for k, v in (data.get("task_type_overrides") or {}).items() } phases = { k: MO(**v) for k, v in (data.get("pipeline_phases") or {}).items() } perf = { k: PR(**v) for k, v in (data.get("model_performance") or {}).items() } convs = [ Convention(**c) for c in (data.get("conventions") or []) ] proj_convs: dict[str, list] = {} for pk, cv_list in (data.get("project_conventions") or {}).items(): proj_convs[pk] = [Convention(**c) for c in cv_list] stakes = _stakes_from_dict(data.get("stakes_patterns") or {}) cascade_raw = data.get("cascade_policy") or {} cascade = CP(**cascade_raw) if cascade_raw else CP() return RR( version=data.get("version", 1), updated_at=data.get("updated_at", ""), task_type_overrides=overrides, pipeline_phases=phases, model_performance=perf, conventions=convs, project_conventions=proj_convs, stakes=stakes, cascade=cascade, ) def _stakes_to_dict(stakes: StakesPatterns) -> dict: return { "high": _level_to_dict(stakes.high), "medium": _level_to_dict(stakes.medium), "low": _level_to_dict(stakes.low), } def _level_to_dict(level: StakesLevel) -> dict: return { "file_patterns": level.file_patterns, "task_types": level.task_types, "keywords": level.keywords, } def _cascade_to_dict(cascade: CascadePolicy) -> dict: return { "enabled": cascade.enabled, "min_blast": cascade.min_blast, "min_stakes": cascade.min_stakes, "max_attempts": cascade.max_attempts, "quality_threshold": cascade.quality_threshold, } def _override_to_dict(v: ModelOverride) -> dict: return { "model": v.model, "reason": v.reason, "confidence": v.confidence, "source": v.source, } def _perf_to_dict(v: PerformanceRecord) -> dict: return { "strengths": v.strengths, "weaknesses": v.weaknesses, "tasks_completed": v.tasks_completed, "success_rate": v.success_rate, } def _stakes_from_dict(raw: dict) -> StakesPatterns: from maggy.routing_rules import StakesLevel as SL from maggy.routing_rules import StakesPatterns as SP def _level(d: dict) -> SL: return SL( file_patterns=d.get("file_patterns", []), task_types=d.get("task_types", []), keywords=d.get("keywords", []), ) if not raw: from maggy.routing_rules_defaults import default_stakes return default_stakes() return SP( high=_level(raw.get("high", {})), medium=_level(raw.get("medium", {})), low=_level(raw.get("low", {})), ) ================================================ FILE: maggy/maggy/scores.py ================================================ """Reward table — tracks model performance per task type and blast tier. SQLite-backed with decay so old data ages out naturally. """ from __future__ import annotations import sqlite3 from contextlib import contextmanager from datetime import date, datetime, timezone from pathlib import Path from typing import Iterator from maggy.config import MaggyConfig SCHEMA = """ CREATE TABLE IF NOT EXISTS rewards ( id INTEGER PRIMARY KEY AUTOINCREMENT, model TEXT NOT NULL, task_type TEXT NOT NULL, blast_tier TEXT NOT NULL, reward REAL NOT NULL, recorded_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_rewards_lookup ON rewards(model, task_type, blast_tier); """ MIN_SAMPLES = 5 DECAY_RATE = 0.95 @contextmanager def _connect(path: Path) -> Iterator[sqlite3.Connection]: path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(path), timeout=30.0) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA busy_timeout=30000") conn.row_factory = sqlite3.Row try: yield conn finally: conn.close() class RewardTable: """SQLite-backed reward table with time decay.""" def __init__(self, cfg: MaggyConfig): db_dir = Path(cfg.storage.path).expanduser().parent self._db_path = db_dir / "model_scores.db" self._init_db() def _init_db(self) -> None: with _connect(self._db_path) as conn: conn.executescript(SCHEMA) def record( self, model: str, task_type: str, blast_tier: str, reward: float, ) -> None: """Record a reward observation.""" now = datetime.now(timezone.utc).isoformat() with _connect(self._db_path) as conn: conn.execute( "INSERT INTO rewards " "(model, task_type, blast_tier, " "reward, recorded_at) " "VALUES (?, ?, ?, ?, ?)", (model, task_type, blast_tier, reward, now), ) conn.commit() def best_model( self, task_type: str, blast_tier: str, ) -> str | None: """Return best model, or None if insufficient data.""" with _connect(self._db_path) as conn: rows = conn.execute( "SELECT model, reward, recorded_at " "FROM rewards " "WHERE task_type = ? AND blast_tier = ?", (task_type, blast_tier), ).fetchall() if not rows: return None scores: dict[str, tuple[float, int]] = {} today = date.today() for r in rows: model = r["model"] rec_date = datetime.fromisoformat( r["recorded_at"], ).date() days = (today - rec_date).days weight = DECAY_RATE ** days weighted = r["reward"] * weight total, count = scores.get(model, (0.0, 0)) scores[model] = (total + weighted, count + 1) candidates = { m: total / count for m, (total, count) in scores.items() if count >= MIN_SAMPLES } if not candidates: return None return max(candidates, key=candidates.get) def heatmap(self) -> list[dict]: """Return reward averages for dashboard.""" with _connect(self._db_path) as conn: rows = conn.execute( "SELECT model, task_type, blast_tier, " "AVG(reward) as avg_reward, " "COUNT(*) as n " "FROM rewards " "GROUP BY model, task_type, blast_tier", ).fetchall() return [ { "model": r["model"], "task_type": r["task_type"], "blast_tier": r["blast_tier"], "avg_reward": round(r["avg_reward"], 3), "samples": r["n"], } for r in rows ] ================================================ FILE: maggy/maggy/services/__init__.py ================================================ ================================================ FILE: maggy/maggy/services/account_guide.py ================================================ """Account switching guidance — detect profiles, suggest re-auth.""" from __future__ import annotations from dataclasses import dataclass from pathlib import Path from rich.console import Console console = Console() _PROVIDERS = { ".claude": ("anthropic", "claude auth login"), ".codex": ("openai", "codex auth login"), } @dataclass class AccountProfile: """Represents a CLI auth profile.""" name: str provider: str auth_command: str is_active: bool = False def detect_accounts(home: Path | None = None) -> list[AccountProfile]: """Discover CLI auth profiles from home dir.""" root = home or Path.home() accounts: list[AccountProfile] = [] for dirname, (provider, cmd) in _PROVIDERS.items(): path = root / dirname if path.exists(): accounts.append(AccountProfile( name=dirname.lstrip("."), provider=provider, auth_command=cmd, )) return accounts def suggest_switch(provider: str) -> str: """Return CLI instructions to switch accounts.""" if provider == "anthropic": return ( "Claude quota hit. Switch account:\n" " claude auth login\n" "Then restart your session." ) if provider == "openai": return ( "OpenAI/Codex quota hit. Switch account:\n" " codex auth login\n" "Then restart your session." ) return f"Quota hit for {provider}. Re-authenticate." def render_switch_guide(provider: str) -> None: """Print Rich-formatted switch instructions.""" guide = suggest_switch(provider) console.print(f"[yellow]{guide}[/yellow]") ================================================ FILE: maggy/maggy/services/activity.py ================================================ """CLI activity scanner — detects running sessions and recent prompts.""" from __future__ import annotations import json import logging import re import subprocess from dataclasses import asdict, dataclass from datetime import datetime, timezone from pathlib import Path logger = logging.getLogger(__name__) @dataclass class ActiveSession: """A currently running CLI session.""" cli: str session_id: str project: str project_path: str status: str # "running" | "agent" last_prompt: str agent_name: str team_name: str pid: int @dataclass class RecentPrompt: """A recent user prompt from CLI history.""" cli: str text: str project: str timestamp: str session_id: str class ActivityService: """Scans CLI histories and processes.""" def get_activity(self) -> dict: sessions = _scan_processes() prompts = _recent_prompts() return { "sessions": [asdict(s) for s in sessions], "recent": [asdict(p) for p in prompts], } # ── Process scanning ────────────────────────────── def _scan_processes() -> list[ActiveSession]: """Find running claude/codex/kimi processes.""" try: result = subprocess.run( ["ps", "aux"], capture_output=True, text=True, timeout=5, ) lines = result.stdout.splitlines() except (subprocess.SubprocessError, OSError): return [] return _parse_claude_processes( [line for line in lines if "claude" in line.lower()], ) def _parse_claude_processes( lines: list[str], ) -> list[ActiveSession]: """Parse ps aux lines for Claude CLI sessions.""" sessions: list[ActiveSession] = [] for line in lines: if not _is_cli_process(line): continue pid = _extract_pid(line) if not pid: continue cwd = _get_cwd(pid) project = Path(cwd).name if cwd else "" agent = _extract_flag(line, "--agent-name") team = _extract_flag(line, "--team-name") status = "agent" if agent else "running" sessions.append(ActiveSession( cli="claude", session_id="", project=project, project_path=cwd, status=status, last_prompt="", agent_name=agent, team_name=team, pid=pid, )) return sessions def _is_cli_process(line: str) -> bool: """Filter real CLI processes from app helpers.""" lower = line.lower() if "claude.app" in lower: return False if "grep" in lower: return False if "claude helper" in lower: return False return bool(re.search( r'(?:^|/|\s)claude\s+--', line, )) def _extract_pid(line: str) -> int: """Extract PID from ps aux line.""" parts = line.split() if len(parts) >= 2: try: return int(parts[1]) except ValueError: pass return 0 def _extract_flag(line: str, flag: str) -> str: """Extract --flag value from command line.""" idx = line.find(flag) if idx < 0: return "" rest = line[idx + len(flag):].strip() if not rest: return "" return rest.split()[0] if rest else "" def _get_cwd(pid: int) -> str: """Get working directory of a process (macOS).""" try: result = subprocess.run( ["lsof", "-p", str(pid), "-Fn"], capture_output=True, text=True, timeout=3, ) for line in result.stdout.splitlines(): if line.startswith("n") and "/" in line: path = line[1:] if Path(path).is_dir(): return path except (subprocess.SubprocessError, OSError): pass return "" # ── History scanning ────────────────────────────── def _recent_prompts( claude_dir: Path | None = None, codex_dir: Path | None = None, kimi_dir: Path | None = None, limit: int = 15, ) -> list[RecentPrompt]: """Read recent prompts from all CLI histories.""" home = Path.home() c_dir = claude_dir or (home / ".claude") x_dir = codex_dir or (home / ".codex") k_dir = kimi_dir or (home / ".kimi") prompts: list[RecentPrompt] = [] prompts.extend(_read_claude_history(c_dir)) prompts.extend(_read_codex_history(x_dir)) prompts.extend(_read_kimi_history(k_dir)) prompts.sort(key=lambda p: p.timestamp, reverse=True) return prompts[:limit] def _read_claude_history( claude_dir: Path, ) -> list[RecentPrompt]: """Parse ~/.claude/history.jsonl.""" path = claude_dir / "history.jsonl" if not path.exists(): return [] prompts: list[RecentPrompt] = [] try: for line in _tail_lines(path, 50): try: entry = json.loads(line) except json.JSONDecodeError: continue text = entry.get("display", "") if not text: continue ts = entry.get("timestamp", 0) project = entry.get("project", "") prompts.append(RecentPrompt( cli="claude", text=text[:200], project=Path(project).name if project else "", timestamp=_ms_to_iso(ts), session_id=entry.get("sessionId", ""), )) except OSError: pass return prompts def _read_codex_history( codex_dir: Path, ) -> list[RecentPrompt]: """Parse ~/.codex/history.jsonl.""" path = codex_dir / "history.jsonl" if not path.exists(): return [] prompts: list[RecentPrompt] = [] try: for line in _tail_lines(path, 50): try: entry = json.loads(line) except json.JSONDecodeError: continue text = entry.get("text", "") if not text: continue ts = entry.get("ts", 0) prompts.append(RecentPrompt( cli="codex", text=text[:200], project="", timestamp=_s_to_iso(ts), session_id=entry.get("session_id", ""), )) except OSError: pass return prompts def _read_kimi_history( kimi_dir: Path, ) -> list[RecentPrompt]: """Parse ~/.kimi/user-history/*.jsonl.""" hist_dir = kimi_dir / "user-history" if not hist_dir.is_dir(): return [] prompts: list[RecentPrompt] = [] try: for f in sorted( hist_dir.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True, )[:3]: mtime = datetime.fromtimestamp( f.stat().st_mtime, tz=timezone.utc, ).isoformat() for line in _tail_lines(f, 10): try: entry = json.loads(line) except json.JSONDecodeError: continue text = entry.get("content", "") if text: prompts.append(RecentPrompt( cli="kimi", text=text[:200], project="", timestamp=mtime, session_id=f.stem, )) except OSError: pass return prompts # ── Helpers ─────────────────────────────────────── def _tail_lines(path: Path, n: int) -> list[str]: """Read last N non-empty lines from a file.""" try: lines = path.read_text().splitlines() return [line for line in lines if line.strip()][-n:] except OSError: return [] def _ms_to_iso(ms: int | float) -> str: """Convert milliseconds epoch to ISO string.""" if not ms: return "" try: dt = datetime.fromtimestamp( ms / 1000, tz=timezone.utc, ) return dt.isoformat() except (ValueError, OSError): return "" def _s_to_iso(s: int | float) -> str: """Convert seconds epoch to ISO string.""" if not s: return "" try: dt = datetime.fromtimestamp(s, tz=timezone.utc) return dt.isoformat() except (ValueError, OSError): return "" ================================================ FILE: maggy/maggy/services/ai_client.py ================================================ """AI client — uses API key or falls back to CLI subscription.""" from __future__ import annotations import asyncio import logging import shutil logger = logging.getLogger(__name__) async def ai_complete( prompt: str, cfg, model: str = "", ) -> str | None: """Get AI completion. Tries API key, then CLI.""" target_model = model or cfg.ai.model if cfg.ai.api_key: return await _api_complete( prompt, cfg.ai.api_key, target_model, ) if shutil.which("claude"): return await _cli_complete(prompt, "claude") if shutil.which("codex"): return await _cli_complete(prompt, "codex") return None async def _api_complete( prompt: str, api_key: str, model: str, ) -> str | None: """Call Anthropic API directly.""" try: import anthropic client = anthropic.AsyncAnthropic(api_key=api_key) msg = await client.messages.create( model=model, max_tokens=2000, messages=[{"role": "user", "content": prompt}], ) return msg.content[0].text except Exception as e: logger.warning("API completion failed: %s", e) return None async def _cli_complete( prompt: str, cli: str, ) -> str | None: """Call AI via CLI subscription (claude/codex).""" try: process = await asyncio.create_subprocess_exec( cli, "-p", prompt, "--output-format", "text", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await asyncio.wait_for( process.communicate(), timeout=120, ) if process.returncode == 0: return stdout.decode().strip() logger.warning( "%s CLI failed (rc=%d): %s", cli, process.returncode, stderr.decode()[:200], ) except asyncio.TimeoutError: logger.warning("%s CLI timed out", cli) except OSError as e: logger.warning("%s CLI not available: %s", cli, e) return None ================================================ FILE: maggy/maggy/services/cascade.py ================================================ """Cascade execution — quality-gate-based model escalation. Try cheapest model first, evaluate output quality, escalate to next tier if quality gate fails. Max 3 attempts. """ from __future__ import annotations import logging from dataclasses import dataclass, field from typing import TYPE_CHECKING, Callable if TYPE_CHECKING: from maggy.adapters.pi import PiAdapter logger = logging.getLogger(__name__) @dataclass class CascadeAttempt: """Record of a single cascade attempt.""" model: str success: bool score: int = 0 output: str = "" cost_usd: float = 0.0 @dataclass class CascadeResult: """Result of cascade execution.""" model: str output: str attempts: list[CascadeAttempt] = field(default_factory=list) escalated: bool = False cost_usd: float = 0.0 async def cascade_execute( pi: PiAdapter, chain: list[str], prompt: str, wd: str, quality_gate: Callable[[str], int], ) -> CascadeResult: """Try cheapest model, escalate on quality gate failure.""" attempts: list[CascadeAttempt] = [] best = CascadeAttempt("", False) max_attempts = min(len(chain), 3) for i in range(max_attempts): model = chain[i] result = await pi.send_prompt(model, prompt, wd) cost = getattr(result, "cost_usd", 0.0) if not result.success: attempts.append(CascadeAttempt(model, False)) logger.info("Cascade: %s failed, escalating", model) continue score = await quality_gate(result.output) attempt = CascadeAttempt(model, True, score, result.output, cost) attempts.append(attempt) if score > best.score: best = attempt if score >= 3: return CascadeResult( model, result.output, attempts, escalated=i > 0, cost_usd=cost, ) logger.info( "Cascade: %s scored %d, escalating", model, score, ) return CascadeResult( best.model, best.output, attempts, escalated=len(attempts) > 1, cost_usd=best.cost_usd, ) ================================================ FILE: maggy/maggy/services/chat.py ================================================ """ChatManager — interactive Claude Code sessions with message queue.""" from __future__ import annotations import asyncio import logging import uuid from collections import deque from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import AsyncGenerator from maggy.config import MaggyConfig from maggy.services.chat_stream import stream_message logger = logging.getLogger(__name__) MAX_QUEUE = 5 @dataclass class ChatMessage: """A single message in a chat session.""" role: str # "user" | "assistant" content: str timestamp: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) @dataclass class ChatSession: """An interactive Claude Code session.""" id: str claude_session_id: str project_key: str working_dir: str messages: list[ChatMessage] = field(default_factory=list) status: str = "idle" created_at: str = field( default_factory=lambda: datetime.now( timezone.utc ).isoformat() ) pid: int = 0 history_context: str = "" pending_queue: deque = field( default_factory=lambda: deque(maxlen=MAX_QUEUE), ) def enqueue_msg(session: ChatSession, message: str) -> int: """Append message to session queue. Returns position or -1.""" if len(session.pending_queue) >= MAX_QUEUE: return -1 session.pending_queue.append(message) return len(session.pending_queue) class ChatManager: """Manages interactive Claude Code sessions.""" def __init__(self, cfg: MaggyConfig): self.cfg = cfg self._sessions: dict[str, ChatSession] = {} self._locks: dict[str, asyncio.Lock] = {} def create_session( self, project_key: str, project_path: str | None = None, ) -> ChatSession: """Create a new chat session for a project.""" if project_path: wd = self._validate_path(project_path) key = project_key or Path(wd).name else: wd = self._resolve_project(project_key) key = project_key session = ChatSession( id=uuid.uuid4().hex[:10], claude_session_id="", project_key=key, working_dir=wd, ) self._sessions[session.id] = session self._locks[session.id] = asyncio.Lock() return session def find_by_project(self, key: str) -> ChatSession | None: """Find existing session for a project key.""" for s in self._sessions.values(): if s.project_key == key: return s return None def auto_connect( self, active_sessions: list[dict], ) -> list[ChatSession]: """Create sessions for all active projects.""" connected: dict[str, ChatSession] = {} for active in active_sessions: project = active.get("project", "") path = active.get("project_path", "") if not project or not path: continue if project in connected: continue existing = self.find_by_project(project) if existing: connected[project] = existing continue try: session = self.create_session(project, path) except ValueError: continue connected[project] = session return list(connected.values()) def get_session(self, sid: str) -> ChatSession | None: return self._sessions.get(sid) def list_sessions(self) -> list[ChatSession]: return list(self._sessions.values()) def delete_session(self, session_id: str) -> bool: if session_id in self._sessions: del self._sessions[session_id] self._locks.pop(session_id, None) return True return False async def send( self, session_id: str, message: str, ) -> AsyncGenerator[dict, None]: """Send message, yield streamed response chunks.""" session = self._sessions.get(session_id) if not session: raise ValueError(f"Session {session_id} not found") lock = self._locks.setdefault( session_id, asyncio.Lock(), ) if lock.locked(): pos = enqueue_msg(session, message) if pos < 0: yield {"type": "error", "content": "Queue full."} return yield {"type": "queued", "position": pos} return async with lock: async for chunk in stream_message(session, message): yield chunk async for chunk in self._drain_queue(session): yield chunk async def _drain_queue( self, session: ChatSession, ) -> AsyncGenerator[dict, None]: """Process queued messages after current stream.""" while session.pending_queue: msg = session.pending_queue.popleft() yield { "type": "queue_next", "content": msg[:80], } async for chunk in stream_message(session, msg): yield chunk def _validate_path(self, path: str) -> str: """Validate path is inside a configured codebase root.""" candidate = Path(path).expanduser().resolve() roots = [ Path(c.path).expanduser().resolve() for c in self.cfg.codebases ] for root in roots: try: candidate.relative_to(root) return str(candidate) except ValueError: continue raise ValueError( f"Path {path!r} is not inside any configured " f"codebase. Allowed: {[str(r) for r in roots]}" ) def _resolve_project(self, project_key: str) -> str: """Map project_key to validated working directory.""" for cb in self.cfg.codebases: if cb.key == project_key: path = Path(cb.path).expanduser().resolve() return str(path) raise ValueError( f"Project '{project_key}' not found in codebases" ) ================================================ FILE: maggy/maggy/services/chat_context.py ================================================ """Chat context builder — resolves history and session IDs. Handles the three context gaps: 1. Path-based history matching (not just project name) 2. Recent prompt injection from activity data 3. Claude session_id lookup for true --resume """ from __future__ import annotations import json import logging from pathlib import Path logger = logging.getLogger(__name__) def build_project_context( history, working_dir: str, project_key: str, recent_prompts: list[dict], ) -> str: """Build full context string for a project.""" parts = [] hist = _match_history(history, working_dir, project_key) if hist: parts.append(hist) prompts = _format_recent_prompts(recent_prompts, project_key) if prompts: parts.append(prompts) return "\n\n".join(parts) def _match_history( history, working_dir: str, project_key: str, ) -> str: """Match history using report data (path-aware).""" if not history: return "" report = history.get_report() if report: return _match_from_report( report, working_dir, project_key, ) return "" def _match_from_report( report: dict, working_dir: str, project_key: str, ) -> str: """Match project in the aggregated history report.""" projects = report.get("projects", []) if not projects: return "" candidates = _path_candidates(working_dir, project_key) matched = [ p for p in projects if p.get("project", "") in candidates ] if not matched: return "" lines = [] for p in matched: sessions = p.get("total_sessions", 0) prompts = p.get("total_prompts", 0) providers = ", ".join(p.get("providers_used", [])) topics = ", ".join(p.get("top_topics", [])[:5]) line = f"- {sessions} sessions, {prompts} prompts" if providers: line += f" ({providers})" if topics: line += f", topics: {topics}" lines.append(line) return ( f"Project history ({len(matched)} entries):\n" + "\n".join(lines) ) _SKIP_DIRS = { "Users", "home", "Documents", "var", "tmp", "opt", "usr", "Library", "Applications", } def _path_candidates( working_dir: str, project_key: str, ) -> set[str]: """Generate candidate project names from path.""" candidates = {project_key} if working_dir: parts = Path(working_dir).parts for part in parts: if (part and part != "/" and len(part) > 2 and part not in _SKIP_DIRS): candidates.add(part) return candidates def _format_recent_prompts( recent_prompts: list[dict], project_key: str, ) -> str: """Format recent prompts for this project.""" matched = [ p for p in recent_prompts if p.get("project", "") == project_key ][:5] if not matched: return "" lines = [] for p in matched: text = p.get("text", "")[:120] ts = p.get("timestamp", "")[:10] lines.append(f"- [{ts}] {text}") return "Recent prompts:\n" + "\n".join(lines) def resolve_claude_session_id( working_dir: str, ) -> str: """Find the latest Claude session_id for a project. Reads ~/.claude/history.jsonl to find the most recent sessionId used in this working directory. """ history_path = Path.home() / ".claude" / "history.jsonl" if not history_path.exists(): return "" try: lines = history_path.read_text().splitlines() except OSError: return "" target = working_dir.rstrip("/") for line in reversed(lines): line = line.strip() if not line: continue try: entry = json.loads(line) except (json.JSONDecodeError, ValueError): continue project = entry.get("project", "") if not project: continue if project.rstrip("/") == target: sid = entry.get("sessionId", "") if sid: return sid return "" ================================================ FILE: maggy/maggy/services/chat_router.py ================================================ """Routed chat — blast-score routing for interactive messages. Estimates complexity from message keywords, routes to the optimal model via RoutingService, and builds CLI commands for any model. """ from __future__ import annotations import re from dataclasses import dataclass from maggy.routing import RoutingContext HIGH_KEYWORDS = frozenset({ "security", "auth", "authentication", "authorization", "oauth", "encrypt", "vulnerability", "architecture", "refactor", "redesign", "migrate", "migration", "database", "schema", "performance", "optimize", "deploy", "infrastructure", "cicd", "pipeline", }) MID_KEYWORDS = frozenset({ "feature", "implement", "build", "create", "api", "endpoint", "component", "service", "integration", "pagination", "filter", "search", "cache", }) LOW_KEYWORDS = frozenset({ "fix", "typo", "rename", "move", "style", "format", "lint", "comment", "readme", "docs", "log", "print", "bump", "version", "config", "env", "update", }) TYPE_KEYWORDS: dict[str, frozenset[str]] = { "security": frozenset({ "auth", "authentication", "authorization", "security", "permission", "token", "encrypt", "vulnerability", "oauth", "csrf", }), "search": frozenset({ "find", "search", "grep", "where", "locate", "which", "look", "scan", "show", "list", "read", }), "docs": frozenset({ "document", "documentation", "readme", "docs", "docstring", "comment", "spec", "jsdoc", "write", }), "tests": frozenset({ "test", "spec", "coverage", "mock", "fixture", "assert", "pytest", "jest", "vitest", }), "frontend": frozenset({ "component", "css", "style", "ui", "layout", "responsive", "tailwind", "react", "vue", }), } DEFAULT_BLAST = 5 _RETRIEVAL = re.compile( r"\b(find|get|show|check|where|list|read|look|grab|pick)\b", re.IGNORECASE, ) _MUTATION = re.compile( r"\b(create|add|build|implement|write|refactor|migrate" r"|redesign|overhaul|deploy)\b", re.IGNORECASE, ) def estimate_blast(message: str) -> int: """Estimate blast score (1-10) from message text.""" if not message.strip(): return DEFAULT_BLAST words = set(re.findall(r"[a-zA-Z]+", message.lower())) has_kw = words & (HIGH_KEYWORDS | MID_KEYWORDS | LOW_KEYWORDS) if len(words) <= 3 and not has_kw: return 1 high = len(words & HIGH_KEYWORDS) mid = len(words & MID_KEYWORDS) low = len(words & LOW_KEYWORDS) score = _keyword_score(high, mid, low) return _apply_intent(message, score) def _keyword_score(high: int, mid: int, low: int) -> int: """Score based on keyword tier counts.""" if high >= 2: return min(9, 7 + high - 2) if high == 1: return 7 if low >= 2 and mid == 0: return 2 if low >= 1 and mid == 0: return 3 if mid >= 2: return 6 if mid >= 1: return 5 return 1 def _apply_intent(message: str, score: int) -> int: """Cap score for retrieval-only messages.""" is_retrieval = bool(_RETRIEVAL.search(message)) is_mutation = bool(_MUTATION.search(message)) if is_retrieval and not is_mutation and score < 7: return min(score, 3) return score def estimate_type(message: str) -> str: """Estimate task type from message keywords.""" words = set(re.findall(r"[a-zA-Z]+", message.lower())) best_type = "general" best_count = 0 for ttype, keywords in TYPE_KEYWORDS.items(): count = len(words & keywords) if count > best_count: best_count = count best_type = ttype return best_type @dataclass class RouteDecision: """Result of routing a chat message.""" model: str reason: str blast: int task_type: str class RoutedChat: """Routes chat messages through blast-score engine.""" def __init__(self, routing, budget): self._routing = routing self._budget = budget def decide( self, message: str, blast_override: int | None = None, type_override: str | None = None, ) -> RouteDecision: """Get routing decision for a message.""" blast = blast_override or estimate_blast(message) task_type = type_override or estimate_type(message) ctx = RoutingContext( blast_score=blast, task_type=task_type, ) decision = self._routing.route(ctx) model_name = self._model_name(decision.primary) return RouteDecision( model=model_name, reason=decision.reason, blast=blast, task_type=task_type, ) def _model_name(self, primary) -> str: if isinstance(primary, str): return primary return str(getattr(primary, "name", primary)) ================================================ FILE: maggy/maggy/services/chat_stream.py ================================================ """Chat streaming — subprocess execution and JSON parsing. Extracted from ChatManager for quality-gate compliance. Handles claude CLI subprocess, stream-json parsing, and assistant message extraction. """ from __future__ import annotations import asyncio import json import logging import os from typing import TYPE_CHECKING, AsyncGenerator if TYPE_CHECKING: from maggy.services.chat import ChatSession logger = logging.getLogger(__name__) CLAUDE_BIN = "claude" def build_cmd(session: ChatSession, message: str) -> list[str]: """Build claude CLI command.""" cmd = [ CLAUDE_BIN, "-p", message, "--output-format", "stream-json", "--verbose", "--dangerously-skip-permissions", ] if session.claude_session_id: cmd += ["--resume", session.claude_session_id] return cmd def parse_chunk( text: str, session: ChatSession, ) -> dict | None: """Parse a stream-json line from Claude.""" try: data = json.loads(text) except json.JSONDecodeError: return {"type": "text", "content": text} if "session_id" in data and not session.claude_session_id: session.claude_session_id = data["session_id"] msg_type = data.get("type", "") if msg_type == "assistant": return _extract_assistant(data) if msg_type == "result": content = data.get("result", "") chunk: dict = {"type": "result", "content": content} cost = data.get("cost_usd") if cost is not None: chunk["cost_usd"] = float(cost) usage = data.get("usage") if usage is not None: chunk["input_tokens"] = int(usage.get("input_tokens") or 0) chunk["output_tokens"] = int(usage.get("output_tokens") or 0) return chunk return None def _extract_assistant(data: dict) -> dict: """Extract text from assistant message.""" content = data.get("message", {}).get("content", "") if isinstance(content, list): parts = [ b.get("text", "") for b in content if b.get("type") == "text" ] return {"type": "text", "content": "".join(parts)} return {"type": "text", "content": str(content)} def check_context_pressure(session: ChatSession) -> dict | None: """Warn if session messages are getting large.""" from maggy.services.context_compactor import estimate_tokens msgs = [{"content": m.content} for m in session.messages] tokens = estimate_tokens(msgs) if tokens > 24_000: return {"type": "warning", "content": f"Context: ~{tokens} tokens"} return None async def stream_message( session: ChatSession, message: str, ) -> AsyncGenerator[dict, None]: """Run a single message through Claude CLI.""" from maggy.services.chat import ChatMessage session.messages.append( ChatMessage(role="user", content=message), ) session.status = "streaming" pressure = check_context_pressure(session) if pressure: yield pressure cmd = build_cmd(session, message) response_text = "" try: env = { k: v for k, v in os.environ.items() if k != "CLAUDECODE" } proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, cwd=session.working_dir, env=env, ) session.pid = proc.pid or 0 async for line in proc.stdout: text = line.decode("utf-8", errors="replace").strip() if not text: continue chunk = parse_chunk(text, session) if chunk: response_text += chunk.get("content", "") yield chunk await proc.wait() session.status = "idle" except FileNotFoundError: session.status = "error" yield {"type": "error", "content": "claude CLI not found"} except Exception as e: session.status = "error" yield {"type": "error", "content": str(e)} if response_text: session.messages.append( ChatMessage(role="assistant", content=response_text), ) ================================================ FILE: maggy/maggy/services/checkpoint.py ================================================ """Cross-model checkpoint serializer. Produces model-agnostic checkpoints that can be injected into any model on switch, preserving task understanding. """ from __future__ import annotations import json from dataclasses import asdict, dataclass, field from datetime import datetime, timezone @dataclass class Checkpoint: """Model-agnostic task checkpoint.""" goal: str = "" constraints: list[str] = field(default_factory=list) progress: list[str] = field(default_factory=list) working_state: str = "" file_context: list[str] = field(default_factory=list) source_model: str = "" created_at: str = "" def serialize(self) -> str: """Serialize to JSON for storage/transfer.""" if not self.created_at: self.created_at = datetime.now( timezone.utc ).isoformat() return json.dumps(asdict(self), indent=2) @classmethod def deserialize(cls, data: str) -> Checkpoint: """Reconstruct from JSON.""" d = json.loads(data) return cls(**d) def to_prompt(self) -> str: """Format as a structured prompt for the new model.""" parts = [ "## Task Checkpoint (from previous model session)", f"**Goal:** {self.goal}", ] if self.constraints: parts.append("**Constraints:**") for c in self.constraints: parts.append(f" - {c}") if self.progress: parts.append("**Progress so far:**") for p in self.progress: parts.append(f" - {p}") if self.working_state: parts.append( f"**Current state:** {self.working_state}" ) if self.file_context: parts.append("**Key files:**") for f in self.file_context[:10]: parts.append(f" - {f}") parts.append( "\nPlease confirm you understand this context " "before proceeding." ) return "\n".join(parts) def create_checkpoint( goal: str, progress: list[str], model: str, working_state: str = "", files: list[str] | None = None, constraints: list[str] | None = None, ) -> Checkpoint: """Create a checkpoint from current session state.""" return Checkpoint( goal=goal, constraints=constraints or [], progress=progress, working_state=working_state, file_context=files or [], source_model=model, ) ================================================ FILE: maggy/maggy/services/competitor.py ================================================ """Generic competitor intelligence — AI discovery + RSS/news monitoring + daily briefing. Stores competitors in ~/.maggy/competitors.json. Monitored feeds stored in SQLite. Works for ANY domain — CX, fintech, devtools, healthcare, etc. Domain comes from config. """ from __future__ import annotations import hashlib import ipaddress import json import logging import socket import sqlite3 from datetime import date, datetime, timezone from pathlib import Path from urllib.parse import quote, urlparse import feedparser from maggy.services.ai_client import ai_complete import httpx from maggy.config import MaggyConfig logger = logging.getLogger(__name__) def _connect_sqlite(path: Path) -> sqlite3.Connection: """Open a SQLite connection with WAL + foreign_keys + busy_timeout. Same defaults as InboxService — safe for concurrent FastAPI handlers plus the heartbeat worker writing from another thread. """ db = sqlite3.connect(path, timeout=30.0) db.execute("PRAGMA journal_mode=WAL") db.execute("PRAGMA foreign_keys=ON") db.execute("PRAGMA busy_timeout=30000") return db def _parse_feed_date(raw: str) -> datetime | None: """Parse RFC 822 / ISO 8601 date strings from RSS/Atom feeds. feedparser returns `published` as RFC 822 ("Mon, 15 Jan 2024 10:30:00 GMT"). Comparing those lexicographically is wrong because day names cycle weekly. Returns a timezone-aware UTC datetime, or None if parsing fails. """ if not raw: return None # feedparser exposes parsed tuple when it can try: from email.utils import parsedate_to_datetime dt = parsedate_to_datetime(raw) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc) except (TypeError, ValueError): pass # Fall through: try ISO 8601 (atom feeds, Google News sometimes) try: dt = datetime.fromisoformat(raw.replace("Z", "+00:00")) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc) except (TypeError, ValueError): return None def _is_safe_feed_url(url: str) -> bool: """Reject RSS URLs that would let an attacker hit internal services. Blocks non-HTTP(S), bare hostnames without scheme, and any host whose resolved IPs include loopback, link-local, private, or multicast ranges. Prevents SSRF via AI-discovered or user-edited competitor registry. """ try: parsed = urlparse(url) except Exception: return False if parsed.scheme not in ("http", "https"): return False host = (parsed.hostname or "").strip().lower() if not host or host in ("localhost",): return False # Block bare IP strings that are themselves private try: ip = ipaddress.ip_address(host) return not (ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_multicast or ip.is_reserved or ip.is_unspecified) except ValueError: pass # Hostname: resolve and check every returned address try: infos = socket.getaddrinfo(host, None) except socket.gaierror: return False for info in infos: addr = info[4][0] try: ip = ipaddress.ip_address(addr.split("%")[0]) # strip scope id on v6 except ValueError: return False if (ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_multicast or ip.is_reserved or ip.is_unspecified): return False return True class CompetitorService: def __init__(self, cfg: MaggyConfig): self.cfg = cfg self.competitors_path = Path(cfg.storage.path).expanduser().parent / "competitors.json" self.db_path = Path(cfg.storage.path).expanduser() self._init_db() def _init_db(self) -> None: with _connect_sqlite(self.db_path) as db: db.execute(""" CREATE TABLE IF NOT EXISTS competitor_news ( id TEXT PRIMARY KEY, competitor_id TEXT NOT NULL, competitor_name TEXT NOT NULL, event_type TEXT NOT NULL, title TEXT NOT NULL, url TEXT, source TEXT, created_at TEXT NOT NULL ) """) db.execute("CREATE INDEX IF NOT EXISTS idx_news_created ON competitor_news(created_at DESC)") db.execute(""" CREATE TABLE IF NOT EXISTS briefing_cache ( date TEXT PRIMARY KEY, summary TEXT NOT NULL, signal_count INTEGER NOT NULL, generated_at TEXT NOT NULL ) """) db.execute(""" CREATE TABLE IF NOT EXISTS feed_cursors ( feed_key TEXT PRIMARY KEY, cursor TEXT NOT NULL ) """) # ── Registry ───────────────────────────────────────────────────────── def load_registry(self) -> dict[str, dict]: if not self.competitors_path.exists(): return {} try: return json.loads(self.competitors_path.read_text()) except Exception: return {} def save_registry(self, registry: dict[str, dict]) -> None: self.competitors_path.parent.mkdir(parents=True, exist_ok=True) self.competitors_path.write_text(json.dumps(registry, indent=2)) # ── Discovery ──────────────────────────────────────────────────────── async def discover(self) -> dict: """Ask Claude to identify competitors in the configured domain categories. Stores results in ~/.maggy/competitors.json (merges with existing). """ if not self.cfg.competitors.categories: return {"error": "No competitor categories configured", "added": 0} registry = self.load_registry() before = len(registry) categories = self.cfg.competitors.categories seed = self.cfg.competitors.seed org_name = self.cfg.org.name prompt = f"""Identify competitors for {org_name}, operating in these categories: {', '.join(categories)}. {f"User already mentioned: {', '.join(seed)}. Include these and add more." if seed else ""} Return 12-18 competitors as JSON. Include a mix of: - Established market leaders - AI-first challengers / next-gen disruptors - Vertical-specific specialists Format (STRICT JSON): {{"competitors": [ {{ "id": "lowercase-slug", "name": "Display Name", "category": "One of: {' | '.join(categories)}", "website": "example.com", "description": "One-sentence positioning", "strengths": ["str1", "str2", "str3"], "weaknesses": ["w1", "w2"], "tags": ["tag1", "tag2"], "blog_rss": "optional RSS URL or null" }} ]}}""" try: text = await ai_complete(prompt, self.cfg) if not text: return {"error": "No AI provider available", "added": 0} start = text.find("{") end = text.rfind("}") data = json.loads(text[start:end + 1]) except Exception as e: logger.error("Discovery failed: %s", e) return {"error": str(e), "added": 0} for comp in data.get("competitors", []): cid = comp.get("id", "").lower() if not cid: continue # Preserve blog_rss inside a social sub-dict for monitoring rss = comp.pop("blog_rss", None) if rss: comp["social"] = {"blog_rss": rss} # Merge (don't overwrite existing manual edits) if cid in registry: registry[cid].setdefault("social", {}) if rss and not registry[cid]["social"].get("blog_rss"): registry[cid]["social"]["blog_rss"] = rss else: registry[cid] = comp self.save_registry(registry) return {"total": len(registry), "added": len(registry) - before} def list_all(self) -> list[dict]: return list(self.load_registry().values()) # ── Monitoring ─────────────────────────────────────────────────────── async def monitor_all(self) -> dict: """Scan RSS + Google News for all competitors. Called by heartbeat or on-demand.""" registry = self.load_registry() rss_new = 0 news_new = 0 for cid, comp in registry.items(): try: rss_new += await self._check_rss(cid, comp) except Exception as e: logger.debug("RSS %s: %s", cid, e) try: news_new += await self._check_google_news(cid, comp) except Exception as e: logger.debug("News %s: %s", cid, e) return {"rss": rss_new, "news": news_new, "total_competitors": len(registry)} def _get_cursor(self, key: str) -> str: with _connect_sqlite(self.db_path) as db: row = db.execute("SELECT cursor FROM feed_cursors WHERE feed_key = ?", (key,)).fetchone() return row[0] if row else "" def _set_cursor(self, key: str, cursor: str) -> None: with _connect_sqlite(self.db_path) as db: db.execute( "INSERT INTO feed_cursors (feed_key, cursor) VALUES (?, ?) " "ON CONFLICT(feed_key) DO UPDATE SET cursor = excluded.cursor", (key, cursor), ) def _classify(self, title: str) -> str: t = title.lower() if any(w in t for w in ["launch", "release", "introduces", "announces new", "ships"]): return "feature_launch" if any(w in t for w in ["pricing", "price", "cost", "free tier"]): return "pricing_change" if any(w in t for w in ["funding", "raises", "series", "valuation", "investment"]): return "funding" if any(w in t for w in ["acquir", "acquisition", "merge", "bought"]): return "acquisition" if any(w in t for w in ["partner", "integration with", "teams up"]): return "partnership" return "news" def _log_event(self, competitor_id: str, competitor_name: str, event_type: str, title: str, url: str, source: str) -> None: # Deterministic ID so the same article logged twice (cursor reset, # overlapping scans) becomes a no-op instead of a duplicate row. id_seed = f"{competitor_id}|{source}|{url or title}" event_id = hashlib.sha256(id_seed.encode("utf-8")).hexdigest()[:32] with _connect_sqlite(self.db_path) as db: db.execute( "INSERT OR IGNORE INTO competitor_news " "(id, competitor_id, competitor_name, event_type, title, url, source, created_at) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (event_id, competitor_id, competitor_name, event_type, title, url, source, datetime.now(timezone.utc).isoformat()), ) async def _check_rss(self, cid: str, comp: dict) -> int: rss_url = (comp.get("social") or {}).get("blog_rss") if not rss_url: return 0 if not _is_safe_feed_url(rss_url): logger.warning("Skipping unsafe RSS URL for %s: %s", cid, rss_url) return 0 cursor_key = f"rss:{cid}" last_cursor = self._get_cursor(cursor_key) try: async with httpx.AsyncClient(timeout=15) as client: resp = await client.get(rss_url) if resp.status_code >= 400: return 0 feed = feedparser.parse(resp.text) except Exception: return 0 # Cursor is stored as an ISO-8601 UTC string so comparisons are # valid lexicographically AND survive round-trips through SQLite. last_cursor_dt = _parse_feed_date(last_cursor) if last_cursor else None new_items = 0 latest_dt = last_cursor_dt for entry in feed.entries[:10]: pub_raw = entry.get("published", entry.get("updated", "")) pub_dt = _parse_feed_date(pub_raw) # Skip entries already seen (we have a cursor AND the entry's parsed date is ≤ cursor). # Entries without a parseable date are always processed — INSERT OR IGNORE dedupes. if pub_dt and last_cursor_dt and pub_dt <= last_cursor_dt: continue title = entry.get("title", "") link = entry.get("link", "") if pub_dt and (latest_dt is None or pub_dt > latest_dt): latest_dt = pub_dt self._log_event(cid, comp.get("name", cid), "blog_post", f"{comp.get('name','')}: {title}", link, "rss") new_items += 1 if latest_dt and latest_dt != last_cursor_dt: self._set_cursor(cursor_key, latest_dt.isoformat()) return new_items async def _check_google_news(self, cid: str, comp: dict) -> int: name = comp.get("name", "") if not name: return 0 cursor_key = f"news:{cid}" last_cursor = self._get_cursor(cursor_key) # Use domain + category for better relevance — e.g. "Sprinklr CX" not "Sprinklr software" category = (comp.get("category") or "").replace("_", " ").split("/")[0] search_term = f"{name} {category}" if category else f"{name} {self.cfg.org.domain}" url = f"https://news.google.com/rss/search?q={quote(search_term)}&hl=en-US&gl=US&ceid=US:en" try: async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client: resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}) if resp.status_code >= 400: return 0 feed = feedparser.parse(resp.text) except Exception: return 0 last_cursor_dt = _parse_feed_date(last_cursor) if last_cursor else None new_items = 0 latest_dt = last_cursor_dt for entry in feed.entries[:5]: pub_dt = _parse_feed_date(entry.get("published", "")) if pub_dt and last_cursor_dt and pub_dt <= last_cursor_dt: continue title = entry.get("title", "") link = entry.get("link", "") if pub_dt and (latest_dt is None or pub_dt > latest_dt): latest_dt = pub_dt self._log_event(cid, name, self._classify(title), f"{name}: {title}", link, "google_news") new_items += 1 if latest_dt and latest_dt != last_cursor_dt: self._set_cursor(cursor_key, latest_dt.isoformat()) return new_items # ── News query ─────────────────────────────────────────────────────── def get_news(self, limit: int = 100) -> list[dict]: with _connect_sqlite(self.db_path) as db: db.row_factory = sqlite3.Row rows = db.execute( "SELECT * FROM competitor_news ORDER BY created_at DESC LIMIT ?", (limit,), ).fetchall() return [dict(r) for r in rows] # ── Daily briefing (cached per day) ────────────────────────────────── async def get_daily_briefing(self, refresh: bool = False) -> dict: today = date.today().isoformat() if not refresh: with _connect_sqlite(self.db_path) as db: row = db.execute( "SELECT summary, signal_count, generated_at FROM briefing_cache WHERE date = ?", (today,), ).fetchone() if row: return {"date": today, "summary": row[0], "total_signals": row[1], "generated_at": row[2]} # Regenerate news = self.get_news(limit=80) if not news: return {"date": today, "summary": "No competitor news yet. Run a scan first.", "total_signals": 0} digest = [f"[{n['event_type']}] {n['competitor_name']}: {n['title']}" for n in news[:50]] domain = self.cfg.org.domain or "our domain" prompt = f"""You are the competitive intelligence analyst for {self.cfg.org.name} in the {domain} space. Write a daily competitive landscape briefing for {today}. Structure: 1. **Top Signals Today** — 3-5 most important moves (acquisitions, launches, partnerships) with specific competitor names 2. **Market Trends** — patterns across multiple signals (AI adoption, consolidation, pricing shifts) 3. **Implications for {self.cfg.org.name}** — 2-3 specific, actionable takeaways Be specific with competitor names and facts. No generic advice. Under 250 words. Signals ({len(digest)} total): {chr(10).join(digest)}""" try: summary = await ai_complete(prompt, self.cfg) if not summary: return {"date": today, "summary": "No AI provider available for briefing.", "total_signals": len(news)} except Exception as e: return {"date": today, "summary": f"Failed to generate briefing: {e}", "total_signals": len(news)} generated_at = datetime.now(timezone.utc).isoformat() with _connect_sqlite(self.db_path) as db: db.execute( "INSERT INTO briefing_cache (date, summary, signal_count, generated_at) VALUES (?, ?, ?, ?) " "ON CONFLICT(date) DO UPDATE SET summary = excluded.summary, signal_count = excluded.signal_count, generated_at = excluded.generated_at", (today, summary, len(news), generated_at), ) return {"date": today, "summary": summary, "total_signals": len(news), "generated_at": generated_at} ================================================ FILE: maggy/maggy/services/context_compactor.py ================================================ """Context compactor — summarize old messages to fit context window. When conversation length exceeds 80% of the model's context window, old messages are summarized into a single system message while keeping the most recent messages intact. """ from __future__ import annotations import logging from dataclasses import dataclass from typing import Awaitable, Callable logger = logging.getLogger(__name__) COMPACT_THRESHOLD = 0.80 CHARS_PER_TOKEN = 4 SummarizerFn = Callable[[str], Awaitable[str]] @dataclass class CompactionResult: messages: list[dict] tokens_saved: int = 0 summary: str = "" def estimate_tokens(messages: list[dict]) -> int: """Rough token estimate based on char count / 4.""" total = sum(len(m.get("content", "")) for m in messages) return total // CHARS_PER_TOKEN def should_compact(messages: list[dict], context_window: int) -> bool: """Check if messages exceed 80% of context window.""" tokens = estimate_tokens(messages) return tokens > int(context_window * COMPACT_THRESHOLD) async def compact( messages: list[dict], keep_recent: int = 6, summarizer: SummarizerFn | None = None, ) -> CompactionResult: """Summarize old messages, keep recent ones.""" if len(messages) <= keep_recent: return CompactionResult(messages=messages) old = messages[:-keep_recent] recent = messages[-keep_recent:] old_text = _format_for_summary(old) old_tokens = estimate_tokens(old) try: if summarizer is None: return CompactionResult(messages=messages) summary = await summarizer(old_text) except Exception as exc: logger.debug("Compaction failed: %s", exc) return CompactionResult(messages=messages) summary_msg = {"role": "system", "content": summary} new_tokens = estimate_tokens([summary_msg]) return CompactionResult( messages=[summary_msg, *recent], tokens_saved=max(0, old_tokens - new_tokens), summary=summary, ) def _format_for_summary(messages: list[dict]) -> str: """Format messages into text for summarization.""" parts: list[str] = [] for m in messages: role = m.get("role", "unknown") content = m.get("content", "")[:500] parts.append(f"{role}: {content}") return "\n".join(parts) ================================================ FILE: maggy/maggy/services/convention_inferrer.py ================================================ """LLM-based dynamic convention inference from project fingerprint. Collects filesystem signals (file tree, config snippets, git log) and sends them to a cheap/local model to infer project-specific conventions that the static rule table doesn't cover. """ from __future__ import annotations import logging import re import subprocess from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: from maggy.adapters.pi import PiAdapter from maggy.routing_rules import Convention, RoutingRules logger = logging.getLogger(__name__) MAX_CONVENTIONS = 10 MAX_FINGERPRINT = 4000 FALLBACK_MODELS = ["local", "kimi"] SKIP_DIRS = frozenset({ ".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build", ".next", ".cache", ".tox", ".mypy_cache", ".ruff_cache", "egg-info", }) CONFIG_FILES = [ "pyproject.toml", "package.json", "Makefile", "docker-compose.yml", "Dockerfile", "tsconfig.json", ".env.example", "Cargo.toml", "go.mod", "Gemfile", "mix.exs", "build.gradle", "pom.xml", ] PROMPT_TEMPLATE = ( "Analyze this project and list its development conventions.\n" "Each convention must be one line starting with '- '.\n" "Focus on: build tools, test runners, deployment, migrations,\n" "package managers, CI/CD, linting, coding patterns.\n" "Be specific — mention exact commands and tool names.\n" "Max 10 conventions. No explanations, just the list.\n\n" "{fingerprint}" ) def collect_fingerprint(working_dir: str) -> str: """Build compact project fingerprint for LLM analysis.""" root = Path(working_dir) parts = [_file_tree(root), _config_snippets(root), _git_log(root)] return "\n".join(p for p in parts if p)[:MAX_FINGERPRINT] def parse_conventions(text: str) -> list[Convention]: """Extract '- convention' lines from LLM response.""" from maggy.routing_rules import Convention as Conv convs: list[Conv] = [] for line in text.splitlines(): m = re.match(r"^-\s+(.{5,200})$", line.strip()) if m: convs.append(Conv(m.group(1).strip(), ["all"], "llm-inferred")) if len(convs) >= MAX_CONVENTIONS: break return convs async def infer_conventions( pi: PiAdapter, working_dir: str, ) -> list[Convention]: """Send fingerprint to LLM, parse conventions from response.""" fp = collect_fingerprint(working_dir) if len(fp.strip()) < 20: return [] prompt = PROMPT_TEMPLATE.format(fingerprint=fp) for model in FALLBACK_MODELS: result = await pi.send_prompt(model, prompt, working_dir, max_turns=1, timeout=60) if result.success and result.output.strip(): return parse_conventions(result.output) logger.debug("Inference failed on %s: %s", model, result.error) return [] async def ensure_inferred( rules: RoutingRules, project_key: str, working_dir: str, pi: PiAdapter, ) -> None: """Run LLM inference if not already cached for this project.""" if not project_key: return existing = rules.project_conventions.get(project_key, []) if any(c.source == "llm-inferred" for c in existing): return try: convs = await infer_conventions(pi, working_dir) except Exception as exc: logger.warning("Convention inference failed: %s", exc) return if not convs: return existing_texts = {c.text for c in existing} new = [c for c in convs if c.text not in existing_texts] rules.project_conventions.setdefault(project_key, []).extend(new) def _file_tree(root: Path) -> str: """List files/dirs to depth 2, excluding noise.""" lines = ["## Project Files"] try: for p in sorted(root.iterdir()): if p.name in SKIP_DIRS or p.name.startswith("."): continue lines.append(p.name + ("/" if p.is_dir() else "")) if p.is_dir(): for child in sorted(p.iterdir()): if child.name in SKIP_DIRS: continue lines.append(f" {child.name}") except OSError: pass return "\n".join(lines[:80]) def _config_snippets(root: Path) -> str: """Read first 300 chars of known config files.""" parts: list[str] = [] for name in CONFIG_FILES: path = root / name if path.is_file(): try: text = path.read_text(errors="ignore")[:300] parts.append(f"## {name}\n{text}") except OSError: continue return "\n".join(parts) def _git_log(root: Path) -> str: """Recent commit messages via git log --oneline -10.""" if not (root / ".git").is_dir(): return "" try: r = subprocess.run( ["git", "log", "--oneline", "-10"], cwd=root, capture_output=True, text=True, timeout=5, ) if r.returncode == 0 and r.stdout.strip(): return f"## Recent Commits\n{r.stdout.strip()}" except (OSError, subprocess.TimeoutExpired): pass return "" ================================================ FILE: maggy/maggy/services/convention_scanner.py ================================================ """Declarative filesystem scanner for project-specific conventions. Scans a project directory for config files, lock files, and directory structures to auto-detect tooling conventions (e.g. supabase vs alembic, npm vs pnpm, pytest vs jest). """ from __future__ import annotations import re from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: from maggy.routing_rules import Convention, RoutingRules @dataclass class ScanRule: """A filesystem marker that implies a convention.""" marker: str convention: str applies_to: list[str] = field(default_factory=lambda: ["all"]) content_match: str = "" is_dir: bool = False RULES: list[ScanRule] = [ ScanRule( "supabase/migrations", is_dir=True, convention="Use `supabase db push` for migrations. RLS policies required.", ), ScanRule( "alembic.ini", convention="Use `alembic revision --autogenerate` for schema changes.", ), ScanRule( "package-lock.json", convention="Package manager: npm. Use `npm install`, not yarn/pnpm.", ), ScanRule( "pnpm-lock.yaml", convention="Package manager: pnpm. Use `pnpm install`, not npm/yarn.", ), ScanRule( "yarn.lock", convention="Package manager: yarn. Use `yarn add`, not npm/pnpm.", ), ScanRule( "pyproject.toml", content_match=r"\[tool\.ruff\]", convention="Linter: ruff. Run `ruff check .` before committing.", ), ScanRule( "pyproject.toml", content_match=r"\[tool\.pytest", convention="Testing: pytest. Run `pytest` for tests.", applies_to=["feature", "bug", "all"], ), ScanRule( "pytest.ini", convention="Testing: pytest. Run `pytest` for tests.", applies_to=["feature", "bug", "all"], ), ScanRule( "docker-compose.yml", convention="Use Docker Compose for local services. `docker compose up -d`.", ), ScanRule( ".github/workflows", is_dir=True, convention="CI: GitHub Actions. Check workflow status before merging.", ), ScanRule( "Makefile", convention="Project uses Make. Check `make help` for available targets.", ), ScanRule( "tailwind.config.js", convention="Styling: Tailwind CSS. Use utility classes, not custom CSS.", applies_to=["feature"], ), ScanRule( "tailwind.config.ts", convention="Styling: Tailwind CSS. Use utility classes, not custom CSS.", applies_to=["feature"], ), ] def scan_project(working_dir: str) -> list[Convention]: """Scan project directory, return detected conventions.""" from maggy.routing_rules import Convention as Conv root = Path(working_dir) found: list[Conv] = [] seen: set[str] = set() for rule in RULES: if not _matches(root, rule): continue if rule.convention in seen: continue seen.add(rule.convention) found.append(Conv(rule.convention, list(rule.applies_to), "auto-detected")) return found def ensure_scanned( rules: RoutingRules, project_key: str, working_dir: str, ) -> None: """Scan project if not already cached in rules.""" if project_key in rules.project_conventions: return convs = scan_project(working_dir) rules.project_conventions[project_key] = convs def _matches(root: Path, rule: ScanRule) -> bool: """Check if a scan rule matches the project directory.""" target = root / rule.marker if rule.is_dir: return target.is_dir() if not target.is_file(): return False if not rule.content_match: return True try: text = target.read_text(errors="ignore")[:4096] return bool(re.search(rule.content_match, text)) except OSError: return False ================================================ FILE: maggy/maggy/services/executor.py ================================================ from __future__ import annotations import asyncio import logging import uuid from datetime import datetime, timezone from pathlib import Path from maggy.adapters.pi import PiAdapter, RunResult from maggy.budget import BudgetManager from maggy.checkpoint import CheckpointManager from maggy.config import MaggyConfig from maggy.coordination.lock_manager import LockManager from maggy.escalation.protocol import Escalator from maggy.mnemos.fatigue import FatigueTracker from maggy.mnemos.signals import SignalLog from maggy.providers.base import IssueTrackerProvider from maggy.recovery.rollback import RollbackManager from maggy.routing import RoutingService from maggy.services import executor_helpers as H from maggy.services import executor_prompts as P from maggy.services.executor_types import SessionCtx, StepSpec from maggy.services.planner import DualPlanner logger = logging.getLogger(__name__) class ExecutorService: def __init__(self, cfg: MaggyConfig, provider: IssueTrackerProvider, status_cb=None): self.cfg, self.provider = cfg, provider self._pi = PiAdapter() self._routing = RoutingService(cfg) self._budget = BudgetManager(cfg) self._sessions: dict[str, dict] = {} self._bg_tasks: set[asyncio.Task] = set() db = Path(cfg.storage.path).expanduser().parent self._fatigue = FatigueTracker() self._signals = SignalLog(db / "signals.jsonl") self._locks = LockManager(db / "locks.db") self._rollback = RollbackManager() self._checkpoint = CheckpointManager(db / "checkpoints") self._escalator = Escalator(db / "escalations.db") self._planner, self._status_cb = DualPlanner(self._pi), status_cb async def start(self, task_id: str, mode: str = "tdd", working_dir: str | None = None) -> str: if mode not in ("tdd", "plan"): raise ValueError(f"Unknown mode {mode!r}") task = await self.provider.get_task(task_id) if not task: raise ValueError(f"Task {task_id} not found") wd = H.resolve_working_dir(self.cfg, working_dir, task) sid = uuid.uuid4().hex[:10] self._sessions[sid] = dict( id=sid, task_id=task_id, task_title=task.title, mode=mode, working_dir=wd, status="running", started_at=datetime.now(timezone.utc).isoformat(), output="") self._locks.acquire(wd, sid) ctx = SessionCtx(self._sessions[sid], task, wd) bg = asyncio.create_task(self._run(ctx, mode)) self._bg_tasks.add(bg) bg.add_done_callback(self._bg_tasks.discard) return sid def get_session(self, sid: str) -> dict | None: return self._sessions.get(sid) def list_sessions(self) -> list[dict]: return list(self._sessions.values()) async def _run(self, ctx: SessionCtx, mode: str) -> None: try: from maggy.services.convention_inferrer import ensure_inferred from maggy.services.convention_scanner import ensure_scanned pk = str(ctx.task.raw.get("project_key", "")) ensure_scanned(self._routing.rules, pk, ctx.wd) await ensure_inferred(self._routing.rules, pk, ctx.wd, self._pi) ctx.icpg = await H.build_icpg_context(self.cfg, ctx.task) await (self._run_plan(ctx) if mode == "plan" else self._run_tdd(ctx)) except Exception as e: logger.exception("Execution failed") ctx.session["status"], ctx.session["error"] = "failed", str(e) finally: self._locks.release_all(ctx.session["id"]) self._checkpoint.delete(ctx.task.id.replace("/", "-")) async def _run_plan(self, ctx: SessionCtx) -> None: result = await self._run_model(ctx, P.plan_prompt(ctx.task, ctx.icpg, self._routing), 5) ctx.session["output"] = result.output[:10000] ctx.session["status"] = "completed" if result.success else "failed" if not result.success: ctx.session["error"] = result.output[:500] elif result.output: await H.post_plan(self.provider, ctx.task.id, result.output) async def _run_tdd(self, ctx: SessionCtx) -> None: if H.blast_score(ctx.task) >= 7: await self._dual_plan(ctx) prompt = P.analysis_prompt(ctx.task, ctx.icpg, self._routing) ok, analysis = await self._reviewed_step(ctx, StepSpec("ANALYZE", prompt, 5)) if not ok: return prompt = P.tests_prompt(ctx.task, ctx.icpg, analysis, self._routing) ok, _ = await self._reviewed_step(ctx, StepSpec("WRITE TESTS", prompt, 15)) if not ok: return if not await self._verify_red(ctx): return await H.save_rollback(self._rollback, ctx.session["id"], ctx.wd) prompt = P.impl_prompt(ctx.task, ctx.icpg, self._routing) ok, _ = await self._reviewed_step(ctx, StepSpec("IMPLEMENT", prompt, 25)) if not ok: await H.try_rollback(self._rollback, ctx.session["id"], ctx.wd) H.maybe_escalate(self._escalator, ctx.session, ctx.task) return if not await self._verify_green(ctx): await H.try_rollback(self._rollback, ctx.session["id"], ctx.wd) return ctx.session["status"] = "completed" ctx.session["completed_at"] = datetime.now(timezone.utc).isoformat() async def _reviewed_step(self, ctx: SessionCtx, step: StepSpec) -> tuple[bool, str]: for attempt in range(2): ok, output = await self._run_step(ctx, step) if not ok: return ok, output if await self._review_step(ctx, step, output): return True, output if attempt == 0: ctx.session["output"] += f"\n--- RETRY {step.label} ---\n" ctx.session.update(status="failed", error=f"Review gate failed for {step.label}") return False, output async def _run_step(self, ctx: SessionCtx, step: StepSpec) -> tuple[bool, str]: result = await self._run_model(ctx, step.prompt, step.max_turns) ctx.session["output"] += f"\n=== {step.label} ===\n{result.output[:2000]}\n" H.log_signal(self._signals, ctx.session["id"], step.label, result) if not result.success: ctx.session["status"] = "failed" return result.success, result.output async def _review_step(self, ctx: SessionCtx, step: StepSpec, output: str) -> bool: from maggy.services.output_reviewer import review_output review = await review_output(self._pi, step.label, output, ctx.wd) ctx.session["output"] += f"\n--- REVIEW {step.label}: {review.score}/5 ---\n" return review.score >= 3 async def _run_model(self, ctx: SessionCtx, prompt: str, turns: int) -> RunResult: decision = H.route_model(ctx.task, self._routing) name = H.model_name(decision.primary) H.write_checkpoint(self._checkpoint, ctx.task, name) self._emit_status(name, "running") result = await self._send(decision, name, prompt, ctx) self._emit_status(name, "done") if result.model != name and (e := self._pi.get_model(result.model)): self._fatigue.on_model_switch(e.context_window) H.track_fatigue(self._fatigue, result) if result.cost_usd > 0 or result.input_tokens > 0: self._budget.record_spend( decision.primary.provider, result.model, result.cost_usd, result.input_tokens, result.output_tokens) return result async def _send(self, decision, name, prompt, ctx): cascade = self._routing.rules.cascade if not cascade.enabled or H.blast_score(ctx.task) < cascade.min_blast: return await self._pi.send_with_fallback(name, prompt, ctx.wd) from maggy.services.cascade import cascade_execute from maggy.services.output_reviewer import review_output chain = [name] + decision.fallback_chain async def gate(output: str) -> int: return (await review_output(self._pi, "CASCADE", output, ctx.wd)).score cr = await cascade_execute(self._pi, chain, prompt, ctx.wd, gate) return RunResult(model=cr.model, success=bool(cr.output), output=cr.output, cost_usd=cr.cost_usd) def _emit_status(self, agent: str, status: str) -> None: if self._status_cb: self._status_cb({"type": "agent_status", "agent": agent, "status": status}) async def _verify_red(self, ctx: SessionCtx) -> bool: from maggy.services.tdd_verifier import verify_tests_exist, verify_tests_fail for check, prefix in [(verify_tests_exist, "RED: no tests"), (verify_tests_fail, "RED")]: r = await check(ctx.wd) if not r.passed: ctx.session["status"], ctx.session["error"] = "failed", f"{prefix}: {r.detail}" return False ctx.session["output"] += f"\n=== RED ===\n{r.detail}\n" return True async def _verify_green(self, ctx: SessionCtx) -> bool: from maggy.services.tdd_verifier import verify_coverage, verify_lint, verify_tests_pass if not (green := await verify_tests_pass(ctx.wd)).passed: ctx.session["status"], ctx.session["error"] = "failed", f"GREEN: {green.detail}" return False for label, check in [("LINT", verify_lint), ("COVERAGE", verify_coverage)]: if not (r := await check(ctx.wd)).passed: ctx.session["output"] += f"\n=== {label} ===\n{r.detail}\n" ctx.session["output"] += "\n=== VALIDATE ===\nPassed\n" return True async def _dual_plan(self, ctx: SessionCtx) -> None: try: r = await self._planner.dual_plan(ctx.task.title, ctx.task.description[:1500], ctx.wd) ctx.session.update(dual_plan=r.primary_plan[:2000], plan_conflicts=r.conflicts or []) except Exception as exc: logger.warning("DualPlanner failed: %s", exc) ================================================ FILE: maggy/maggy/services/executor_helpers.py ================================================ """Executor helpers — routing, rollback, fatigue, iCPG.""" from __future__ import annotations import asyncio import logging from typing import TYPE_CHECKING from maggy.adapters.pi import RunResult from maggy.mnemos.fatigue import FatigueTracker from maggy.mnemos.signals import SignalLog from maggy.process.model_router import RoutingDecision from maggy.routing import RoutingContext, RoutingService if TYPE_CHECKING: from maggy.checkpoint import CheckpointManager from maggy.config import MaggyConfig from maggy.escalation.protocol import Escalator from maggy.providers.base import Task from maggy.recovery.rollback import RollbackManager logger = logging.getLogger(__name__) def route_model(task: Task, routing: RoutingService) -> RoutingDecision: """Pick the best model for a task via routing rules.""" from maggy.services.stakes import classify_stakes raw = task.raw if isinstance(task.raw, dict) else {} task_type = str(raw.get("task_type") or _task_type(task)) stakes = classify_stakes(task).level return routing.route( RoutingContext( blast_score=int_value(raw.get("blast_score")), task_type=task_type, security_sensitive=_security_flag(raw, task_type), project_key=str(raw.get("project_key") or task.board), stakes=stakes, ), ) def blast_score(task: Task) -> int: """Extract blast score from task metadata.""" raw = task.raw if isinstance(task.raw, dict) else {} return int_value(raw.get("blast_score")) def int_value(value: object) -> int: """Safely convert to int, default 0.""" try: return int(value) except (TypeError, ValueError): return 0 def model_name(primary: object) -> str: """Extract model name string from routing decision.""" if isinstance(primary, str): return primary return str(primary.name) def track_fatigue(fatigue: FatigueTracker, result: RunResult) -> None: """Record context load from result output length.""" load = min(len(result.output) / 50_000, 1.0) fatigue.record("context_load", load) def log_signal(signals: SignalLog, sid: str, label: str, result: RunResult) -> None: """Append step signal to log.""" signals.append({ "session_id": sid, "step": label, "model": result.model, "success": result.success, }) def write_checkpoint( checkpoint: "CheckpointManager", task: Task, model: str, ) -> None: """Write execution checkpoint for crash recovery.""" checkpoint.write(task.id.replace("/", "-"), { "goal": task.title, "model_history": [model], "current_subgoal": "executing", }) async def save_rollback( rollback: "RollbackManager", sid: str, wd: str, ) -> None: """Create git savepoint before implementation.""" try: await rollback.create_savepoint(sid, wd) except Exception as exc: logger.warning("Savepoint failed: %s", exc) async def try_rollback( rollback: "RollbackManager", sid: str, wd: str, ) -> None: """Revert to last savepoint on failure.""" try: await rollback.rollback(sid, wd) except Exception as exc: logger.warning("Rollback failed: %s", exc) def maybe_escalate( escalator: "Escalator", session: dict, task: Task, ) -> None: """Escalate after 3+ consecutive failures.""" failures = session.get("_fail_count", 0) + 1 session["_fail_count"] = failures if failures >= 3: escalator.escalate( session["id"], "repeated_failure", {"task_id": task.id, "failures": failures}, ) async def build_icpg_context(cfg: "MaggyConfig", task: Task) -> str: """Query iCPG CLI for code intelligence context.""" bp = cfg.resolve_bootstrap_path() if not bp or not (bp / "scripts" / "icpg" / "__main__.py").exists(): return "" from maggy.services.executor_prompts import extract_keywords kw = extract_keywords(f"{task.title} {task.description}") if not kw: return "" try: proc = await asyncio.create_subprocess_exec( "python3", "-m", "scripts.icpg", "--project", str(bp), "query", "prior", "--text", " ".join(kw[:8]), "--limit", "8", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=str(bp)) stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10) if proc.returncode != 0: return "" text = (stdout or b"").decode("utf-8", errors="replace").strip() except (asyncio.TimeoutError, FileNotFoundError, OSError): return "" if not text: return "" return ("## iCPG Code Intelligence\n" "Pre-queried from Maggy's intent code property graph:\n\n" + text[:2000] + "\n\n**Use this to target your file reads.**") def resolve_working_dir(cfg: "MaggyConfig", requested: str | None, task: "Task") -> str: """Resolve working_dir inside configured codebases.""" from pathlib import Path if not cfg.codebases: raise ValueError("No codebases configured") roots = [Path(c.path).expanduser().resolve() for c in cfg.codebases] if requested: candidate = Path(requested).expanduser().resolve() for root in roots: try: candidate.relative_to(root) return str(candidate) except ValueError: continue raise ValueError(f"working_dir {requested!r} not inside codebases") return pick_working_dir(cfg, task) def pick_working_dir(cfg: "MaggyConfig", task: "Task") -> str: """Match task keywords to configured codebases.""" from pathlib import Path cbs = cfg.codebases if len(cbs) == 1: return str(Path(cbs[0].path).expanduser().resolve()) text = f"{task.title} {task.description} {task.board}".lower() best_key, best_score = cbs[0].key, 0 for cb in cbs: score = 5 if cb.key.lower() in text else 0 name = Path(cb.path).name.lower() if name != cb.key.lower() and name in text: score += 3 if score > best_score: best_key, best_score = cb.key, score picked = next(c for c in cbs if c.key == best_key) return str(Path(picked.path).expanduser().resolve()) async def post_plan(provider, task_id: str, output: str) -> None: """Post plan as comment to issue tracker.""" try: await provider.add_comment( task_id, f"## Maggy Plan\n\n{output[:4000]}", ) except Exception as e: logger.warning("Failed to post plan: %s", e) def _task_type(task: "Task") -> str: return task.labels[0] if task.labels else "general" def _security_flag(raw: dict, task_type: str) -> bool: if "security_sensitive" in raw: return bool(raw["security_sensitive"]) return task_type in {"security", "auth", "billing"} ================================================ FILE: maggy/maggy/services/executor_prompts.py ================================================ """Executor prompt templates for TDD pipeline steps.""" from __future__ import annotations import re from typing import TYPE_CHECKING if TYPE_CHECKING: from maggy.providers.base import Task from maggy.routing import RoutingService from maggy.routing_rules import conventions_for STOP = frozenset({ "the", "and", "for", "to", "in", "of", "a", "is", "with", "on", "from", "be", "as", "by", "an", "or", "not", "all", "that", "this", "are", "can", "should", "would", "when", "how", "what", "where", "which", "we", "need", "also", "been", "has", "have", "it", "its", "new", "add", "fix", "update", "create", "delete", "get", "set", "use", }) def plan_prompt(task: Task, icpg_ctx: str, routing: RoutingService) -> str: conv = _conventions_block(task, routing) return ( "Create an implementation plan for this ticket. " "No code changes — just a plan.\n\n" f"Ticket: {task.title}\n{task.description[:1500]}" f"{_icpg_block(icpg_ctx)}{conv}\n" "Output: numbered steps, files to touch, risks, tests." ) def analysis_prompt(task: Task, icpg_ctx: str, routing: RoutingService) -> str: conv = _conventions_block(task, routing) return ( "Analyze this ticket against the codebase and output " "a concise plan.\nIdentify: files to change, functions " "affected, tests needed, risks.\n\n" f"Ticket: {task.title}\n{task.description[:1500]}" f"{_icpg_block(icpg_ctx)}{conv}" ) def tests_prompt( task: Task, icpg_ctx: str, analysis: str, routing: RoutingService, ) -> str: conv = _conventions_block(task, routing) return ( "Write failing test cases for this ticket " "(TDD — no implementation yet).\n" "Use the project's existing test patterns. " "Commit tests separately.\n\n" f"Ticket: {task.title}\n{task.description[:1500]}" f"{_icpg_block(icpg_ctx)}{conv}\n" f"Analysis:\n{analysis[:1000]}" ) def impl_prompt(task: Task, icpg_ctx: str, routing: RoutingService) -> str: conv = _conventions_block(task, routing) return ( "Implement the feature to make the failing tests pass.\n" "Follow existing code patterns. Keep changes minimal.\n\n" f"Ticket: {task.title}\n{task.description[:1500]}" f"{_icpg_block(icpg_ctx)}{conv}\n" "Run tests to verify, then commit with a conventional " "commit message." ) def extract_keywords(text: str) -> list[str]: """Extract unique keywords from text, filtering stop words.""" words = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", text.lower()) seen: set[str] = set() result: list[str] = [] for w in words: if w in STOP or len(w) < 3 or w in seen: continue seen.add(w) result.append(w) return result[:20] def _icpg_block(icpg_ctx: str) -> str: if not icpg_ctx: return "" return f"\n\n{icpg_ctx}\n" def _task_type(task: Task) -> str: if task.labels: return task.labels[0] return "general" def _conventions_block(task: Task, routing: RoutingService) -> str: raw = task.raw if isinstance(task.raw, dict) else {} task_type = str(raw.get("task_type") or _task_type(task)) project_key = str(raw.get("project_key") or "") text = conventions_for(routing.rules, task_type, project_key or None) if not text: return "" return f"\n\n{text}\n" ================================================ FILE: maggy/maggy/services/executor_types.py ================================================ """Executor shared types — context and step descriptors.""" from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING if TYPE_CHECKING: from maggy.providers.base import Task @dataclass class SessionCtx: """Bundles session state, task, and working dir for executor.""" session: dict task: Task wd: str icpg: str = "" @dataclass class StepSpec: """Describes a single TDD pipeline step.""" label: str prompt: str max_turns: int ================================================ FILE: maggy/maggy/services/inbox.py ================================================ """AI-prioritized inbox — ranks tasks by urgency, OKR alignment, and age. Works with any IssueTrackerProvider. Caches ranking for 30 minutes in SQLite. """ from __future__ import annotations import json import logging import sqlite3 from datetime import datetime, timezone from pathlib import Path from maggy.config import MaggyConfig from maggy.services.ai_client import ai_complete from maggy.providers.base import IssueTrackerProvider, Task logger = logging.getLogger(__name__) CACHE_TTL_SECONDS = 30 * 60 # 30 min def _connect_sqlite(path: Path) -> sqlite3.Connection: """Open a SQLite connection with sensible defaults for concurrent use. FastAPI serves requests concurrently, and the heartbeat worker writes from a different thread. WAL lets readers and writers coexist; foreign_keys enforces referential integrity; busy_timeout avoids 'database is locked' errors under contention. Matches the convention used by scripts/icpg/store.py. """ db = sqlite3.connect(path, timeout=30.0) db.execute("PRAGMA journal_mode=WAL") db.execute("PRAGMA foreign_keys=ON") db.execute("PRAGMA busy_timeout=30000") return db class InboxService: def __init__(self, cfg: MaggyConfig, provider: IssueTrackerProvider): self.cfg = cfg self.provider = provider self.db_path = Path(cfg.storage.path).expanduser() self.db_path.parent.mkdir(parents=True, exist_ok=True) self._init_db() def _init_db(self) -> None: with _connect_sqlite(self.db_path) as db: db.execute(""" CREATE TABLE IF NOT EXISTS inbox_cache ( id INTEGER PRIMARY KEY AUTOINCREMENT, cached_at TEXT NOT NULL, payload TEXT NOT NULL ) """) def _read_cache(self, ignore_ttl: bool = False) -> list[dict] | None: with _connect_sqlite(self.db_path) as db: row = db.execute( "SELECT cached_at, payload FROM inbox_cache ORDER BY id DESC LIMIT 1" ).fetchone() if not row: return None if not ignore_ttl: cached_at = datetime.fromisoformat(row[0]) age = (datetime.now(timezone.utc) - cached_at).total_seconds() if age > CACHE_TTL_SECONDS: return None return json.loads(row[1]) def _write_cache(self, items: list[dict]) -> None: with _connect_sqlite(self.db_path) as db: db.execute("DELETE FROM inbox_cache") # keep just latest db.execute( "INSERT INTO inbox_cache (cached_at, payload) VALUES (?, ?)", (datetime.now(timezone.utc).isoformat(), json.dumps(items)), ) async def get_prioritized(self, force_refresh: bool = False) -> list[dict]: """Return AI-ranked tasks. Cached 30 min. On provider failure (GitHub/Asana down), fall back to the last cached ranking — even if stale — rather than 500ing the whole endpoint. Staleness is indicated to clients via the `stale` flag on items. """ if not force_refresh: cached = self._read_cache() if cached is not None: return cached try: tasks = await self.provider.list_tasks(state="open", limit=50) except Exception as e: logger.warning("provider.list_tasks failed, falling back to stale cache: %s", e) stale = self._read_cache(ignore_ttl=True) or [] for item in stale: item["stale"] = True return stale if not tasks: return [] ranked = await self._rank_with_ai(tasks) self._write_cache(ranked) return ranked async def _rank_with_ai(self, tasks: list[Task]) -> list[dict]: """Ask Claude to rank tasks by priority. Falls back to date-sorted if AI unavailable.""" prompt = self._build_rank_prompt(tasks) text = await self._call_ai(prompt) if not text: return [self._task_to_dict(t, rank=i + 1, reason="AI not available; sorted by recency") for i, t in enumerate(tasks)] try: start = text.find("{") end = text.rfind("}") data = json.loads(text[start:end + 1]) if start >= 0 else {"rankings": []} except Exception as e: logger.warning("AI ranking parse failed: %s", e) return [self._task_to_dict(t, rank=i + 1, reason="AI ranking unavailable") for i, t in enumerate(tasks)] # Apply rankings — validate each row before trusting it. # LLMs routinely return missing indices, string ranks, or out-of-range values. rank_map: dict[int, dict] = {} for r in data.get("rankings", []): if not isinstance(r, dict): continue idx = r.get("index") rank = r.get("rank") if not isinstance(idx, int) or idx < 0 or idx >= len(tasks): continue # Coerce rank defensively try: rank_int = int(rank) except (TypeError, ValueError): continue if rank_int < 1: continue # First write wins — LLM occasionally emits duplicate indices rank_map.setdefault(idx, {"rank": rank_int, "reason": str(r.get("reason", ""))[:300]}) ranked: list[dict] = [] for i, t in enumerate(tasks): r = rank_map.get(i) or {"rank": i + 1, "reason": ""} ranked.append(self._task_to_dict(t, rank=r["rank"], reason=r["reason"])) ranked.sort(key=lambda x: x["rank"]) return ranked def _build_rank_prompt(self, tasks: list[Task]) -> str: """Build the ranking prompt for AI.""" okr_block = "" if self.cfg.okrs.source == "yaml" and self.cfg.okrs.items: okr_lines = [f"- {o.id}: {o.title}" for o in self.cfg.okrs.items] okr_block = "## Current OKRs\n" + "\n".join(okr_lines) + "\n" task_lines = [] for i, t in enumerate(tasks): snippet = (t.description or "")[:200].replace("\n", " ") task_lines.append(f"[{i}] id={t.id} board={t.board} labels={','.join(t.labels[:3])}\n {t.title}\n {snippet}") return f"""You are the AI triage assistant for {self.cfg.org.name}. {okr_block} Rank the following {len(tasks)} open tasks by priority. Consider: - OKR alignment (if OKRs provided) - Urgency signals (labels like "bug", "critical", "urgent") - Age (older + stale = deprioritize, older + active = maybe important) Respond with STRICT JSON only: {{"rankings": [{{"index": 0, "rank": 1, "reason": "<20 word explanation>"}}, ...]}} Tasks: {chr(10).join(task_lines)}""" async def _call_ai(self, prompt: str) -> str | None: """Call AI via API key or CLI subscription.""" return await ai_complete(prompt, self.cfg) def _task_to_dict(self, t: Task, rank: int, reason: str) -> dict: return { "id": t.id, "title": t.title, "description": t.description[:500], "status": t.status, "assignee": t.assignee, "author": t.author, "url": t.url, "labels": t.labels, "board": t.board, "created_at": t.created_at, "updated_at": t.updated_at, "rank": rank, "ai_reason": reason, } ================================================ FILE: maggy/maggy/services/monitor.py ================================================ """MonitorService — background polling for issue trackers.""" from __future__ import annotations import logging import sqlite3 from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path import httpx logger = logging.getLogger(__name__) GITHUB_API = "https://api.github.com" MONDAY_API = "https://api.monday.com/v2" @dataclass class MonitorConfig: """Config for a single project monitor.""" project_key: str provider: str # "github" | "asana" | "monday" poll_command: str = "" interval_seconds: int = 300 enabled: bool = True @dataclass class MonitorEvent: """A detected new item from a tracker.""" id: str title: str url: str provider: str project_key: str seen_at: str = "" class MonitorService: """SQLite-backed tracker polling service.""" def __init__(self, db_path: Path) -> None: self._db = sqlite3.connect(str(db_path)) self._init_tables() def _init_tables(self) -> None: self._db.executescript(""" CREATE TABLE IF NOT EXISTS monitors ( project_key TEXT PRIMARY KEY, provider TEXT NOT NULL, poll_command TEXT DEFAULT '', interval_seconds INTEGER DEFAULT 300, enabled INTEGER DEFAULT 1 ); CREATE TABLE IF NOT EXISTS seen_events ( event_id TEXT, project_key TEXT, seen_at TEXT, PRIMARY KEY (event_id, project_key) ); """) def add(self, cfg: MonitorConfig) -> None: self._db.execute( "INSERT OR REPLACE INTO monitors VALUES (?,?,?,?,?)", (cfg.project_key, cfg.provider, cfg.poll_command, cfg.interval_seconds, int(cfg.enabled)), ) self._db.commit() def remove(self, project_key: str) -> None: self._db.execute( "DELETE FROM monitors WHERE project_key=?", (project_key,), ) self._db.commit() def list_active(self) -> list[MonitorConfig]: rows = self._db.execute( "SELECT * FROM monitors WHERE enabled=1", ).fetchall() return [_row_to_config(r) for r in rows] def is_new(self, event_id: str, project_key: str) -> bool: row = self._db.execute( "SELECT 1 FROM seen_events WHERE event_id=? AND project_key=?", (event_id, project_key), ).fetchone() return row is None def mark_seen(self, event_id: str, project_key: str) -> None: now = datetime.now(timezone.utc).isoformat() self._db.execute( "INSERT OR IGNORE INTO seen_events VALUES (?,?,?)", (event_id, project_key, now), ) self._db.commit() def status(self) -> dict: active = len(self.list_active()) total = self._db.execute( "SELECT COUNT(*) FROM seen_events", ).fetchone()[0] return {"active": active, "seen_events": total} async def poll(self, cfg: MonitorConfig) -> list[MonitorEvent]: """Poll tracker and return new events.""" if cfg.provider == "github": return await _poll_github(self, cfg) if cfg.provider == "monday": return await _poll_monday(self, cfg) return [] def _row_to_config(row: tuple) -> MonitorConfig: return MonitorConfig( project_key=row[0], provider=row[1], poll_command=row[2], interval_seconds=row[3], enabled=bool(row[4]), ) async def _poll_github(svc: MonitorService, cfg: MonitorConfig) -> list[MonitorEvent]: repo = cfg.poll_command or "" if not repo: return [] events: list[MonitorEvent] = [] async with httpx.AsyncClient(timeout=15) as client: url = f"{GITHUB_API}/repos/{repo}/pulls" resp = await client.get(url, params={"state": "open"}) if resp.status_code != 200: return [] for pr in resp.json(): eid = f"gh-pr-{pr.get('number', '')}" if svc.is_new(eid, cfg.project_key): events.append(MonitorEvent( id=eid, title=pr.get("title", ""), url=pr.get("html_url", ""), provider="github", project_key=cfg.project_key, )) svc.mark_seen(eid, cfg.project_key) return events async def _poll_monday(svc: MonitorService, cfg: MonitorConfig) -> list[MonitorEvent]: board_id = cfg.poll_command or "" if not board_id: return [] events: list[MonitorEvent] = [] query = f'{{ boards(ids: [{board_id}]) {{ items_page(limit: 20) {{ items {{ id name }} }} }} }}' async with httpx.AsyncClient(timeout=15) as client: resp = await client.post( MONDAY_API, json={"query": query}, ) if resp.status_code != 200: return [] boards = resp.json().get("data", {}).get("boards", []) if not boards: return [] items = boards[0].get("items_page", {}).get("items", []) for item in items: eid = f"mon-{item.get('id', '')}" if svc.is_new(eid, cfg.project_key): events.append(MonitorEvent( id=eid, title=item.get("name", ""), url="", provider="monday", project_key=cfg.project_key, )) svc.mark_seen(eid, cfg.project_key) return events ================================================ FILE: maggy/maggy/services/output_reviewer.py ================================================ """Inter-task output quality reviewer. Sends step output to a fast local model for quality scoring. Falls back to pass-through (score=3) on any failure so it never blocks the pipeline. """ from __future__ import annotations import logging import re from dataclasses import dataclass from typing import TYPE_CHECKING if TYPE_CHECKING: from maggy.adapters.pi import PiAdapter logger = logging.getLogger(__name__) _SCORE_RE = re.compile(r"SCORE:\s*(\d+)", re.IGNORECASE) _REASON_RE = re.compile(r"REASON:\s*(.+)", re.IGNORECASE) REVIEW_MODEL = "local" REVIEW_MAX_TURNS = 1 @dataclass class ReviewResult: score: int reason: str = "" def _parse_review(text: str) -> ReviewResult: """Extract score and reason from reviewer output.""" m = _SCORE_RE.search(text) if not m: return ReviewResult(score=3) score = max(1, min(5, int(m.group(1)))) rm = _REASON_RE.search(text) reason = rm.group(1).strip() if rm else "" return ReviewResult(score=score, reason=reason) def _build_prompt(step_label: str, output: str) -> str: """Build the review prompt for the local model.""" trimmed = output[:3000] return ( f"Review this {step_label} output for quality.\n" "Rate 1-5 (1=wrong, 3=acceptable, 5=excellent).\n" "Reply ONLY in this format:\n" "SCORE: <number>\nREASON: <one sentence>\n\n" f"--- OUTPUT ---\n{trimmed}" ) async def review_output( pi: "PiAdapter", step_label: str, output: str, wd: str, ) -> ReviewResult: """Send step output to local model for quality review.""" prompt = _build_prompt(step_label, output) try: result = await pi.send_prompt( REVIEW_MODEL, prompt, wd, max_turns=REVIEW_MAX_TURNS, timeout=30, ) if not result.success: return ReviewResult(score=3, reason="review unavailable") return _parse_review(result.output) except Exception as exc: logger.debug("Review failed: %s", exc) return ReviewResult(score=3, reason="review error") ================================================ FILE: maggy/maggy/services/planner.py ================================================ """Dual-model planning service.""" from __future__ import annotations from dataclasses import dataclass, field from maggy.adapters.pi import PiAdapter, RunResult @dataclass class PlanResult: primary_plan: str counter_check: str conflicts: list[str] = field(default_factory=list) class DualPlanner: def __init__(self, pi: PiAdapter): self._pi = pi async def plan( self, task_title: str, task_desc: str, wd: str, ) -> str: prompt = _plan_prompt(task_title, task_desc) return await self._send("claude", prompt, wd) async def counter_check(self, plan_text: str, wd: str) -> str: prompt = _review_prompt(plan_text) return await self._send("codex", prompt, wd) async def dual_plan( self, task_title: str, task_desc: str, wd: str, ) -> PlanResult: primary = await self.plan(task_title, task_desc, wd) review = await self.counter_check(primary, wd) return PlanResult(primary, review, _conflicts(review)) async def _send(self, model: str, prompt: str, wd: str) -> str: result = await self._pi.send_prompt(model, prompt, wd, 5) return _result_text(result, model) def _plan_prompt(task_title: str, task_desc: str) -> str: return ( "Create an implementation plan.\n" "Return numbered steps, files to touch, risks, and tests.\n\n" f"Title: {task_title}\n" f"Description: {task_desc}" ) def _review_prompt(plan_text: str) -> str: return ( "Review this implementation plan.\n" "Flag conflicts as 'CONFLICT:' and keep the note short.\n" "Call out risky omissions and invalid assumptions.\n\n" f"Plan:\n{plan_text}" ) def _result_text(result: RunResult, model: str) -> str: if result.success: return result.output.strip() message = result.output or result.error raise RuntimeError((message or f"{model} planning failed").strip()) def _conflicts(text: str) -> list[str]: return [ line.partition(":")[2].strip() for line in text.splitlines() if line.upper().startswith("CONFLICT:") ] ================================================ FILE: maggy/maggy/services/session_detect.py ================================================ """Multi-CLI session detection. Scans Claude, Kimi, Codex state directories to find previous sessions for a given working directory. """ from __future__ import annotations import json import logging from dataclasses import dataclass, field from pathlib import Path logger = logging.getLogger(__name__) def _home() -> Path: """Testable home directory getter.""" return Path.home() @dataclass class CliSessionInfo: """Detected session from a CLI tool.""" cli: str session_id: str project_path: str = "" @dataclass class DetectedSessions: """Results from scanning all CLIs.""" sessions: list[CliSessionInfo] = field( default_factory=list, ) def detect_all(working_dir: str) -> DetectedSessions: """Scan all CLIs for previous sessions.""" result = DetectedSessions() for fn in (detect_claude, detect_kimi, detect_codex): try: info = fn(working_dir) if info: result.sessions.append(info) except Exception: continue return result def detect_claude(working_dir: str) -> CliSessionInfo | None: """Find latest Claude session for this directory.""" path = _home() / ".claude" / "history.jsonl" if not path.exists(): return None target = working_dir.rstrip("/") for line in reversed(path.read_text().splitlines()): entry = _parse_json(line) if not entry: continue project = entry.get("project", "").rstrip("/") sid = entry.get("sessionId", "") if project == target and sid: return CliSessionInfo("claude", sid, target) return None def detect_kimi(working_dir: str) -> CliSessionInfo | None: """Find latest Kimi session from kimi.json.""" path = _home() / ".kimi" / "kimi.json" if not path.exists(): return None data = _parse_json(path.read_text()) if not data: return None target = working_dir.rstrip("/") for entry in data.get("work_dirs", []): entry_path = entry.get("path", "").rstrip("/") sid = entry.get("last_session_id") if entry_path == target and sid: return CliSessionInfo("kimi", sid, target) return None def detect_codex(working_dir: str) -> CliSessionInfo | None: """Find latest Codex session by scanning files.""" sess_dir = _home() / ".codex" / "sessions" if not sess_dir.exists(): return None target = working_dir.rstrip("/") files = sorted( sess_dir.rglob("rollout-*.jsonl"), reverse=True, ) for f in files[:50]: entry = _parse_json(_read_first_line(f)) if not entry: continue payload = entry.get("payload", {}) cwd = payload.get("cwd", "").rstrip("/") sid = payload.get("id", "") if cwd == target and sid: return CliSessionInfo("codex", sid, target) return None def _parse_json(text: str) -> dict | None: """Safe JSON parse, returns None on failure.""" text = text.strip() if not text: return None try: return json.loads(text) except (json.JSONDecodeError, ValueError): return None def _read_first_line(path: Path) -> str: """Read first line of a file safely.""" try: with path.open() as f: return f.readline() except OSError: return "" ================================================ FILE: maggy/maggy/services/stakes.py ================================================ """Stakes classification — HIGH/MEDIUM/LOW from task metadata.""" from __future__ import annotations import re from dataclasses import dataclass, field from typing import TYPE_CHECKING if TYPE_CHECKING: from maggy.providers.base import Task from maggy.routing_rules import StakesLevel, StakesPatterns @dataclass class StakesResult: """Result of stakes classification.""" level: str # "high" | "medium" | "low" reasons: list[str] = field(default_factory=list) def classify_stakes( task: Task, patterns: StakesPatterns | None = None, ) -> StakesResult: """Classify task stakes from metadata and text.""" if patterns is None: from maggy.routing_rules_defaults import default_stakes patterns = default_stakes() text = f"{task.title} {task.description}".lower() raw = task.raw if isinstance(task.raw, dict) else {} task_type = str(raw.get("task_type", "")) reasons: list[str] = [] if _matches(text, task_type, patterns.high, reasons): return StakesResult("high", reasons) if _matches(text, task_type, patterns.medium, reasons): return StakesResult("medium", reasons) return StakesResult("low", ["default"]) def _matches( text: str, task_type: str, level: "StakesLevel", reasons: list[str], ) -> bool: """Check if text/task_type matches a stakes level.""" matched = False for pat in level.file_patterns: if re.search(re.escape(pat), text): reasons.append(f"file:{pat}") matched = True if task_type and task_type in level.task_types: reasons.append(f"type:{task_type}") matched = True for kw in level.keywords: if kw.lower() in text: reasons.append(f"keyword:{kw}") matched = True return matched ================================================ FILE: maggy/maggy/services/tdd_verifier.py ================================================ """TDD verification — runs pytest/ruff/coverage between executor steps.""" from __future__ import annotations import asyncio import logging import re from dataclasses import dataclass logger = logging.getLogger(__name__) DEFAULT_TIMEOUT = 120 COVERAGE_THRESHOLD = 80.0 @dataclass class VerifyResult: """Outcome of a verification step.""" passed: bool detail: str tests_found: int = 0 tests_failed: int = 0 async def verify_tests_exist(wd: str) -> VerifyResult: """Run pytest --collect-only to verify tests were written.""" code, output = await _run_cmd( ["python3", "-m", "pytest", "--collect-only", "-q"], wd, ) count = _count_collected(output) if code != 0 or count == 0: return VerifyResult(False, output[:500], count) return VerifyResult(True, f"{count} tests collected", count) async def verify_tests_fail(wd: str) -> VerifyResult: """Run pytest -x and confirm failures (RED phase).""" code, output = await _run_cmd( ["python3", "-m", "pytest", "-x", "--tb=short", "-q"], wd, ) failed = _count_failures(output) if code == 0: return VerifyResult( False, "Tests passed — expected failures (RED)", ) if failed == 0: return VerifyResult(False, f"Non-test error:\n{output[:500]}") return VerifyResult(True, f"{failed} tests failed (RED)", 0, failed) async def verify_tests_pass(wd: str) -> VerifyResult: """Run pytest -x and confirm all pass (GREEN phase).""" code, output = await _run_cmd( ["python3", "-m", "pytest", "-x", "--tb=short", "-q"], wd, ) if code != 0: return VerifyResult( False, f"Tests failing:\n{output[:500]}", ) return VerifyResult(True, "All tests pass (GREEN)") async def verify_lint(wd: str) -> VerifyResult: """Run ruff check on the working directory.""" code, output = await _run_cmd( ["python3", "-m", "ruff", "check", "."], wd, ) if code != 0: return VerifyResult(False, f"Lint errors:\n{output[:500]}") return VerifyResult(True, "Lint clean") async def verify_coverage( wd: str, threshold: float = COVERAGE_THRESHOLD, ) -> VerifyResult: """Run pytest with coverage and check threshold.""" code, output = await _run_cmd( ["python3", "-m", "pytest", "--cov", "-q"], wd, ) pct = _parse_coverage(output) if pct < threshold: return VerifyResult( False, f"Coverage {pct:.0f}% < {threshold:.0f}%", ) return VerifyResult(True, f"Coverage {pct:.0f}%") async def _run_cmd( cmd: list[str], cwd: str, ) -> tuple[int, str]: """Run a subprocess, return (exit_code, output).""" try: proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, cwd=cwd, ) stdout, _ = await asyncio.wait_for( proc.communicate(), timeout=DEFAULT_TIMEOUT, ) text = (stdout or b"").decode("utf-8", errors="replace") return proc.returncode or 0, text except asyncio.TimeoutError: return 1, "Command timed out" except FileNotFoundError: return 1, f"Command not found: {cmd[0]}" def _count_collected(output: str) -> int: """Parse 'N tests collected' from pytest output.""" m = re.search(r"(\d+)\s+tests?\s+collected", output) return int(m.group(1)) if m else 0 def _count_failures(output: str) -> int: """Parse 'N failed' from pytest summary.""" m = re.search(r"(\d+)\s+failed", output) return int(m.group(1)) if m else 0 def _parse_coverage(output: str) -> float: """Parse 'TOTAL ... NN%' from coverage output.""" m = re.search(r"TOTAL\s+.*?(\d+)%", output) return float(m.group(1)) if m else 0.0 ================================================ FILE: maggy/maggy/services/vision.py ================================================ """Vision analysis via Ollama Qwen3-VL — screenshot review.""" from __future__ import annotations import base64 import json import logging from pathlib import Path from typing import Generator import httpx logger = logging.getLogger(__name__) OLLAMA_URL = "http://localhost:11434" VISION_MODEL = "qwen3-vl:32b" _IMAGE_EXTS = frozenset({ ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", }) _DEFAULT_PROMPT = ( "Analyze this screenshot. Describe what you see, " "identify any UI issues, and suggest improvements." ) def _validate(path: str) -> Path | None: """Check file exists and is an image.""" p = Path(path).expanduser().resolve() if not p.exists(): return None if p.suffix.lower() not in _IMAGE_EXTS: return None return p def _encode(path: Path) -> str: """Base64-encode an image file.""" return base64.b64encode(path.read_bytes()).decode() def analyze_image( path: str, prompt: str | None = None, ) -> Generator[dict, None, None]: """Stream vision analysis from Ollama Qwen3-VL. Yields dicts: {type: text|error|done, content: ...} """ resolved = _validate(path) if resolved is None: yield _err(f"Invalid image: {path}") return img_b64 = _encode(resolved) body = { "model": VISION_MODEL, "messages": [{ "role": "user", "content": prompt or _DEFAULT_PROMPT, "images": [img_b64], }], "stream": True, } try: with httpx.stream( "POST", f"{OLLAMA_URL}/api/chat", json=body, timeout=120.0, ) as resp: for line in resp.iter_lines(): chunk = json.loads(line) if chunk.get("done"): break text = chunk.get("message", {}).get( "content", "", ) if text: yield {"type": "text", "content": text} except httpx.ConnectError as e: yield _err(f"Cannot connect to Ollama: {e}") return except Exception as e: yield _err(str(e)) return yield {"type": "done"} def _err(msg: str) -> dict: return {"type": "error", "content": msg} ================================================ FILE: maggy/maggy/static/app.js ================================================ // Maggy dashboard — vanilla JS, no build step. // Talks to /api/* routes. Single-user local install; no auth by default. const API = '/api'; let CURRENT_TAB = 'chat'; // ── Fetch helper ──────────────────────────────────────────────────────── async function api(path, opts = {}) { const apiKey = localStorage.getItem('maggy-api-key') || ''; const headers = { 'Content-Type': 'application/json', ...(opts.headers || {}) }; if (apiKey) headers['X-API-Key'] = apiKey; const resp = await fetch(`${API}${path}`, { ...opts, headers }); if (!resp.ok) { const text = await resp.text().catch(() => ''); throw new Error(`${resp.status}: ${text || resp.statusText}`); } return resp.json(); } // ── HTML escape ───────────────────────────────────────────────────────── function esc(s) { if (s === null || s === undefined) return ''; if (typeof s !== 'string') s = String(s); return s.replace(/[&<>"']/g, c => ({ '&': '&', '<': '<', '>': '>', '"': '"', "'": ''' }[c])); } // Only allow http(s) / mailto URLs when rendering external `href`. // Blocks javascript:, data:, vbscript: and other script-capable schemes that // would slip past `esc()` (since it only encodes angle brackets and quotes). function safeHref(url) { if (!url || typeof url !== 'string') return ''; const trimmed = url.trim(); if (!/^(https?:|mailto:)/i.test(trimmed)) return ''; return esc(trimmed); } // Escape a value for use inside a JS string literal that is itself embedded in // an HTML attribute. esc() is NOT enough here — it leaves single quotes and // backslashes intact, so a task id containing `'); alert(1);//` would break // out of onclick="executeTask('${id}', ...)". We need to: // 1. escape the backslash first (so later escapes don't double-encode) // 2. escape the single quote that wraps the JS string // 3. escape angle brackets in case the attribute is interpreted as HTML // 4. escape newlines and carriage returns that would break the statement function jsStr(s) { if (s === null || s === undefined) return ''; return String(s) .replace(/\\/g, '\\\\') .replace(/'/g, "\\'") .replace(/</g, '\\u003C') .replace(/>/g, '\\u003E') .replace(/\r?\n/g, '\\n'); } function relDate(iso) { if (!iso) return ''; const d = new Date(iso); const diff = (Date.now() - d.getTime()) / 1000; if (diff < 60) return 'just now'; if (diff < 3600) return `${Math.floor(diff/60)}m ago`; if (diff < 86400) return `${Math.floor(diff/3600)}h ago`; if (diff < 2592000) return `${Math.floor(diff/86400)}d ago`; return d.toLocaleDateString(); } // ── Tabs ──────────────────────────────────────────────────────────────── function switchTab(tab) { CURRENT_TAB = tab; // Close system dropdown const menu = document.getElementById('system-menu'); if (menu) menu.classList.add('hidden'); // Highlight active tab button (nav bar) for (const b of document.querySelectorAll('.tab-btn')) { b.classList.toggle('active', b.dataset.tab === tab); } // Highlight active system dropdown item const gear = document.getElementById('system-gear'); const sysTabs = ['budget', 'routing', 'forge', 'settings']; if (gear) { gear.classList.toggle('active', sysTabs.includes(tab)); } for (const s of document.querySelectorAll('.sys-item')) { s.classList.toggle( 'text-orange-400', s.dataset.tab === tab, ); } // Show/hide panes for (const p of document.querySelectorAll('.pane')) { p.classList.toggle('hidden', p.id !== `pane-${tab}`); } if (tab === 'chat') loadChat(); else if (tab === 'inbox') loadInbox(); else if (tab === 'followed') loadFollowed(); else if (tab === 'competitors') loadCompetitors(); else if (tab === 'process') loadProcess(); else if (tab === 'budget') loadBudget(); else if (tab === 'routing') loadRouting(); else if (tab === 'forge') loadForge(); else if (tab === 'settings') loadSettings(); } function toggleSystemMenu() { const menu = document.getElementById('system-menu'); if (menu) menu.classList.toggle('hidden'); } // Close system menu when clicking outside document.addEventListener('click', (e) => { const menu = document.getElementById('system-menu'); const gear = document.getElementById('system-gear'); if (!menu || !gear) return; if (!gear.contains(e.target) && !menu.contains(e.target)) { menu.classList.add('hidden'); } }); // ── Drawer ────────────────────────────────────────────────────────────── function openDrawer(title, html) { document.getElementById('drawer-title').textContent = title; document.getElementById('drawer-body').innerHTML = html; document.getElementById('drawer').classList.remove('translate-x-full'); } function closeDrawer() { document.getElementById('drawer').classList.add('translate-x-full'); } // ── Inbox ─────────────────────────────────────────────────────────────── async function loadInbox(refresh = false) { const pane = document.getElementById('pane-inbox'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading…</div>`; const [activity, inbox] = await Promise.all([ api('/activity').catch(() => ({ sessions: [], recent: [] })), api(`/inbox${refresh ? '?refresh=true' : ''}`).catch(() => ({ items: [] })), ]); const sessions = activity.sessions || []; const recent = activity.recent || []; const items = inbox.items || []; let html = ''; if (sessions.length) { html += `<div class="mb-4"><h2 class="text-sm font-bold text-white mb-2"><i class="fas fa-terminal mr-1 text-green-400"></i>Active Sessions (${sessions.length})</h2><div class="space-y-2">`; for (const s of sessions) { const badge = s.status === 'agent' ? '<span class="text-[10px] px-1.5 py-0.5 rounded bg-purple-900 text-purple-300">agent</span>' : '<span class="text-[10px] px-1.5 py-0.5 rounded bg-green-900 text-green-300">running</span>'; const label = s.status === 'agent' ? `${esc(s.agent_name)} @ ${esc(s.team_name)}` : esc(s.project || 'unknown'); html += `<div class="card p-3"><div class="flex items-center gap-2"> <span class="text-[10px] font-mono text-blue-400 uppercase">${esc(s.cli)}</span> ${badge} <span class="text-sm text-white">${label}</span> <span class="text-[10px] text-gray-500 ml-auto">PID ${s.pid}</span> </div> ${s.last_prompt ? `<div class="text-[11px] text-gray-400 mt-1 truncate">"${esc(s.last_prompt)}"</div>` : ''} </div>`; } html += `</div></div>`; } if (recent.length) { html += `<div class="mb-4"><h2 class="text-sm font-bold text-white mb-2"><i class="fas fa-clock-rotate-left mr-1 text-yellow-400"></i>Recent Activity</h2><div class="space-y-1">`; for (const r of recent.slice(0, 10)) { html += `<div class="card p-2 flex items-center gap-2"> <span class="text-[10px] font-mono text-blue-400 uppercase w-10">${esc(r.cli)}</span> <span class="text-[11px] text-gray-300 flex-1 truncate">${esc(r.text)}</span> <span class="text-[10px] text-gray-500 shrink-0">${r.project ? esc(r.project) + ' · ' : ''}${esc(relDate(r.timestamp))}</span> </div>`; } html += `</div></div>`; } if (items.length) { html += `<div class="mb-4"><div class="flex items-center gap-3 mb-2"> <h2 class="text-sm font-bold text-white"><i class="fas fa-inbox mr-1 text-orange-400"></i>Issues (${items.length})</h2> <button onclick="loadInbox(true)" class="text-[10px] text-gray-400 hover:text-white"><i class="fas fa-rotate mr-1"></i>Re-rank</button> </div><div class="space-y-2">`; for (const i of items) { const labels = (i.labels || []).slice(0, 4).map(l => `<span class="text-[10px] px-1.5 py-0.5 rounded bg-gray-800 text-gray-400">${esc(l)}</span>`).join(' '); html += `<div class="card p-3 hover:bg-gray-900 cursor-pointer" onclick="openTaskDetail('${jsStr(i.id)}')"> <div class="flex items-start gap-3"> <div class="text-xs font-mono text-orange-400 mt-0.5">#${i.rank}</div> <div class="flex-1 min-w-0"> <div class="text-sm text-white">${esc(i.title)}</div> <div class="text-[11px] text-gray-500 mt-0.5"> <span class="text-blue-400">${esc(i.board || '')}</span> ${i.assignee ? `· ${esc(i.assignee)}` : ''} · ${esc(relDate(i.updated_at))} ${labels ? '· ' + labels : ''} </div> ${i.ai_reason ? `<div class="text-[11px] text-gray-400 mt-1 italic">"${esc(i.ai_reason)}"</div>` : ''} </div> <div class="flex gap-1 shrink-0" onclick="event.stopPropagation()"> <button onclick="executeTask('${jsStr(i.id)}', 'plan')" class="text-[10px] px-2 py-1 rounded bg-gray-800 hover:bg-gray-700 text-gray-300">Plan</button> <button onclick="executeTask('${jsStr(i.id)}', 'tdd')" class="text-[10px] px-2 py-1 rounded bg-orange-600 hover:bg-orange-700 text-white">Execute</button> </div> </div> </div>`; } html += `</div></div>`; } if (!sessions.length && !recent.length && !items.length) { html = `<div class="card p-4 text-sm text-gray-400">No activity detected. Start a Claude, Codex, or Kimi session to see it here.</div>`; } pane.innerHTML = html; } // ── Followed ──────────────────────────────────────────────────────────── async function loadFollowed() { const pane = document.getElementById('pane-followed'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading followed tasks…</div>`; try { const data = await api('/followed'); const items = data.items || []; if (!items.length) { pane.innerHTML = `<div class="card p-4 text-sm text-gray-400">Nothing you're following right now.</div>`; return; } let html = `<h2 class="text-sm font-bold text-white mb-3">Following (${items.length})</h2><div class="space-y-2">`; for (const i of items) { html += `<div class="card p-3 hover:bg-gray-900 cursor-pointer" onclick="openTaskDetail('${jsStr(i.id)}')"> <div class="text-sm text-white">${esc(i.title)}</div> <div class="text-[11px] text-gray-500 mt-0.5"> <span class="text-blue-400">${esc(i.board || '')}</span> ${i.assignee ? `· ${esc(i.assignee)}` : ''} · ${esc(relDate(i.updated_at))} </div> </div>`; } html += `</div>`; pane.innerHTML = html; } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } // ── Task detail drawer ────────────────────────────────────────────────── async function openTaskDetail(taskId) { openDrawer('Loading…', '<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading task…</div>'); try { const data = await api(`/task/${encodeURIComponent(taskId)}`); const t = data.task; const comments = data.comments || []; document.getElementById('drawer-title').textContent = t.title; let html = `<div class="space-y-3"> <div class="card p-3"> <div class="text-[10px] text-gray-500 uppercase mb-1">Details</div> <div class="flex flex-wrap gap-2 text-[11px] text-gray-400"> <span class="text-blue-400">${esc(t.board)}</span> <span>${esc(t.status)}</span> ${t.assignee ? `<span>@${esc(t.assignee)}</span>` : ''} <span>${esc(relDate(t.updated_at))}</span> ${safeHref(t.url) ? `<a href="${safeHref(t.url)}" target="_blank" rel="noopener noreferrer" class="text-orange-400">Open ↗</a>` : ''} </div> </div>`; if (t.description) { html += `<div class="card p-3"><div class="text-[10px] text-gray-500 uppercase mb-1">Description</div><pre class="text-xs text-gray-300 max-h-48 overflow-y-auto">${esc(t.description)}</pre></div>`; } html += `<div class="flex gap-2"> <button onclick="executeTask('${jsStr(t.id)}', 'plan')" class="flex-1 text-xs px-3 py-1.5 rounded bg-gray-700 hover:bg-gray-600 text-white"><i class="fas fa-list-check mr-1"></i>Plan</button> <button onclick="executeTask('${jsStr(t.id)}', 'tdd')" class="flex-1 text-xs px-3 py-1.5 rounded bg-orange-600 hover:bg-orange-700 text-white"><i class="fas fa-play mr-1"></i>Execute (TDD)</button> </div>`; if (comments.length) { html += `<div class="card p-3"><div class="text-[10px] text-gray-500 uppercase mb-2">Comments (${comments.length})</div><div class="space-y-2 max-h-64 overflow-y-auto">`; for (const c of comments) { html += `<div class="bg-gray-900 rounded p-2"> <div class="flex justify-between text-[10px] text-gray-500 mb-1"><span>${esc(c.author)}</span><span>${esc(relDate(c.created_at))}</span></div> <div class="text-xs text-gray-300 whitespace-pre-wrap">${esc(c.text)}</div> </div>`; } html += `</div></div>`; } html += `<div class="card p-3"> <div class="text-[10px] text-gray-500 uppercase mb-1">Reply</div> <textarea id="reply-box" rows="3" class="w-full bg-gray-900 text-xs text-white rounded px-2 py-1.5 border border-gray-700"></textarea> <button onclick="postReply('${jsStr(t.id)}')" class="mt-2 text-xs px-3 py-1 rounded bg-blue-600 text-white">Post</button> </div>`; html += `</div>`; document.getElementById('drawer-body').innerHTML = html; } catch (e) { document.getElementById('drawer-body').innerHTML = `<div class="text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } async function postReply(taskId) { const text = document.getElementById('reply-box').value.trim(); if (!text) return; try { await api(`/task/${encodeURIComponent(taskId)}/comment`, { method: 'POST', body: JSON.stringify({ text }) }); openTaskDetail(taskId); // refresh } catch (e) { alert('Failed to post: ' + e.message); } } async function executeTask(taskId, mode) { try { const data = await api('/execute', { method: 'POST', body: JSON.stringify({ task_id: taskId, mode }) }); alert(`Started session ${data.session_id} (${mode}). Open the Sessions tab to follow progress.`); switchTab('sessions'); } catch (e) { alert('Execute failed: ' + e.message); } } // ── Competitors ───────────────────────────────────────────────────────── let COMP_VIEW = 'news'; // 'news' | 'list' async function loadCompetitors() { const pane = document.getElementById('pane-competitors'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading competitors…</div>`; try { const [comps, news] = await Promise.all([ api('/competitors'), api('/competitors/news?limit=100').catch(() => []), ]); let html = `<div class="flex items-center gap-2 mb-3"> <button onclick="COMP_VIEW='news'; loadCompetitors()" class="text-[10px] px-3 py-1.5 rounded-full ${COMP_VIEW==='news' ? 'bg-orange-600 text-white' : 'bg-gray-800 text-gray-300'}"><i class="fas fa-newspaper mr-1"></i>News (${news.length})</button> <button onclick="COMP_VIEW='list'; loadCompetitors()" class="text-[10px] px-3 py-1.5 rounded-full ${COMP_VIEW==='list' ? 'bg-orange-600 text-white' : 'bg-gray-800 text-gray-300'}"><i class="fas fa-list mr-1"></i>Competitors (${comps.length})</button> <div class="flex-1"></div> ${COMP_VIEW==='news' ? '<button onclick="scanCompetitors()" class="text-[10px] px-3 py-1 rounded bg-gray-700 text-gray-300 hover:bg-gray-600"><i class="fas fa-rotate mr-1"></i>Scan</button>' : '<button onclick="discoverCompetitors()" class="text-[10px] px-3 py-1 rounded bg-purple-600 text-white hover:bg-purple-700"><i class="fas fa-magnifying-glass-plus mr-1"></i>Discover More</button>'} </div>`; if (COMP_VIEW === 'news') { html += `<div id="briefing" class="card p-4 mb-3 border-purple-700/50"><div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading daily briefing…</div></div>`; pane.innerHTML = html + renderNewsFeed(news); loadBriefing(); } else { if (!comps.length) { html += `<div class="card p-4 text-sm text-gray-400">No competitors yet. Click <b>Discover More</b> to have Maggy find competitors in your domain.</div>`; } else { html += `<div class="grid grid-cols-1 md:grid-cols-2 gap-3">`; for (const c of comps) { html += `<div class="card p-3"> <div class="text-sm font-bold text-white">${esc(c.name)}</div> <div class="text-[10px] text-gray-500">${esc(c.category || '')} · ${esc(c.website || '')}</div> <div class="text-xs text-gray-400 mt-2">${esc(c.description || '')}</div> </div>`; } html += `</div>`; } pane.innerHTML = html; } } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } function renderNewsFeed(news) { if (!news.length) return '<div class="card p-4 text-sm text-gray-400">No competitor news yet. Click <b>Scan</b> to fetch.</div>'; const typeIcon = { feature_launch: 'fa-rocket text-cyan-400', acquisition: 'fa-handshake text-yellow-400', partnership: 'fa-link text-green-400', pricing_change: 'fa-tag text-orange-400', funding: 'fa-dollar-sign text-green-400', blog_post: 'fa-rss text-blue-400', news: 'fa-newspaper text-gray-400', }; let html = `<div class="space-y-1.5 max-h-[70vh] overflow-y-auto">`; for (const n of news.slice(0, 80)) { const icon = typeIcon[n.event_type] || 'fa-circle text-gray-500'; html += `<div class="card px-3 py-2 flex items-start gap-2"> <i class="fas ${icon} text-[10px] mt-1.5"></i> <div class="flex-1 min-w-0"> <div class="text-xs text-white">${esc(n.title)}</div> <div class="text-[10px] text-gray-500 mt-0.5"> <span class="text-orange-400">${esc(n.competitor_name)}</span> · ${esc(n.source === 'rss' ? 'blog' : 'news')} · ${esc(relDate(n.created_at))} </div> </div> ${safeHref(n.url) ? `<a href="${safeHref(n.url)}" target="_blank" rel="noopener noreferrer" class="text-blue-400 text-[10px]"><i class="fas fa-external-link-alt"></i></a>` : ''} </div>`; } html += `</div>`; return html; } async function loadBriefing() { try { const data = await api('/competitors/news/summary'); document.getElementById('briefing').innerHTML = ` <div class="flex items-center justify-between mb-2"> <div class="text-[10px] text-purple-400 uppercase font-bold"><i class="fas fa-robot mr-1"></i>Daily Briefing — ${esc(data.date || '')}</div> <button onclick="regenerateBriefing()" class="text-[10px] text-gray-500 hover:text-purple-400"><i class="fas fa-sync-alt mr-1"></i>Regenerate</button> </div> <pre class="text-xs text-gray-300">${esc(data.summary || '')}</pre> <div class="text-[10px] text-gray-600 mt-2">${data.total_signals || 0} signals analyzed</div>`; } catch (e) { document.getElementById('briefing').innerHTML = `<div class="text-xs text-red-400">Briefing failed: ${esc(e.message)}</div>`; } } async function regenerateBriefing() { const el = document.getElementById('briefing'); if (el) el.innerHTML = '<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Regenerating…</div>'; try { await api('/competitors/news/summary?refresh=true'); loadBriefing(); } catch (e) { if (el) el.innerHTML = `<div class="text-xs text-red-400">Regenerate failed: ${esc(e.message)}</div>`; } } async function discoverCompetitors() { if (!confirm('Ask Maggy to discover competitors for your domain? This calls the AI.')) return; try { const data = await api('/competitors/discover', { method: 'POST' }); alert(`Added ${data.added} new competitors (total: ${data.total})`); loadCompetitors(); } catch (e) { alert('Discovery failed: ' + e.message); } } async function scanCompetitors() { try { const data = await api('/competitors/monitor', { method: 'POST' }); alert(`Found ${data.rss || 0} blog posts + ${data.news || 0} news items across ${data.total_competitors} competitors`); loadCompetitors(); } catch (e) { alert('Scan failed: ' + e.message); } } // ── Chat ──────────────────────────────────────────────────────────────── let CHAT_SESSION_ID = null; let CHAT_SESSIONS_CACHE = []; async function loadChat() { const pane = document.getElementById('pane-chat'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Auto-connecting to active projects…</div>`; try { const result = await api('/chat/auto-connect', { method: 'POST' }); CHAT_SESSIONS_CACHE = result.sessions || []; if (!CHAT_SESSION_ID && CHAT_SESSIONS_CACHE.length) { CHAT_SESSION_ID = CHAT_SESSIONS_CACHE[0].id; } renderChatUI(pane); } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } function renderChatUI(pane) { const sessions = CHAT_SESSIONS_CACHE; let html = `<div class="flex h-[calc(100vh-10rem)]">`; html += renderChatSidebar(sessions); html += renderChatMain(); html += `</div>`; pane.innerHTML = html; if (CHAT_SESSION_ID) loadChatMessages(CHAT_SESSION_ID); } function renderChatSidebar(sessions) { let html = `<div class="w-60 shrink-0 border-r border-gray-800 pr-3 overflow-y-auto">`; html += `<div class="flex items-center justify-between mb-2"> <span class="text-[10px] text-gray-500 uppercase font-bold"><i class="fas fa-circle text-green-400 text-[8px] mr-1"></i>Connected Projects</span> <button onclick="newChatSession()" class="text-[10px] px-2 py-1 rounded bg-orange-600 hover:bg-orange-700 text-white"><i class="fas fa-plus mr-1"></i>New</button> </div><div class="space-y-1">`; if (!sessions.length) { html += `<div class="text-[10px] text-gray-500 p-2">No active CLI sessions detected</div>`; } for (const s of sessions) { const active = s.id === CHAT_SESSION_ID ? 'bg-gray-800 border-orange-500' : 'border-transparent hover:bg-gray-900'; const ctx = s.history_context ? ' title="' + esc(s.history_context) + '"' : ''; html += `<div class="card px-2 py-1.5 cursor-pointer border ${active}" onclick="openChatSession('${jsStr(s.id)}')"${ctx}> <div class="flex items-center gap-1"><i class="fas fa-circle text-green-400 text-[6px]"></i><span class="text-xs text-white truncate">${esc(s.project_key)}</span></div> <div class="text-[10px] text-gray-500 truncate">${esc(s.working_dir)}</div> ${s.history_context ? '<div class="text-[9px] text-gray-600 mt-0.5 truncate"><i class="fas fa-history mr-0.5"></i>has history</div>' : ''} </div>`; } html += `</div></div>`; return html; } function renderChatMain() { let html = `<div class="flex-1 flex flex-col pl-4">`; if (CHAT_SESSION_ID) { html += `<div id="chat-messages" class="flex-1 overflow-y-auto space-y-3 mb-3"></div>`; html += `<div class="shrink-0 flex gap-2"> <input id="chat-input" type="text" placeholder="Type a message to Claude…" class="flex-1 bg-gray-900 text-sm text-white rounded px-3 py-2 border border-gray-700 focus:border-orange-500 outline-none" onkeydown="if(event.key==='Enter')sendChatMessage()" /> <button onclick="sendChatMessage()" class="px-4 py-2 rounded bg-orange-600 hover:bg-orange-700 text-white text-sm"><i class="fas fa-paper-plane"></i></button> </div>`; } else { html += `<div class="flex-1 flex items-center justify-center"> <div class="text-center"> <i class="fas fa-robot text-4xl text-gray-700 mb-3"></i> <div class="text-sm text-gray-400 mb-2">No active CLI sessions detected</div> <div class="text-xs text-gray-500">Start a Claude Code session in any project and Maggy will auto-connect</div> </div> </div>`; } html += `</div>`; return html; } async function newChatSession() { let projects; try { const [cfg, activity] = await Promise.all([ api('/config').catch(() => ({ codebases: [] })), api('/activity').catch(() => ({ sessions: [] })), ]); const configProjects = (cfg.codebases || []).map(c => ({ key: c.key, path: c.path })); const activeProjects = (activity.sessions || []).map(s => ({ key: s.project, path: s.project_path })); const seen = new Set(); projects = []; for (const p of [...activeProjects, ...configProjects]) { if (p.key && !seen.has(p.key)) { seen.add(p.key); projects.push(p); } } } catch { projects = []; } if (!projects.length) { alert('No codebases found.'); return; } let chosen = projects[0]; if (projects.length > 1) { const name = prompt('Select project:\n' + projects.map((p, i) => `${i+1}. ${p.key}`).join('\n') + '\n\nEnter name:', projects[0].key); if (!name) return; chosen = projects.find(p => p.key === name) || { key: name, path: '' }; } try { const data = await api('/chat/sessions', { method: 'POST', body: JSON.stringify({ project_key: chosen.key, project_path: chosen.path }) }); CHAT_SESSION_ID = data.id; loadChat(); } catch (e) { alert('Failed: ' + e.message); } } function openChatSession(id) { CHAT_SESSION_ID = id; const pane = document.getElementById('pane-chat'); if (pane) renderChatUI(pane); } async function loadChatMessages(id) { const el = document.getElementById('chat-messages'); if (!el) return; try { const data = await api(`/chat/sessions/${id}`); let html = renderSessionHeader(data); if (data.history_context && !(data.messages || []).length) { html += renderHistoryContext(data.history_context); } for (const m of data.messages || []) { html += m.role === 'user' ? renderUserMsg(m) : renderAssistantMsg(m); } el.innerHTML = html; el.scrollTop = el.scrollHeight; } catch (e) { el.innerHTML = `<div class="text-xs text-red-400">${esc(e.message)}</div>`; } } function renderSessionHeader(data) { return `<div class="text-[10px] text-gray-500 mb-2"><i class="fas fa-folder-open mr-1"></i>${esc(data.project_key)} · <span class="font-mono">${esc(data.working_dir)}</span></div>`; } function renderHistoryContext(ctx) { return `<div class="card px-3 py-2 mb-2 border border-gray-700 bg-gray-900/50"> <div class="text-[10px] text-gray-400 font-bold mb-1"><i class="fas fa-history mr-1"></i>Session History (Maggy knows this)</div> <pre class="text-[10px] text-gray-500 whitespace-pre-wrap">${esc(ctx)}</pre> </div>`; } function renderUserMsg(m) { return `<div class="flex justify-end"><div class="max-w-[80%] bg-orange-600/20 border border-orange-600/30 rounded-lg px-3 py-2"> <div class="text-xs text-white">${esc(m.content)}</div> <div class="text-[10px] text-gray-500 mt-1">${esc(relDate(m.timestamp))}</div> </div></div>`; } function renderAssistantMsg(m) { return `<div class="flex justify-start"><div class="max-w-[80%] card px-3 py-2"> <pre class="text-xs text-gray-300 whitespace-pre-wrap">${esc(m.content)}</pre> <div class="text-[10px] text-gray-500 mt-1">${esc(relDate(m.timestamp))}</div> </div></div>`; } async function sendChatMessage() { const input = document.getElementById('chat-input'); if (!input) return; const message = input.value.trim(); if (!message || !CHAT_SESSION_ID) return; input.value = ''; input.disabled = true; const el = document.getElementById('chat-messages'); el.innerHTML += renderUserMsg({ content: message, timestamp: '' }); el.innerHTML += `<div id="stream-response" class="flex justify-start"><div class="max-w-[80%] card px-3 py-2"> <pre id="stream-text" class="text-xs text-gray-300"><i class="fas fa-spinner fa-spin text-orange-400"></i> Claude is thinking…</pre> </div></div>`; el.scrollTop = el.scrollHeight; try { await streamChatResponse(message, el); } catch (e) { const streamEl = document.getElementById('stream-text'); if (streamEl) streamEl.innerHTML = `<span class="text-red-400">Error: ${esc(e.message)}</span>`; } input.disabled = false; input.focus(); } async function streamChatResponse(message, el) { const apiKey = localStorage.getItem('maggy-api-key') || ''; const resp = await fetch(`${API}/chat/sessions/${CHAT_SESSION_ID}/send`, { method: 'POST', headers: { 'Content-Type': 'application/json', ...(apiKey ? { 'X-API-Key': apiKey } : {}) }, body: JSON.stringify({ message }), }); const reader = resp.body.getReader(); const decoder = new TextDecoder(); let responseText = ''; const streamEl = document.getElementById('stream-text'); while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); for (const line of chunk.split('\n')) { if (!line.startsWith('data: ')) continue; try { const data = JSON.parse(line.slice(6)); if (data.type === 'done') continue; if (data.type === 'error') { streamEl.innerHTML = `<span class="text-red-400">${esc(data.content)}</span>`; continue; } if (data.content) { responseText += data.content; streamEl.textContent = responseText; el.scrollTop = el.scrollHeight; } } catch {} } } if (!responseText) streamEl.textContent = '(no response)'; } // ── Settings ──────────────────────────────────────────────────────────── async function loadSettings() { const pane = document.getElementById('pane-settings'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading settings…</div>`; try { const cfg = await api('/config'); pane.innerHTML = ` <h2 class="text-sm font-bold text-white mb-3">Settings</h2> <div class="card p-4 space-y-3 text-sm text-gray-300"> <div><span class="text-gray-500 text-[10px] uppercase">Org</span> — <b>${esc(cfg.org.name)}</b> ${cfg.org.domain ? `(domain: <span class="text-orange-400">${esc(cfg.org.domain)}</span>)` : ''}</div> <div><span class="text-gray-500 text-[10px] uppercase">Issue Tracker</span> — ${esc(cfg.issue_tracker.provider)}</div> <div><span class="text-gray-500 text-[10px] uppercase">Codebases</span> <ul class="ml-4 text-xs">${cfg.codebases.map(c => `<li>${esc(c.key)} → <code class="text-gray-400">${esc(c.path)}</code></li>`).join('')}</ul> </div> <div><span class="text-gray-500 text-[10px] uppercase">Competitors</span> — categories: ${cfg.competitors.categories.map(esc).join(', ') || '—'}</div> <div><span class="text-gray-500 text-[10px] uppercase">OKRs</span> — source: ${esc(cfg.okrs.source)} (${cfg.okrs.count} items)</div> <div><span class="text-gray-500 text-[10px] uppercase">AI</span> — ${esc(cfg.ai.provider)} / ${esc(cfg.ai.model)} · API key ${cfg.ai.has_key ? '<span class="text-green-400">set</span>' : '<span class="text-red-400">MISSING</span>'}</div> </div> <p class="text-[11px] text-gray-500 mt-4">Edit <code>~/.maggy/config.yaml</code> and restart Maggy to apply changes.</p> `; } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } // ── Budget ────────────────────────────────────────────────────────────── async function loadBudget() { const pane = document.getElementById('pane-budget'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading budget…</div>`; try { const [status, byProvider] = await Promise.all([ api('/budget'), api('/budget/by-provider'), ]); const statusColor = status.status === 'ok' ? 'text-green-400' : status.status === 'warning' ? 'text-yellow-400' : 'text-red-400'; let html = `<h2 class="text-sm font-bold text-white mb-3">Token Budget</h2>`; html += `<div class="grid grid-cols-1 md:grid-cols-3 gap-3 mb-4"> <div class="card p-4 text-center"> <div class="text-2xl font-bold ${statusColor}">$${esc(status.spent_today_usd)}</div> <div class="text-[10px] text-gray-500">Spent Today</div> </div> <div class="card p-4 text-center"> <div class="text-2xl font-bold text-gray-300">$${esc(status.daily_limit_usd)}</div> <div class="text-[10px] text-gray-500">Daily Limit</div> </div> <div class="card p-4 text-center"> <div class="text-2xl font-bold ${statusColor}">${esc(Math.round(status.utilization * 100))}%</div> <div class="text-[10px] text-gray-500">${esc(status.status)}</div> </div> </div>`; const providers = byProvider.providers || byProvider || []; if (providers.length) { html += `<h3 class="text-xs font-bold text-gray-400 mb-2">By Provider</h3><div class="space-y-1">`; for (const p of providers) { html += `<div class="card px-3 py-2 flex justify-between"><span class="text-xs text-white">${esc(p.provider)}</span><span class="text-xs text-orange-400">$${esc(p.spent_usd)}</span></div>`; } html += `</div>`; } pane.innerHTML = html; } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } // ── Model Routing ─────────────────────────────────────────────────────── async function loadRouting() { const pane = document.getElementById('pane-routing'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading model performance…</div>`; try { const data = await api('/routing/heatmap'); const heatmap = data.heatmap || data || []; let html = `<h2 class="text-sm font-bold text-white mb-3">Model Performance Heatmap</h2>`; if (!heatmap.length) { html += `<div class="card p-4 text-sm text-gray-400">No reward data yet. Execute some tasks to build the heatmap.</div>`; } else { html += `<div class="overflow-x-auto"><table class="text-xs w-full"><thead><tr class="text-gray-500"> <th class="text-left p-2">Model</th><th class="text-left p-2">Task Type</th><th class="text-left p-2">Blast Tier</th><th class="text-right p-2">Avg Reward</th><th class="text-right p-2">Samples</th> </tr></thead><tbody>`; for (const r of heatmap) { const color = r.avg_reward >= 0.7 ? 'text-green-400' : r.avg_reward >= 0.4 ? 'text-yellow-400' : 'text-red-400'; html += `<tr class="border-t border-gray-800"><td class="p-2 text-white">${esc(r.model)}</td><td class="p-2">${esc(r.task_type)}</td><td class="p-2">${esc(r.blast_tier)}</td><td class="p-2 text-right ${color}">${esc(r.avg_reward)}</td><td class="p-2 text-right text-gray-500">${esc(r.samples)}</td></tr>`; } html += `</tbody></table></div>`; } pane.innerHTML = html; } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } // ── Process Intelligence ──────────────────────────────────────────────── async function loadProcess() { const pane = document.getElementById('pane-process'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading process intelligence…</div>`; try { const [events, history, improve, landscape, activity] = await Promise.all([ api('/events/count').catch(() => ({ count: 0 })), api('/history/report').catch(() => ({ status: 'no_data' })), api('/improve/report').catch(() => ({ report: null })), api('/cikg/landscape').catch(() => ({ technologies: 0 })), api('/activity').catch(() => ({ sessions: [], recent: [] })), ]); let html = `<h2 class="text-sm font-bold text-white mb-3">Process Intelligence</h2>`; html += renderPIStats(events, history, landscape); html += renderPIPatterns(history); html += renderPIHealth(improve); html += renderPIActivity(activity); html += renderPIActions(); pane.innerHTML = html; } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } function renderPIStats(events, history, landscape) { return `<div class="grid grid-cols-2 md:grid-cols-4 gap-3 mb-4"> <div class="card p-3 text-center"><div class="text-xl font-bold text-orange-400">${esc(events.count || 0)}</div><div class="text-[10px] text-gray-500">Events</div></div> <div class="card p-3 text-center"><div class="text-xl font-bold text-blue-400">${esc(history.total_sessions || 0)}</div><div class="text-[10px] text-gray-500">CLI Sessions</div></div> <div class="card p-3 text-center"><div class="text-xl font-bold text-green-400">${esc(history.total_prompts || 0)}</div><div class="text-[10px] text-gray-500">Total Prompts</div></div> <div class="card p-3 text-center"><div class="text-xl font-bold text-purple-400">${esc(landscape.technologies || 0)}</div><div class="text-[10px] text-gray-500">Technologies</div></div> </div>`; } function renderPIPatterns(history) { if (!history.patterns || !history.patterns.length) return ''; let html = `<div class="card p-4 mb-3"><div class="text-[10px] text-gray-500 uppercase mb-2"><i class="fas fa-chart-bar mr-1"></i>Session Patterns</div><div class="space-y-1">`; for (const p of history.patterns.slice(0, 5)) { html += `<div class="text-xs text-gray-300">- ${esc(typeof p === 'string' ? p : JSON.stringify(p))}</div>`; } return html + `</div></div>`; } function renderPIHealth(improve) { const report = improve.report; if (!report) return ''; const health = report.health_summary || {}; const keys = Object.keys(health); if (!keys.length) return ''; let html = `<div class="card p-4 mb-3"><div class="text-[10px] text-gray-500 uppercase mb-2"><i class="fas fa-heartbeat mr-1"></i>Health Signals</div>`; html += `<div class="grid grid-cols-2 md:grid-cols-4 gap-2">`; for (const k of keys) { const val = health[k]; const pct = Math.round(val * 100); const color = pct >= 80 ? 'text-green-400' : pct >= 50 ? 'text-yellow-400' : 'text-red-400'; html += `<div class="text-center"><div class="text-lg font-bold ${color}">${pct}%</div><div class="text-[10px] text-gray-500 capitalize">${esc(k)}</div></div>`; } html += `</div>`; if (report.top_actions && report.top_actions.length) { html += `<div class="mt-3 space-y-1">`; for (const a of report.top_actions) { html += `<div class="text-xs text-yellow-300"><i class="fas fa-lightbulb mr-1"></i>${esc(a)}</div>`; } html += `</div>`; } return html + `</div>`; } function renderPIActivity(activity) { const sessions = activity.sessions || []; const recent = activity.recent || []; if (!sessions.length && !recent.length) return ''; let html = `<div class="card p-4 mb-3"><div class="text-[10px] text-gray-500 uppercase mb-2"><i class="fas fa-bolt mr-1"></i>Live Activity</div>`; if (sessions.length) { html += `<div class="mb-2"><span class="text-[10px] text-green-400 font-bold">${sessions.length} active session${sessions.length > 1 ? 's' : ''}</span></div>`; html += `<div class="grid grid-cols-2 md:grid-cols-4 gap-2 mb-3">`; const seen = new Set(); for (const s of sessions) { if (seen.has(s.project)) continue; seen.add(s.project); html += `<div class="bg-gray-900 rounded px-2 py-1.5"><div class="text-xs text-white truncate"><i class="fas fa-circle text-green-400 text-[6px] mr-1"></i>${esc(s.project)}</div><div class="text-[9px] text-gray-500">${esc(s.status)}</div></div>`; } html += `</div>`; } if (recent.length) { html += `<div class="text-[10px] text-gray-500 mb-1">Recent prompts:</div><div class="space-y-1">`; for (const p of recent.slice(0, 5)) { html += `<div class="text-[10px] text-gray-400 truncate"><span class="text-gray-600">${esc(p.project)}</span> ${esc(p.text)}</div>`; } html += `</div>`; } return html + `</div>`; } function renderPIActions() { return `<div class="card p-4"><div class="text-[10px] text-gray-500 uppercase mb-2">Quick Actions</div> <div class="flex flex-wrap gap-2"> <button id="btn-history" onclick="triggerAnalysis('history')" class="text-[10px] px-3 py-1.5 rounded bg-gray-800 hover:bg-gray-700 text-gray-300"><i class="fas fa-clock-rotate-left mr-1"></i>Analyze History</button> <button id="btn-improve" onclick="triggerAnalysis('improve')" class="text-[10px] px-3 py-1.5 rounded bg-gray-800 hover:bg-gray-700 text-gray-300"><i class="fas fa-brain mr-1"></i>Self-Improve</button> <a href="/api/events?limit=20" target="_blank" class="text-[10px] px-3 py-1.5 rounded bg-gray-800 hover:bg-gray-700 text-blue-400">Events JSON</a> <a href="/api/cikg/landscape" target="_blank" class="text-[10px] px-3 py-1.5 rounded bg-gray-800 hover:bg-gray-700 text-blue-400">CIKG Landscape</a> </div> </div>`; } async function triggerAnalysis(type) { const btn = document.getElementById('btn-' + type); const origText = btn ? btn.innerHTML : ''; if (btn) btn.innerHTML = `<i class="fas fa-spinner fa-spin mr-1"></i>Running…`; if (btn) btn.disabled = true; try { let result; if (type === 'history') result = await api('/history/analyze', { method: 'POST' }); else if (type === 'improve') result = await api('/improve/analyze', { method: 'POST' }); showToast(type === 'history' ? `History: ${result.total_sessions || 0} sessions, ${result.total_prompts || 0} prompts` : `Improve: ${(result.report || {}).total_signals || 0} signals collected`); loadProcess(); } catch (e) { alert('Analysis failed: ' + e.message); if (btn) { btn.innerHTML = origText; btn.disabled = false; } } } function showToast(msg) { const el = document.createElement('div'); el.className = 'fixed bottom-4 right-4 bg-green-600 text-white text-xs px-4 py-2 rounded shadow-lg z-50'; el.innerHTML = `<i class="fas fa-check mr-1"></i>${esc(msg)}`; document.body.appendChild(el); setTimeout(() => el.remove(), 3000); } // ── Forge ─────────────────────────────────────────────────────────────── async function loadForge() { const pane = document.getElementById('pane-forge'); pane.innerHTML = `<div class="text-xs text-gray-500"><i class="fas fa-spinner fa-spin mr-1"></i>Loading forge…</div>`; try { const [status, gaps] = await Promise.all([ api('/forge/status'), api('/forge/gaps'), ]); let html = `<h2 class="text-sm font-bold text-white mb-3">MCP Forge</h2>`; html += `<div class="grid grid-cols-1 md:grid-cols-3 gap-3 mb-4"> <div class="card p-4 text-center"> <div class="text-xl font-bold ${status.available ? 'text-green-400' : 'text-red-400'}">${status.available ? 'Online' : 'Offline'}</div> <div class="text-[10px] text-gray-500">Status</div> </div> <div class="card p-4 text-center"> <div class="text-xl font-bold text-orange-400">${esc(status.registry_count || 0)}</div> <div class="text-[10px] text-gray-500">Tools in Registry</div> </div> <div class="card p-4 text-center"> <div class="text-xl font-bold text-yellow-400">${esc(status.pending_gaps || 0)}</div> <div class="text-[10px] text-gray-500">Detected Gaps</div> </div> </div>`; const gapList = gaps.gaps || []; if (gapList.length) { html += `<h3 class="text-xs font-bold text-gray-400 mb-2">Capability Gaps</h3><div class="space-y-1">`; for (const g of gapList) { html += `<div class="card px-3 py-2 flex justify-between"><span class="text-xs text-white">${esc(g.capability)}</span><span class="text-xs text-gray-400">${esc(g.occurrences)} hits ${g.triggered ? '<span class="text-orange-400">TRIGGERED</span>' : ''}</span></div>`; } html += `</div>`; } pane.innerHTML = html; } catch (e) { pane.innerHTML = `<div class="card p-4 text-sm text-red-400">Failed: ${esc(e.message)}</div>`; } } // ── Setup Wizard ──────────────────────────────────────────────────────── async function checkSetup() { try { const status = await api('/setup/status'); if (status.configured) return true; showSetupWizard(status); return false; } catch { return true; } } function showSetupWizard(status) { const pane = document.getElementById('pane-inbox'); const missing = status.steps.filter(s => s.status === 'missing'); const disc = status.discovery || {}; const clis = disc.clis || {}; const cliAuth = disc.cli_auth || {}; const tokens = disc.tokens || {}; let html = `<div class="max-w-2xl mx-auto mt-4 space-y-4">`; // Header html += `<div class="card p-6"> <div class="flex items-center gap-3 mb-3"> <i class="fas fa-wand-magic-sparkles text-orange-500 text-xl"></i> <h2 class="text-lg font-bold text-white">Welcome to Maggy</h2> <span class="text-[10px] text-gray-500">${esc(status.progress)} configured</span> </div> <div class="space-y-2">`; for (const step of status.steps) { const icon = step.status === 'done' ? '<i class="fas fa-check-circle text-green-400"></i>' : '<i class="fas fa-circle-xmark text-red-400/60"></i>'; html += `<div class="flex items-center gap-3 px-3 py-2 rounded ${step.status === 'done' ? 'bg-green-900/20' : 'bg-red-900/10'}"> ${icon} <span class="text-sm ${step.status === 'done' ? 'text-green-300' : 'text-gray-300'}">${esc(step.label)}</span> ${step.status !== 'done' && step.hint ? `<span class="text-[10px] text-gray-500 ml-auto">${esc(step.hint)}</span>` : ''} </div>`; } html += `</div></div>`; // Discovered CLIs const cliNames = Object.keys(clis); if (cliNames.length) { html += `<div class="card p-4"> <div class="text-[10px] text-gray-500 uppercase mb-2"><i class="fas fa-terminal mr-1"></i>Detected CLI Tools</div> <div class="space-y-1">`; for (const name of cliNames) { const auth = cliAuth[name]; html += `<div class="flex items-center gap-2 text-xs"> <i class="fas fa-check text-green-400"></i> <span class="text-white font-mono">${esc(name)}</span> <span class="text-gray-500">${esc(clis[name])}</span> ${auth ? '<span class="text-[10px] px-1.5 py-0.5 rounded bg-green-900/40 text-green-400">authenticated</span>' : '<span class="text-[10px] px-1.5 py-0.5 rounded bg-gray-800 text-gray-500">not logged in</span>'} </div>`; } html += `</div></div>`; } // Token sources html += `<div class="card p-4"> <div class="text-[10px] text-gray-500 uppercase mb-2"><i class="fas fa-key mr-1"></i>Credential Sources</div> <div class="space-y-1 text-xs">`; if (tokens.GITHUB_TOKEN) html += `<div class="text-green-400"><i class="fas fa-check mr-1"></i>GITHUB_TOKEN (env var)</div>`; else if (tokens.GIT_CREDENTIAL) html += `<div class="text-green-400"><i class="fas fa-check mr-1"></i>GitHub token (git credential helper)</div>`; else html += `<div class="text-red-400/60"><i class="fas fa-xmark mr-1"></i>No GitHub token found</div>`; if (tokens.ANTHROPIC_API_KEY) html += `<div class="text-green-400"><i class="fas fa-check mr-1"></i>ANTHROPIC_API_KEY (env var)</div>`; else if (cliAuth.claude) html += `<div class="text-green-400"><i class="fas fa-check mr-1"></i>Claude Code subscription (CLI auth)</div>`; else html += `<div class="text-gray-500"><i class="fas fa-info-circle mr-1"></i>No Anthropic API key (Claude CLI can be used instead)</div>`; html += `</div></div>`; // Actions html += `<div class="flex gap-2"> <button onclick="autoConfigureSetup()" class="text-xs px-4 py-2 rounded bg-orange-600 hover:bg-orange-700 text-white"><i class="fas fa-wand-magic mr-1"></i>Auto-Configure</button> <button onclick="reloadConfig()" class="text-xs px-4 py-2 rounded bg-gray-700 hover:bg-gray-600 text-gray-300"><i class="fas fa-rotate mr-1"></i>Reload</button> <button onclick="enterLocalMode()" class="text-xs px-4 py-2 rounded bg-gray-800 hover:bg-gray-700 text-gray-400"><i class="fas fa-laptop mr-1"></i>Local Mode</button> </div>`; html += `</div>`; pane.innerHTML = html; } function enterLocalMode() { const pane = document.getElementById('pane-inbox'); pane.innerHTML = `<div class="card p-6 max-w-2xl mx-auto mt-4"> <div class="flex items-center gap-3 mb-3"> <i class="fas fa-laptop text-blue-400 text-lg"></i> <h2 class="text-sm font-bold text-white">Local Mode</h2> </div> <p class="text-xs text-gray-400 mb-3">These features work without provider credentials:</p> <div class="grid grid-cols-2 gap-2"> <button onclick="switchTab('budget')" class="card p-3 text-left hover:bg-gray-900"><div class="text-xs text-white"><i class="fas fa-wallet text-orange-400 mr-1"></i>Budget</div><div class="text-[10px] text-gray-500">Track token spend</div></button> <button onclick="switchTab('routing')" class="card p-3 text-left hover:bg-gray-900"><div class="text-xs text-white"><i class="fas fa-route text-blue-400 mr-1"></i>Model Routing</div><div class="text-[10px] text-gray-500">Performance heatmap</div></button> <button onclick="switchTab('process')" class="card p-3 text-left hover:bg-gray-900"><div class="text-xs text-white"><i class="fas fa-chart-line text-green-400 mr-1"></i>Process</div><div class="text-[10px] text-gray-500">Events + knowledge graph</div></button> <button onclick="switchTab('forge')" class="card p-3 text-left hover:bg-gray-900"><div class="text-xs text-white"><i class="fas fa-hammer text-yellow-400 mr-1"></i>Forge</div><div class="text-[10px] text-gray-500">MCP tool gaps</div></button> </div> <button onclick="loadAll()" class="mt-3 text-[10px] text-gray-500 hover:text-white"><i class="fas fa-arrow-left mr-1"></i>Back to setup</button> </div>`; } async function reloadConfig() { try { const result = await api('/setup/reload', { method: 'POST' }); if (result.mode === 'full') { loadAll(); } else { const status = await api('/setup/status'); showSetupWizard(status); } } catch (e) { alert('Reload failed: ' + e.message); } } async function autoConfigureSetup() { const btn = event.target; btn.innerHTML = '<i class="fas fa-spinner fa-spin mr-1"></i>Discovering...'; btn.disabled = true; try { const result = await api('/setup/auto-configure', { method: 'POST' }); if (result.mode === 'full') { loadAll(); } else { const status = await api('/setup/status'); showSetupWizard(status); } } catch (e) { alert('Auto-configure failed: ' + e.message); btn.innerHTML = '<i class="fas fa-wand-magic mr-1"></i>Auto-Configure'; btn.disabled = false; } } // ── Init ──────────────────────────────────────────────────────────────── async function loadAll() { try { const h = await api('/health'); document.getElementById('org-badge').textContent = `${h.org} · ${h.provider} · ${h.codebases} codebases`; } catch {} const ready = await checkSetup(); if (ready) switchTab(CURRENT_TAB); } loadAll(); ================================================ FILE: maggy/maggy/static/index.html ================================================ <!doctype html> <html lang="en"> <head> <meta charset="utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <title>Maggy

Maggy

v0.1.0

Task

================================================ FILE: maggy/pyproject.toml ================================================ [build-system] requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" [project] name = "maggy" version = "0.1.0" description = "Generic AI engineering command center — part of the Maggy platform" readme = "README.md" requires-python = ">=3.11" license = { text = "MIT" } authors = [{ name = "Maggy Contributors" }] dependencies = [ "fastapi>=0.115", "uvicorn[standard]>=0.30", "httpx>=0.27", "anthropic>=0.40", "bcrypt>=4.1", "email-validator>=2.0", "pyyaml>=6.0", "feedparser>=6.0", "pydantic>=2.6", "typer>=0.12", "rich>=13.0", ] [project.scripts] maggy = "maggy.cli:app" [tool.setuptools.packages.find] where = ["."] include = ["maggy*"] ================================================ FILE: maggy/tests/conftest.py ================================================ """Shared test fixtures for Maggy test suite.""" from __future__ import annotations import tempfile from pathlib import Path from unittest.mock import MagicMock import pytest from maggy.config import ( BudgetConfig, DashboardConfig, MaggyConfig, MeshConfig, OrgConfig, RoutingConfig, StorageConfig, ) @pytest.fixture def tmp_dir(tmp_path: Path) -> Path: return tmp_path @pytest.fixture def mock_cfg(tmp_path: Path) -> MaggyConfig: """Minimal MaggyConfig pointing to tmp storage.""" return MaggyConfig( org=OrgConfig(name="test-org"), storage=StorageConfig(path=str(tmp_path / "store.db")), dashboard=DashboardConfig(), budget=BudgetConfig(daily_limit_usd=10.0), routing=RoutingConfig(), mesh=MeshConfig(), ) ================================================ FILE: maggy/tests/integration/__init__.py ================================================ """Integration tests for cross-module flows.""" ================================================ FILE: maggy/tests/integration/test_full_task_flow.py ================================================ """Integration test: Ticket -> Route -> Execute -> Reward. Tests the full lifecycle of a task through routing, event emission, and reward recording. """ from __future__ import annotations from pathlib import Path from maggy.event_spine.emitter import EventEmitter from maggy.event_spine.events import ( ExecutionEvent, IntentEvent, OutcomeEvent, ) from maggy.event_spine.store import EventStore from maggy.routing import RoutingContext, RoutingService from maggy.scores import MIN_SAMPLES class TestFullTaskFlow: def test_route_emit_reward(self, mock_cfg, tmp_path: Path): """Full flow: route task, emit events, record reward.""" # 1. Route the task router = RoutingService(mock_cfg) ctx = RoutingContext( blast_score=5, task_type="feature", ) decision = router.route(ctx) name = ( decision.primary if isinstance(decision.primary, str) else decision.primary.name ) assert name # Got a routing decision # 2. Emit events through the spine store = EventStore(tmp_path / "events.db") emitter = EventEmitter(store) intent = IntentEvent( intent_text="Add user dashboard", decomposed_steps=["create component", "add api"], ) intent.header.task_id = "task-123" emitter.emit(intent) exec_evt = ExecutionEvent( tool_name="code_edit", duration_ms=500, success=True, ) exec_evt.header.task_id = "task-123" emitter.emit(exec_evt) outcome = OutcomeEvent(success=True, reward=0.85) outcome.header.task_id = "task-123" emitter.emit(outcome) # 3. Verify trace trace = emitter.trace("task-123") assert len(trace) == 3 # 4. Record reward for learning router.record_outcome(name, "feature", 5, 0.85) heatmap = router.get_heatmap() assert len(heatmap) >= 1 def test_multi_task_routing(self, mock_cfg): """Route multiple tasks, verify different tiers.""" router = RoutingService(mock_cfg) low = router.route(RoutingContext(blast_score=1)) high = router.route(RoutingContext(blast_score=9)) low_name = ( low.primary if isinstance(low.primary, str) else low.primary.name ) high_name = ( high.primary if isinstance(high.primary, str) else high.primary.name ) # Low should be cheaper, high should be premium assert low_name != high_name or low_name == "claude" ================================================ FILE: maggy/tests/integration/test_model_fallback.py ================================================ """Integration test: Quota -> Checkpoint -> Switch -> Continue. Tests fatigue-based checkpointing and model switching. """ from __future__ import annotations from maggy.fatigue import create_profile from maggy.services.checkpoint import Checkpoint, create_checkpoint class TestModelFallback: def test_fatigue_triggers_checkpoint(self): """When fatigue is high, checkpoint and switch.""" profile = create_profile("claude") profile.tokens_used = 170_000 profile.turns = 40 assert profile.should_checkpoint() # Create checkpoint cp = create_checkpoint( goal="Refactor auth module", progress=["Extracted interface", "Updated tests"], model="claude", working_state="Mid-refactor, 3 files changed", files=["auth.py", "test_auth.py"], ) # Serialize for handoff data = cp.serialize() restored = Checkpoint.deserialize(data) assert restored.goal == "Refactor auth module" assert restored.source_model == "claude" # Generate prompt for next model prompt = restored.to_prompt() assert "Refactor auth module" in prompt assert "Mid-refactor" in prompt def test_cross_model_checkpoint_round_trip(self): """Checkpoint survives serialization across models.""" cp = create_checkpoint( goal="Fix API pagination", progress=["Found bug in offset calc"], model="gpt", constraints=["Don't break existing tests"], files=["api/routes.py"], ) # Simulate model switch: serialize -> transfer -> restore serialized = cp.serialize() new_model_cp = Checkpoint.deserialize(serialized) assert new_model_cp.source_model == "gpt" prompt = new_model_cp.to_prompt() assert "Don't break existing tests" in prompt def test_fresh_model_low_fatigue(self): """A fresh model should not be fatigued.""" profile = create_profile("kimi") assert not profile.should_checkpoint() assert profile.fatigue_score == 0.0 ================================================ FILE: maggy/tests/integration/test_process_loop.py ================================================ """Integration test: CI fail -> Signal -> Pattern -> Fix. Tests the process intelligence pipeline with CIKG and Engram. """ from __future__ import annotations from pathlib import Path from maggy.cikg.graph import KnowledgeGraphService from maggy.cikg.models import Edge, Node from maggy.cikg.queries import find_gaps, get_landscape from maggy.engram.diagnostics import diagnose from maggy.engram.record import EngramRecord from maggy.engram.retrieval import EngramRetrieval from maggy.engram.store import EngramStore from maggy.lexon.router import LexonRouter class TestProcessLoop: def test_cikg_gap_to_engram(self, tmp_path: Path): """Detect feature gap in CIKG, store in Engram.""" # 1. Build competitive landscape g = KnowledgeGraphService(tmp_path / "cikg.db") for i in range(3): g.add_node(Node( id=f"c{i}", node_type="competitor", name=f"Competitor{i}", )) g.add_node(Node( id="f1", node_type="feature", name="SSO", )) g.add_edge(Edge("c0", "f1", "has_feature")) # 2. Detect gap score = find_gaps(g, "SSO") assert score.gap_count == 2 # 3. Store insight in Engram store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="gap-sso", namespace="process", memory_type="decision", content=f"Gap detected: {score.recommendation}", tags=["cikg", "gap", "sso"], )) # 4. Verify retrieval retrieval = EngramRetrieval(store) results = retrieval.by_tag("cikg") assert len(results) == 1 assert "Gap detected" in results[0].content def test_lexon_to_engram(self, tmp_path: Path): """Parse intent with Lexon, store in Engram.""" # 1. Parse user intent router = LexonRouter() record = router.route("deploy the app to production") assert record.confidence > 0.5 # 2. Store the resolution in Engram store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="intent-deploy", namespace="session-1", memory_type="fact", content=f"User said '{record.phrase}' -> " f"{record.resolved_tool}", tags=["lexon", "intent"], )) # 3. Verify result = store.get("intent-deploy") assert result is not None assert "deploy" in result.content def test_full_diagnostics(self, tmp_path: Path): """Memory diagnostics across diverse types.""" store = EngramStore(tmp_path / "engram.db") types = ["fact", "decision", "code_ref", "handoff"] for i, mt in enumerate(types): store.write(EngramRecord( engram_id=f"e{i}", namespace="test", memory_type=mt, content=f"Content for {mt}", )) profile = diagnose(store) assert profile.total_memories == 4 assert profile.health_score > 0.8 ================================================ FILE: maggy/tests/test_account_guide.py ================================================ """Tests for account switching guidance.""" from __future__ import annotations from maggy.services.account_guide import ( AccountProfile, detect_accounts, suggest_switch, ) def test_account_profile_dataclass(): """AccountProfile stores provider and auth command.""" p = AccountProfile( name="claude-work", provider="anthropic", auth_command="claude auth login", ) assert p.provider == "anthropic" assert "login" in p.auth_command def test_detect_accounts_finds_claude(tmp_path): """Detects Claude accounts from ~/.claude directory.""" (tmp_path / ".claude").mkdir() (tmp_path / ".claude" / "credentials.json").write_text("{}") accounts = detect_accounts(home=tmp_path) providers = [a.provider for a in accounts] assert "anthropic" in providers def test_detect_accounts_finds_codex(tmp_path): """Detects Codex accounts from ~/.codex directory.""" (tmp_path / ".codex").mkdir() accounts = detect_accounts(home=tmp_path) providers = [a.provider for a in accounts] assert "openai" in providers def test_suggest_switch_anthropic(): """Suggests claude auth login for anthropic quota hit.""" guide = suggest_switch("anthropic") assert "claude" in guide.lower() assert "login" in guide.lower() or "auth" in guide.lower() def test_suggest_switch_openai(): """Suggests codex auth for openai quota hit.""" guide = suggest_switch("openai") assert "codex" in guide.lower() or "openai" in guide.lower() ================================================ FILE: maggy/tests/test_activity.py ================================================ """Tests for CLI activity scanner.""" from __future__ import annotations import json from pathlib import Path from unittest.mock import patch import pytest from maggy.services.activity import ( ActiveSession, ActivityService, RecentPrompt, _parse_claude_processes, _recent_prompts, ) class TestParseClaudeProcesses: def test_detects_running_session(self): lines = [ "user 1234 0.0 0.1 claude --dangerously-skip-permissions --continue", ] with patch( "maggy.services.activity._get_cwd", return_value="/Users/me/proj-a", ): sessions = _parse_claude_processes(lines) assert len(sessions) == 1 assert sessions[0].cli == "claude" assert sessions[0].pid == 1234 assert sessions[0].status == "running" assert sessions[0].project == "proj-a" def test_detects_agent_subprocess(self): lines = [ "user 5678 0.1 0.3 /path/to/claude " "--agent-id be-schema@maia-demo " "--agent-name be-schema " "--team-name maia-demo " "--parent-session-id abc-123", ] with patch( "maggy.services.activity._get_cwd", return_value="/Users/me/proj-b", ): sessions = _parse_claude_processes(lines) assert len(sessions) == 1 s = sessions[0] assert s.status == "agent" assert s.agent_name == "be-schema" assert s.team_name == "maia-demo" def test_ignores_non_cli_processes(self): lines = [ "user 9999 0.0 0.0 /Applications/Claude.app/Contents/MacOS/Claude", "user 8888 0.0 0.0 grep claude", ] sessions = _parse_claude_processes(lines) assert sessions == [] def test_empty_input(self): assert _parse_claude_processes([]) == [] class TestRecentPrompts: def test_reads_claude_history(self, tmp_path: Path): history = tmp_path / "history.jsonl" entries = [ {"display": "fix the bug", "timestamp": 1000, "project": "/Users/me/app", "sessionId": "s1"}, {"display": "run tests", "timestamp": 2000, "project": "/Users/me/app", "sessionId": "s1"}, ] history.write_text( "\n".join(json.dumps(e) for e in entries) + "\n", ) prompts = _recent_prompts( claude_dir=tmp_path, codex_dir=tmp_path / "none", kimi_dir=tmp_path / "none2", limit=5, ) assert len(prompts) == 2 assert prompts[0].text == "run tests" assert prompts[0].cli == "claude" assert prompts[0].project == "app" def test_reads_codex_history(self, tmp_path: Path): history = tmp_path / "history.jsonl" entries = [ {"session_id": "c1", "ts": 3000, "text": "deploy it"}, ] history.write_text( "\n".join(json.dumps(e) for e in entries) + "\n", ) prompts = _recent_prompts( claude_dir=tmp_path / "none", codex_dir=tmp_path, kimi_dir=tmp_path / "none2", limit=5, ) assert len(prompts) == 1 assert prompts[0].cli == "codex" assert prompts[0].text == "deploy it" def test_merges_and_sorts_by_time(self, tmp_path: Path): claude_dir = tmp_path / "claude" codex_dir = tmp_path / "codex" claude_dir.mkdir() codex_dir.mkdir() (claude_dir / "history.jsonl").write_text( json.dumps({"display": "old", "timestamp": 1000, "project": "/p", "sessionId": "s"}) + "\n", ) (codex_dir / "history.jsonl").write_text( json.dumps({"session_id": "c1", "ts": 5000, "text": "new"}) + "\n", ) prompts = _recent_prompts( claude_dir=claude_dir, codex_dir=codex_dir, kimi_dir=tmp_path / "none", limit=5, ) assert prompts[0].text == "new" assert prompts[1].text == "old" def test_limits_output(self, tmp_path: Path): history = tmp_path / "history.jsonl" lines = [] for i in range(20): lines.append(json.dumps({ "display": f"msg-{i}", "timestamp": i * 1000, "project": "/p", "sessionId": "s", })) history.write_text("\n".join(lines) + "\n") prompts = _recent_prompts( claude_dir=tmp_path, codex_dir=tmp_path / "x", kimi_dir=tmp_path / "y", limit=5, ) assert len(prompts) == 5 def test_no_history_files(self, tmp_path: Path): prompts = _recent_prompts( claude_dir=tmp_path / "a", codex_dir=tmp_path / "b", kimi_dir=tmp_path / "c", limit=5, ) assert prompts == [] def test_malformed_json_skipped(self, tmp_path: Path): history = tmp_path / "history.jsonl" history.write_text( "not-json\n" + json.dumps({"display": "ok", "timestamp": 1000, "project": "/p", "sessionId": "s"}) + "\n", ) prompts = _recent_prompts( claude_dir=tmp_path, codex_dir=tmp_path / "x", kimi_dir=tmp_path / "y", limit=5, ) assert len(prompts) == 1 assert prompts[0].text == "ok" class TestActivityService: def test_get_activity_shape(self): svc = ActivityService() with patch( "maggy.services.activity._scan_processes", return_value=[], ), patch( "maggy.services.activity._recent_prompts", return_value=[], ): result = svc.get_activity() assert "sessions" in result assert "recent" in result def test_serializes_sessions(self): session = ActiveSession( cli="claude", session_id="", project="myapp", project_path="/Users/me/myapp", status="running", last_prompt="fix bug", agent_name="", team_name="", pid=1234, ) svc = ActivityService() with patch( "maggy.services.activity._scan_processes", return_value=[session], ), patch( "maggy.services.activity._recent_prompts", return_value=[], ): result = svc.get_activity() assert len(result["sessions"]) == 1 s = result["sessions"][0] assert s["cli"] == "claude" assert s["project"] == "myapp" assert s["pid"] == 1234 def test_serializes_prompts(self): prompt = RecentPrompt( cli="codex", text="deploy", project="api", timestamp="2026-05-10T12:00:00", session_id="c1", ) svc = ActivityService() with patch( "maggy.services.activity._scan_processes", return_value=[], ), patch( "maggy.services.activity._recent_prompts", return_value=[prompt], ): result = svc.get_activity() assert len(result["recent"]) == 1 assert result["recent"][0]["text"] == "deploy" ================================================ FILE: maggy/tests/test_api_endpoints.py ================================================ """Full API endpoint validation tests. Creates a real FastAPI app with all services wired in (using tmp directories for SQLite) and validates every endpoint from all 14 phases. """ from __future__ import annotations from pathlib import Path from types import SimpleNamespace import pytest from fastapi import FastAPI from fastapi.testclient import TestClient from maggy.budget import BudgetManager from maggy.cikg.graph import KnowledgeGraphService from maggy.cikg.models import Edge, Node from maggy.config import ( BudgetConfig, DashboardConfig, MaggyConfig, MeshConfig, OrgConfig, RoutingConfig, StorageConfig, ) from maggy.deploy import DeployService from maggy.engram.record import EngramRecord from maggy.engram.store import EngramStore from maggy.event_spine.emitter import EventEmitter from maggy.event_spine.events import IntentEvent from maggy.event_spine.store import EventStore from maggy.forge.connector import ForgeConnector from maggy.lexon.router import LexonRouter from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore from maggy.planning import PlanningService from maggy.history.service import HistoryService from maggy.improve.service import Introspector from maggy.routing import RoutingService @pytest.fixture def app_with_services(tmp_path: Path) -> FastAPI: """Build a FastAPI app with all services wired.""" cfg = MaggyConfig( org=OrgConfig(name="test-org", domain="devtools"), storage=StorageConfig(path=str(tmp_path / "store.db")), dashboard=DashboardConfig(auth_mode="local"), budget=BudgetConfig(daily_limit_usd=10.0), routing=RoutingConfig(), mesh=MeshConfig(enabled=True), ) app = FastAPI() app.state.cfg = cfg app.state.configured = True app.state.mode = "local" # Wire all services app.state.budget = BudgetManager(cfg) app.state.routing = RoutingService(cfg) app.state.events = EventEmitter( EventStore(tmp_path / "events.db"), ) app.state.cikg = KnowledgeGraphService( tmp_path / "cikg.db", ) app.state.planning = PlanningService(cfg) app.state.deploy = DeployService() app.state.forge = ForgeConnector( forge_path=tmp_path / "fake-forge", ) app.state.engram = EngramStore(tmp_path / "engram.db") app.state.lexon = LexonRouter() mesh_store = MeshStore(tmp_path / "mesh.db") mesh_cfg = SimpleNamespace( peer_id="test-peer", org_key_secret="secret", port=8080, tunnel_url="", git_discovery=False, ) mgr = MeshManager(mesh_cfg, mesh_store) mgr.add_network("test-org") app.state.mesh = mgr app.state.history = HistoryService( db_path=tmp_path / "history.db", cli_dirs={ "claude": tmp_path / "no_claude", "codex": tmp_path / "no_codex", "kimi": tmp_path / "no_kimi", }, ) app.state.introspector = Introspector(app.state) app.state.heartbeat = None # Register all routers from maggy.api.routes import router as r_api from maggy.api.routes_budget import router as r_budget from maggy.api.routes_cikg import router as r_cikg from maggy.api.routes_deploy import router as r_deploy from maggy.api.routes_engram import router as r_engram from maggy.api.routes_events import router as r_events from maggy.api.routes_forge import router as r_forge from maggy.api.routes_heartbeat import router as r_heartbeat from maggy.api.routes_history import router as r_history from maggy.api.routes_improve import router as r_improve from maggy.api.routes_lexon import router as r_lexon from maggy.api.routes_mesh import router as r_mesh from maggy.api.routes_planning import router as r_plan from maggy.api.routes_routing import router as r_routing from maggy.api.routes_setup import router as r_setup from maggy.api.routes_users import router as r_users for r in ( r_api, r_budget, r_cikg, r_deploy, r_engram, r_events, r_forge, r_heartbeat, r_history, r_improve, r_lexon, r_mesh, r_plan, r_routing, r_setup, r_users, ): app.include_router(r) return app @pytest.fixture def client(app_with_services: FastAPI) -> TestClient: return TestClient(app_with_services) # ── Phase 1: Budget ───────────────────────────────────── class TestBudgetAPI: def test_get_budget_empty(self, client: TestClient): resp = client.get("/api/budget") assert resp.status_code == 200 data = resp.json() assert "daily_limit_usd" in data assert "spent_today_usd" in data assert data["spent_today_usd"] == 0.0 def test_budget_by_provider_empty(self, client: TestClient): resp = client.get("/api/budget/by-provider") assert resp.status_code == 200 assert resp.json() == [] def test_budget_with_spend( self, app_with_services: FastAPI, ): mgr = app_with_services.state.budget mgr.record_spend("anthropic", "claude", 2.5) mgr.record_spend("openai", "gpt-4", 1.0) c = TestClient(app_with_services) resp = c.get("/api/budget") data = resp.json() assert data["spent_today_usd"] == 3.5 resp = c.get("/api/budget/by-provider") providers = { r["provider"]: r["spent_usd"] for r in resp.json() } assert providers["anthropic"] == 2.5 assert providers["openai"] == 1.0 # ── Phase 2: Routing ──────────────────────────────────── class TestRoutingAPI: def test_heatmap_empty(self, client: TestClient): resp = client.get("/api/routing/heatmap") assert resp.status_code == 200 assert resp.json() == [] def test_decide_low_blast(self, client: TestClient): resp = client.get( "/api/routing/decide?blast=1&task_type=bugfix", ) assert resp.status_code == 200 data = resp.json() assert "primary" in data assert "reason" in data def test_decide_high_blast(self, client: TestClient): resp = client.get( "/api/routing/decide?blast=9&task_type=feature", ) data = resp.json() assert data["primary"] is not None def test_heatmap_after_recording( self, app_with_services: FastAPI, ): svc = app_with_services.state.routing svc.record_outcome("claude", "feature", 5, 0.9) c = TestClient(app_with_services) resp = c.get("/api/routing/heatmap") assert len(resp.json()) >= 1 class TestUsersAPI: def test_create_user(self, client: TestClient): resp = client.post( "/api/users", json={"email": "user@example.com", "password": "secret123"}, ) assert resp.status_code == 201 data = resp.json() assert data["email"] == "user@example.com" assert "password_hash" not in data # ── Phase 14: Event Spine ─────────────────────────────── class TestEventsAPI: def test_events_empty(self, client: TestClient): resp = client.get("/api/events") assert resp.status_code == 200 assert resp.json() == [] def test_event_count_empty(self, client: TestClient): resp = client.get("/api/events/count") assert resp.status_code == 200 assert resp.json()["count"] == 0 def test_trace_empty(self, client: TestClient): resp = client.get("/api/events/trace/nope") assert resp.status_code == 200 assert resp.json() == [] def test_events_after_emit( self, app_with_services: FastAPI, ): emitter = app_with_services.state.events evt = IntentEvent( intent_text="Add login", decomposed_steps=["create form", "add auth"], ) evt.header.task_id = "t-1" emitter.emit(evt) c = TestClient(app_with_services) resp = c.get("/api/events?task_id=t-1") assert len(resp.json()) == 1 resp = c.get("/api/events/trace/t-1") assert len(resp.json()) == 1 resp = c.get("/api/events/count") assert resp.json()["count"] == 1 # ── Phase 4: CIKG ─────────────────────────────────────── class TestCIKGAPI: def test_landscape_empty(self, client: TestClient): resp = client.get("/api/cikg/landscape") assert resp.status_code == 200 data = resp.json() assert data["competitors"] == 0 def test_gaps_no_feature(self, client: TestClient): resp = client.get("/api/cikg/gaps/SSO") assert resp.status_code == 200 data = resp.json() assert "gap_count" in data def test_landscape_with_data( self, app_with_services: FastAPI, ): graph = app_with_services.state.cikg graph.add_node(Node( id="c1", node_type="competitor", name="Rival", )) graph.add_node(Node( id="f1", node_type="feature", name="SSO", )) graph.add_edge(Edge("c1", "f1", "has_feature")) c = TestClient(app_with_services) resp = c.get("/api/cikg/landscape") data = resp.json() assert data["competitors"] == 1 assert data["features_tracked"] == 1 resp = c.get("/api/cikg/gaps/SSO") data = resp.json() assert data["feature"] == "SSO" # ── Phase 6: Planning ─────────────────────────────────── class TestPlanningAPI: def test_single_plan(self, client: TestClient): resp = client.post( "/api/planning/generate", json={"task": "Add auth", "blast_score": 2}, ) assert resp.status_code == 200 data = resp.json() assert data["mode"] == "single" assert len(data["plan"]["steps"]) == 3 def test_dual_plan(self, client: TestClient): resp = client.post( "/api/planning/generate", json={"task": "Refactor core", "blast_score": 7}, ) data = resp.json() assert data["mode"] == "dual" assert "diff" in data # ── Phase 7: Deploy ───────────────────────────────────── class TestDeployAPI: def test_sessions_empty(self, client: TestClient): resp = client.get("/api/deploy/sessions") assert resp.status_code == 200 assert resp.json()["sessions"] == [] def test_create_and_get(self, client: TestClient): resp = client.post( "/api/deploy/sessions", json={"project": "web", "branch": "feat-x"}, ) assert resp.status_code == 200 data = resp.json() sid = data["session_id"] assert data["status"] == "building" resp = client.get(f"/api/deploy/sessions/{sid}") assert resp.json()["project"] == "web" def test_missing_session(self, client: TestClient): resp = client.get("/api/deploy/sessions/nope") data = resp.json() assert data.get("error") == "session not found" # ── Phase 9: Forge ────────────────────────────────────── class TestForgeAPI: def test_forge_status(self, client: TestClient): resp = client.get("/api/forge/status") assert resp.status_code == 200 data = resp.json() assert "available" in data assert "registry_count" in data def test_forge_search(self, client: TestClient): resp = client.get("/api/forge/search?q=test") assert resp.status_code == 200 assert "results" in resp.json() def test_forge_gaps_empty(self, client: TestClient): resp = client.get("/api/forge/gaps") assert resp.status_code == 200 assert resp.json()["gaps"] == [] def test_report_gap(self, client: TestClient): resp = client.post( "/api/forge/gaps", json={"capability": "slack-notify"}, ) assert resp.status_code == 200 data = resp.json() assert data["capability"] == "slack-notify" resp = client.get("/api/forge/gaps") gaps = resp.json()["gaps"] assert len(gaps) == 1 # ── Phase 12: Engram ──────────────────────────────────── class TestEngramAPI: def test_query_empty(self, client: TestClient): resp = client.get("/api/engram/query") assert resp.status_code == 200 assert resp.json()["records"] == [] def test_diagnostics_empty(self, client: TestClient): resp = client.get("/api/engram/diagnostics") assert resp.status_code == 200 data = resp.json() assert "total_memories" in data def test_query_with_data( self, app_with_services: FastAPI, ): store = app_with_services.state.engram store.write(EngramRecord( engram_id="e1", namespace="test", memory_type="fact", content="Test memory", tags=["test"], )) c = TestClient(app_with_services) resp = c.get("/api/engram/query?namespace=test") records = resp.json()["records"] assert len(records) == 1 assert records[0]["content"] == "Test memory" def test_diagnostics_with_data( self, app_with_services: FastAPI, ): store = app_with_services.state.engram store.write(EngramRecord( engram_id="e2", namespace="test", memory_type="decision", content="Chose X over Y", )) c = TestClient(app_with_services) resp = c.get("/api/engram/diagnostics") data = resp.json() assert data["total_memories"] >= 1 # ── Phase 13: Lexon ───────────────────────────────────── class TestLexonAPI: def test_parse_known(self, client: TestClient): resp = client.get("/api/lexon/parse?q=deploy") assert resp.status_code == 200 data = resp.json() assert "resolved_tool" in data assert data["confidence"] > 0 def test_parse_unknown(self, client: TestClient): resp = client.get( "/api/lexon/parse?q=xyzzy_unknown_phrase", ) data = resp.json() assert data["resolved_tool"] == "" def test_learn(self, client: TestClient): resp = client.post( "/api/lexon/learn", json={"phrase": "ship it", "tool": "deploy"}, ) assert resp.status_code == 200 assert resp.json()["status"] == "learned" resp = client.get("/api/lexon/parse?q=ship+it") data = resp.json() assert data["resolved_tool"] == "deploy" # ── Phase 11: Mesh ────────────────────────────────────── class TestMeshAPI: def test_mesh_status_enabled(self, client: TestClient): resp = client.get("/api/mesh/status") assert resp.status_code == 200 data = resp.json() assert data["enabled"] is True assert data["peers"] == 0 assert "networks" in data def test_mesh_peers_empty(self, client: TestClient): resp = client.get("/api/mesh/peers") assert resp.status_code == 200 assert resp.json()["peers"] == [] def test_mesh_networks(self, client: TestClient): resp = client.get("/api/mesh/networks") assert resp.status_code == 200 nets = resp.json()["networks"] assert len(nets) == 1 assert nets[0]["org"] == "test-org" def test_mesh_quarantine_requires_org( self, client: TestClient, ): resp = client.get("/api/mesh/quarantine") assert resp.status_code == 422 assert "error" in resp.json() def test_mesh_quarantine_with_org( self, client: TestClient, ): resp = client.get("/api/mesh/quarantine?org=test-org") assert resp.status_code == 200 assert resp.json()["items"] == [] def test_mesh_add_peer(self, client: TestClient): resp = client.post( "/api/mesh/peers", json={ "org": "test-org", "peer_id": "p1", "name": "remote", "address": "ws://x", }, ) assert resp.json()["status"] == "added" resp = client.get("/api/mesh/peers?org=test-org") assert len(resp.json()["peers"]) == 1 # ── Unconfigured state ────────────────────────────────── class TestUnconfiguredState: """Verify graceful behavior when services are None.""" @pytest.fixture def unconfigured_client(self) -> TestClient: app = FastAPI() app.state.cfg = MaggyConfig() app.state.configured = False app.state.budget = None app.state.routing = None app.state.events = None app.state.cikg = None app.state.planning = None app.state.deploy = None app.state.forge = None app.state.engram = None app.state.lexon = None app.state.mesh = None from maggy.api.routes_budget import router as r1 from maggy.api.routes_cikg import router as r2 from maggy.api.routes_deploy import router as r3 from maggy.api.routes_engram import router as r4 from maggy.api.routes_events import router as r5 from maggy.api.routes_forge import router as r6 from maggy.api.routes_lexon import router as r7 from maggy.api.routes_mesh import router as r8 from maggy.api.routes_planning import router as r9 from maggy.api.routes_routing import router as r0 for r in (r1, r2, r3, r4, r5, r6, r7, r8, r9, r0): app.include_router(r) return TestClient(app) def test_budget_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/budget") assert resp.status_code == 200 assert resp.json()["status"] == "unconfigured" def test_routing_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/routing/heatmap") assert resp.status_code == 200 assert resp.json() == [] def test_events_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/events") assert resp.json() == [] def test_mesh_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/mesh/status") data = resp.json() assert data["enabled"] is False def test_engram_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/engram/query") assert "error" in resp.json() def test_lexon_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/lexon/parse?q=hi") assert "error" in resp.json() def test_deploy_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/deploy/sessions") assert "error" in resp.json() def test_forge_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/forge/status") assert "error" in resp.json() def test_planning_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.post( "/api/planning/generate", json={"task": "test"}, ) assert "error" in resp.json() def test_cikg_unconfigured( self, unconfigured_client: TestClient, ): resp = unconfigured_client.get("/api/cikg/landscape") assert "error" in resp.json() # --- History Endpoint Tests --- class TestHistoryEndpoints: """Tests for /api/history/* endpoints.""" def test_providers(self, client: TestClient): resp = client.get("/api/history/providers") assert resp.status_code == 200 assert "providers" in resp.json() def test_analyze(self, client: TestClient): resp = client.post("/api/history/analyze") assert resp.status_code == 200 data = resp.json() assert "total_sessions" in data assert "total_prompts" in data def test_report_empty(self, client: TestClient): resp = client.get("/api/history/report") assert resp.status_code == 200 def test_sessions(self, client: TestClient): # First analyze to populate client.post("/api/history/analyze") resp = client.get("/api/history/sessions") assert resp.status_code == 200 assert "sessions" in resp.json() def test_sessions_filter(self, client: TestClient): resp = client.get( "/api/history/sessions?provider=claude", ) assert resp.status_code == 200 # --- Discovery + Enhanced Health --- class TestDiscoveryEndpoint: def test_discovery_returns_data( self, client: TestClient, ): resp = client.get("/api/discovery") assert resp.status_code == 200 data = resp.json() assert "clis" in data assert "repos" in data assert "tokens" in data def test_health_has_mode( self, client: TestClient, ): resp = client.get("/api/health") assert resp.status_code == 200 data = resp.json() assert "mode" in data assert data["mode"] in ("full", "local") # --- Heartbeat Endpoint Tests --- class TestHeartbeatEndpoints: def test_status_no_scheduler(self, client: TestClient): resp = client.get("/api/heartbeat/status") assert resp.status_code == 200 assert resp.json() == [] def test_trigger_no_scheduler(self, client: TestClient): resp = client.post("/api/heartbeat/trigger/nope") assert resp.status_code == 503 def test_status_with_scheduler( self, app_with_services: FastAPI, ): from maggy.heartbeat.scheduler import HeartbeatScheduler from unittest.mock import AsyncMock sched = HeartbeatScheduler() sched.register("test_job", AsyncMock(), 60) app_with_services.state.heartbeat = sched c = TestClient(app_with_services) resp = c.get("/api/heartbeat/status") assert resp.status_code == 200 data = resp.json() assert len(data) == 1 assert data[0]["name"] == "test_job" # --- Self-Improvement Endpoint Tests --- class TestImproveEndpoints: def test_report_empty(self, client: TestClient): resp = client.get("/api/improve/report") assert resp.status_code == 200 assert resp.json()["report"] is None def test_analyze_returns_report( self, client: TestClient, ): resp = client.post("/api/improve/analyze") assert resp.status_code == 200 data = resp.json() assert "report" in data report = data["report"] assert "generated_at" in report assert "recommendations" in report def test_report_after_analyze( self, client: TestClient, ): client.post("/api/improve/analyze") resp = client.get("/api/improve/report") data = resp.json() assert data["report"] is not None ================================================ FILE: maggy/tests/test_benchmark_scenario.py ================================================ """Benchmark scenario — simulate a 10-task sprint across 3 models. Measures Maggy's effectiveness at: 1. Routing accuracy — correct model for each complexity tier 2. Budget efficiency — spend distribution across providers 3. Fallback resilience — recovery when models hit quota 4. Fatigue awareness — detects and reacts to context overload 5. Lock safety — prevents file clobbering between agents 6. Escalation — auto-escalates repeated failures 7. Checkpoint continuity — survives model handoff 8. Calibration learning — penalizes bad models over time 9. Dual planning — counter-checks high-blast tasks 10. Observability — signals recorded for all activity """ from __future__ import annotations from unittest.mock import AsyncMock import pytest from maggy.adapters.pi import PiAdapter, RunResult from maggy.budget import BudgetManager from maggy.calibration.tracker import CalibrationTracker from maggy.checkpoint import CheckpointManager from maggy.config import ( CodebaseConfig, MaggyConfig, OrgConfig, ProjectConfig, StorageConfig, ) from maggy.coordination.lock_manager import LockManager from maggy.escalation.protocol import Escalator from maggy.mnemos.fatigue import FatigueTracker from maggy.mnemos.signals import SignalLog from maggy.observability.collector import ObservabilityCollector from maggy.providers.base import Task from maggy.registry import ProjectRegistry from maggy.routing import RoutingContext, RoutingService from maggy.services.executor import ExecutorService from maggy.services.executor_types import SessionCtx from maggy.services.planner import DualPlanner # -- fixtures ---------------------------------------------------------------- def _cfg(tmp_path) -> MaggyConfig: return MaggyConfig( org=OrgConfig(name="benchmark-org"), storage=StorageConfig(path=str(tmp_path / "store.db")), codebases=[ CodebaseConfig(path=str(tmp_path / "repo"), key="app"), ], projects=[ ProjectConfig( name="app", repo="bench/app", path=str(tmp_path / "repo"), default_branch="main", ), ], ) SPRINT_TASKS = [ Task(id="T-1", title="Fix README typo", description="Typo fix", raw={"blast_score": 1, "task_type": "docs"}), Task(id="T-2", title="Lint cleanup", description="Format files", raw={"blast_score": 1, "task_type": "formatting"}), Task(id="T-3", title="Add health endpoint", description="GET /health", raw={"blast_score": 3, "task_type": "feature"}), Task(id="T-4", title="Pagination for /users", description="Cursor pagination", raw={"blast_score": 5, "task_type": "feature"}), Task(id="T-5", title="Refactor auth service", description="Extract middleware", raw={"blast_score": 6, "task_type": "refactor"}), Task(id="T-6", title="Add rate limiter", description="Redis rate limit", raw={"blast_score": 7, "task_type": "feature"}), Task(id="T-7", title="Migrate to v2 API", description="Breaking change", raw={"blast_score": 8, "task_type": "refactor"}), Task(id="T-8", title="Fix XSS in comments", description="Sanitize HTML", raw={"blast_score": 9, "task_type": "security", "security_sensitive": True}), Task(id="T-9", title="OAuth2 PKCE flow", description="Full OAuth impl", raw={"blast_score": 10, "task_type": "security", "security_sensitive": True}), Task(id="T-10", title="Performance audit", description="Profile + optimize", raw={"blast_score": 7, "task_type": "performance"}), ] # -- 1. Routing accuracy ----------------------------------------------------- class TestRoutingAccuracy: """Every task lands on the right model tier.""" def test_all_10_tasks_route_correctly(self, tmp_path): cfg = _cfg(tmp_path) svc = RoutingService(cfg) results: dict[str, str] = {} for task in SPRINT_TASKS: raw = task.raw or {} ctx = RoutingContext( blast_score=raw.get("blast_score", 0), task_type=raw.get("task_type", "general"), security_sensitive=raw.get("security_sensitive", False), ) decision = svc.route(ctx) name = decision.primary if isinstance(decision.primary, str) else decision.primary.name results[task.id] = name # Low blast (1-3) → cheap tier unless rules override # T-1 is docs → rules force claude assert results["T-1"] == "claude" assert results["T-2"] in ("local", "kimi") assert results["T-3"] in ("local", "kimi") # Blast 5 → local(0-5) cheapest, codex(4-10), claude(5-10) assert results["T-4"] in ("local", "codex") # Blast 6 → codex(4-10) cheapest, claude(5-10) assert results["T-5"] in ("codex", "claude") # Blast 7 → codex or claude assert results["T-6"] in ("codex", "claude") # Blast 8+ → codex or claude (security→claude) assert results["T-7"] in ("codex", "claude") # Security always premium (claude) assert results["T-8"] == "claude" assert results["T-9"] == "claude" assert results["T-10"] in ("codex", "claude") def test_routing_accuracy_score(self, tmp_path): """Compute accuracy as % of correct routing decisions.""" cfg = _cfg(tmp_path) svc = RoutingService(cfg) correct = 0 expected_tiers = { "T-1": "premium", "T-2": "cheap", "T-3": "cheap", "T-4": "cheap", # local covers 0-5 "T-5": "mid", # codex covers 4-10 "T-6": "mid", # codex covers 4-10 "T-7": "mid", # codex (no security override) "T-8": "premium", "T-9": "premium", "T-10": "mid", # codex covers 4-10 } tier_map = {"local": "cheap", "kimi": "cheap", "codex": "mid", "claude": "premium"} for task in SPRINT_TASKS: raw = task.raw or {} ctx = RoutingContext( blast_score=raw.get("blast_score", 0), task_type=raw.get("task_type", "general"), security_sensitive=raw.get("security_sensitive", False), ) decision = svc.route(ctx) name = decision.primary if isinstance(decision.primary, str) else decision.primary.name actual_tier = tier_map.get(name, "unknown") if actual_tier == expected_tiers[task.id]: correct += 1 accuracy = correct / len(SPRINT_TASKS) assert accuracy >= 0.9, f"Routing accuracy {accuracy:.0%} < 90%" # -- 2. Budget efficiency ---------------------------------------------------- class TestBudgetEfficiency: def test_spend_distribution(self, tmp_path): cfg = _cfg(tmp_path) bm = BudgetManager(cfg) # Simulate spend from a 10-task sprint bm.record_spend("moonshot", "kimi-k2", 0.03) bm.record_spend("moonshot", "kimi-k2", 0.03) bm.record_spend("moonshot", "kimi-k2", 0.02) bm.record_spend("openai", "gpt-4o", 0.30) bm.record_spend("openai", "gpt-4o", 0.25) bm.record_spend("anthropic", "claude-sonnet-4", 1.20) bm.record_spend("anthropic", "claude-sonnet-4", 1.50) bm.record_spend("anthropic", "claude-sonnet-4", 1.80) bm.record_spend("anthropic", "claude-sonnet-4", 1.60) bm.record_spend("anthropic", "claude-sonnet-4", 1.40) breakdown = bm.by_provider() by_name = {r["provider"]: r["spent_usd"] for r in breakdown} # Cheap tasks should be < 5% of total total = sum(by_name.values()) cheap_pct = by_name.get("moonshot", 0) / total assert cheap_pct < 0.05, f"Cheap tier {cheap_pct:.0%} >= 5%" # Premium should be > 70% (complex tasks dominate) premium_pct = by_name.get("anthropic", 0) / total assert premium_pct > 0.70, f"Premium {premium_pct:.0%} <= 70%" # -- 3. Fallback resilience -------------------------------------------------- class TestFallbackResilience: @pytest.mark.asyncio async def test_quota_recovery(self): pi = PiAdapter() attempts: list[str] = [] async def fake_send(model, prompt, wd, max_turns=20, timeout=600): attempts.append(model) if model in ("kimi", "deepseek"): return RunResult(model=model, success=False, error="quota", quota_hit=True) return RunResult(model=model, success=True, output="recovered") pi.send_prompt = fake_send result = await pi.send_with_fallback("kimi", "test", "/tmp") assert result.success assert len(attempts) >= 3, "Should try multiple models" assert attempts[0] == "kimi" assert result.model not in ("kimi", "deepseek") @pytest.mark.asyncio async def test_full_chain_failure(self): pi = PiAdapter() async def all_fail(model, prompt, wd, max_turns=20, timeout=600): return RunResult(model=model, success=False, error="down") pi.send_prompt = all_fail result = await pi.send_with_fallback("kimi", "test", "/tmp") assert not result.success # -- 4. Fatigue awareness ---------------------------------------------------- class TestFatigueAwareness: def test_progressive_fatigue(self): ft = FatigueTracker(context_window=200_000) assert ft.state() == "ok" # Simulate 5 steps of increasing context for i in range(5): ft.record("context_load", 0.15 * (i + 1)) ft.record("turn_pressure", 0.1 * (i + 1)) assert ft.composite() > 0.3 def test_model_switch_degrades_fatigue(self): ft = FatigueTracker(context_window=200_000) ft.record("reread_ratio", 0.2) ft.on_model_switch(128_000) assert ft.dimensions["reread_ratio"] == pytest.approx(0.35) assert ft.context_window == 128_000 ft.on_model_switch(128_000) assert ft.dimensions["reread_ratio"] == pytest.approx(0.50) def test_critical_state_detection(self): ft = FatigueTracker() for dim in ("context_load", "turn_pressure", "reread_ratio", "handoff_risk"): ft.record(dim, 0.85) assert ft.state() == "critical" # -- 5. Lock safety ---------------------------------------------------------- class TestLockSafety: def test_concurrent_agent_protection(self, tmp_path): locks = LockManager(tmp_path / "bench-locks.db") assert locks.acquire("src/auth.py", "kimi-agent") assert not locks.acquire("src/auth.py", "claude-agent") assert locks.acquire("src/api.py", "claude-agent") conflicts = locks.conflicts(["src/auth.py", "src/api.py"]) assert len(conflicts) == 2 def test_release_allows_reacquire(self, tmp_path): locks = LockManager(tmp_path / "bench-locks.db") locks.acquire("src/main.py", "agent-a") locks.release("src/main.py", "agent-a") assert locks.acquire("src/main.py", "agent-b") def test_release_all_by_session(self, tmp_path): locks = LockManager(tmp_path / "bench-locks.db") locks.acquire("f1.py", "sess-1") locks.acquire("f2.py", "sess-1") locks.acquire("f3.py", "sess-1") count = locks.release_all("sess-1") assert count == 3 # -- 6. Escalation ----------------------------------------------------------- class TestEscalation: def test_auto_escalate_after_failures(self, tmp_path): esc = Escalator(tmp_path / "bench-esc.db") assert len(esc.list_pending()) == 0 esc.escalate("sess-1", "repeated_failure", {"failures": 3}) pending = esc.list_pending() assert len(pending) == 1 assert pending[0].reason == "repeated_failure" def test_resolve_clears_pending(self, tmp_path): esc = Escalator(tmp_path / "bench-esc.db") pkt = esc.escalate("sess-2", "stuck", {}) esc.resolve(pkt.id, "retry with claude") assert len(esc.list_pending()) == 0 # -- 7. Checkpoint continuity ------------------------------------------------ class TestCheckpointContinuity: def test_model_handoff_preserves_state(self, tmp_path): mgr = CheckpointManager(tmp_path / "bench-cp") mgr.write("session-x", { "goal": "Add OAuth2", "model_history": ["kimi", "gpt", "claude"], "progress": ["Step 1 by kimi", "Step 2 by gpt"], "current_subgoal": "Write tests", "fatigue_score": 0.45, }) data = mgr.read("session-x") assert data["goal"] == "Add OAuth2" assert len(data["model_history"]) == 3 assert data["fatigue_score"] == 0.45 def test_checkpoint_cleanup(self, tmp_path): mgr = CheckpointManager(tmp_path / "bench-cp") mgr.write("temp-sess", {"goal": "temp"}) assert mgr.read("temp-sess") is not None mgr.delete("temp-sess") assert mgr.read("temp-sess") is None # -- 8. Calibration learning ------------------------------------------------- class TestCalibrationLearning: def test_bad_model_gets_penalized(self, tmp_path): cal = CalibrationTracker(tmp_path / "bench-cal.db") # Record consistently bad predictions for "kimi" for _ in range(10): cal.record("kimi", "feature", 0.9, 0.1) # Record good predictions for "claude" for _ in range(10): cal.record("claude", "feature", 0.8, 0.85) kimi_acc = cal.accuracy("kimi") claude_acc = cal.accuracy("claude") assert kimi_acc < 0.5, f"Bad model accuracy {kimi_acc} >= 0.5" assert claude_acc > 0.9, f"Good model accuracy {claude_acc} <= 0.9" def test_routing_penalizes_uncalibrated(self, tmp_path): cfg = _cfg(tmp_path) svc = RoutingService(cfg) # Poison kimi's calibration for _ in range(10): svc.calibration.record("kimi", "feature", 0.9, 0.1) ctx = RoutingContext(blast_score=1, task_type="feature") decision = svc.route(ctx) name = decision.primary if isinstance(decision.primary, str) else decision.primary.name # kimi should be penalized — routing skips it # (only applies if kimi was the primary) assert name is not None # routing still works # -- 9. Dual planning ------------------------------------------------------- class TestDualPlanning: @pytest.mark.asyncio async def test_counter_check_runs(self): models_used: list[str] = [] async def fake_send(model, prompt, wd, turns=5, timeout=600): models_used.append(model) text = "CONFLICT: Missing error handling" if model == "codex" else "Step 1: implement" return RunResult(model=model, success=True, output=text) pi = PiAdapter() pi.send_prompt = fake_send planner = DualPlanner(pi) result = await planner.dual_plan("Add OAuth", "Implement OAuth2", "/tmp") assert "claude" in models_used assert "codex" in models_used assert len(result.conflicts) >= 1 assert "Missing error handling" in result.conflicts[0] # -- 10. Observability ------------------------------------------------------- class TestObservability: def test_signal_recording(self, tmp_path): obs = ObservabilityCollector(tmp_path / "bench-obs.db") obs.record_signal("app", "deploy_status", 1.0) obs.record_signal("app", "test_coverage", 0.87) obs.record_signal("api", "latency_p99", 0.250) app_signals = obs.recent_signals("app", limit=10) assert len(app_signals) == 2 api_signals = obs.recent_signals("api", limit=10) assert len(api_signals) == 1 assert api_signals[0]["signal_type"] == "latency_p99" def test_signal_log_jsonl(self, tmp_path): log = SignalLog(tmp_path / "bench-signals.jsonl") for i in range(5): log.append({"step": i, "model": "claude"}) recent = log.recent(3) assert len(recent) == 3 assert recent[0]["step"] == 2 # -- 11. Full executor pipeline (E2E) ---------------------------------------- class TestFullExecutorPipeline: @pytest.mark.asyncio async def test_10_task_sprint(self, tmp_path): """Simulate a full 10-task sprint through the executor.""" cfg = _cfg(tmp_path) (tmp_path / "repo").mkdir() provider = AsyncMock() executor = ExecutorService(cfg, provider) models_used: list[str] = [] async def fake_send(model, prompt, wd, max_turns=20, timeout=600): models_used.append(model) return RunResult(model=model, success=True, output="done", cost_usd=0.10) async def fake_ctx(cfg, task): return "" executor._pi.send_prompt = fake_send from maggy.services import executor_helpers _orig_icpg = executor_helpers.build_icpg_context executor_helpers.build_icpg_context = fake_ctx for task in SPRINT_TASKS: sid = f"s-{task.id}" session = { "id": sid, "task_id": task.id, "task_title": task.title, "mode": "plan", "working_dir": str(tmp_path / "repo"), "status": "running", "started_at": "", "output": "", } executor._sessions[sid] = session ctx = SessionCtx(session, task, str(tmp_path / "repo")) await executor._run(ctx, "plan") # Verify multi-model distribution unique_models = set(models_used) assert len(unique_models) >= 3, f"Only {unique_models} used" assert "claude" in unique_models assert "codex" in unique_models cheap = {"kimi", "local"} assert cheap & unique_models, "No cheap model used" # Verify fatigue was tracked assert executor._fatigue.dimensions["context_load"] > 0 # Verify signals were logged (plan mode uses _run_model directly) # Checkpoints were written and cleaned up for task in SPRINT_TASKS: clean_id = task.id.replace("/", "-") assert executor._checkpoint.read(clean_id) is None @pytest.mark.asyncio async def test_sprint_budget_summary(self, tmp_path): """After a sprint, budget tracks all providers.""" cfg = _cfg(tmp_path) (tmp_path / "repo").mkdir() provider = AsyncMock() executor = ExecutorService(cfg, provider) cost_map = {"kimi": 0.01, "local": 0.0, "claude": 0.80, "codex": 0.10} async def fake_send(model, prompt, wd, max_turns=20, timeout=600): return RunResult(model=model, success=True, output="ok", cost_usd=cost_map.get(model, 0.05)) async def fake_ctx(cfg, task): return "" executor._pi.send_prompt = fake_send from maggy.services import executor_helpers _orig_icpg = executor_helpers.build_icpg_context executor_helpers.build_icpg_context = fake_ctx for task in SPRINT_TASKS: sid = f"s-{task.id}" session = { "id": sid, "task_id": task.id, "task_title": task.title, "mode": "plan", "working_dir": str(tmp_path / "repo"), "status": "running", "started_at": "", "output": "", } executor._sessions[sid] = session ctx = SessionCtx(session, task, str(tmp_path / "repo")) await executor._run(ctx, "plan") breakdown = executor._budget.by_provider() providers = {r["provider"] for r in breakdown} assert len(providers) >= 2, f"Only {providers}" # -- 12. Project Registry CRUD ----------------------------------------------- class TestProjectRegistry: def test_full_lifecycle(self, tmp_path): cfg = _cfg(tmp_path) reg = ProjectRegistry(cfg) assert len(reg.list()) == 1 reg.add(ProjectConfig( name="api", repo="bench/api", path="/tmp/api", default_branch="main", )) assert len(reg.list()) == 2 assert reg.get("api") is not None reg.remove("api") assert reg.get("api") is None assert len(reg.list()) == 1 ================================================ FILE: maggy/tests/test_bootstrap.py ================================================ """Tests for startup bootstrap — auto-populate services.""" from __future__ import annotations from pathlib import Path from unittest.mock import MagicMock, patch import pytest def _make_cfg(tmp_path: Path): """Build a minimal MaggyConfig with codebases.""" from maggy.config import CodebaseConfig, MaggyConfig # Create fake codebase dirs repo_a = tmp_path / "repo-a" repo_a.mkdir() (repo_a / "main.py").write_text("print('hello')") (repo_a / "utils.ts").write_text("export const x = 1;") repo_b = tmp_path / "repo-b" repo_b.mkdir() (repo_b / "app.go").write_text("package main") return MaggyConfig( codebases=[ CodebaseConfig(path=str(repo_a), key="repo-a"), CodebaseConfig(path=str(repo_b), key="repo-b"), ], ) class TestSeedCIKG: """Test CIKG seeding from codebases.""" def test_creates_codebase_nodes(self, tmp_path): from maggy.main import _seed_cikg from maggy.cikg.graph import KnowledgeGraphService cfg = _make_cfg(tmp_path) cikg = KnowledgeGraphService(tmp_path / "cikg.db") _seed_cikg(cikg, cfg) nodes = cikg.list_nodes("codebase") assert len(nodes) == 2 names = {n.name for n in nodes} assert names == {"repo-a", "repo-b"} def test_creates_language_nodes(self, tmp_path): from maggy.main import _seed_cikg from maggy.cikg.graph import KnowledgeGraphService cfg = _make_cfg(tmp_path) cikg = KnowledgeGraphService(tmp_path / "cikg.db") _seed_cikg(cikg, cfg) langs = cikg.list_nodes("technology") lang_names = {n.name for n in langs} assert "python" in lang_names assert "typescript" in lang_names assert "go" in lang_names def test_creates_edges(self, tmp_path): from maggy.main import _seed_cikg from maggy.cikg.graph import KnowledgeGraphService cfg = _make_cfg(tmp_path) cikg = KnowledgeGraphService(tmp_path / "cikg.db") _seed_cikg(cikg, cfg) edges = cikg.get_edges("codebase:repo-a", "out") edge_types = {e.edge_type for e in edges} assert "uses_technology" in edge_types def test_skips_missing_dirs(self, tmp_path): from maggy.config import CodebaseConfig, MaggyConfig from maggy.main import _seed_cikg from maggy.cikg.graph import KnowledgeGraphService cfg = MaggyConfig(codebases=[ CodebaseConfig(path="/nonexistent/path", key="missing"), ]) cikg = KnowledgeGraphService(tmp_path / "cikg.db") _seed_cikg(cikg, cfg) assert cikg.list_nodes("codebase") == [] def test_idempotent(self, tmp_path): from maggy.main import _seed_cikg from maggy.cikg.graph import KnowledgeGraphService cfg = _make_cfg(tmp_path) cikg = KnowledgeGraphService(tmp_path / "cikg.db") _seed_cikg(cikg, cfg) _seed_cikg(cikg, cfg) # run again nodes = cikg.list_nodes("codebase") assert len(nodes) == 2 # no duplicates class TestBootstrap: """Test the full _bootstrap function.""" @pytest.mark.asyncio async def test_calls_services(self): from maggy.main import _bootstrap app = MagicMock() app.state.history = MagicMock() app.state.introspector = MagicMock() app.state.cikg = None app.state.cfg = MagicMock() await _bootstrap(app) app.state.history.analyze.assert_called_once() app.state.introspector.analyze.assert_called_once() @pytest.mark.asyncio async def test_handles_missing_services(self): from maggy.main import _bootstrap app = MagicMock() app.state.history = None app.state.introspector = None app.state.cikg = None app.state.cfg = None await _bootstrap(app) # should not raise @pytest.mark.asyncio async def test_handles_analyze_error(self): from maggy.main import _bootstrap app = MagicMock() app.state.history = MagicMock() app.state.history.analyze.side_effect = RuntimeError("db locked") app.state.introspector = None app.state.cikg = None app.state.cfg = None await _bootstrap(app) # should not raise ================================================ FILE: maggy/tests/test_budget.py ================================================ """Tests for BudgetManager — spend tracking and status.""" from __future__ import annotations from maggy.budget import ProviderBudget, TaskSpendTracker from maggy.config import BudgetConfig from maggy.budget import BudgetManager class TestBudgetTracking: def test_initial_spend_is_zero(self, mock_cfg): bm = BudgetManager(mock_cfg) assert bm.today_spend() == 0.0 def test_record_and_read(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 0.5) assert bm.today_spend() >= 0.5 def test_multiple_records_sum(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 0.3) bm.record_spend("openai", "gpt-4o", 0.2) assert bm.today_spend() >= 0.5 class TestBudgetStatus: def test_ok_status(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 1.0) status = bm.budget_status() assert status["status"] == "ok" def test_warning_status(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 8.5) status = bm.budget_status() assert status["status"] == "warning" def test_exhausted_status(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 10.0) status = bm.budget_status() assert status["status"] == "exhausted" class TestByProvider: def test_breakdown(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 0.5) bm.record_spend("openai", "gpt-4o", 0.3) breakdown = bm.by_provider() assert len(breakdown) == 2 providers = {r["provider"] for r in breakdown} assert "anthropic" in providers assert "openai" in providers class TestIsExhausted: def test_not_exhausted(self, mock_cfg): bm = BudgetManager(mock_cfg) assert not bm.is_exhausted() def test_exhausted(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 11.0) assert bm.is_exhausted() class TestProviderBudgets: def test_provider_exhaustion_uses_provider_limit(self, mock_cfg): mock_cfg.budget = BudgetConfig( daily_limit_usd=20.0, providers=[ ProviderBudget("moonshot", 1.0, "kimi"), ProviderBudget("openai", 5.0, "gpt"), ], ) bm = BudgetManager(mock_cfg) bm.record_spend("moonshot", "kimi", 1.1) assert bm.is_provider_exhausted("moonshot") assert not bm.is_provider_exhausted("openai") def test_cheapest_available_skips_exhausted_provider(self, mock_cfg): mock_cfg.budget = BudgetConfig( providers=[ ProviderBudget("moonshot", 1.0, "kimi"), ProviderBudget("openai", 5.0, "gpt"), ], ) bm = BudgetManager(mock_cfg) bm.record_spend("moonshot", "kimi", 1.0) assert bm.cheapest_available() == "gpt" class TestTokenTracking: def test_initial_tokens_zero(self, mock_cfg): bm = BudgetManager(mock_cfg) tokens = bm.today_tokens() assert tokens == {"input": 0, "output": 0} def test_record_and_read_tokens(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 0.5, 1000, 500) bm.record_spend("openai", "gpt-4o", 0.3, 2000, 800) tokens = bm.today_tokens() assert tokens["input"] == 3000 assert tokens["output"] == 1300 def test_tokens_by_provider(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 0.5, 1000, 500) bm.record_spend("openai", "gpt", 0.3, 2000, 800) tokens = bm.today_tokens("anthropic") assert tokens["input"] == 1000 def test_budget_status_includes_tokens(self, mock_cfg): bm = BudgetManager(mock_cfg) bm.record_spend("anthropic", "claude", 0.5, 1500, 600) status = bm.budget_status() assert status["input_tokens"] == 1500 assert status["output_tokens"] == 600 class TestTaskSpendTracker: def test_records_total_cost(self) -> None: tracker = TaskSpendTracker(5.0) tracker.record(1.5) tracker.record(0.5) assert tracker.total() == 2.0 def test_detects_exceeded_spend(self) -> None: tracker = TaskSpendTracker(2.0) tracker.record(2.0) assert tracker.is_exceeded() def test_tracks_edit_loops(self) -> None: tracker = TaskSpendTracker(5.0) for _ in range(4): tracker.record_edit("maggy/services/planner.py") tracker.record_edit("maggy/budget.py") assert tracker.detect_loop() == ["maggy/services/planner.py"] def test_budget_config_has_task_limit(self) -> None: cfg = BudgetConfig(max_spend_per_task=3.5) assert cfg.max_spend_per_task == 3.5 ================================================ FILE: maggy/tests/test_calibration.py ================================================ """Tests for calibration tracking.""" from __future__ import annotations import pytest from maggy.calibration import CalibrationTracker def test_records_accuracy_and_error(tmp_path) -> None: tracker = CalibrationTracker(tmp_path / "calibration.db") tracker.record("claude", "planning", 0.8, 0.7) tracker.record("claude", "planning", 0.4, 0.5) assert tracker.accuracy("claude") == pytest.approx(0.9) assert tracker.calibration_error("claude") == pytest.approx(0.1) def test_unknown_model_returns_zero(tmp_path) -> None: tracker = CalibrationTracker(tmp_path / "calibration.db") assert tracker.accuracy("codex") == 0.0 assert tracker.calibration_error("codex") == 0.0 def test_accuracy_clamps_at_zero_for_large_errors(tmp_path) -> None: tracker = CalibrationTracker(tmp_path / "calibration.db") tracker.record("claude", "planning", 0.0, 2.0) assert tracker.accuracy("claude") >= 0.0 ================================================ FILE: maggy/tests/test_cascade.py ================================================ """Tests for cascade execution — quality-gate-based model escalation.""" from __future__ import annotations import pytest from maggy.adapters.pi import PiAdapter, RunResult from maggy.services.cascade import cascade_execute class TestCascadeNoEscalation: @pytest.mark.asyncio async def test_first_model_passes(self): pi = PiAdapter() calls: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): calls.append(model_name) return RunResult(model=model_name, success=True, output="good") pi.send_prompt = fake_send async def good_gate(output: str) -> int: return 4 result = await cascade_execute( pi, ["local", "gpt", "claude"], "test", "/tmp", good_gate, ) assert result.model == "local" assert not result.escalated assert len(calls) == 1 class TestCascadeEscalation: @pytest.mark.asyncio async def test_low_quality_escalates(self): pi = PiAdapter() calls: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): calls.append(model_name) return RunResult(model=model_name, success=True, output="ok") pi.send_prompt = fake_send scores = iter([2, 4]) async def improving_gate(output: str) -> int: return next(scores) result = await cascade_execute( pi, ["local", "gpt", "claude"], "test", "/tmp", improving_gate, ) assert result.model == "gpt" assert result.escalated assert len(calls) == 2 @pytest.mark.asyncio async def test_max_3_attempts(self): pi = PiAdapter() calls: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): calls.append(model_name) return RunResult(model=model_name, success=True, output="bad") pi.send_prompt = fake_send async def always_bad(output: str) -> int: return 1 result = await cascade_execute( pi, ["local", "gpt", "claude"], "test", "/tmp", always_bad, ) assert len(result.attempts) == 3 # All scored equally — returns best (first with highest score) assert len(calls) == 3 class TestCascadeFailure: @pytest.mark.asyncio async def test_send_failure_escalates(self): pi = PiAdapter() calls: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): calls.append(model_name) if model_name == "local": return RunResult( model=model_name, success=False, error="crash", ) return RunResult(model=model_name, success=True, output="ok") pi.send_prompt = fake_send async def ok_gate(output: str) -> int: return 4 result = await cascade_execute( pi, ["local", "gpt"], "test", "/tmp", ok_gate, ) assert result.model == "gpt" assert result.escalated @pytest.mark.asyncio async def test_single_model_no_escalation(self): pi = PiAdapter() async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): return RunResult(model=model_name, success=True, output="ok") pi.send_prompt = fake_send async def low_gate(output: str) -> int: return 2 result = await cascade_execute( pi, ["claude"], "test", "/tmp", low_gate, ) assert result.model == "claude" assert len(result.attempts) == 1 class TestCascadeAttemptTracking: @pytest.mark.asyncio async def test_attempts_recorded(self): pi = PiAdapter() async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): return RunResult(model=model_name, success=True, output="ok") pi.send_prompt = fake_send scores = iter([1, 4]) async def gate(output: str) -> int: return next(scores) result = await cascade_execute( pi, ["local", "gpt"], "test", "/tmp", gate, ) assert len(result.attempts) == 2 assert result.attempts[0].model == "local" assert result.attempts[0].score == 1 assert result.attempts[1].model == "gpt" assert result.attempts[1].score == 4 ================================================ FILE: maggy/tests/test_chat.py ================================================ """Tests for ChatManager — interactive Claude sessions.""" from __future__ import annotations from pathlib import Path import pytest from maggy.config import CodebaseConfig, MaggyConfig def _make_cfg(tmp_path: Path) -> MaggyConfig: repo = tmp_path / "my-project" repo.mkdir() return MaggyConfig(codebases=[ CodebaseConfig(path=str(repo), key="my-project"), ]) class TestChatManager: """Test ChatManager session lifecycle.""" def test_create_session(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) session = mgr.create_session("my-project") assert session.project_key == "my-project" assert session.status == "idle" assert session.working_dir == str( tmp_path / "my-project" ) assert session.messages == [] def test_create_session_invalid_project(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) with pytest.raises(ValueError, match="not found"): mgr.create_session("nonexistent") def test_create_with_project_path(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) # Subdirectory of configured codebase is allowed sub = tmp_path / "my-project" / "src" sub.mkdir() s = mgr.create_session("my-project", str(sub)) assert s.project_key == "my-project" assert s.working_dir == str(sub) def test_create_rejects_outside_path(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) outside = tmp_path / "other-repo" outside.mkdir() with pytest.raises(ValueError, match="not inside"): mgr.create_session("other", str(outside)) def test_list_sessions(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) mgr.create_session("my-project") mgr.create_session("my-project") sessions = mgr.list_sessions() assert len(sessions) == 2 def test_get_session(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") got = mgr.get_session(s.id) assert got is not None assert got.id == s.id def test_get_missing_session(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) assert mgr.get_session("missing") is None def test_build_cmd_new_session(self, tmp_path): from maggy.services.chat import ChatManager from maggy.services.chat_stream import build_cmd cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") cmd = build_cmd(s, "fix the bug") assert "claude" in cmd[0] assert "-p" in cmd assert "fix the bug" in cmd assert "--output-format" in cmd assert "--resume" not in cmd def test_build_cmd_resume(self, tmp_path): from maggy.services.chat import ChatManager from maggy.services.chat_stream import build_cmd cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") s.claude_session_id = "abc123" cmd = build_cmd(s, "continue working") assert "--resume" in cmd idx = cmd.index("--resume") assert cmd[idx + 1] == "abc123" def test_delete_session(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") assert mgr.delete_session(s.id) is True assert mgr.get_session(s.id) is None def test_delete_missing(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) assert mgr.delete_session("nope") is False def test_working_dir_security_bad_key(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) with pytest.raises(ValueError, match="not found"): mgr.create_session("hacker-repo") def test_working_dir_security_bad_path(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) with pytest.raises(ValueError, match="not inside"): mgr.create_session("x", "/etc") class TestAutoConnect: """Test auto-connect to active projects.""" def test_auto_connect_creates_sessions(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) repo = tmp_path / "my-project" active = [ {"project": "my-project", "project_path": str(repo)}, ] result = mgr.auto_connect(active) assert len(result) == 1 assert result[0].project_key == "my-project" def test_auto_connect_deduplicates(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) repo = tmp_path / "my-project" active = [ {"project": "my-project", "project_path": str(repo)}, {"project": "my-project", "project_path": str(repo)}, ] result = mgr.auto_connect(active) assert len(result) == 1 def test_auto_connect_multiple_projects(self, tmp_path): from maggy.services.chat import ChatManager r1 = tmp_path / "proj-a" r2 = tmp_path / "proj-b" r1.mkdir() r2.mkdir() cfg = MaggyConfig(codebases=[ CodebaseConfig(path=str(r1), key="proj-a"), CodebaseConfig(path=str(r2), key="proj-b"), ]) mgr = ChatManager(cfg) active = [ {"project": "proj-a", "project_path": str(r1)}, {"project": "proj-b", "project_path": str(r2)}, ] result = mgr.auto_connect(active) assert len(result) == 2 keys = {s.project_key for s in result} assert keys == {"proj-a", "proj-b"} def test_auto_connect_skips_empty(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) repo = tmp_path / "my-project" active = [ {"project": "", "project_path": ""}, {"project": "my-project", "project_path": str(repo)}, ] result = mgr.auto_connect(active) assert len(result) == 1 def test_find_by_project(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") found = mgr.find_by_project("my-project") assert found is not None assert found.id == s.id def test_find_by_project_missing(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) assert mgr.find_by_project("nope") is None class TestMessageQueue: """Message queuing when session is busy.""" def test_enqueue_returns_position(self, tmp_path): from maggy.services.chat import ChatManager, enqueue_msg cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") assert enqueue_msg(s, "msg 1") == 1 assert enqueue_msg(s, "msg 2") == 2 def test_enqueue_full_returns_negative(self, tmp_path): from maggy.services.chat import ChatManager, enqueue_msg cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") for i in range(5): enqueue_msg(s, f"msg {i}") assert enqueue_msg(s, "overflow") == -1 def test_session_has_pending_queue(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") assert hasattr(s, "pending_queue") assert len(s.pending_queue) == 0 @pytest.mark.asyncio async def test_send_while_locked_enqueues(self, tmp_path): from maggy.services.chat import ChatManager cfg = _make_cfg(tmp_path) mgr = ChatManager(cfg) s = mgr.create_session("my-project") lock = mgr._locks[s.id] async with lock: chunks = [c async for c in mgr.send(s.id, "queued")] assert any(c.get("type") == "queued" for c in chunks) assert len(s.pending_queue) == 1 ================================================ FILE: maggy/tests/test_chat_context.py ================================================ """Tests for chat context builder.""" from __future__ import annotations import json from unittest.mock import MagicMock import pytest from maggy.services.chat_context import ( _format_recent_prompts, _match_from_report, _match_history, _path_candidates, build_project_context, resolve_claude_session_id, ) class TestPathCandidates: """Test path candidate generation.""" def test_basic_path(self): result = _path_candidates( "/Users/ali/Documents/protaige", "protaige" ) assert "protaige" in result assert "Documents" not in result # skipped assert "Users" not in result # skipped assert "ali" in result def test_nested_path(self): result = _path_candidates( "/Users/ali/Documents/AI-Playground/" "claude-skills-package", "claude-skills-package", ) assert "claude-skills-package" in result assert "AI-Playground" in result def test_empty_path(self): result = _path_candidates("", "my-project") assert "my-project" in result class TestMatchFromReport: """Test matching via aggregated report data.""" def test_exact_project_match(self): report = { "projects": [ { "project": "protaige", "total_sessions": 22, "total_prompts": 2369, "providers_used": ["claude"], "top_topics": ["maia", "api", "auth"], }, ], } result = _match_from_report( report, "/Users/ali/protaige", "protaige" ) assert "22 sessions" in result assert "2369 prompts" in result assert "maia" in result def test_parent_dir_match(self): """Match claude-skills-package via AI-Playground.""" report = { "projects": [ { "project": "AI-Playground", "total_sessions": 5, "total_prompts": 51, "providers_used": ["claude"], "top_topics": ["setup", "config"], }, ], } result = _match_from_report( report, "/Users/ali/Documents/AI-Playground/" "claude-skills-package", "claude-skills-package", ) assert "5 sessions" in result assert "51 prompts" in result def test_multiple_matches(self): """Match both direct and parent entries.""" report = { "projects": [ { "project": "plugins", "total_sessions": 22, "total_prompts": 990, "providers_used": ["claude"], "top_topics": ["plugin"], }, { "project": "edubites", "total_sessions": 10, "total_prompts": 200, "providers_used": ["claude"], "top_topics": ["platform"], }, ], } result = _match_from_report( report, "/Users/ali/edubites/plugins", "plugins", ) assert "plugins" in result or "22 sessions" in result assert "edubites" in result or "10 sessions" in result def test_no_match(self): report = { "projects": [ {"project": "unrelated", "total_sessions": 1, "total_prompts": 5, "providers_used": [], "top_topics": []}, ], } result = _match_from_report( report, "/Users/ali/my-project", "my-project" ) assert result == "" class TestMatchHistory: """Test the main matching function.""" def test_uses_report_when_available(self): history = MagicMock() history.get_report.return_value = { "projects": [ { "project": "myapp", "total_sessions": 5, "total_prompts": 100, "providers_used": ["claude"], "top_topics": ["api"], }, ], } result = _match_history( history, "/Users/ali/myapp", "myapp" ) assert "5 sessions" in result def test_returns_empty_when_no_history(self): result = _match_history( None, "/some/path", "proj" ) assert result == "" def test_returns_empty_when_no_report(self): history = MagicMock() history.get_report.return_value = None result = _match_history( history, "/some/path", "proj" ) assert result == "" class TestFormatRecentPrompts: """Test recent prompt formatting.""" def test_matching_prompts(self): prompts = [ {"project": "protaige", "text": "fix the auth bug", "timestamp": "2026-05-10T14:00:00"}, {"project": "other", "text": "unrelated", "timestamp": "2026-05-10T13:00:00"}, ] result = _format_recent_prompts(prompts, "protaige") assert "fix the auth bug" in result assert "unrelated" not in result def test_no_matching_prompts(self): prompts = [ {"project": "other", "text": "something", "timestamp": "2026-05-10T14:00:00"}, ] result = _format_recent_prompts(prompts, "protaige") assert result == "" def test_limits_to_five(self): prompts = [ {"project": "x", "text": f"msg {i}", "timestamp": f"2026-05-10T1{i}:00:00"} for i in range(10) ] result = _format_recent_prompts(prompts, "x") assert result.count("- [") == 5 class TestResolveSessionId: """Test Claude session ID resolution.""" def test_finds_session_id(self, tmp_path): history = tmp_path / ".claude" / "history.jsonl" history.parent.mkdir(parents=True) entries = [ json.dumps({ "project": "/Users/ali/protaige", "sessionId": "abc-123", "timestamp": 1715000000000, }), json.dumps({ "project": "/Users/ali/protaige", "sessionId": "def-456", "timestamp": 1715100000000, }), ] history.write_text("\n".join(entries)) from unittest.mock import patch with patch( "maggy.services.chat_context.Path.home", return_value=tmp_path, ): result = resolve_claude_session_id( "/Users/ali/protaige" ) assert result == "def-456" def test_no_match(self, tmp_path): history = tmp_path / ".claude" / "history.jsonl" history.parent.mkdir(parents=True) history.write_text(json.dumps({ "project": "/Users/ali/other", "sessionId": "xyz", "timestamp": 1715000000000, })) from unittest.mock import patch with patch( "maggy.services.chat_context.Path.home", return_value=tmp_path, ): result = resolve_claude_session_id( "/Users/ali/protaige" ) assert result == "" def test_missing_file(self, tmp_path): from unittest.mock import patch with patch( "maggy.services.chat_context.Path.home", return_value=tmp_path, ): result = resolve_claude_session_id("/some/path") assert result == "" class TestBuildProjectContext: """Test full context assembly.""" def test_combines_history_and_prompts(self): history = MagicMock() history.get_report.return_value = { "projects": [ { "project": "myapp", "total_sessions": 8, "total_prompts": 200, "providers_used": ["claude"], "top_topics": ["api", "auth"], }, ], } prompts = [ {"project": "myapp", "text": "add endpoint", "timestamp": "2026-05-10T14:00:00"}, ] result = build_project_context( history, "/Users/ali/myapp", "myapp", prompts, ) assert "8 sessions" in result assert "add endpoint" in result def test_empty_when_nothing(self): history = MagicMock() history.get_report.return_value = {"projects": []} result = build_project_context( history, "/some/path", "proj", [], ) assert result == "" ================================================ FILE: maggy/tests/test_chat_routed.py ================================================ """Tests for routed chat — multi-model routing in ChatManager.""" from __future__ import annotations from unittest.mock import MagicMock import pytest from maggy.services.chat_router import estimate_blast, estimate_type class TestBlastEstimation: """Blast score estimation from message keywords.""" def test_low_blast_simple_fix(self): assert estimate_blast("fix the typo in README") <= 3 def test_high_blast_security(self): assert estimate_blast("design auth system with OAuth") >= 7 def test_high_blast_architecture(self): assert estimate_blast("refactor database schema") >= 5 def test_medium_blast_feature(self): score = estimate_blast("add pagination to the API") assert 3 <= score <= 6 def test_empty_returns_default(self): assert estimate_blast("") == 5 # --- Intent-based scoring --- def test_retrieval_find_key_low_blast(self): """'find the API key' is retrieval, not mid-complexity.""" assert estimate_blast("find the API key in ~/Documents") <= 3 def test_retrieval_show_config(self): assert estimate_blast("show me the current config") <= 3 def test_retrieval_check_env(self): assert estimate_blast("check the env variables") <= 3 def test_retrieval_where_is_file(self): assert estimate_blast("where is the routes file") <= 3 def test_retrieval_list_endpoints(self): assert estimate_blast("list all API endpoints") <= 3 def test_retrieval_read_file(self): assert estimate_blast("read the package.json") <= 3 def test_creation_still_mid(self): """create/implement should stay in 4-6 range.""" score = estimate_blast("create a new user service") assert 4 <= score <= 6 def test_multi_step_high(self): """refactor + migrate = high blast.""" assert estimate_blast("refactor and migrate the database") >= 7 def test_retrieval_with_action_not_capped(self): """'find the bug and fix it' has both retrieval and mutation.""" score = estimate_blast("find the bug and fix the auth") assert score >= 4 class TestTypeEstimation: """Task type estimation from message keywords.""" def test_security_type(self): assert estimate_type("fix authentication bug") == "security" def test_docs_type(self): assert estimate_type("write documentation for API") == "docs" def test_test_type(self): assert estimate_type("add unit tests with mock fixtures") == "tests" def test_general_default(self): assert estimate_type("make it faster") == "general" class TestRoutedEndpoint: """API endpoint /send-routed returns routing metadata.""" @pytest.mark.asyncio async def test_send_routed_yields_routing_chunk(self): """First SSE chunk should be routing decision.""" from maggy.services.chat_router import RoutedChat mock_routing = MagicMock() mock_routing.route.return_value = MagicMock( primary=MagicMock(name="claude"), reason="blast 8 → claude", ) mock_budget = MagicMock() mock_budget.check.return_value = True rc = RoutedChat(mock_routing, mock_budget) # We only test the routing decision, not the full send decision = rc.decide("design auth system", None, None) assert decision is not None mock_routing.route.assert_called_once() class TestRewardRecording: """Reward recording after routed chat completes.""" def test_success_records_reward(self): """Successful chat records reward=1.0.""" from maggy.api.routes_chat import _record_routing_outcome routing = MagicMock() decision = MagicMock( model="local", task_type="general", blast=5, ) _record_routing_outcome(routing, decision, had_error=False) routing.record_outcome.assert_called_once_with( "local", "general", 5, 1.0, ) def test_error_records_zero_reward(self): """Chat with error records reward=0.0.""" from maggy.api.routes_chat import _record_routing_outcome routing = MagicMock() decision = MagicMock( model="claude", task_type="security", blast=8, ) _record_routing_outcome(routing, decision, had_error=True) routing.record_outcome.assert_called_once_with( "claude", "security", 8, 0.0, ) def test_no_routing_service_noop(self): """No routing service → no crash.""" from maggy.api.routes_chat import _record_routing_outcome _record_routing_outcome(None, None, had_error=False) ================================================ FILE: maggy/tests/test_chat_router.py ================================================ """Tests for blast-score estimation and task-type detection.""" from __future__ import annotations from maggy.services.chat_router import ( DEFAULT_BLAST, estimate_blast, estimate_type, ) def test_blast_hi_scores_low(): """Trivial greeting should score 1, not 5.""" assert estimate_blast("hi") == 1 def test_blast_exit_scores_low(): """Exit-like messages should score 1.""" assert estimate_blast("exit") == 1 def test_blast_empty_returns_default(): """Empty string uses DEFAULT_BLAST.""" assert estimate_blast("") == DEFAULT_BLAST def test_blast_security_audit_scores_high(): """Multiple high-tier keywords → blast >= 7.""" score = estimate_blast("security audit migration") assert score >= 7 def test_blast_fix_typo_scores_low(): """Low-tier keywords → blast <= 3.""" score = estimate_blast("fix typo in readme") assert score <= 3 def test_type_security_detected(): """Security keywords map to security type.""" assert estimate_type("fix auth vulnerability") == "security" def test_type_general_default(): """No keyword matches → general.""" assert estimate_type("hello world") == "general" def test_type_search_detected(): """Search queries map to search type.""" assert estimate_type("find the utils module") == "search" def test_type_search_grep(): """grep-like queries map to search type.""" assert estimate_type("grep for config files") == "search" def test_blast_search_scores_low(): """Search queries should score low blast (cheap model).""" score = estimate_blast("find the utils module") assert score <= 3 ================================================ FILE: maggy/tests/test_chat_stream.py ================================================ """Tests for chat streaming JSON parser and usage extraction.""" from __future__ import annotations import json from maggy.services.chat_stream import parse_chunk class _FakeSession: def __init__(self): self.claude_session_id = "" def test_parse_result_extracts_usage(): session = _FakeSession() data = json.dumps({ "type": "result", "result": "Done", "cost_usd": 0.05, "usage": {"input_tokens": 1500, "output_tokens": 800}, }) chunk = parse_chunk(data, session) assert chunk["type"] == "result" assert chunk["content"] == "Done" assert chunk["cost_usd"] == 0.05 assert chunk["input_tokens"] == 1500 assert chunk["output_tokens"] == 800 def test_parse_result_without_usage(): session = _FakeSession() data = json.dumps({"type": "result", "result": "Done"}) chunk = parse_chunk(data, session) assert chunk["type"] == "result" assert chunk["content"] == "Done" assert "cost_usd" not in chunk def test_parse_assistant_text(): session = _FakeSession() data = json.dumps({ "type": "assistant", "message": {"content": [{"type": "text", "text": "Hello"}]}, }) chunk = parse_chunk(data, session) assert chunk["type"] == "text" assert chunk["content"] == "Hello" def test_parse_captures_session_id(): session = _FakeSession() data = json.dumps({"session_id": "abc123", "type": "system"}) parse_chunk(data, session) assert session.claude_session_id == "abc123" def test_parse_result_zero_cost_preserved(): """cost_usd=0.0 must appear in chunk, not be dropped.""" session = _FakeSession() data = json.dumps({ "type": "result", "result": "Done", "cost_usd": 0.0, "usage": {"input_tokens": 0, "output_tokens": 0}, }) chunk = parse_chunk(data, session) assert chunk["cost_usd"] == 0.0 assert chunk["input_tokens"] == 0 assert chunk["output_tokens"] == 0 def test_parse_invalid_json(): session = _FakeSession() chunk = parse_chunk("not json {{", session) assert chunk["type"] == "text" ================================================ FILE: maggy/tests/test_checkpoint.py ================================================ """Tests for cross-model checkpoint serializer.""" from __future__ import annotations from maggy.services.checkpoint import ( Checkpoint, create_checkpoint, ) class TestCheckpoint: def test_serialize_round_trip(self): cp = Checkpoint( goal="Fix auth bug", progress=["Found root cause"], source_model="claude", ) data = cp.serialize() restored = Checkpoint.deserialize(data) assert restored.goal == "Fix auth bug" assert restored.source_model == "claude" assert len(restored.progress) == 1 def test_serialize_sets_timestamp(self): cp = Checkpoint(goal="test") data = cp.serialize() restored = Checkpoint.deserialize(data) assert restored.created_at != "" def test_to_prompt_format(self): cp = Checkpoint( goal="Add logout button", constraints=["No breaking changes"], progress=["Created component"], working_state="Testing phase", file_context=["src/auth.ts"], ) prompt = cp.to_prompt() assert "Add logout button" in prompt assert "No breaking changes" in prompt assert "Created component" in prompt assert "Testing phase" in prompt assert "src/auth.ts" in prompt def test_to_prompt_minimal(self): cp = Checkpoint(goal="Simple task") prompt = cp.to_prompt() assert "Simple task" in prompt assert "confirm you understand" in prompt class TestCreateCheckpoint: def test_helper_function(self): cp = create_checkpoint( goal="Refactor DB layer", progress=["Extracted interface"], model="gpt", working_state="mid-refactor", files=["db.py", "models.py"], constraints=["Keep API stable"], ) assert cp.goal == "Refactor DB layer" assert cp.source_model == "gpt" assert len(cp.file_context) == 2 def test_defaults(self): cp = create_checkpoint( goal="Test", progress=[], model="claude", ) assert cp.constraints == [] assert cp.file_context == [] ================================================ FILE: maggy/tests/test_checkpoint_mgr.py ================================================ """Tests for CheckpointManager persistence.""" from __future__ import annotations from maggy.checkpoint import CheckpointManager def _checkpoint() -> dict: return { "goal": "Ship Phase 2", "constraints": ["Keep tests green"], "progress": ["Planner added"], "model_history": ["claude"], "current_subgoal": "Add checkpoints", "fatigue_score": 0.2, } class TestCheckpointManager: def test_write_and_read(self, tmp_path) -> None: mgr = CheckpointManager(tmp_path) mgr.write("session-1", _checkpoint()) assert mgr.read("session-1") == _checkpoint() def test_read_missing_returns_none(self, tmp_path) -> None: mgr = CheckpointManager(tmp_path) assert mgr.read("missing") is None def test_delete_returns_true_when_removed(self, tmp_path) -> None: mgr = CheckpointManager(tmp_path) mgr.write("session-1", _checkpoint()) assert mgr.delete("session-1") is True assert mgr.read("session-1") is None def test_list_checkpoints_returns_session_ids(self, tmp_path) -> None: mgr = CheckpointManager(tmp_path) mgr.write("b", _checkpoint()) mgr.write("a", _checkpoint()) assert mgr.list_checkpoints() == ["a", "b"] def test_path_traversal_rejected(self, tmp_path) -> None: import pytest mgr = CheckpointManager(tmp_path) with pytest.raises(ValueError, match="Invalid session id"): mgr.write("../../etc/passwd", _checkpoint()) def test_read_corrupt_json_returns_none(self, tmp_path) -> None: mgr = CheckpointManager(tmp_path) mgr.write("sess-1", _checkpoint()) path = tmp_path / "sess-1.json" path.write_text("{corrupt") assert mgr.read("sess-1") is None ================================================ FILE: maggy/tests/test_cikg.py ================================================ """Tests for CIKG — knowledge graph, queries, market scoring.""" from __future__ import annotations from pathlib import Path import pytest from maggy.cikg.graph import KnowledgeGraphService from maggy.cikg.models import Edge, Node from maggy.cikg.queries import ( compare_entities, find_gaps, find_gaps_raw, get_landscape, get_segment_landscape, ) class TestKnowledgeGraph: def test_add_and_get_node(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") node = Node( id="c1", node_type="competitor", name="Acme", ) g.add_node(node) result = g.get_node("c1") assert result is not None assert result.name == "Acme" def test_get_missing_node(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") assert g.get_node("nonexistent") is None def test_list_nodes_by_type(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c1", node_type="competitor", name="A")) g.add_node(Node(id="f1", node_type="feature", name="B")) comps = g.list_nodes("competitor") assert len(comps) == 1 assert comps[0].name == "A" def test_list_all_nodes(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c1", node_type="competitor", name="A")) g.add_node(Node(id="f1", node_type="feature", name="B")) assert len(g.list_nodes()) == 2 class TestEdges: def test_add_and_get_edge(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c1", node_type="competitor", name="A")) g.add_node(Node(id="f1", node_type="feature", name="SSO")) g.add_edge(Edge( source_id="c1", target_id="f1", edge_type="has_feature", )) edges = g.get_edges("c1", "out") assert len(edges) == 1 assert edges[0].target_id == "f1" def test_inbound_edges(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c1", node_type="competitor", name="A")) g.add_node(Node(id="f1", node_type="feature", name="SSO")) g.add_edge(Edge( source_id="c1", target_id="f1", edge_type="has_feature", )) edges = g.get_edges("f1", "in") assert len(edges) == 1 assert edges[0].source_id == "c1" def test_neighbors(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c1", node_type="competitor", name="A")) g.add_node(Node(id="f1", node_type="feature", name="SSO")) g.add_edge(Edge( source_id="c1", target_id="f1", edge_type="has_feature", )) neighbors = g.neighbors("c1") assert len(neighbors) == 1 assert neighbors[0].id == "f1" class TestDeleteNode: def test_delete_removes_node_and_edges(self, tmp_path: Path): g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c1", node_type="competitor", name="A")) g.add_node(Node(id="f1", node_type="feature", name="SSO")) g.add_edge(Edge( source_id="c1", target_id="f1", edge_type="has_feature", )) g.delete_node("c1") assert g.get_node("c1") is None assert g.get_edges("c1", "out") == [] class TestQueries: def _seed_graph(self, tmp_path: Path) -> KnowledgeGraphService: g = KnowledgeGraphService(tmp_path / "cikg.db") for i in range(3): g.add_node(Node( id=f"c{i}", node_type="competitor", name=f"Comp{i}", )) g.add_node(Node( id="f1", node_type="feature", name="SSO", )) g.add_node(Node( id="t1", node_type="technology", name="React", )) # 2 out of 3 competitors have SSO g.add_edge(Edge("c0", "f1", "has_feature")) g.add_edge(Edge("c1", "f1", "has_feature")) return g def test_find_gaps_existing(self, tmp_path: Path): g = self._seed_graph(tmp_path) score = find_gaps(g, "SSO") assert score.feature == "SSO" assert score.gap_count == 1 assert score.threat_level == "medium" def test_find_gaps_unknown(self, tmp_path: Path): g = self._seed_graph(tmp_path) score = find_gaps(g, "AI Chat") assert score.gap_count == 3 assert score.threat_level == "low" assert "differentiator" in score.recommendation.lower() def test_get_landscape(self, tmp_path: Path): g = self._seed_graph(tmp_path) ls = get_landscape(g) assert ls["competitors"] == 3 assert ls["features_tracked"] == 1 assert ls["technologies"] == 1 def test_compare_entities(self, tmp_path: Path): g = self._seed_graph(tmp_path) result = compare_entities(g, "c0", "c1") assert "f1" in result["shared"] class TestServiceQueries: def _seed_graph(self, tmp_path: Path) -> KnowledgeGraphService: g = KnowledgeGraphService(tmp_path / "cikg.db") g.add_node(Node(id="c0", node_type="competitor", name="Alpha")) g.add_node(Node(id="c1", node_type="competitor", name="Bravo")) g.add_node(Node(id="c2", node_type="competitor", name="Charlie")) g.add_node(Node(id="f1", node_type="feature", name="SSO")) g.add_node(Node(id="f2", node_type="feature", name="AI Chat")) g.add_node(Node(id="t1", node_type="technology", name="React")) g.add_node(Node(id="s1", node_type="market_segment", name="SMB")) g.add_node(Node(id="s2", node_type="market_segment", name="Enterprise")) g.add_edge(Edge("c0", "f1", "has_feature")) g.add_edge(Edge("c1", "f1", "has_feature")) g.add_edge(Edge("c1", "f2", "has_feature")) g.add_edge(Edge("c0", "c1", "competes_with")) g.add_edge(Edge("c0", "t1", "uses_technology")) g.add_edge(Edge("c1", "t1", "uses_technology")) g.add_edge(Edge("c0", "s1", "targets_market")) g.add_edge(Edge("c1", "s1", "targets_market")) g.add_edge(Edge("c2", "s2", "targets_market")) g.add_edge(Edge("c1", "c0", "threatens")) return g def test_find_gaps_raw(self, tmp_path: Path): g = self._seed_graph(tmp_path) result = find_gaps_raw(g, "SSO") assert {item["entity"] for item in result} == { "Alpha", "Bravo", "Charlie", } status = {item["entity"]: item["status"] for item in result} assert status == { "Alpha": "has", "Bravo": "has", "Charlie": "lacks", } def test_compare_entities(self, tmp_path: Path): g = self._seed_graph(tmp_path) result = compare_entities(g, "c0", "c1") assert result["shared"] == ["f1"] assert result["only_a"] == [] assert result["only_b"] == ["f2"] assert result["relationships"][0]["edge_type"] == "competes_with" def test_segment_landscape(self, tmp_path: Path): g = self._seed_graph(tmp_path) result = get_segment_landscape(g, "SMB") assert result["segment"] == "SMB" assert result["competitors"] == 2 assert result["features_tracked"] == 2 assert result["technologies"] == 1 assert result["threat_count"] == 1 class TestTypeValidation: def test_valid_node_type_accepted(self): node = Node(id="c1", node_type="competitor", name="Test") assert node.node_type == "competitor" def test_invalid_node_type_rejected(self): with pytest.raises(ValueError, match="Invalid node_type"): Node(id="c1", node_type="bogus", name="Test") def test_valid_edge_type_accepted(self): edge = Edge(source_id="a", target_id="b", edge_type="has_feature") assert edge.edge_type == "has_feature" def test_invalid_edge_type_rejected(self): with pytest.raises(ValueError, match="Invalid edge_type"): Edge(source_id="a", target_id="b", edge_type="bogus") ================================================ FILE: maggy/tests/test_cli.py ================================================ """Tests for Maggy CLI — thin client over REST API.""" from __future__ import annotations import json import subprocess from unittest.mock import AsyncMock, MagicMock, patch import pytest from typer.testing import CliRunner from maggy.cli import app runner = CliRunner() # ── Fixtures ──────────────────────────────────────────────────────── @pytest.fixture(autouse=True) def _mock_server_running(monkeypatch): """Pretend server is always up.""" monkeypatch.setattr( "maggy.cli_client.MaggyClient._check_health", lambda self: True, ) def _mock_get(response_json: dict | list): """Return a mock httpx response.""" resp = MagicMock() resp.status_code = 200 resp.json.return_value = response_json resp.raise_for_status = MagicMock() return resp # ── Status ────────────────────────────────────────────────────────── def test_status_shows_health(): health = { "status": "ok", "mode": "full", "org": "Protaige", "codebases": 5, "provider": "github", } with patch("maggy.cli_client.httpx.get", return_value=_mock_get(health)): result = runner.invoke(app, ["status"]) assert result.exit_code == 0 assert "Protaige" in result.output def test_status_json_flag(): health = {"status": "ok", "mode": "full", "org": "X", "codebases": 1} with patch("maggy.cli_client.httpx.get", return_value=_mock_get(health)): result = runner.invoke(app, ["status", "--json"]) assert result.exit_code == 0 parsed = json.loads(result.output) assert parsed["status"] == "ok" # ── Inbox ─────────────────────────────────────────────────────────── def test_inbox_renders_table(): items = { "items": [ {"rank": 1, "title": "Fix auth bug", "labels": ["bug"], "ai_reason": "critical", "id": "1", "board": "repo"}, {"rank": 2, "title": "Add tests", "labels": ["test"], "ai_reason": "coverage", "id": "2", "board": "repo"}, ], "total": 2, } with patch("maggy.cli_client.httpx.get", return_value=_mock_get(items)): result = runner.invoke(app, ["inbox"]) assert result.exit_code == 0 assert "Fix auth bug" in result.output def test_inbox_empty(): with patch("maggy.cli_client.httpx.get", return_value=_mock_get({"items": [], "total": 0})): result = runner.invoke(app, ["inbox"]) assert result.exit_code == 0 assert "No tasks" in result.output # ── Sessions ──────────────────────────────────────────────────────── def test_sessions_renders(): data = { "sessions": [ {"pid": 1234, "tool": "claude", "project": "myapp", "prompts": 42, "duration": "1h 20m"}, ], "total": 1, } with patch("maggy.cli_client.httpx.get", return_value=_mock_get(data)): result = runner.invoke(app, ["sessions"]) assert result.exit_code == 0 assert "claude" in result.output # ── Route ─────────────────────────────────────────────────────────── def test_route_decision(): decision = { "primary": "claude", "validator": "codex", "fallback": ["kimi", "ollama"], "reason": "blast 8 → premium tier", } with patch("maggy.cli_client.httpx.get", return_value=_mock_get(decision)): result = runner.invoke(app, ["route", "8"]) assert result.exit_code == 0 assert "claude" in result.output # ── Budget ────────────────────────────────────────────────────────── def test_budget_renders(): data = { "daily_limit_usd": 10.0, "used_today_usd": 3.50, "providers": [ {"name": "anthropic", "used": 2.50, "limit": 5.0}, {"name": "openai", "used": 1.00, "limit": 3.0}, ], } with patch("maggy.cli_client.httpx.get", return_value=_mock_get(data)): result = runner.invoke(app, ["budget"]) assert result.exit_code == 0 assert "anthropic" in result.output # ── Competitors ───────────────────────────────────────────────────── def test_competitors_news(): news = [ {"date": "2026-05-11", "source": "TechCrunch", "event_type": "funding", "headline": "Rival raises $50M"}, ] with patch("maggy.cli_client.httpx.get", return_value=_mock_get(news)): result = runner.invoke(app, ["competitors"]) assert result.exit_code == 0 assert "Rival" in result.output # ── Models ────────────────────────────────────────────────────────── def test_models_heatmap(): heatmap = [ {"model": "claude", "task_type": "security", "reward": 0.92}, {"model": "codex", "task_type": "crud", "reward": 0.85}, ] with patch("maggy.cli_client.httpx.get", return_value=_mock_get(heatmap)): result = runner.invoke(app, ["models"]) assert result.exit_code == 0 assert "claude" in result.output # ── Server auto-start ─────────────────────────────────────────────── def test_server_not_running_starts_it(monkeypatch): """If health check fails, CLI should attempt to start server.""" monkeypatch.undo() # remove autouse mock call_count = {"n": 0} def fake_check(self): call_count["n"] += 1 if call_count["n"] <= 1: return False return True monkeypatch.setattr( "maggy.cli_client.MaggyClient._check_health", fake_check, ) monkeypatch.setattr( "maggy.cli_client.MaggyClient._start_server", lambda self: None, ) health = {"status": "ok", "mode": "local", "org": "Test", "codebases": 0} with patch("maggy.cli_client.httpx.get", return_value=_mock_get(health)): result = runner.invoke(app, ["status"]) assert result.exit_code == 0 def test_stale_port_killed_before_start(monkeypatch): """Stale port holder is killed before spawning server.""" monkeypatch.undo() calls = {"health": 0, "kill": 0} def fake_check(self): calls["health"] += 1 return calls["health"] > 2 monkeypatch.setattr( "maggy.cli_client.MaggyClient._check_health", fake_check, ) monkeypatch.setattr( "maggy.cli_client.MaggyClient._start_server", lambda self: None, ) monkeypatch.setattr( "maggy.cli_client.MaggyClient._kill_stale_port", lambda self: calls.__setitem__("kill", 1), ) health = { "status": "ok", "mode": "local", "org": "T", "codebases": 0, } with patch( "maggy.cli_client.httpx.get", return_value=_mock_get(health), ): result = runner.invoke(app, ["status"]) assert result.exit_code == 0 assert calls["kill"] == 1 def test_server_log_written_to_file(monkeypatch, tmp_path): """Server stdout/stderr go to ~/.maggy/server.log.""" monkeypatch.setattr("maggy.cli_client.CONFIG_DIR", tmp_path) captured = {} def fake_popen(cmd, **kw): captured.update(kw) monkeypatch.setattr( "maggy.cli_client.subprocess.Popen", fake_popen, ) from maggy.cli_client import MaggyClient MaggyClient()._start_server() assert captured.get("stdout") is not subprocess.DEVNULL assert (tmp_path / "server.log").exists() ================================================ FILE: maggy/tests/test_cli_chat.py ================================================ """Tests for maggy chat CLI — interactive REPL.""" from __future__ import annotations from unittest.mock import patch import pytest from typer.testing import CliRunner from maggy.cli import app runner = CliRunner() SESSION = { "id": "abc123", "project_key": "my-proj", "working_dir": "/tmp/my-proj", "status": "idle", "messages": 0, } RESUMED = { "id": "abc123", "project_key": "my-proj", "working_dir": "/tmp/my-proj", "status": "idle", "messages": 5, } HISTORY = { "id": "abc123", "messages": [ {"role": "user", "content": "hello"}, {"role": "assistant", "content": "hi"}, ], } @pytest.fixture(autouse=True) def _no_detect(monkeypatch): """Prevent real CLI detection in tests.""" from maggy.services import session_detect monkeypatch.setattr( session_detect, "detect_all", lambda wd: session_detect.DetectedSessions(), ) def _setup_new(mock_client): """Configure client mocks for new session flow.""" mock_client.ensure_server.return_value = True mock_client.chat_sessions.return_value = [] mock_client.chat_create.return_value = SESSION mock_client.chat_history.return_value = {"messages": []} mock_client.budget_summary.return_value = { "spent_today_usd": 0, "daily_limit_usd": 10, "status": "ok", } mock_client.models_heatmap.return_value = [] @patch("maggy.cli._client") def test_chat_creates_session(mock_client): """Creates new session when none exist for project.""" _setup_new(mock_client) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 assert "my-proj" in result.output mock_client.chat_create.assert_called_once_with("my-proj") @patch("maggy.cli._client") def test_chat_resumes_existing(mock_client): """Resumes existing session instead of creating new.""" mock_client.ensure_server.return_value = True mock_client.chat_sessions.return_value = [RESUMED] mock_client.chat_history.return_value = HISTORY mock_client.budget_summary.return_value = { "spent_today_usd": 0, "daily_limit_usd": 10, "status": "ok", } mock_client.models_heatmap.return_value = [] with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 assert "Resuming" in result.output mock_client.chat_create.assert_not_called() @patch("maggy.cli._client") def test_chat_routed_streams(mock_client): """Routed chat sends via send_routed and shows model.""" _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "routing", "model": "kimi", "blast": 3, "reason": "low blast"}, {"type": "text", "content": "Hello"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["say hi", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 mock_client.chat_send_routed.assert_called_once_with( "abc123", "say hi", blast=None, allowed_models=None, ) @patch("maggy.cli._client") def test_chat_direct_mode(mock_client): """--direct flag uses send_stream instead of routed.""" _setup_new(mock_client) mock_client.chat_send_stream.return_value = iter([ {"type": "text", "content": "Hi"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["say hi", "/quit"] result = runner.invoke( app, ["chat", "my-proj", "--direct"], ) assert result.exit_code == 0 mock_client.chat_send_stream.assert_called_once_with( "abc123", "say hi", ) @patch("maggy.cli._client") def test_chat_history_command(mock_client): _setup_new(mock_client) mock_client.chat_history.return_value = HISTORY with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["/history", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 @patch("maggy.cli._client") def test_chat_blast_override(mock_client): """'/blast 8' sets override for next message.""" _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "routing", "model": "claude", "blast": 8, "reason": "override"}, {"type": "text", "content": "Done"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["/blast 8", "do it", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 mock_client.chat_send_routed.assert_called_once_with( "abc123", "do it", blast=8, allowed_models=None, ) @patch("maggy.cli._client") def test_chat_ctrl_c_exits(mock_client): _setup_new(mock_client) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = KeyboardInterrupt result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 @patch("maggy.cli._client") def test_chat_empty_input_ignored(mock_client): _setup_new(mock_client) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["", " ", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 mock_client.chat_send_routed.assert_not_called() @patch("maggy.cli._client") def test_chat_error_displayed(mock_client): _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "error", "content": "CLI not found"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["test", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 @patch("maggy.cli._client") def test_chat_shows_queued_status(mock_client): _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "queued", "position": 2}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["test", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 @patch("maggy.cli._client") def test_chat_shows_warning(mock_client): _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "warning", "content": "Context: ~25000 tokens"}, {"type": "text", "content": "Hi"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["test", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 @patch("maggy.cli._client") def test_chat_exit_word_quits(mock_client): """Typing 'exit' terminates the REPL (not routed to LLM).""" _setup_new(mock_client) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["exit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 mock_client.chat_send_routed.assert_not_called() @patch("maggy.cli._client") def test_chat_agent_status_rendered(mock_client): """Agent status chunks render @model> step status.""" _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "agent_status", "agent": "local", "step": "ANALYZE", "status": "running"}, {"type": "text", "content": "Done"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["test", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 assert "running" in result.output @patch("maggy.cli._client") def test_chat_quota_error_shows_guide(mock_client): """Quota error triggers account switch guidance.""" _setup_new(mock_client) mock_client.chat_send_routed.return_value = iter([ {"type": "error", "content": "rate_limit_exceeded: quota hit"}, {"type": "done"}, ]) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["test", "/quit"] result = runner.invoke(app, ["chat", "my-proj"]) assert result.exit_code == 0 out = result.output.lower() assert "switch" in out or "login" in out or "account" in out @patch("maggy.cli._client") def test_chat_prompt_uses_angle_bracket(mock_client): """Prompt uses '>' character, not 'maggy:'.""" _setup_new(mock_client) with patch("maggy.cli_chat.Prompt") as mp: mp.ask.side_effect = ["/quit"] runner.invoke(app, ["chat", "my-proj"]) call_args = mp.ask.call_args[0][0] assert ">" in call_args assert "maggy" not in call_args.lower() @patch("maggy.cli._client") def test_screenshot_command_dispatches(mock_client): """'/screenshot path.png' calls vision handler.""" _setup_new(mock_client) with patch("maggy.cli_chat.Prompt") as mp, \ patch("maggy.cli_chat._handle_screenshot") as mh: mp.ask.side_effect = ["/screenshot test.png", "/quit"] runner.invoke(app, ["chat", "my-proj"]) mh.assert_called_once() assert "test.png" in mh.call_args[0][0] ================================================ FILE: maggy/tests/test_cli_discovery.py ================================================ """Tests for CLI auto-discovery and command building.""" from __future__ import annotations from maggy.adapters.cli_discovery import ( CliProfile, discover_all, discover_cli, ) def test_discover_all_returns_profiles(): result = discover_all() assert "claude" in result.profiles assert "codex" in result.profiles assert "kimi" in result.profiles def test_claude_discovered(): p = discover_cli("claude") assert p.installed is True assert p.prompt_is_positional is True assert p.prompt_flag == "-p" assert "skip-permissions" in p.auto_approve_flag assert p.output_format_flag == "--output-format" assert p.work_dir_flag == "" def test_codex_discovered(): p = discover_cli("codex") assert p.installed is True assert p.uses_exec_subcommand is True assert p.prompt_is_positional is True assert "bypass" in p.auto_approve_flag assert p.work_dir_flag == "-C" def test_kimi_discovered(): p = discover_cli("kimi") assert p.installed is True assert p.prompt_flag == "-p" assert p.auto_approve_flag == "--yolo" assert p.afk_flag == "--afk" assert p.work_dir_flag == "-w" def test_missing_cli(): p = discover_cli("nonexistent_xyz") assert p.installed is False def test_claude_build_command(): p = CliProfile( name="claude", binary="claude", installed=True, prompt_flag="-p", prompt_is_positional=True, auto_approve_flag="--dangerously-skip-permissions", output_format_flag="--output-format", ) cmd = p.build_command("do stuff", "/tmp/repo", 20) assert cmd[:3] == ["claude", "-p", "do stuff"] assert "--dangerously-skip-permissions" in cmd assert "--output-format" in cmd assert "json" in cmd def test_codex_build_command(): p = CliProfile( name="codex", binary="codex", installed=True, uses_exec_subcommand=True, prompt_is_positional=True, work_dir_flag="-C", auto_approve_flag="--dangerously-bypass-approvals-and-sandbox", ) cmd = p.build_command("do stuff", "/tmp/repo", 10) assert cmd[:3] == ["codex", "exec", "do stuff"] assert "-C" in cmd assert "/tmp/repo" in cmd def test_kimi_build_command(): p = CliProfile( name="kimi", binary="kimi", installed=True, prompt_flag="-p", work_dir_flag="-w", auto_approve_flag="--yolo", afk_flag="--afk", ) cmd = p.build_command("do stuff", "/tmp/repo", 10) assert cmd[:3] == ["kimi", "-p", "do stuff"] assert "-w" in cmd assert "--yolo" in cmd assert "--afk" in cmd def test_ollama_discovered(): p = discover_cli("ollama") assert p.installed is True assert p.uses_run_subcommand is True assert p.prompt_is_positional is True assert "qwen" in p.run_model and "coder" in p.run_model def test_ollama_build_command(): p = CliProfile( name="ollama", binary="ollama", installed=True, uses_run_subcommand=True, run_model="qwen3-coder:30b-a3b-q8_0", prompt_is_positional=True, ) cmd = p.build_command("do stuff", "/tmp/repo", 5) assert cmd[:4] == ["ollama", "run", "qwen3-coder:30b-a3b-q8_0", "do stuff"] assert "--output-format" not in cmd def test_pi_adapter_uses_discovery(): from maggy.adapters.pi import PiAdapter pi = PiAdapter() profiles = pi.discovered_profiles assert "claude" in profiles assert profiles["claude"].installed is True assert "ollama" in profiles assert profiles["ollama"].installed is True ================================================ FILE: maggy/tests/test_cli_sessions.py ================================================ """Tests for CLI session management commands.""" from __future__ import annotations from unittest.mock import patch from typer.testing import CliRunner from maggy.cli import app runner = CliRunner() @patch("maggy.cli._client") def test_spawn_creates_session(mock_client): """maggy spawn posts to execute endpoint.""" mock_client.ensure_server.return_value = True mock_client.spawn.return_value = { "session_id": "abc123", } result = runner.invoke( app, ["spawn", "add unit tests"], ) assert result.exit_code == 0 assert "abc123" in result.output mock_client.spawn.assert_called_once() @patch("maggy.cli._client") def test_ps_lists_sessions(mock_client): """maggy ps shows all sessions.""" mock_client.ensure_server.return_value = True mock_client.all_sessions.return_value = [ { "id": "abc", "project": "edubites-core", "model": "claude", "status": "running", "type": "chat", }, ] result = runner.invoke(app, ["ps"]) assert result.exit_code == 0 assert "edubites-core" in result.output @patch("maggy.cli._client") def test_kill_stops_session(mock_client): """maggy kill sends delete to session.""" mock_client.ensure_server.return_value = True mock_client.kill_session.return_value = {"ok": True} result = runner.invoke(app, ["kill", "abc123"]) assert result.exit_code == 0 mock_client.kill_session.assert_called_once_with("abc123") ================================================ FILE: maggy/tests/test_cli_welcome.py ================================================ """Tests for the CLI welcome banner.""" from __future__ import annotations from unittest.mock import MagicMock from maggy.cli_welcome import render_welcome def _mock_client(): c = MagicMock() c.budget_summary.return_value = { "spent_today_usd": 1.50, "daily_limit_usd": 10.0, "status": "ok", } c.models_heatmap.return_value = [ {"model": "claude"}, {"model": "kimi"}, ] return c SESSION = { "id": "abc123", "project_key": "edubites", "working_dir": "/tmp/edubites", "status": "idle", "messages": 5, } def test_render_welcome_shows_project(capsys): render_welcome("edubites", SESSION, _mock_client()) out = capsys.readouterr().out assert "edubites" in out def test_render_welcome_shows_budget(capsys): render_welcome("edubites", SESSION, _mock_client()) out = capsys.readouterr().out assert "1.50" in out or "$1.50" in out def test_render_welcome_shows_models(capsys): render_welcome("edubites", SESSION, _mock_client()) out = capsys.readouterr().out assert "2" in out def test_render_welcome_shows_health(capsys): """Welcome banner displays memory health score.""" c = _mock_client() c.engram_diagnostics.return_value = {"health_score": 0.85} render_welcome("edubites", SESSION, c) out = capsys.readouterr().out assert "85%" in out or "0.85" in out def test_render_welcome_shows_session_history(capsys): """Welcome banner shows previous session message count.""" session = {**SESSION, "messages": 12} render_welcome("edubites", session, _mock_client()) out = capsys.readouterr().out assert "12" in out def test_dir_shows_cwd_fallback(capsys): """Dir row uses os.getcwd() when working_dir missing.""" import os session = {**SESSION, "working_dir": ""} render_welcome("edubites", session, _mock_client()) out = capsys.readouterr().out # Should contain part of the actual cwd, not empty string cwd_tail = os.path.basename(os.getcwd()) assert cwd_tail in out def test_models_shows_available_count(capsys): """Empty heatmap shows available model count.""" c = _mock_client() c.models_heatmap.return_value = [] render_welcome("edubites", SESSION, c) out = capsys.readouterr().out assert "5 available" in out or "available" in out def test_budget_subscription_welcome(capsys): """Subscription plan shows Subscription in welcome.""" c = _mock_client() c.budget_summary.return_value = { "spent_today_usd": 0, "daily_limit_usd": 10.0, "status": "ok", "plan": "subscription", } render_welcome("edubites", SESSION, c) out = capsys.readouterr().out assert "subscription" in out.lower() ================================================ FILE: maggy/tests/test_context_compactor.py ================================================ """Tests for context compactor — message summarization.""" from __future__ import annotations import pytest from maggy.services.context_compactor import ( CompactionResult, estimate_tokens, should_compact, ) class TestEstimateTokens: def test_empty_list(self): assert estimate_tokens([]) == 0 def test_single_message(self): msgs = [{"role": "user", "content": "hello world"}] assert estimate_tokens(msgs) > 0 def test_approximation(self): text = "a" * 400 msgs = [{"role": "user", "content": text}] assert estimate_tokens(msgs) == pytest.approx(100, abs=10) class TestShouldCompact: def test_below_threshold_no_compact(self): msgs = [{"role": "user", "content": "short"}] assert not should_compact(msgs, context_window=200_000) def test_above_threshold_compact(self): big = "x" * 160_000 msgs = [{"role": "user", "content": big}] assert should_compact(msgs, context_window=40_000) def test_threshold_at_80_pct(self): content = "a" * 32_800 msgs = [{"role": "user", "content": content}] assert should_compact(msgs, context_window=10_000) class TestCompact: @pytest.mark.asyncio async def test_keeps_recent_messages(self): from maggy.services.context_compactor import compact msgs = [ {"role": "user", "content": f"msg {i}"} for i in range(10) ] async def fake_summarize(text): return "summary of old messages" result = await compact(msgs, keep_recent=4, summarizer=fake_summarize) assert isinstance(result, CompactionResult) assert len(result.messages) == 5 assert result.messages[0]["role"] == "system" assert "summary" in result.messages[0]["content"] assert result.messages[-1]["content"] == "msg 9" @pytest.mark.asyncio async def test_nothing_to_compact(self): from maggy.services.context_compactor import compact msgs = [{"role": "user", "content": "hi"}] async def fake_summarize(text): return "summary" result = await compact(msgs, keep_recent=6, summarizer=fake_summarize) assert result.messages == msgs assert result.tokens_saved == 0 @pytest.mark.asyncio async def test_summarizer_failure_passthrough(self): from maggy.services.context_compactor import compact msgs = [ {"role": "user", "content": f"msg {i}"} for i in range(10) ] async def broken_summarize(text): raise RuntimeError("model down") result = await compact(msgs, keep_recent=4, summarizer=broken_summarize) assert result.messages == msgs assert result.tokens_saved == 0 ================================================ FILE: maggy/tests/test_contracts.py ================================================ """Tests for contract generation.""" from __future__ import annotations from maggy.contracts import ContractGenerator def test_generates_test_code_from_postcondition() -> None: generator = ContractGenerator() code = generator.from_postcondition( "returns sorted results", "maggy.services.planner.DualPlanner.plan", ) assert "returns sorted results" in code assert "DualPlanner.plan" in code assert "def test_dualplanner_plan_contract()" in code ================================================ FILE: maggy/tests/test_convention_inferrer.py ================================================ """Tests for LLM-based dynamic convention inference.""" from __future__ import annotations from pathlib import Path import pytest from maggy.adapters.pi import PiAdapter, RunResult from maggy.routing_rules import Convention, RoutingRules from maggy.services.convention_inferrer import ( collect_fingerprint, ensure_inferred, infer_conventions, parse_conventions, ) def test_collect_fingerprint_includes_files(tmp_path: Path): (tmp_path / "src").mkdir() (tmp_path / "src" / "main.py").write_text("print('hi')") (tmp_path / "README.md").write_text("# Hello") fp = collect_fingerprint(str(tmp_path)) assert "src" in fp assert "README.md" in fp def test_collect_fingerprint_excludes_noise(tmp_path: Path): (tmp_path / "node_modules" / "pkg").mkdir(parents=True) (tmp_path / ".git" / "objects").mkdir(parents=True) (tmp_path / "__pycache__").mkdir() (tmp_path / "src").mkdir() fp = collect_fingerprint(str(tmp_path)) assert "node_modules" not in fp assert ".git" not in fp assert "__pycache__" not in fp assert "src" in fp def test_collect_fingerprint_includes_config(tmp_path: Path): (tmp_path / "pyproject.toml").write_text("[tool.ruff]\nline-length = 88\n") fp = collect_fingerprint(str(tmp_path)) assert "tool.ruff" in fp def test_collect_fingerprint_includes_git_log(tmp_path: Path): import subprocess subprocess.run(["git", "init"], cwd=tmp_path, capture_output=True) subprocess.run(["git", "config", "user.email", "t@t.com"], cwd=tmp_path, capture_output=True) subprocess.run(["git", "config", "user.name", "T"], cwd=tmp_path, capture_output=True) (tmp_path / "f.txt").write_text("x") subprocess.run(["git", "add", "."], cwd=tmp_path, capture_output=True) subprocess.run(["git", "commit", "-m", "chore: run prisma migrate"], cwd=tmp_path, capture_output=True) fp = collect_fingerprint(str(tmp_path)) assert "prisma" in fp def test_parse_conventions_from_llm_output(): text = "Here are the conventions:\n- Use prisma migrate\n- Use turbo build\n" convs = parse_conventions(text) assert len(convs) == 2 assert "prisma" in convs[0].text.lower() assert "turbo" in convs[1].text.lower() def test_parse_ignores_non_convention_lines(): text = "Analysis:\nThe project uses X.\n- Use X for builds\nEnd." convs = parse_conventions(text) assert len(convs) == 1 assert "Use X" in convs[0].text def test_parse_caps_at_10(): lines = "\n".join(f"- Convention {i}" for i in range(15)) assert len(parse_conventions(lines)) == 10 def test_parse_empty_response(): assert parse_conventions("") == [] assert parse_conventions("No conventions found.") == [] def _seed_project(tmp_path: Path) -> None: """Add a config file so fingerprint exceeds the 20-char minimum.""" (tmp_path / "pyproject.toml").write_text("[tool.ruff]\nline-length=88\n") @pytest.mark.asyncio async def test_infer_calls_local_model(tmp_path: Path): _seed_project(tmp_path) pi, models_called = PiAdapter(), [] async def fake_send(model, prompt, wd, **kw): models_called.append(model) return RunResult(model=model, success=True, output="- Use custom deploy\n") pi.send_prompt = fake_send convs = await infer_conventions(pi, str(tmp_path)) assert models_called[0] == "local" assert len(convs) >= 1 assert "custom deploy" in convs[0].text.lower() @pytest.mark.asyncio async def test_infer_falls_back_on_local_failure(tmp_path: Path): _seed_project(tmp_path) pi, models_called = PiAdapter(), [] async def fake_send(model, prompt, wd, **kw): models_called.append(model) if model == "local": return RunResult(model=model, success=False, error="offline") return RunResult(model=model, success=True, output="- Use yarn\n") pi.send_prompt = fake_send convs = await infer_conventions(pi, str(tmp_path)) assert "local" in models_called assert "kimi" in models_called assert len(convs) >= 1 @pytest.mark.asyncio async def test_infer_returns_empty_on_all_failures(tmp_path: Path): _seed_project(tmp_path) pi = PiAdapter() async def fail_send(model, prompt, wd, **kw): return RunResult(model=model, success=False, error="down") pi.send_prompt = fail_send assert await infer_conventions(pi, str(tmp_path)) == [] @pytest.mark.asyncio async def test_ensure_inferred_caches(tmp_path: Path): _seed_project(tmp_path) pi, call_count = PiAdapter(), [0] async def counting_send(model, prompt, wd, **kw): call_count[0] += 1 return RunResult(model=model, success=True, output="- Use X\n") pi.send_prompt = counting_send rules = RoutingRules() await ensure_inferred(rules, "proj", str(tmp_path), pi) first_count = call_count[0] await ensure_inferred(rules, "proj", str(tmp_path), pi) assert call_count[0] == first_count @pytest.mark.asyncio async def test_ensure_inferred_deduplicates(tmp_path: Path): _seed_project(tmp_path) pi = PiAdapter() async def fake_send(model, prompt, wd, **kw): return RunResult(model=model, success=True, output="- Use npm install\n- Use custom script\n") pi.send_prompt = fake_send rules = RoutingRules(project_conventions={ "proj": [Convention("Use npm install", ["all"], "auto-detected")], }) await ensure_inferred(rules, "proj", str(tmp_path), pi) texts = [c.text for c in rules.project_conventions["proj"]] assert texts.count("Use npm install") == 1 assert "Use custom script" in texts @pytest.mark.asyncio async def test_all_inferred_have_llm_source(tmp_path: Path): _seed_project(tmp_path) pi = PiAdapter() async def fake_send(model, prompt, wd, **kw): return RunResult(model=model, success=True, output="- Use X\n") pi.send_prompt = fake_send rules = RoutingRules() await ensure_inferred(rules, "proj", str(tmp_path), pi) llm_convs = [c for c in rules.project_conventions.get("proj", []) if c.source == "llm-inferred"] assert len(llm_convs) >= 1 ================================================ FILE: maggy/tests/test_convention_scanner.py ================================================ """Tests for project-specific convention detection from filesystem.""" from __future__ import annotations from pathlib import Path from maggy.routing_rules import Convention, RoutingRules from maggy.services.convention_scanner import ( ensure_scanned, scan_project, ) def test_detects_supabase_migrations(tmp_path: Path): """supabase/migrations/ dir -> supabase convention.""" (tmp_path / "supabase" / "migrations").mkdir(parents=True) convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "supabase" in texts.lower() def test_detects_alembic(tmp_path: Path): """alembic.ini -> alembic convention.""" (tmp_path / "alembic.ini").write_text("[alembic]\n") convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "alembic" in texts.lower() def test_detects_npm(tmp_path: Path): """package-lock.json -> npm convention.""" (tmp_path / "package-lock.json").write_text("{}") convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "npm" in texts.lower() def test_detects_pnpm(tmp_path: Path): """pnpm-lock.yaml -> pnpm convention.""" (tmp_path / "pnpm-lock.yaml").write_text("") convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "pnpm" in texts.lower() def test_detects_pytest_in_pyproject(tmp_path: Path): """pyproject.toml with [tool.pytest] -> pytest convention.""" (tmp_path / "pyproject.toml").write_text( "[tool.pytest.ini_options]\ntestpaths = ['tests']\n" ) convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "pytest" in texts.lower() def test_detects_ruff_in_pyproject(tmp_path: Path): """pyproject.toml with [tool.ruff] -> ruff convention.""" (tmp_path / "pyproject.toml").write_text("[tool.ruff]\nline-length=88\n") convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "ruff" in texts.lower() def test_empty_dir_no_conventions(tmp_path: Path): """Empty directory produces no conventions.""" convs = scan_project(str(tmp_path)) assert convs == [] def test_all_conventions_have_auto_source(tmp_path: Path): """Detected conventions have source='auto-detected'.""" (tmp_path / "Makefile").write_text("all:\n\techo hi\n") convs = scan_project(str(tmp_path)) assert len(convs) >= 1 assert all(c.source == "auto-detected" for c in convs) def test_conventions_for_merges_project(): """conventions_for includes project-specific conventions.""" from maggy.routing_rules import conventions_for rules = RoutingRules( conventions=[Convention("Global rule", ["all"], "manual")], project_conventions={ "protaige": [ Convention("Use supabase db push", ["all"], "auto"), ], }, ) text = conventions_for(rules, "feature", "protaige") assert "Global rule" in text assert "supabase" in text def test_conventions_for_without_project(): """conventions_for without project_key returns only global.""" from maggy.routing_rules import conventions_for rules = RoutingRules( conventions=[Convention("Global rule", ["all"], "manual")], project_conventions={ "protaige": [ Convention("Use supabase db push", ["all"], "auto"), ], }, ) text = conventions_for(rules, "feature") assert "Global rule" in text assert "supabase" not in text def test_ensure_scanned_caches(tmp_path: Path): """ensure_scanned only scans once per project_key.""" (tmp_path / "alembic.ini").write_text("[alembic]\n") rules = RoutingRules() ensure_scanned(rules, "my-proj", str(tmp_path)) assert "my-proj" in rules.project_conventions count = len(rules.project_conventions["my-proj"]) ensure_scanned(rules, "my-proj", str(tmp_path)) assert len(rules.project_conventions["my-proj"]) == count def test_yaml_roundtrip_project_conventions(tmp_path: Path): """Project conventions survive YAML save/load cycle.""" from maggy.routing_rules_io import load, save rules = RoutingRules( project_conventions={ "protaige": [ Convention("Use supabase", ["all"], "auto-detected"), ], "edubites": [ Convention("Use alembic", ["all"], "auto-detected"), ], }, ) yaml_path = tmp_path / "rules.yaml" save(rules, yaml_path) loaded = load(yaml_path) assert "protaige" in loaded.project_conventions assert "edubites" in loaded.project_conventions assert "supabase" in loaded.project_conventions["protaige"][0].text def test_detects_docker_compose(tmp_path: Path): """docker-compose.yml -> docker convention.""" (tmp_path / "docker-compose.yml").write_text("version: '3'\n") convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "docker" in texts.lower() def test_detects_github_actions(tmp_path: Path): """.github/workflows/ -> CI convention.""" (tmp_path / ".github" / "workflows").mkdir(parents=True) convs = scan_project(str(tmp_path)) texts = " ".join(c.text for c in convs) assert "github actions" in texts.lower() ================================================ FILE: maggy/tests/test_coordination.py ================================================ """Tests for multi-agent coordination locks.""" from __future__ import annotations import sqlite3 from datetime import datetime, timedelta, timezone from maggy.coordination.lock_manager import LockManager class TestLockManager: def test_acquire_and_release(self, tmp_path): manager = LockManager(tmp_path / "locks.db") assert manager.acquire("maggy/a.py", "agent-1") is True assert manager.release("maggy/a.py", "agent-1") is True assert manager.release("maggy/a.py", "agent-1") is False def test_blocks_other_agent(self, tmp_path): manager = LockManager(tmp_path / "locks.db") assert manager.acquire("maggy/a.py", "agent-1") is True assert manager.acquire("maggy/a.py", "agent-2") is False def test_release_all_returns_count(self, tmp_path): manager = LockManager(tmp_path / "locks.db") manager.acquire("maggy/a.py", "agent-1") manager.acquire("maggy/b.py", "agent-1") manager.acquire("maggy/c.py", "agent-2") assert manager.release_all("agent-1") == 2 assert manager.conflicts(["maggy/a.py", "maggy/c.py"]) == ["maggy/c.py"] def test_conflicts_returns_locked_paths(self, tmp_path): manager = LockManager(tmp_path / "locks.db") manager.acquire("maggy/a.py", "agent-1") manager.acquire("maggy/c.py", "agent-2") conflicts = manager.conflicts(["maggy/a.py", "maggy/b.py", "maggy/c.py"]) assert conflicts == ["maggy/a.py", "maggy/c.py"] def test_expired_locks_are_removed(self, tmp_path): db_path = tmp_path / "locks.db" manager = LockManager(db_path) expired_at = datetime.now(timezone.utc) - timedelta(minutes=31) with sqlite3.connect(db_path) as conn: conn.execute( "INSERT INTO locks(file_path, agent_id, acquired_at, expires_at) " "VALUES (?, ?, ?, ?)", ( "maggy/a.py", "agent-1", expired_at.isoformat(), expired_at.isoformat(), ), ) conn.commit() assert manager.acquire("maggy/a.py", "agent-2") is True ================================================ FILE: maggy/tests/test_deploy.py ================================================ """Tests for deploy service — session management.""" from __future__ import annotations from maggy.deploy import DeployService, DeploySession class TestDeployService: def test_create_session(self): svc = DeployService() session = svc.create_session("myapp", "main") assert session.project == "myapp" assert session.branch == "main" assert session.status == "building" def test_get_session(self): svc = DeployService() session = svc.create_session("myapp", "feat") result = svc.get_session(session.session_id) assert result is not None assert result.branch == "feat" def test_get_missing_session(self): svc = DeployService() assert svc.get_session("nonexistent") is None def test_list_sessions(self): svc = DeployService() svc.create_session("app1", "main") svc.create_session("app2", "dev") sessions = svc.list_sessions() assert len(sessions) == 2 def test_update_status(self): svc = DeployService() session = svc.create_session("myapp", "main") updated = svc.update_status( session.session_id, "live", url="https://preview.vercel.app", ) assert updated.status == "live" assert updated.url == "https://preview.vercel.app" def test_update_missing_returns_none(self): svc = DeployService() assert svc.update_status("nope", "live") is None def test_teardown(self): svc = DeployService() session = svc.create_session("myapp", "main") assert svc.teardown(session.session_id) assert svc.get_session(session.session_id) is None def test_teardown_missing(self): svc = DeployService() assert not svc.teardown("nonexistent") ================================================ FILE: maggy/tests/test_discovery.py ================================================ """Tests for environment auto-discovery.""" from __future__ import annotations import json from pathlib import Path from unittest.mock import patch import pytest from maggy.discovery import ( DiscoveryResult, _parse_org_from_url, discover_active_projects, discover_clis, discover_env_tokens, discover_repos, full_discovery, ) from maggy.process.discovery import discover_local class TestDiscoverLocal: def test_empty_project(self, tmp_path: Path): result = discover_local(tmp_path) assert result["ci"] == [] assert result["quality"] == [] assert result["review"] == [] assert result["deps"] == [] def test_detects_github_actions(self, tmp_path: Path): (tmp_path / ".github" / "workflows").mkdir(parents=True) result = discover_local(tmp_path) assert "github_actions" in result["ci"] def test_detects_jenkins(self, tmp_path: Path): (tmp_path / "Jenkinsfile").touch() result = discover_local(tmp_path) assert "jenkins" in result["ci"] def test_detects_circleci(self, tmp_path: Path): (tmp_path / ".circleci").mkdir() result = discover_local(tmp_path) assert "circleci" in result["ci"] def test_detects_gitlab_ci(self, tmp_path: Path): (tmp_path / ".gitlab-ci.yml").touch() result = discover_local(tmp_path) assert "gitlab_ci" in result["ci"] def test_detects_eslint(self, tmp_path: Path): (tmp_path / ".eslintrc.json").touch() result = discover_local(tmp_path) assert "eslint" in result["quality"] def test_detects_ruff_in_pyproject(self, tmp_path: Path): pyproject = tmp_path / "pyproject.toml" pyproject.write_text("[tool.ruff]\nline-length = 88\n") result = discover_local(tmp_path) assert "ruff" in result["quality"] def test_detects_pre_commit(self, tmp_path: Path): (tmp_path / ".pre-commit-config.yaml").touch() result = discover_local(tmp_path) assert "pre-commit" in result["quality"] def test_detects_codeowners(self, tmp_path: Path): (tmp_path / "CODEOWNERS").touch() result = discover_local(tmp_path) assert "codeowners" in result["review"] def test_detects_dependabot(self, tmp_path: Path): (tmp_path / ".github").mkdir(parents=True) (tmp_path / ".github" / "dependabot.yml").touch() result = discover_local(tmp_path) assert "dependabot" in result["deps"] def test_detects_renovate(self, tmp_path: Path): (tmp_path / "renovate.json").touch() result = discover_local(tmp_path) assert "renovate" in result["deps"] # --- CLI Discovery --- class TestDiscoverClis: def test_finds_installed(self): def _which(n): return f"/usr/bin/{n}" if n == "claude" else None with patch("shutil.which", side_effect=_which): result = discover_clis() assert result == {"claude": "/usr/bin/claude"} def test_finds_none(self): with patch("shutil.which", return_value=None): result = discover_clis() assert result == {} def test_finds_all(self): with patch("shutil.which", side_effect=lambda n: f"/usr/bin/{n}"): result = discover_clis() assert len(result) == 3 assert "claude" in result # --- Repo Discovery --- class TestDiscoverRepos: def test_finds_git_repos(self, tmp_path: Path): docs = tmp_path / "Documents" docs.mkdir() repo = docs / "my-proj" repo.mkdir() (repo / ".git").mkdir() repos = discover_repos(home=tmp_path) assert len(repos) == 1 assert repos[0]["key"] == "my-proj" def test_skips_hidden_dirs(self, tmp_path: Path): docs = tmp_path / "Documents" docs.mkdir() hidden = docs / ".secret" hidden.mkdir() (hidden / ".git").mkdir() repos = discover_repos(home=tmp_path) assert repos == [] def test_depth_limited(self, tmp_path: Path): dev = tmp_path / "dev" deep = dev / "a" / "b" / "c" / "d" / "e" deep.mkdir(parents=True) (deep / ".git").mkdir() repos = discover_repos(home=tmp_path) assert repos == [] def test_max_30_repos(self, tmp_path: Path): dev = tmp_path / "dev" dev.mkdir() for i in range(35): r = dev / f"repo-{i:02d}" r.mkdir() (r / ".git").mkdir() repos = discover_repos(home=tmp_path) assert len(repos) == 30 def test_no_scan_dirs(self, tmp_path: Path): repos = discover_repos(home=tmp_path) assert repos == [] # --- Active Projects --- class TestDiscoverActiveProjects: def test_parses_history(self, tmp_path: Path): lines = [ json.dumps({"project": "/Users/me/proj-a"}), json.dumps({"project": "/Users/me/proj-a"}), json.dumps({"project": "/Users/me/proj-b"}), ] (tmp_path / "history.jsonl").write_text( "\n".join(lines) + "\n", ) projects = discover_active_projects(tmp_path) assert projects[0] == "proj-a" assert "proj-b" in projects def test_no_history_file(self, tmp_path: Path): result = discover_active_projects(tmp_path) assert result == [] def test_malformed_json(self, tmp_path: Path): content = "not-json\n{\"project\":\"/p\"}\n" (tmp_path / "history.jsonl").write_text(content) projects = discover_active_projects(tmp_path) assert projects == ["p"] # --- Env Tokens --- class TestDiscoverEnvTokens: def test_detects_tokens(self): env = {"GITHUB_TOKEN": "ghp_abc"} with patch.dict("os.environ", env, clear=True): result = discover_env_tokens() assert result["GITHUB_TOKEN"] is True assert result["ANTHROPIC_API_KEY"] is False def test_no_env_tokens(self): with patch.dict("os.environ", {}, clear=True): with patch("maggy.discovery.discover_git_token", return_value=""): result = discover_env_tokens() assert result["GITHUB_TOKEN"] is False assert result["ANTHROPIC_API_KEY"] is False assert result["ASANA_API_KEY"] is False # --- URL Parsing --- class TestParseOrgFromUrl: def test_ssh_url(self): url = "git@github.com:acme/webapp.git" assert _parse_org_from_url(url) == "acme" def test_https_url(self): url = "https://github.com/acme/webapp.git" assert _parse_org_from_url(url) == "acme" def test_non_github(self): url = "https://gitlab.com/acme/webapp.git" assert _parse_org_from_url(url) == "" # --- Full Discovery --- class TestFullDiscovery: def test_returns_result(self, tmp_path: Path): with patch("shutil.which", return_value=None): result = full_discovery(home=tmp_path) assert isinstance(result, DiscoveryResult) assert result.timestamp != "" def test_populates_repos(self, tmp_path: Path): dev = tmp_path / "dev" dev.mkdir() repo = dev / "my-app" repo.mkdir() (repo / ".git").mkdir() with patch("shutil.which", return_value=None): result = full_discovery(home=tmp_path) assert len(result.repos) == 1 assert result.repos[0]["key"] == "my-app" ================================================ FILE: maggy/tests/test_dual_planner.py ================================================ """Tests for DualPlanner orchestration.""" from __future__ import annotations from unittest.mock import AsyncMock, MagicMock import pytest from maggy.adapters.pi import RunResult from maggy.services.planner import DualPlanner def _result(output: str) -> RunResult: return RunResult(model="test", success=True, output=output) @pytest.mark.asyncio async def test_plan_uses_claude_prompt() -> None: pi = MagicMock() pi.send_prompt = AsyncMock(return_value=_result("Primary plan")) planner = DualPlanner(pi) plan = await planner.plan("Fix auth", "Add logout flow", "/tmp/work") assert plan == "Primary plan" pi.send_prompt.assert_awaited_once() args = pi.send_prompt.await_args.args assert args[0] == "claude" assert args[2] == "/tmp/work" assert args[3] == 5 assert "Fix auth" in args[1] assert "Add logout flow" in args[1] @pytest.mark.asyncio async def test_counter_check_uses_codex_prompt() -> None: pi = MagicMock() pi.send_prompt = AsyncMock(return_value=_result("Looks good")) planner = DualPlanner(pi) review = await planner.counter_check("1. Update auth\n2. Add tests", "/tmp/work") assert review == "Looks good" args = pi.send_prompt.await_args.args assert args[0] == "codex" assert args[2] == "/tmp/work" assert args[3] == 5 assert "1. Update auth" in args[1] assert "Flag conflicts as 'CONFLICT:'" in args[1] @pytest.mark.asyncio async def test_dual_plan_collects_conflicts() -> None: pi = MagicMock() pi.send_prompt = AsyncMock( side_effect=[ _result("1. Update auth\n2. Add tests"), _result("CONFLICT: use middleware\nkeep step 2"), ] ) planner = DualPlanner(pi) result = await planner.dual_plan("Fix auth", "Add logout flow", "/tmp/work") assert result.primary_plan.startswith("1. Update auth") assert result.counter_check.startswith("CONFLICT:") assert result.conflicts == ["use middleware"] ================================================ FILE: maggy/tests/test_engram.py ================================================ """Tests for Engram — record, store, retrieval, diagnostics.""" from __future__ import annotations from pathlib import Path from maggy.engram.diagnostics import AmnesiaProfile, diagnose from maggy.engram.record import EngramRecord, Origin, Validity from maggy.engram.retrieval import EngramRetrieval from maggy.engram.store import EngramStore class TestEngramRecord: def test_defaults(self): r = EngramRecord( engram_id="e1", namespace="proj-1", memory_type="fact", content="Python 3.11", ) assert r.is_active assert r.origin == Origin.EXPLICIT def test_supersede(self): r = EngramRecord( engram_id="e1", namespace="proj-1", memory_type="fact", content="test", ) r.supersede() assert not r.is_active assert r.validity == Validity.SUPERSEDED class TestEngramStore: def test_write_and_get(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") r = EngramRecord( engram_id="e1", namespace="proj-1", memory_type="fact", content="Uses FastAPI", ) store.write(r) result = store.get("e1") assert result is not None assert result.content == "Uses FastAPI" def test_get_missing(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") assert store.get("nope") is None def test_query_by_namespace(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="e1", namespace="proj-1", memory_type="fact", content="A", )) store.write(EngramRecord( engram_id="e2", namespace="proj-2", memory_type="fact", content="B", )) results = store.query(namespace="proj-1") assert len(results) == 1 assert results[0].namespace == "proj-1" def test_query_by_type(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="e1", namespace="p", memory_type="fact", content="A", )) store.write(EngramRecord( engram_id="e2", namespace="p", memory_type="decision", content="B", )) results = store.query(memory_type="decision") assert len(results) == 1 def test_count(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="e1", namespace="p", memory_type="fact", content="A", )) assert store.count() == 1 assert store.count(namespace="p") == 1 assert store.count(namespace="x") == 0 class TestRetrieval: def _seed(self, tmp_path: Path) -> EngramStore: store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="e1", namespace="proj", memory_type="fact", content="Uses FastAPI", tags=["backend", "python"], )) store.write(EngramRecord( engram_id="e2", namespace="proj", memory_type="decision", content="Chose SQLite", tags=["database"], )) return store def test_by_keyword(self, tmp_path: Path): store = self._seed(tmp_path) r = EngramRetrieval(store) results = r.by_keyword("FastAPI") assert len(results) == 1 def test_by_tag(self, tmp_path: Path): store = self._seed(tmp_path) r = EngramRetrieval(store) results = r.by_tag("backend") assert len(results) == 1 def test_by_type(self, tmp_path: Path): store = self._seed(tmp_path) r = EngramRetrieval(store) results = r.by_type("decision") assert len(results) == 1 def test_recent(self, tmp_path: Path): store = self._seed(tmp_path) r = EngramRetrieval(store) results = r.recent() assert len(results) == 2 class TestDiagnostics: def test_empty_store(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") profile = diagnose(store) assert profile.health_score == 0.0 def test_healthy_store(self, tmp_path: Path): store = EngramStore(tmp_path / "engram.db") for i, mt in enumerate( ["fact", "decision", "code_ref", "handoff"] ): store.write(EngramRecord( engram_id=f"e{i}", namespace="p", memory_type=mt, content=f"content {i}", )) profile = diagnose(store) assert profile.total_memories == 4 assert profile.active_count == 4 assert profile.health_score > 0.8 class TestEngramSeed: """Seed engrams on first boot for non-zero health.""" def test_seed_writes_all_types(self, tmp_path: Path): from maggy.engram.seed import seed_if_empty store = EngramStore(tmp_path / "engram.db") seed_if_empty(store) profile = diagnose(store) assert profile.facts > 0 assert profile.decisions > 0 assert profile.code_refs > 0 assert profile.handoffs > 0 def test_seed_gives_healthy_score(self, tmp_path: Path): from maggy.engram.seed import seed_if_empty store = EngramStore(tmp_path / "engram.db") seed_if_empty(store) profile = diagnose(store) assert profile.health_score >= 0.8 def test_seed_fills_missing_types(self, tmp_path: Path): from maggy.engram.seed import seed_if_empty store = EngramStore(tmp_path / "engram.db") store.write(EngramRecord( engram_id="existing", namespace="p", memory_type="fact", content="already here", )) seed_if_empty(store) profile = diagnose(store) # Original fact kept, missing types seeded assert profile.facts >= 1 assert profile.decisions > 0 assert profile.code_refs > 0 assert profile.handoffs > 0 def test_seed_skips_when_all_types_present(self, tmp_path: Path): from maggy.engram.seed import seed_if_empty store = EngramStore(tmp_path / "engram.db") for i, mt in enumerate( ["fact", "decision", "code_ref", "handoff"], ): store.write(EngramRecord( engram_id=f"e{i}", namespace="p", memory_type=mt, content=f"c{i}", )) seed_if_empty(store) assert store.count() == 4 ================================================ FILE: maggy/tests/test_escalation.py ================================================ """Tests for human escalation packets.""" from __future__ import annotations from maggy.escalation.protocol import Escalator class TestEscalator: def test_escalate_and_get(self, tmp_path): escalator = Escalator(tmp_path / "escalations.db") packet = escalator.escalate( "session-1", "blocked on merge conflict", { "agent_state": {"task": "coordination"}, "suggested_actions": ["review lock owner"], }, ) loaded = escalator.get(packet.id) assert loaded is not None assert loaded.session_id == "session-1" assert loaded.agent_state == {"task": "coordination"} assert loaded.suggested_actions == ["review lock owner"] def test_list_pending_returns_unresolved(self, tmp_path): escalator = Escalator(tmp_path / "escalations.db") first = escalator.escalate("session-1", "needs input", {}) escalator.escalate("session-2", "waiting on human", {}) escalator.resolve(first.id, "continue with fallback") pending = escalator.list_pending() assert [packet.session_id for packet in pending] == ["session-2"] def test_resolve_marks_packet(self, tmp_path): escalator = Escalator(tmp_path / "escalations.db") packet = escalator.escalate("session-1", "needs approval", {}) resolved = escalator.resolve(packet.id, "approved") assert resolved.resolved is True assert resolved.resolution == "approved" ================================================ FILE: maggy/tests/test_event_spine.py ================================================ """Tests for Event Spine — header, typed events, emitter, store.""" from __future__ import annotations from pathlib import Path from maggy.event_spine.emitter import EventEmitter from maggy.event_spine.events import ( EVENT_TYPES, ExecutionEvent, IntentEvent, MeshEvent, OutcomeEvent, ) from maggy.event_spine.header import EventHeader from maggy.event_spine.store import EventStore class TestEventHeader: def test_defaults(self): h = EventHeader(event_type="intent") assert h.event_type == "intent" assert h.event_id # uuid generated assert h.timestamp # iso time generated assert h.schema_version == 1 assert h.confidence == 1.0 def test_custom_fields(self): h = EventHeader( event_type="execution", task_id="t1", project_id="p1", agent_id="a1", ) assert h.task_id == "t1" assert h.project_id == "p1" class TestTypedEvents: def test_all_eight_types(self): assert len(EVENT_TYPES) == 8 def test_intent_event(self): e = IntentEvent( intent_text="Add login button", decomposed_steps=["create component", "add route"], ) assert e.header.event_type == "intent" assert len(e.decomposed_steps) == 2 def test_execution_event(self): e = ExecutionEvent( tool_name="grep", duration_ms=150, success=True, ) assert e.header.event_type == "execution" assert e.duration_ms == 150 def test_outcome_event(self): e = OutcomeEvent(success=True, reward=0.9) assert e.header.event_type == "outcome" assert e.reward == 0.9 class TestEventStore: def test_write_and_query(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") h = EventHeader(event_type="intent", task_id="t1") store.write(h, {"header": {"event_type": "intent"}, "text": "hi"}) results = store.query(task_id="t1") assert len(results) == 1 def test_query_by_type(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") h1 = EventHeader(event_type="intent", task_id="t1") h2 = EventHeader(event_type="execution", task_id="t1") store.write(h1, {"type": "intent"}) store.write(h2, {"type": "execution"}) results = store.query(event_type="intent") assert len(results) == 1 def test_count(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") for i in range(5): h = EventHeader( event_type="execution", task_id=f"t{i}", ) store.write(h, {"i": i}) assert store.count(event_type="execution") == 5 assert store.count(event_type="intent") == 0 def test_limit(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") for i in range(10): h = EventHeader(event_type="intent", task_id="t1") store.write(h, {"i": i}) results = store.query(task_id="t1", limit=3) assert len(results) == 3 class TestEventEmitter: def test_emit_returns_id(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") emitter = EventEmitter(store) event = IntentEvent(intent_text="test") eid = emitter.emit(event) assert eid == event.header.event_id def test_emit_invalid_raises(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") emitter = EventEmitter(store) import pytest with pytest.raises(ValueError): emitter.emit({"not": "an event"}) def test_trace(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") emitter = EventEmitter(store) e1 = IntentEvent(intent_text="step 1") e1.header.task_id = "task-abc" e2 = ExecutionEvent(tool_name="grep") e2.header.task_id = "task-abc" emitter.emit(e1) emitter.emit(e2) trace = emitter.trace("task-abc") assert len(trace) == 2 def test_count(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") emitter = EventEmitter(store) for _ in range(3): emitter.emit(IntentEvent(intent_text="x")) assert emitter.count(event_type="intent") == 3 def test_query_by_project(self, tmp_path: Path): store = EventStore(tmp_path / "events.db") emitter = EventEmitter(store) e = IntentEvent(intent_text="x") e.header.project_id = "proj-1" emitter.emit(e) results = emitter.query(project_id="proj-1") assert len(results) == 1 ================================================ FILE: maggy/tests/test_executor_routing.py ================================================ """Tests for executor model routing and spend recording.""" from __future__ import annotations from unittest.mock import AsyncMock import pytest from maggy.adapters.pi import RunResult from maggy.providers.base import Task from maggy.services import executor_helpers from maggy.services import output_reviewer as reviewer_mod from maggy.services.executor import ExecutorService from maggy.services.executor_types import SessionCtx def _session() -> dict[str, str]: return { "id": "session-1", "task_id": "task-1", "task_title": "Test task", "mode": "plan", "working_dir": ".", "status": "running", "started_at": "", "output": "", } def _task(blast_score: int, task_type: str) -> Task: return Task( id="task-1", title="Route this task", description="Use task metadata for routing.", raw={ "blast_score": blast_score, "task_type": task_type, "security_sensitive": task_type == "security", }, ) def _ctx(session: dict, task: Task, wd: str) -> SessionCtx: return SessionCtx(session=session, task=task, wd=wd) def _patch_executor(executor, monkeypatch): """Wire fake send_prompt and context builder.""" async def fake_context(cfg, task): return "" async def fake_send( model_name: str, prompt: str, working_dir: str, max_turns: int = 20, timeout: int = 600, ) -> RunResult: return RunResult( model=model_name, success=True, output="ok", ) monkeypatch.setattr( executor_helpers, "build_icpg_context", fake_context, ) monkeypatch.setattr(executor._pi, "send_prompt", fake_send) @pytest.mark.asyncio async def test_plan_mode_routes_high_blast_to_claude( mock_cfg, tmp_path, monkeypatch, ): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session models: list[str] = [] async def fake_context(cfg, task): return "" async def tracking_send( model_name: str, prompt: str, working_dir: str, max_turns: int = 20, timeout: int = 600, ) -> RunResult: models.append(model_name) return RunResult(model=model_name, success=True, output="ok") monkeypatch.setattr(executor_helpers, "build_icpg_context", fake_context) monkeypatch.setattr(executor._pi, "send_prompt", tracking_send) task = _task(9, "general") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "plan") # Blast 9 general → codex (cost_rank=3, covers 4-10) assert models[0] == "codex" @pytest.mark.asyncio async def test_plan_records_spend(mock_cfg, tmp_path, monkeypatch): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session async def fake_context(cfg, task): return "" async def fake_send( model_name: str, prompt: str, working_dir: str, max_turns: int = 20, timeout: int = 600, ) -> RunResult: return RunResult( model=model_name, success=True, output="plan", cost_usd=1.25, ) monkeypatch.setattr(executor_helpers, "build_icpg_context", fake_context) monkeypatch.setattr(executor._pi, "send_prompt", fake_send) task = _task(3, "security") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "plan") assert executor._budget.today_spend("anthropic") == pytest.approx(1.25) @pytest.mark.asyncio async def test_tdd_high_blast_calls_dual_planner( mock_cfg, tmp_path, monkeypatch, ): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) planner_called = [] async def track_dual(ctx): planner_called.append(True) monkeypatch.setattr(executor, "_dual_plan", track_dual) task = _task(9, "feature") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "tdd") assert planner_called @pytest.mark.asyncio async def test_locks_released_after_run( mock_cfg, tmp_path, monkeypatch, ): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) wd = str(tmp_path) executor._locks.acquire(wd, "session-1") task = _task(3, "docs") ctx = _ctx(session, task, wd) await executor._run(ctx, "plan") assert executor._locks.acquire(wd, "other-agent") @pytest.mark.asyncio async def test_fatigue_tracked(mock_cfg, tmp_path, monkeypatch): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) task = _task(3, "docs") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "plan") assert executor._fatigue.dimensions["context_load"] > 0 @pytest.mark.asyncio async def test_conventions_in_prompts( mock_cfg, tmp_path, monkeypatch, ): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session prompts: list[str] = [] async def fake_context(cfg, task): return "" async def fake_send( model_name: str, prompt: str, working_dir: str, max_turns: int = 20, timeout: int = 600, ) -> RunResult: prompts.append(prompt) return RunResult(model=model_name, success=True, output="ok") monkeypatch.setattr(executor_helpers, "build_icpg_context", fake_context) monkeypatch.setattr(executor._pi, "send_prompt", fake_send) task = _task(5, "feature") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "plan") assert prompts assert "Team Conventions" in prompts[0] assert "minimum wowable product" in prompts[0] @pytest.mark.asyncio async def test_tdd_calls_reviewer(mock_cfg, tmp_path, monkeypatch): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) reviews: list[str] = [] async def fake_review(pi, label, output, wd): reviews.append(label) from maggy.services.output_reviewer import ReviewResult return ReviewResult(score=4, reason="ok") monkeypatch.setattr(reviewer_mod, "review_output", fake_review) task = _task(3, "feature") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "tdd") assert "ANALYZE" in reviews assert "WRITE TESTS" in reviews @pytest.mark.asyncio async def test_review_retry_on_low_score( mock_cfg, tmp_path, monkeypatch, ): provider = AsyncMock() executor = ExecutorService(mock_cfg, provider) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) call_count = [0] async def fake_review(pi, label, output, wd): call_count[0] += 1 from maggy.services.output_reviewer import ReviewResult if call_count[0] == 1: return ReviewResult(score=2, reason="poor") return ReviewResult(score=4, reason="ok") monkeypatch.setattr(reviewer_mod, "review_output", fake_review) task = _task(3, "feature") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "tdd") assert call_count[0] >= 2 assert "RETRY" in session["output"] @pytest.mark.asyncio async def test_status_callback_fires( mock_cfg, tmp_path, monkeypatch, ): """Status callback receives running/done events.""" provider = AsyncMock() statuses: list[dict] = [] executor = ExecutorService( mock_cfg, provider, status_cb=statuses.append, ) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) task = _task(3, "docs") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "plan") assert any(s["status"] == "running" for s in statuses) assert any(s["status"] == "done" for s in statuses) @pytest.mark.asyncio async def test_status_shows_model_name( mock_cfg, tmp_path, monkeypatch, ): """Status events include the routed model name.""" provider = AsyncMock() statuses: list[dict] = [] executor = ExecutorService( mock_cfg, provider, status_cb=statuses.append, ) session = _session() executor._sessions["session-1"] = session _patch_executor(executor, monkeypatch) task = _task(9, "general") ctx = _ctx(session, task, str(tmp_path)) await executor._run(ctx, "plan") agents = {s.get("agent") for s in statuses} assert "codex" in agents ================================================ FILE: maggy/tests/test_fatigue.py ================================================ """Tests for fatigue tracking — profiles and model comparison.""" from __future__ import annotations from maggy.fatigue import ( FatigueProfile, MODEL_CONTEXT_WINDOWS, compare_fatigue, create_profile, ) class TestFatigueProfile: def test_zero_usage_no_fatigue(self): p = FatigueProfile(model="claude", context_window=200_000) assert p.fatigue_score == 0.0 assert p.raw_utilization == 0.0 def test_full_context_high_fatigue(self): p = FatigueProfile( model="claude", context_window=200_000, tokens_used=200_000, turns=50, ) assert p.fatigue_score == 1.0 def test_half_context_moderate_fatigue(self): p = FatigueProfile( model="gpt", context_window=128_000, tokens_used=64_000, turns=10, ) score = p.fatigue_score assert 0.3 < score < 0.6 def test_zero_context_window_safe(self): p = FatigueProfile(model="x", context_window=0) assert p.raw_utilization == 0.0 class TestShouldCheckpoint: def test_below_threshold(self): p = FatigueProfile( model="claude", context_window=200_000, tokens_used=50_000, ) assert not p.should_checkpoint() def test_above_threshold(self): p = FatigueProfile( model="claude", context_window=200_000, tokens_used=180_000, turns=40, ) assert p.should_checkpoint() def test_custom_threshold(self): p = FatigueProfile( model="claude", context_window=200_000, tokens_used=100_000, ) assert p.should_checkpoint(threshold=0.3) class TestCreateProfile: def test_known_model(self): p = create_profile("claude") assert p.context_window == 200_000 def test_unknown_model_defaults(self): p = create_profile("unknown-model") assert p.context_window == 128_000 class TestCompareFatigue: def test_sorted_by_fatigue(self): p1 = FatigueProfile( model="claude", context_window=200_000, tokens_used=180_000, turns=40, ) p2 = FatigueProfile( model="gpt", context_window=128_000, tokens_used=10_000, turns=2, ) result = compare_fatigue([p1, p2]) assert result[0]["model"] == "claude" assert result[0]["fatigue"] > result[1]["fatigue"] ================================================ FILE: maggy/tests/test_forge.py ================================================ """Tests for MCP Forge connector, registry, and gap detection.""" from __future__ import annotations from pathlib import Path from maggy.forge.connector import ForgeConnector from maggy.forge.detector import GapDetector, TRIGGER_THRESHOLD from maggy.forge.registry import ForgeRegistry, ToolInfo class TestForgeRegistry: def test_empty_without_forge(self): reg = ForgeRegistry(forge_path=None) assert reg.count == 0 def test_loads_from_forge_path(self): forge = Path.home() / "Documents" / "protaige" / "mcp-forge" if not forge.exists(): return # skip if forge not available reg = ForgeRegistry(forge_path=forge) assert reg.count > 0 def test_search(self): forge = Path.home() / "Documents" / "protaige" / "mcp-forge" if not forge.exists(): return reg = ForgeRegistry(forge_path=forge) results = reg.search("stripe") assert any(t.slug == "stripe" for t in results) def test_get_missing(self): reg = ForgeRegistry(forge_path=None) assert reg.get("nonexistent") is None def test_set_enabled(self): reg = ForgeRegistry(forge_path=None) reg._tools["test"] = ToolInfo(slug="test") assert reg.set_enabled("test", False) assert not reg._tools["test"].enabled assert not reg.set_enabled("nope", False) class TestGapDetector: def test_first_record_no_trigger(self): det = GapDetector() assert not det.record_gap("email sending") def test_trigger_at_threshold(self): det = GapDetector(threshold=3) det.record_gap("email sending") det.record_gap("email sending") assert det.record_gap("email sending") def test_no_double_trigger(self): det = GapDetector(threshold=2) det.record_gap("x") det.record_gap("x") # triggers assert not det.record_gap("x") # no re-trigger def test_list_gaps(self): det = GapDetector() det.record_gap("email") det.record_gap("email") det.record_gap("sms") gaps = det.list_gaps() assert len(gaps) == 2 assert gaps[0].capability == "email" assert gaps[0].occurrences == 2 def test_reset(self): det = GapDetector() det.record_gap("x") det.record_gap("x") det.reset("x") gaps = det.list_gaps() assert len(gaps) == 0 class TestForgeConnector: def test_status(self): conn = ForgeConnector(forge_path=Path("/nonexistent")) s = conn.status() assert not s.available assert s.registry_count == 0 def test_report_gap(self): conn = ForgeConnector(forge_path=Path("/nonexistent")) r1 = conn.report_gap("payment processing") assert not r1["triggered"] def test_search_tools_empty(self): conn = ForgeConnector(forge_path=Path("/nonexistent")) assert conn.search_tools("stripe") == [] def test_with_real_forge(self): forge = Path.home() / "Documents" / "protaige" / "mcp-forge" if not forge.exists(): return conn = ForgeConnector(forge_path=forge) assert conn.available assert conn.status().registry_count > 0 results = conn.search_tools("github") assert len(results) > 0 ================================================ FILE: maggy/tests/test_heartbeat.py ================================================ """Tests for heartbeat scheduler.""" from __future__ import annotations import asyncio from unittest.mock import AsyncMock import pytest from maggy.heartbeat.scheduler import HeartbeatScheduler, Job # ── Job dataclass ──────────────────────────────────────────────────────── class TestJob: def test_defaults(self): fn = AsyncMock() job = Job(name="test", fn=fn, interval_seconds=60) assert job.name == "test" assert job.interval_seconds == 60 assert job.run_count == 0 assert job.last_run == "" assert job.last_error == "" assert job.enabled is True def test_is_due_no_last_run(self): fn = AsyncMock() job = Job(name="test", fn=fn, interval_seconds=60) assert job.is_due() is True def test_is_due_after_interval(self): from datetime import datetime, timezone, timedelta fn = AsyncMock() job = Job(name="test", fn=fn, interval_seconds=60) past = datetime.now(timezone.utc) - timedelta(seconds=120) job.last_run = past.isoformat() assert job.is_due() is True def test_not_due_before_interval(self): from datetime import datetime, timezone fn = AsyncMock() job = Job(name="test", fn=fn, interval_seconds=3600) job.last_run = datetime.now(timezone.utc).isoformat() assert job.is_due() is False # ── Scheduler ──────────────────────────────────────────────────────────── class TestSchedulerRegister: def test_register_job(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("refresh", fn, 1800) assert "refresh" in sched._jobs def test_register_duplicate_raises(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("dupe", fn, 60) with pytest.raises(ValueError, match="already registered"): sched.register("dupe", fn, 60) def test_status_returns_list(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("a", fn, 60) sched.register("b", fn, 120) result = sched.status() assert len(result) == 2 names = {r["name"] for r in result} assert names == {"a", "b"} class TestSchedulerTick: @pytest.mark.asyncio async def test_tick_runs_due_jobs(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("job1", fn, 0) await sched.tick() fn.assert_awaited_once() @pytest.mark.asyncio async def test_tick_skips_disabled(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("disabled", fn, 0) sched._jobs["disabled"].enabled = False await sched.tick() fn.assert_not_awaited() @pytest.mark.asyncio async def test_tick_records_error(self): sched = HeartbeatScheduler() fn = AsyncMock(side_effect=RuntimeError("boom")) sched.register("fail", fn, 0) await sched.tick() assert "boom" in sched._jobs["fail"].last_error @pytest.mark.asyncio async def test_tick_increments_count(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("counter", fn, 0) await sched.tick() await sched.tick() assert sched._jobs["counter"].run_count == 2 class TestSchedulerTrigger: @pytest.mark.asyncio async def test_trigger_runs_job(self): sched = HeartbeatScheduler() fn = AsyncMock(return_value=None) sched.register("manual", fn, 9999) result = await sched.trigger("manual") fn.assert_awaited_once() assert result["ok"] is True @pytest.mark.asyncio async def test_trigger_unknown_raises(self): sched = HeartbeatScheduler() with pytest.raises(KeyError, match="nope"): await sched.trigger("nope") class TestSchedulerLifecycle: @pytest.mark.asyncio async def test_start_stop(self): sched = HeartbeatScheduler() fn = AsyncMock() sched.register("tick_job", fn, 0) await sched.start() assert sched._task is not None await asyncio.sleep(0.05) await sched.stop() assert sched._task is None assert fn.await_count >= 1 # ── Jobs ───────────────────────────────────────────────────────────────── class TestJobs: @pytest.mark.asyncio async def test_refresh_history_calls_analyze(self): from types import SimpleNamespace from unittest.mock import MagicMock from maggy.heartbeat.jobs import refresh_history history = MagicMock() app = SimpleNamespace(state=SimpleNamespace(history=history)) await refresh_history(app) history.analyze.assert_called_once() @pytest.mark.asyncio async def test_refresh_history_skips_none(self): from types import SimpleNamespace from maggy.heartbeat.jobs import refresh_history app = SimpleNamespace(state=SimpleNamespace(history=None)) await refresh_history(app) # no error @pytest.mark.asyncio async def test_self_improve_calls_analyze(self): from types import SimpleNamespace from unittest.mock import MagicMock from maggy.heartbeat.jobs import self_improve intro = MagicMock() app = SimpleNamespace(state=SimpleNamespace(introspector=intro)) await self_improve(app) intro.analyze.assert_called_once() @pytest.mark.asyncio async def test_self_improve_skips_none(self): from types import SimpleNamespace from maggy.heartbeat.jobs import self_improve app = SimpleNamespace(state=SimpleNamespace(introspector=None)) await self_improve(app) # no error ================================================ FILE: maggy/tests/test_history.py ================================================ """Tests for history analyzer, store, and service.""" from __future__ import annotations import json from pathlib import Path import pytest from maggy.history.models import ( HistoryReport, ProviderUsage, SessionEntry, TimeDistribution, ) # --- Test Data Fixtures --- def _make_session( sid: str = "s1", provider: str = "claude", project: str = "myproj", prompts: int = 5, tools: int = 3, started: str = "2024-01-15T10:00:00+00:00", ended: str = "2024-01-15T10:30:00+00:00", ) -> SessionEntry: return SessionEntry( session_id=sid, provider=provider, project=project, started_at=started, ended_at=ended, prompt_count=prompts, tool_use_count=tools, models_used=["claude-sonnet-4"], topics=["auth", "tests"], summary="fix auth bug", ) @pytest.fixture def sample_sessions() -> list[SessionEntry]: return [ _make_session("s1", "claude", "proj-a", 10, 5, "2024-01-15T10:00:00+00:00", "2024-01-15T10:45:00+00:00"), _make_session("s2", "claude", "proj-a", 8, 3, "2024-01-15T14:00:00+00:00", "2024-01-15T14:20:00+00:00"), _make_session("s3", "codex", "proj-b", 5, 2, "2024-01-16T09:00:00+00:00", "2024-01-16T09:15:00+00:00"), _make_session("s4", "kimi", "proj-a", 3, 1, "2024-01-16T22:00:00+00:00", "2024-01-16T22:10:00+00:00"), ] # --- Analyzer Tests --- class TestAnalyzer: """Tests for history/analyzer.py functions.""" def test_build_report_empty(self): from maggy.history.analyzer import build_report report = build_report([]) assert report.total_sessions == 0 assert report.total_prompts == 0 assert report.providers == [] def test_build_report_with_data(self, sample_sessions): from maggy.history.analyzer import build_report report = build_report(sample_sessions) assert report.total_sessions == 4 assert report.total_prompts == 26 assert len(report.providers) == 3 def test_aggregate_by_provider(self, sample_sessions): from maggy.history.analyzer import aggregate_by_provider usage = aggregate_by_provider(sample_sessions) assert len(usage) == 3 claude = next(u for u in usage if u.provider == "claude") assert claude.session_count == 2 assert claude.prompt_count == 18 def test_aggregate_by_project(self, sample_sessions): from maggy.history.analyzer import aggregate_by_project projects = aggregate_by_project(sample_sessions) proj_a = next(p for p in projects if p.project == "proj-a") assert proj_a.total_sessions == 3 assert "claude" in proj_a.providers_used def test_compute_time_distribution(self, sample_sessions): from maggy.history.analyzer import compute_time_distribution dist = compute_time_distribution(sample_sessions) assert isinstance(dist, TimeDistribution) # s1 starts at hour 10, s4 at hour 22 assert 10 in dist.by_hour assert 22 in dist.by_hour def test_detect_patterns(self, sample_sessions): from maggy.history.analyzer import detect_patterns patterns = detect_patterns(sample_sessions) assert isinstance(patterns, list) assert len(patterns) > 0 # Should produce human-readable strings assert all(isinstance(p, str) for p in patterns) def test_extract_top_topics(self, sample_sessions): from maggy.history.analyzer import extract_top_topics topics = extract_top_topics(sample_sessions) assert isinstance(topics, list) assert "auth" in topics # --- Store Tests --- class TestHistoryStore: """Tests for history/store.py.""" def test_save_and_load_sessions(self, tmp_path: Path): from maggy.history.store import HistoryStore store = HistoryStore(tmp_path / "history.db") sessions = [_make_session("s1"), _make_session("s2")] store.save_sessions(sessions) loaded = store.load_sessions() assert len(loaded) == 2 def test_load_sessions_by_provider(self, tmp_path: Path): from maggy.history.store import HistoryStore store = HistoryStore(tmp_path / "history.db") sessions = [ _make_session("s1", "claude"), _make_session("s2", "codex"), ] store.save_sessions(sessions) claude = store.load_sessions(provider="claude") assert len(claude) == 1 assert claude[0]["provider"] == "claude" def test_save_and_load_report(self, tmp_path: Path): from maggy.history.store import HistoryStore store = HistoryStore(tmp_path / "history.db") report = HistoryReport( generated_at="2024-01-15T00:00:00Z", total_sessions=5, total_prompts=50, summary="test report", ) store.save_report(report) loaded = store.load_latest_report() assert loaded is not None assert loaded["total_sessions"] == 5 def test_load_report_empty(self, tmp_path: Path): from maggy.history.store import HistoryStore store = HistoryStore(tmp_path / "history.db") assert store.load_latest_report() is None # --- Service Tests --- class TestHistoryService: """Tests for history/service.py.""" def _isolated_dirs(self, tmp_path: Path) -> dict: """Return CLI dirs that don't exist to isolate tests.""" return { "claude": tmp_path / "no_claude", "codex": tmp_path / "no_codex", "kimi": tmp_path / "no_kimi", } def test_analyze_no_parsers(self, tmp_path: Path): from maggy.history.service import HistoryService svc = HistoryService( db_path=tmp_path / "history.db", cli_dirs=self._isolated_dirs(tmp_path), ) report = svc.analyze() assert report.total_sessions == 0 def test_analyze_with_claude(self, tmp_path: Path): from maggy.history.service import HistoryService claude_dir = tmp_path / ".claude" claude_dir.mkdir() lines = [ json.dumps({"display": "fix", "project": "/p", "sessionId": "s1", "timestamp": 1700000000000}), json.dumps({"display": "test", "project": "/p", "sessionId": "s1", "timestamp": 1700000300000}), ] (claude_dir / "history.jsonl").write_text("\n".join(lines) + "\n") dirs = self._isolated_dirs(tmp_path) dirs["claude"] = claude_dir svc = HistoryService( db_path=tmp_path / "history.db", cli_dirs=dirs, ) report = svc.analyze() assert report.total_sessions == 1 assert report.total_prompts == 2 def test_get_report_cached(self, tmp_path: Path): from maggy.history.service import HistoryService claude_dir = tmp_path / ".claude" claude_dir.mkdir() lines = [ json.dumps({"display": "x", "project": "/p", "sessionId": "s1", "timestamp": 1700000000000}), ] (claude_dir / "history.jsonl").write_text("\n".join(lines) + "\n") dirs = self._isolated_dirs(tmp_path) dirs["claude"] = claude_dir svc = HistoryService( db_path=tmp_path / "history.db", cli_dirs=dirs, ) svc.analyze() cached = svc.get_report() assert cached is not None assert cached["total_sessions"] == 1 def test_get_sessions(self, tmp_path: Path): from maggy.history.service import HistoryService claude_dir = tmp_path / ".claude" claude_dir.mkdir() lines = [ json.dumps({"display": "x", "project": "/p", "sessionId": "s1", "timestamp": 1700000000000}), ] (claude_dir / "history.jsonl").write_text("\n".join(lines) + "\n") dirs = self._isolated_dirs(tmp_path) dirs["claude"] = claude_dir svc = HistoryService( db_path=tmp_path / "history.db", cli_dirs=dirs, ) svc.analyze() sessions = svc.get_sessions() assert len(sessions) == 1 ================================================ FILE: maggy/tests/test_history_parsers.py ================================================ """Tests for CLI history parsers — Claude, Codex, Kimi.""" from __future__ import annotations import json from pathlib import Path import pytest from maggy.history.parsers.claude import ClaudeHistoryParser from maggy.history.parsers.codex import CodexHistoryParser from maggy.history.parsers.kimi import KimiHistoryParser # --- Claude Parser --- class TestClaudeParser: """Tests for ClaudeHistoryParser.""" def test_not_available_missing_dir(self, tmp_path: Path): p = ClaudeHistoryParser(tmp_path / ".claude") assert p.is_available() is False def test_available_with_history(self, tmp_path: Path): claude_dir = tmp_path / ".claude" claude_dir.mkdir() (claude_dir / "history.jsonl").write_text("") p = ClaudeHistoryParser(claude_dir) assert p.is_available() is True def test_session_count_empty(self, tmp_path: Path): claude_dir = tmp_path / ".claude" claude_dir.mkdir() (claude_dir / "history.jsonl").write_text("") p = ClaudeHistoryParser(claude_dir) assert p.session_count() == 0 def test_session_count(self, tmp_path: Path): claude_dir = tmp_path / ".claude" claude_dir.mkdir() lines = [ json.dumps({"display": "fix bug", "project": "/p", "sessionId": "s1", "timestamp": 1700000000000}), json.dumps({"display": "add test", "project": "/p", "sessionId": "s1", "timestamp": 1700000100000}), json.dumps({"display": "deploy", "project": "/q", "sessionId": "s2", "timestamp": 1700001000000}), ] (claude_dir / "history.jsonl").write_text("\n".join(lines) + "\n") p = ClaudeHistoryParser(claude_dir) assert p.session_count() == 2 def test_parse_sessions(self, tmp_path: Path): claude_dir = tmp_path / ".claude" claude_dir.mkdir() lines = [ json.dumps({"display": "fix auth", "project": "/Users/test/proj", "sessionId": "s1", "timestamp": 1700000000000}), json.dumps({"display": "add tests", "project": "/Users/test/proj", "sessionId": "s1", "timestamp": 1700000300000}), json.dumps({"display": "deploy app", "project": "/Users/test/other", "sessionId": "s2", "timestamp": 1700001000000}), ] (claude_dir / "history.jsonl").write_text("\n".join(lines) + "\n") p = ClaudeHistoryParser(claude_dir) sessions = p.parse_sessions(limit=10) assert len(sessions) == 2 s1 = next(s for s in sessions if s.session_id == "s1") assert s1.provider == "claude" assert s1.prompt_count == 2 assert s1.summary == "fix auth" assert "proj" in s1.project def test_parse_empty_history(self, tmp_path: Path): claude_dir = tmp_path / ".claude" claude_dir.mkdir() (claude_dir / "history.jsonl").write_text("") p = ClaudeHistoryParser(claude_dir) assert p.parse_sessions() == [] def test_parse_with_transcript(self, tmp_path: Path): claude_dir = tmp_path / ".claude" claude_dir.mkdir() lines = [ json.dumps({"display": "task1", "project": "/Users/test/proj", "sessionId": "s1", "timestamp": 1700000000000}), ] (claude_dir / "history.jsonl").write_text("\n".join(lines) + "\n") # Create transcript directory proj_dir = claude_dir / "projects" / "-Users-test-proj" proj_dir.mkdir(parents=True) transcript = [ json.dumps({"type": "user", "message": {"role": "user", "content": "fix the bug"}, "sessionId": "s1", "timestamp": 1700000000000, "gitBranch": "feat/auth"}), json.dumps({"type": "assistant", "message": {"role": "assistant", "content": [{"type": "text", "text": "ok"}, {"type": "tool_use", "name": "read"}]}, "model": "claude-sonnet-4", "timestamp": 1700000010000}), ] (proj_dir / "s1.jsonl").write_text("\n".join(transcript) + "\n") p = ClaudeHistoryParser(claude_dir) sessions = p.parse_sessions() assert len(sessions) == 1 s = sessions[0] assert s.tool_use_count >= 1 assert "claude-sonnet-4" in s.models_used assert s.git_branch == "feat/auth" # --- Codex Parser --- class TestCodexParser: """Tests for CodexHistoryParser.""" def test_not_available_missing_dir(self, tmp_path: Path): p = CodexHistoryParser(tmp_path / ".codex") assert p.is_available() is False def test_available_with_index(self, tmp_path: Path): codex_dir = tmp_path / ".codex" codex_dir.mkdir() (codex_dir / "session_index.jsonl").write_text("") p = CodexHistoryParser(codex_dir) assert p.is_available() is True def test_session_count(self, tmp_path: Path): codex_dir = tmp_path / ".codex" codex_dir.mkdir() lines = [ json.dumps({"id": "s1", "thread_name": "fix bug", "updated_at": "2024-01-01T00:00:00Z"}), json.dumps({"id": "s2", "thread_name": "add feature", "updated_at": "2024-01-02T00:00:00Z"}), ] (codex_dir / "session_index.jsonl").write_text("\n".join(lines) + "\n") p = CodexHistoryParser(codex_dir) assert p.session_count() == 2 def test_parse_sessions(self, tmp_path: Path): codex_dir = tmp_path / ".codex" codex_dir.mkdir() index_lines = [ json.dumps({"id": "s1", "thread_name": "fix auth bug", "updated_at": "2024-01-01T10:00:00Z"}), ] (codex_dir / "session_index.jsonl").write_text("\n".join(index_lines) + "\n") history_lines = [ json.dumps({"session_id": "s1", "ts": 1704100000, "text": "fix the auth bug"}), json.dumps({"session_id": "s1", "ts": 1704100300, "text": "now add tests"}), ] (codex_dir / "history.jsonl").write_text("\n".join(history_lines) + "\n") p = CodexHistoryParser(codex_dir) sessions = p.parse_sessions() assert len(sessions) == 1 s = sessions[0] assert s.provider == "codex" assert s.prompt_count == 2 assert s.summary == "fix auth bug" def test_parse_empty(self, tmp_path: Path): codex_dir = tmp_path / ".codex" codex_dir.mkdir() (codex_dir / "session_index.jsonl").write_text("") (codex_dir / "history.jsonl").write_text("") p = CodexHistoryParser(codex_dir) assert p.parse_sessions() == [] # --- Kimi Parser --- class TestKimiParser: """Tests for KimiHistoryParser.""" def test_not_available_missing_dir(self, tmp_path: Path): p = KimiHistoryParser(tmp_path / ".kimi") assert p.is_available() is False def test_available_with_sessions(self, tmp_path: Path): kimi_dir = tmp_path / ".kimi" (kimi_dir / "sessions").mkdir(parents=True) p = KimiHistoryParser(kimi_dir) assert p.is_available() is True def test_session_count(self, tmp_path: Path): kimi_dir = tmp_path / ".kimi" sess_dir = kimi_dir / "sessions" / "abc" / "uuid1" sess_dir.mkdir(parents=True) (sess_dir / "context.jsonl").write_text("") sess_dir2 = kimi_dir / "sessions" / "abc" / "uuid2" sess_dir2.mkdir(parents=True) (sess_dir2 / "context.jsonl").write_text("") p = KimiHistoryParser(kimi_dir) assert p.session_count() == 2 def test_parse_sessions(self, tmp_path: Path): kimi_dir = tmp_path / ".kimi" sess_dir = kimi_dir / "sessions" / "abc" / "uuid1" sess_dir.mkdir(parents=True) ctx_lines = [ json.dumps({"role": "user", "content": "fix the deploy"}), json.dumps({"role": "assistant", "content": "sure"}), json.dumps({"role": "user", "content": "now test it"}), ] (sess_dir / "context.jsonl").write_text("\n".join(ctx_lines) + "\n") wire_lines = [ json.dumps({"timestamp": 1700000000.0, "message": '{"type":"TurnBegin"}'}), json.dumps({"timestamp": 1700000010.0, "message": '{"type":"StepBegin"}'}), json.dumps({"timestamp": 1700000300.0, "message": '{"type":"TurnBegin"}'}), ] (sess_dir / "wire.jsonl").write_text("\n".join(wire_lines) + "\n") p = KimiHistoryParser(kimi_dir) sessions = p.parse_sessions() assert len(sessions) == 1 s = sessions[0] assert s.provider == "kimi" assert s.prompt_count == 2 assert s.tool_use_count >= 1 assert s.summary == "fix the deploy" def test_parse_empty(self, tmp_path: Path): kimi_dir = tmp_path / ".kimi" (kimi_dir / "sessions").mkdir(parents=True) p = KimiHistoryParser(kimi_dir) assert p.parse_sessions() == [] def test_parse_missing_wire(self, tmp_path: Path): """Graceful when wire.jsonl is missing.""" kimi_dir = tmp_path / ".kimi" sess_dir = kimi_dir / "sessions" / "abc" / "uuid1" sess_dir.mkdir(parents=True) ctx_lines = [ json.dumps({"role": "user", "content": "hello"}), ] (sess_dir / "context.jsonl").write_text("\n".join(ctx_lines) + "\n") p = KimiHistoryParser(kimi_dir) sessions = p.parse_sessions() assert len(sessions) == 1 assert sessions[0].prompt_count == 1 ================================================ FILE: maggy/tests/test_improve.py ================================================ """Tests for self-improvement signals and analysis.""" from __future__ import annotations from types import SimpleNamespace from unittest.mock import MagicMock, patch import pytest from maggy.improve.models import ( ImprovementReport, Recommendation, SignalBundle, ) # ── Models ─────────────────────────────────────────────────────────────── class TestModels: def test_recommendation_defaults(self): rec = Recommendation( category="routing", severity="info", message="test", suggestion="do something", ) assert rec.data == {} def test_signal_bundle_defaults(self): bundle = SignalBundle() assert bundle.routing == {} assert bundle.collected_at == "" def test_improvement_report(self): report = ImprovementReport( generated_at="2025-01-01", total_signals=3, recommendations=[], health_summary={"routing": 0.8}, top_actions=["fix routing"], ) assert report.total_signals == 3 # ── Signal Collectors ──────────────────────────────────────────────────── class TestCollectRouting: def test_collects_heatmap(self): from maggy.improve.signals import collect_routing routing = MagicMock() routing.get_heatmap.return_value = [ {"model": "a", "task_type": "bug", "avg_reward": 0.8, "count": 10}, ] result = collect_routing(routing) assert len(result["heatmap"]) == 1 assert result["underperformers"] == [] def test_flags_underperformers(self): from maggy.improve.signals import collect_routing routing = MagicMock() routing.get_heatmap.return_value = [ {"model": "bad", "task_type": "bug", "avg_reward": 0.2, "count": 10}, ] result = collect_routing(routing) assert len(result["underperformers"]) == 1 class TestCollectEvents: def test_calculates_failure_rate(self): from maggy.improve.signals import collect_events events = MagicMock() events.query.return_value = [ {"success": True}, {"success": False}, {"success": True}, {"success": True}, ] result = collect_events(events) assert result["total"] == 4 assert result["failures"] == 1 assert result["failure_rate"] == 0.25 def test_empty_events(self): from maggy.improve.signals import collect_events events = MagicMock() events.query.return_value = [] result = collect_events(events) assert result["failure_rate"] == 0.0 class TestCollectHistory: def test_returns_patterns(self): from maggy.improve.signals import collect_history history = MagicMock() history.get_report.return_value = { "total_sessions": 50, "patterns": ["dominance"], "by_provider": {"claude": 40, "codex": 10}, } result = collect_history(history) assert result["sessions"] == 50 def test_no_report(self): from maggy.improve.signals import collect_history history = MagicMock() history.get_report.return_value = None result = collect_history(history) assert result["sessions"] == 0 class TestCollectForge: def test_returns_gaps(self): from maggy.improve.signals import collect_forge forge = MagicMock() forge.get_gaps.return_value = [ {"name": "slack", "count": 5}, ] result = collect_forge(forge) assert result["count"] == 1 class TestCollectEngram: def test_returns_health(self): from maggy.improve.signals import collect_engram engram = MagicMock() with patch("maggy.engram.diagnostics.diagnose") as mock_diag: profile = SimpleNamespace( health_score=0.7, total_memories=100, active_count=70, superseded_count=30, ) mock_diag.return_value = profile result = collect_engram(engram) assert result["health_score"] == 0.7 class TestCollectBudget: def test_returns_status(self): from maggy.improve.signals import collect_budget budget = MagicMock() budget.budget_status.return_value = { "utilization": 0.5, "status": "ok", } result = collect_budget(budget) assert result["utilization"] == 0.5 class TestCollectAll: def test_skips_none_services(self): from maggy.improve.signals import collect_all state = SimpleNamespace( routing=None, events=None, history=None, forge=None, engram=None, budget=None, ) bundle = collect_all(state) assert bundle.routing == {} assert bundle.events == {} # ── Analyzer ───────────────────────────────────────────────────────────── class TestAnalyzeRouting: def test_flags_underperformers(self): from maggy.improve.analyzer import analyze_routing signals = SignalBundle( routing={"underperformers": [ {"model": "bad", "task_type": "bug", "avg_reward": 0.2}, ]}, ) recs = analyze_routing(signals) assert len(recs) == 1 assert recs[0].category == "routing" def test_no_issues(self): from maggy.improve.analyzer import analyze_routing signals = SignalBundle(routing={"underperformers": []}) assert analyze_routing(signals) == [] class TestAnalyzeFailures: def test_flags_high_failure(self): from maggy.improve.analyzer import analyze_failures signals = SignalBundle(events={"failure_rate": 0.25}) recs = analyze_failures(signals) assert len(recs) == 1 assert recs[0].severity == "action" def test_ok_rate(self): from maggy.improve.analyzer import analyze_failures signals = SignalBundle(events={"failure_rate": 0.1}) assert analyze_failures(signals) == [] class TestAnalyzeUsage: def test_flags_low_usage(self): from maggy.improve.analyzer import analyze_usage signals = SignalBundle(history={ "sessions": 100, "by_provider": {"codex": 3}, }) recs = analyze_usage(signals) assert len(recs) == 1 assert recs[0].category == "usage" def test_no_sessions(self): from maggy.improve.analyzer import analyze_usage signals = SignalBundle(history={"sessions": 0}) assert analyze_usage(signals) == [] class TestAnalyzeGaps: def test_surfaces_gaps(self): from maggy.improve.analyzer import analyze_gaps signals = SignalBundle(forge={ "gaps": [{"name": "slack", "count": 5}], }) recs = analyze_gaps(signals) assert len(recs) == 1 assert recs[0].category == "capability" class TestAnalyzeMemory: def test_flags_low_health(self): from maggy.improve.analyzer import analyze_memory signals = SignalBundle(engram={"health_score": 0.3}) recs = analyze_memory(signals) assert len(recs) == 1 assert recs[0].category == "memory" def test_healthy(self): from maggy.improve.analyzer import analyze_memory signals = SignalBundle(engram={"health_score": 0.8}) assert analyze_memory(signals) == [] class TestAnalyzeCost: def test_flags_high_util(self): from maggy.improve.analyzer import analyze_cost signals = SignalBundle(budget={"utilization": 0.95}) recs = analyze_cost(signals) assert len(recs) == 1 assert recs[0].category == "cost" def test_ok_util(self): from maggy.improve.analyzer import analyze_cost signals = SignalBundle(budget={"utilization": 0.5}) assert analyze_cost(signals) == [] class TestAnalyzeAll: def test_merges_all(self): from maggy.improve.analyzer import analyze_all signals = SignalBundle( routing={"underperformers": [ {"model": "x", "task_type": "bug", "avg_reward": 0.1}, ]}, events={"failure_rate": 0.3}, budget={"utilization": 0.95}, engram={"health_score": 0.2}, forge={"gaps": [{"name": "y", "count": 3}]}, history={"sessions": 0}, ) recs = analyze_all(signals) categories = {r.category for r in recs} assert "routing" in categories assert "reliability" in categories assert "cost" in categories # ── Introspector Service ───────────────────────────────────────────────── class TestIntrospector: def test_analyze_empty_state(self): from maggy.improve.service import Introspector state = SimpleNamespace( routing=None, events=None, history=None, forge=None, engram=None, budget=None, ) intro = Introspector(state) report = intro.analyze() assert report.total_signals == 0 assert report.recommendations == [] def test_get_report_none_initially(self): from maggy.improve.service import Introspector state = SimpleNamespace( routing=None, events=None, history=None, forge=None, engram=None, budget=None, ) intro = Introspector(state) assert intro.get_report() is None def test_get_report_after_analyze(self): from maggy.improve.service import Introspector state = SimpleNamespace( routing=None, events=None, history=None, forge=None, engram=None, budget=None, ) intro = Introspector(state) intro.analyze() report = intro.get_report() assert report is not None assert report.generated_at != "" def test_health_summary_populated(self): from maggy.improve.service import Introspector routing = MagicMock() routing.get_heatmap.return_value = [] events = MagicMock() events.query.return_value = [ {"success": True}, {"success": True}, ] budget = MagicMock() budget.budget_status.return_value = { "utilization": 0.5, "status": "ok", } state = SimpleNamespace( routing=routing, events=events, history=None, forge=None, engram=None, budget=budget, ) intro = Introspector(state) report = intro.analyze() assert "routing" in report.health_summary assert "reliability" in report.health_summary assert "cost" in report.health_summary ================================================ FILE: maggy/tests/test_lexon.py ================================================ """Tests for Lexon — routing, terminology, disambiguation.""" from __future__ import annotations from maggy.lexon.disambiguate import disambiguate from maggy.lexon.personalization import PersonalizationEngine from maggy.lexon.record import LexonRecord from maggy.lexon.router import LexonRouter from maggy.lexon.terminology import TermEntry, TerminologyMap class TestTerminology: def test_resolve_canonical(self): tm = TerminologyMap() assert tm.resolve("deploy") == "deploy" def test_resolve_synonym(self): tm = TerminologyMap() assert tm.resolve("ship") == "deploy" def test_resolve_unknown(self): tm = TerminologyMap() assert tm.resolve("xyzzy") is None def test_add_alias(self): tm = TerminologyMap() assert tm.add_alias("deploy", "yeet") assert tm.resolve("yeet") == "deploy" def test_add_alias_unknown_canonical(self): tm = TerminologyMap() assert not tm.add_alias("nonexistent", "alias") class TestDisambiguate: def test_high_confidence_resolves(self): result = disambiguate(0.9, ["grep"]) assert result.resolved assert result.tool == "grep" assert result.mode == "none" def test_mid_confidence_self_clarify(self): result = disambiguate(0.6, ["grep", "glob"]) assert result.resolved assert result.mode == "self_clarify" def test_low_confidence_user_clarify(self): result = disambiguate(0.4, ["grep", "glob", "find"]) assert not result.resolved assert result.mode == "user_clarify" def test_very_low_rejects(self): result = disambiguate(0.1, []) assert not result.resolved class TestPersonalization: def test_record_and_top(self): pe = PersonalizationEngine() pe.record_use("grep") pe.record_use("grep") pe.record_use("glob") top = pe.top_tools(2) assert top[0] == "grep" def test_preferred_alias(self): pe = PersonalizationEngine() pe.record_alias("find stuff", "grep") assert pe.get_preferred("find stuff") == "grep" def test_correction(self): pe = PersonalizationEngine() pe.record_correction("test", "pytest") assert len(pe.signals.correction_pairs) == 1 class TestLexonRouter: def test_known_intent(self): lr = LexonRouter() record = lr.route("deploy my app") assert record.confidence > 0.5 assert len(record.candidates) > 0 def test_unknown_intent(self): lr = LexonRouter() record = lr.route("xyzzy plugh") assert record.disambiguation_mode == "llm" def test_learn_and_recall(self): lr = LexonRouter() lr.learn("push it live", "vercel_deploy") record = lr.route("push it live") assert record.resolved_tool == "vercel_deploy" assert record.confidence >= 0.9 def test_multiple_candidates(self): lr = LexonRouter() record = lr.route("search for files") assert record.disambiguation_mode == "llm" def test_manifest_overrides_default_tools(self): lr = LexonRouter({ "tool_manifest": { "deploy": ["shipctl"], }, }) record = lr.route("deploy release") assert record.resolved_tool == "shipctl" class TestLexonRecord: def test_ambiguous(self): r = LexonRecord(phrase="test", confidence=0.3) assert r.is_ambiguous def test_not_ambiguous(self): r = LexonRecord(phrase="test", confidence=0.9) assert not r.is_ambiguous def test_needs_user_input(self): r = LexonRecord( phrase="x", disambiguation_mode="user_clarify", ) assert r.needs_user_input ================================================ FILE: maggy/tests/test_mesh.py ================================================ """Tests for Maggy Mesh — protocol, discovery, sync, quarantine.""" from __future__ import annotations from maggy.mesh.discovery import PeerInfo, PeerRegistry from maggy.mesh.memory import MemoryType, SharedMemory from maggy.mesh.protocol import ( MeshMessage, MessageType, create_hello, create_share, ) from maggy.mesh.provenance import Provenance from maggy.mesh.quarantine import QuarantineStore from maggy.mesh.sync import SyncEngine from maggy.mesh.transport import compute_hmac, verify_hmac class TestProtocol: def test_serialize_round_trip(self): msg = create_hello("peer-1", "Alice") data = msg.serialize() restored = MeshMessage.deserialize(data) assert restored.msg_type == MessageType.HELLO assert restored.sender_id == "peer-1" def test_share_message(self): msg = create_share( "peer-1", "score:claude:fix", {"memory_type": "score", "model": "claude"}, ) assert msg.msg_type == MessageType.SHARE assert msg.payload["key"] == "score:claude:fix" class TestPeerDiscovery: def test_register_and_list(self): reg = PeerRegistry() reg.register(PeerInfo( peer_id="p1", name="Alice", address="192.168.1.1", )) assert reg.count == 1 assert reg.get("p1").name == "Alice" def test_unregister(self): reg = PeerRegistry() reg.register(PeerInfo( peer_id="p1", name="Alice", address="192.168.1.1", )) assert reg.unregister("p1") assert reg.count == 0 def test_update_seen(self): reg = PeerRegistry() reg.register(PeerInfo( peer_id="p1", name="Alice", address="192.168.1.1", )) old = reg.get("p1").last_seen reg.update_seen("p1") # May or may not change within same ms assert reg.get("p1").last_seen is not None class TestProvenance: def test_no_hop_full_confidence(self): p = Provenance(origin_peer="p1", base_confidence=1.0) assert p.effective_confidence == 1.0 def test_decay_per_hop(self): p = Provenance( origin_peer="p1", hops=3, base_confidence=1.0, ) assert p.effective_confidence == 0.7 def test_add_hop(self): p = Provenance(origin_peer="p1", hops=1) p2 = p.add_hop() assert p2.hops == 2 def test_min_confidence(self): p = Provenance( origin_peer="p1", hops=100, base_confidence=1.0, ) assert p.effective_confidence == 0.1 class TestQuarantine: def test_quarantine_and_list(self): qs = QuarantineStore() qs.quarantine("k1", "peer-1", "low conf", {"x": 1}) assert qs.count == 1 assert qs.get("k1").reason == "low conf" def test_promote(self): qs = QuarantineStore() qs.quarantine("k1", "peer-1", "test", {}) assert qs.promote("k1") assert qs.count == 0 def test_promote_missing(self): qs = QuarantineStore() assert not qs.promote("nope") class TestSync: def test_accept_high_confidence(self): qs = QuarantineStore() engine = SyncEngine(qs) mems = [ SharedMemory( key="s1", memory_type="score", confidence=0.8, source_peer="p1", ), ] result = engine.sync_incoming(mems) assert result.accepted == 1 assert engine.local_count == 1 def test_quarantine_low_confidence(self): qs = QuarantineStore() engine = SyncEngine(qs) mems = [ SharedMemory( key="s1", memory_type="score", confidence=0.3, source_peer="p1", ), ] result = engine.sync_incoming(mems) assert result.quarantined == 1 assert qs.count == 1 class TestTransport: def test_hmac_round_trip(self): sig = compute_hmac("hello", "secret") assert verify_hmac("hello", "secret", sig) def test_hmac_mismatch(self): sig = compute_hmac("hello", "secret") assert not verify_hmac("hello", "wrong", sig) ================================================ FILE: maggy/tests/test_mesh_network.py ================================================ """Tests for mesh network layer: org scanner, git discovery, transport, network, manager, publisher.""" from __future__ import annotations import json from pathlib import Path from types import SimpleNamespace from unittest.mock import AsyncMock, patch import pytest from maggy.mesh.discovery import PeerInfo # ── Org Scanner ───────────────────────────────────────── class TestEffectiveOrgs: def test_merge_scanned_and_manual(self): from maggy.mesh.org_scanner import effective_orgs result = effective_orgs( ["protaige", "edubites"], ["alinaqi"], [], ) assert result == ["alinaqi", "edubites", "protaige"] def test_excludes_orgs(self): from maggy.mesh.org_scanner import effective_orgs result = effective_orgs( ["protaige", "edubites", "alinaqi"], [], ["edubites"], ) assert "edubites" not in result assert len(result) == 2 def test_deduplicates(self): from maggy.mesh.org_scanner import effective_orgs result = effective_orgs( ["protaige"], ["protaige"], [], ) assert result == ["protaige"] def test_empty_inputs(self): from maggy.mesh.org_scanner import effective_orgs assert effective_orgs([], [], []) == [] # ── Transport ─────────────────────────────────────────── class TestDeriveOrgKey: def test_different_orgs_produce_different_keys(self): from maggy.mesh.transport import derive_org_key k1 = derive_org_key("protaige", "secret") k2 = derive_org_key("edubites", "secret") assert k1 != k2 def test_deterministic(self): from maggy.mesh.transport import derive_org_key k1 = derive_org_key("protaige", "secret") k2 = derive_org_key("protaige", "secret") assert k1 == k2 def test_returns_hex_string(self): from maggy.mesh.transport import derive_org_key key = derive_org_key("org", "secret") assert len(key) == 64 # SHA-256 hex class TestSignVerify: def test_roundtrip(self): from maggy.mesh.transport import sign_message, verify_message from maggy.mesh.protocol import create_hello msg = create_hello("peer-1", "tester") signed = sign_message(msg, "test-key") result = verify_message(signed, "test-key") assert result is not None assert result.sender_id == "peer-1" def test_wrong_key_fails(self): from maggy.mesh.transport import sign_message, verify_message from maggy.mesh.protocol import create_hello msg = create_hello("peer-1", "tester") signed = sign_message(msg, "correct-key") result = verify_message(signed, "wrong-key") assert result is None def test_invalid_json_fails(self): from maggy.mesh.transport import verify_message result = verify_message("not-json", "key") assert result is None # ── Network ───────────────────────────────────────────── class TestBuildNetwork: def test_creates_network(self, tmp_path: Path): from maggy.mesh.network import build_network from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") net = build_network("protaige", "secret", store) assert net.org == "protaige" assert net.org_key != "" def test_isolated_org_keys(self, tmp_path: Path): from maggy.mesh.network import build_network from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") n1 = build_network("protaige", "secret", store) n2 = build_network("edubites", "secret", store) assert n1.org_key != n2.org_key def test_status_returns_counts(self, tmp_path: Path): from maggy.mesh.network import build_network from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") net = build_network("test-org", "secret", store) status = net.status() assert status["org"] == "test-org" assert status["peers"] == 0 assert status["memories"] == 0 assert status["quarantined"] == 0 # ── Manager ───────────────────────────────────────────── def _make_cfg(**overrides): """Build a minimal MeshConfig-like SimpleNamespace.""" defaults = { "peer_id": "test-peer", "org_key_secret": "secret", "port": 8080, "tunnel_url": "", "git_discovery": True, } defaults.update(overrides) return SimpleNamespace(**defaults) class TestMeshManager: def test_add_and_get_network(self, tmp_path: Path): from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") mgr = MeshManager(_make_cfg(), store) net = mgr.add_network("protaige") assert net.org == "protaige" assert mgr.get_network("protaige") is net def test_missing_network_returns_none(self, tmp_path: Path): from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") mgr = MeshManager(_make_cfg(), store) assert mgr.get_network("nope") is None def test_list_networks(self, tmp_path: Path): from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") mgr = MeshManager(_make_cfg(), store) mgr.add_network("org-a") mgr.add_network("org-b") nets = mgr.list_networks() assert len(nets) == 2 def test_total_peers_across_networks(self, tmp_path: Path): from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") mgr = MeshManager(_make_cfg(), store) net = mgr.add_network("org-a") net.peers.register(PeerInfo( peer_id="p1", name="peer1", address="ws://1", org="org-a", )) assert mgr.total_peers == 1 def test_resolve_address_tunnel(self, tmp_path: Path): from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") cfg = _make_cfg(tunnel_url="wss://bore.pub/xyz") mgr = MeshManager(cfg, store) assert mgr._resolve_address() == "wss://bore.pub/xyz" def test_resolve_address_local(self, tmp_path: Path): from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") mgr = MeshManager(_make_cfg(), store) assert "127.0.0.1:8080" in mgr._resolve_address() # ── Publisher ─────────────────────────────────────────── class TestPublisher: def test_collect_scores_skips_low_count(self): from maggy.mesh.publisher import collect_scores routing = SimpleNamespace( get_heatmap=lambda: [ {"model": "m1", "task_type": "fix", "count": 2}, ], ) result = collect_scores(routing, "peer-1") assert len(result) == 0 def test_collect_scores_includes_high_count(self): from maggy.mesh.publisher import collect_scores routing = SimpleNamespace( get_heatmap=lambda: [ {"model": "m1", "task_type": "fix", "count": 10}, ], ) result = collect_scores(routing, "peer-1") assert len(result) == 1 assert result[0].memory_type == "score" def test_collect_gaps(self): from maggy.mesh.publisher import collect_gaps forge = SimpleNamespace( get_gaps=lambda: [{"name": "slack-notify"}], ) result = collect_gaps(forge, "peer-1") assert len(result) == 1 assert result[0].key == "gap:slack-notify" def test_collect_policies_filters_severity(self): from maggy.mesh.publisher import collect_policies rec = SimpleNamespace( severity="action", category="routing", message="Fix it", suggestion="Do this", ) rec_info = SimpleNamespace( severity="info", category="mem", message="FYI", suggestion="N/A", ) report = SimpleNamespace( recommendations=[rec, rec_info], ) introspector = SimpleNamespace(get_report=lambda: report) result = collect_policies(introspector, "peer-1") assert len(result) == 1 # only action severity def test_collect_all_none_services(self): from maggy.mesh.publisher import collect_all_shares state = SimpleNamespace() result = collect_all_shares(state, "peer-1") assert result == [] # ── Git Discovery (mocked HTTP) ───────────────────────── class TestGitDiscovery: @pytest.mark.asyncio async def test_ensure_repo_exists(self): from maggy.mesh.git_discovery import ensure_mesh_repo mock_resp = AsyncMock() mock_resp.status_code = 200 mock_client = AsyncMock() mock_client.get = AsyncMock(return_value=mock_resp) mock_client.__aenter__ = AsyncMock( return_value=mock_client, ) mock_client.__aexit__ = AsyncMock() with patch("httpx.AsyncClient", return_value=mock_client): result = await ensure_mesh_repo("org", "token") assert result is True @pytest.mark.asyncio async def test_ensure_repo_creates_new(self): from maggy.mesh.git_discovery import ensure_mesh_repo not_found = AsyncMock() not_found.status_code = 404 created = AsyncMock() created.status_code = 201 mock_client = AsyncMock() mock_client.get = AsyncMock(return_value=not_found) mock_client.post = AsyncMock(return_value=created) mock_client.__aenter__ = AsyncMock( return_value=mock_client, ) mock_client.__aexit__ = AsyncMock() with patch("httpx.AsyncClient", return_value=mock_client): result = await ensure_mesh_repo("org", "token") assert result is True @pytest.mark.asyncio async def test_read_peers_empty(self): from maggy.mesh.git_discovery import read_peers not_found = AsyncMock() not_found.status_code = 404 mock_client = AsyncMock() mock_client.get = AsyncMock(return_value=not_found) mock_client.__aenter__ = AsyncMock( return_value=mock_client, ) mock_client.__aexit__ = AsyncMock() with patch("httpx.AsyncClient", return_value=mock_client): result = await read_peers("org", "token") assert result == [] @pytest.mark.asyncio async def test_announce_success(self): from maggy.mesh.git_discovery import Announcement, announce not_found = AsyncMock() not_found.status_code = 404 success = AsyncMock() success.status_code = 201 mock_client = AsyncMock() mock_client.get = AsyncMock(return_value=not_found) mock_client.put = AsyncMock(return_value=success) mock_client.__aenter__ = AsyncMock( return_value=mock_client, ) mock_client.__aexit__ = AsyncMock() ann = Announcement( peer_id="peer-1", name="node", address="ws://x", ) with patch("httpx.AsyncClient", return_value=mock_client): result = await announce("org", ann, "tok") assert result is True @pytest.mark.asyncio async def test_remove_announcement(self): from maggy.mesh.git_discovery import remove_announcement found = AsyncMock() found.status_code = 200 found.json = lambda: {"sha": "abc123"} deleted = AsyncMock() deleted.status_code = 200 mock_client = AsyncMock() mock_client.get = AsyncMock(return_value=found) mock_client.delete = AsyncMock(return_value=deleted) mock_client.__aenter__ = AsyncMock( return_value=mock_client, ) mock_client.__aexit__ = AsyncMock() with patch("httpx.AsyncClient", return_value=mock_client): result = await remove_announcement( "org", "peer-1", "tok", ) assert result is True # ── Promote Flow ──────────────────────────────────────── class TestPromoteFlow: def test_promote_accepts_into_sync(self, tmp_path: Path): from maggy.mesh.network import build_network from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") net = build_network("org-a", "secret", store) net.quarantine.quarantine( key="score:m1:fix", source="peer-2", reason="low confidence", content={"model": "m1"}, memory_type="score", ) assert net.quarantine.count == 1 assert net.sync.local_count == 0 ok = net.sync.promote_from_quarantine("score:m1:fix") assert ok is True assert net.quarantine.count == 0 assert net.sync.local_count == 1 mem = net.sync.get_local("score:m1:fix") assert mem is not None assert mem.content == {"model": "m1"} def test_promote_nonexistent_returns_false( self, tmp_path: Path, ): from maggy.mesh.network import build_network from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") net = build_network("org-a", "secret", store) ok = net.sync.promote_from_quarantine("nope") assert ok is False # ── Replay Protection ────────────────────────────────── class TestReplayProtection: def test_stale_message_rejected(self): import time from maggy.mesh.transport import ( sign_message, verify_message, ) from maggy.mesh.protocol import create_hello msg = create_hello("peer-1", "tester") signed = sign_message(msg, "key") # Tamper timestamp to make it old import json envelope = json.loads(signed) envelope["ts"] = time.time() - 600 sig_field = envelope["sig"] tampered = json.dumps(envelope) result = verify_message(tampered, "key") assert result is None # ── SQLite Reload on Init ────────────────────────────── class TestSqliteReload: def test_peers_reload_from_store(self, tmp_path: Path): from maggy.mesh.discovery import PeerInfo, PeerRegistry from maggy.mesh.store import MeshStore store = MeshStore(tmp_path / "mesh.db") reg1 = PeerRegistry(store, "org-a") reg1.register(PeerInfo( peer_id="p1", name="Alice", address="ws://a", org="org-a", )) # Create new registry from same store — should reload reg2 = PeerRegistry(store, "org-a") assert reg2.count == 1 assert reg2.get("p1") is not None def test_sync_reload_from_store(self, tmp_path: Path): from maggy.mesh.memory import SharedMemory from maggy.mesh.quarantine import QuarantineStore from maggy.mesh.store import MeshStore from maggy.mesh.sync import SyncEngine store = MeshStore(tmp_path / "mesh.db") q1 = QuarantineStore(store, "org-a") s1 = SyncEngine(q1, store, "org-a") s1.sync_incoming([SharedMemory( key="k1", memory_type="score", content={"x": 1}, source_peer="p1", )]) # New engine from same store — should reload q2 = QuarantineStore(store, "org-a") s2 = SyncEngine(q2, store, "org-a") assert s2.local_count == 1 ================================================ FILE: maggy/tests/test_mesh_store.py ================================================ """Tests for mesh SQLite store.""" from __future__ import annotations from pathlib import Path import pytest from maggy.mesh.store import MeshStore @pytest.fixture def store(tmp_path: Path) -> MeshStore: return MeshStore(tmp_path / "mesh.db") class TestPeerCRUD: def test_upsert_and_get(self, store: MeshStore): store.upsert_peer("p1", "Alice", "1.2.3.4", 8080, "acme") peer = store.get_peer("p1", "acme") assert peer is not None assert peer["name"] == "Alice" def test_list_by_org(self, store: MeshStore): store.upsert_peer("p1", "A", "1.1.1.1", 8080, "acme") store.upsert_peer("p2", "B", "2.2.2.2", 8080, "other") acme = store.list_peers(org="acme") assert len(acme) == 1 def test_list_all(self, store: MeshStore): store.upsert_peer("p1", "A", "1.1.1.1", 8080, "a") store.upsert_peer("p2", "B", "2.2.2.2", 8080, "b") assert len(store.list_peers()) == 2 def test_remove_peer(self, store: MeshStore): store.upsert_peer("p1", "A", "1.1.1.1", 8080, "acme") assert store.remove_peer("p1", "acme") assert store.get_peer("p1", "acme") is None def test_remove_missing(self, store: MeshStore): assert not store.remove_peer("nope", "acme") def test_upsert_updates(self, store: MeshStore): store.upsert_peer("p1", "A", "1.1.1.1", 8080, "acme") store.upsert_peer("p1", "A-new", "9.9.9.9", 8080, "acme") peer = store.get_peer("p1", "acme") assert peer["name"] == "A-new" assert peer["address"] == "9.9.9.9" class TestMemoryCRUD: def test_write_and_list(self, store: MeshStore): store.write_memory("acme", "k1", "score", {"x": 1}, "p1") mems = store.list_memories("acme") assert len(mems) == 1 assert mems[0]["key"] == "k1" def test_scoped_by_org(self, store: MeshStore): store.write_memory("acme", "k1", "score", {}, "p1") store.write_memory("other", "k2", "gap", {}, "p2") assert len(store.list_memories("acme")) == 1 assert len(store.list_memories("other")) == 1 def test_upsert_memory(self, store: MeshStore): store.write_memory("acme", "k1", "score", {"v": 1}, "p1") store.write_memory("acme", "k1", "score", {"v": 2}, "p1") mems = store.list_memories("acme") assert len(mems) == 1 assert mems[0]["content"]["v"] == 2 class TestQuarantineCRUD: def test_quarantine_and_list(self, store: MeshStore): store.quarantine_item("acme", "k1", "p1", "low conf", {"x": 1}) items = store.list_quarantined("acme") assert len(items) == 1 assert items[0]["reason"] == "low conf" def test_promote(self, store: MeshStore): store.quarantine_item("acme", "k1", "p1", "test", {}) assert store.promote_item("acme", "k1") assert len(store.list_quarantined("acme")) == 0 def test_promote_missing(self, store: MeshStore): assert not store.promote_item("acme", "nope") def test_scoped_by_org(self, store: MeshStore): store.quarantine_item("acme", "k1", "p1", "r", {}) store.quarantine_item("other", "k2", "p2", "r", {}) assert len(store.list_quarantined("acme")) == 1 ================================================ FILE: maggy/tests/test_mesh_ws.py ================================================ """Tests for WebSocket server and client.""" from __future__ import annotations import json from pathlib import Path from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock, patch import pytest from fastapi import FastAPI from fastapi.testclient import TestClient from maggy.mesh.protocol import ( MessageType, MeshMessage, create_hello, create_share, ) from maggy.mesh.transport import sign_message from maggy.mesh.ws_server import router # ── WS Server ────────────────────────────────────────── def _build_app_with_mesh(tmp_dir: Path | None = None): """Build a FastAPI app with mesh manager wired.""" import tempfile from maggy.mesh.manager import MeshManager from maggy.mesh.store import MeshStore if tmp_dir is None: tmp_dir = Path(tempfile.mkdtemp()) app = FastAPI() store = MeshStore(tmp_dir / "mesh.db") cfg = SimpleNamespace( peer_id="server-peer", org_key_secret="test-secret", port=8080, tunnel_url="", git_discovery=False, ) mgr = MeshManager(cfg, store) mgr.add_network("test-org") app.state.mesh = mgr app.include_router(router) return app, mgr class TestWsServerNoMesh: def test_no_mesh_closes_connection(self): app = FastAPI() app.state.mesh = None app.include_router(router) client = TestClient(app) with client.websocket_connect("/ws/mesh") as ws: # Server should close immediately with 1008 try: ws.receive_text() assert False, "Should have disconnected" except Exception: pass # expected disconnect class TestWsServerAuth: def test_invalid_json_closes(self): app, mgr = _build_app_with_mesh() client = TestClient(app) with pytest.raises(Exception): with client.websocket_connect("/ws/mesh") as ws: ws.send_text("not-valid-json") ws.receive_text() def test_wrong_org_closes(self): app, mgr = _build_app_with_mesh() net = mgr.get_network("test-org") hello = create_hello("client-1", "client") hello.payload["org"] = "wrong-org" signed = sign_message(hello, net.org_key) client = TestClient(app) with pytest.raises(Exception): with client.websocket_connect("/ws/mesh") as ws: ws.send_text(signed) ws.receive_text() class TestWsServerHello: def test_valid_hello_gets_reply(self): app, mgr = _build_app_with_mesh() net = mgr.get_network("test-org") hello = create_hello("client-1", "client") hello.payload["org"] = "test-org" signed = sign_message(hello, net.org_key) client = TestClient(app) with client.websocket_connect("/ws/mesh") as ws: ws.send_text(signed) reply_raw = ws.receive_text() envelope = json.loads(reply_raw) assert "payload" in envelope assert "sig" in envelope # ── WS Client ────────────────────────────────────────── class TestMeshClient: def test_init(self): from maggy.mesh.ws_client import MeshClient client = MeshClient("peer-1") assert client.connected_count == 0 def test_is_connected_false(self): from maggy.mesh.ws_client import MeshClient client = MeshClient("peer-1") assert client.is_connected("nope") is False @pytest.mark.asyncio async def test_send_no_connection(self): from maggy.mesh.ws_client import MeshClient client = MeshClient("peer-1") msg = create_hello("peer-1", "test") result = await client.send("nope", msg, "key") assert result is False @pytest.mark.asyncio async def test_broadcast_empty(self): from maggy.mesh.ws_client import MeshClient client = MeshClient("peer-1") msg = create_hello("peer-1", "test") count = await client.broadcast([], msg, "key") assert count == 0 @pytest.mark.asyncio async def test_close_all_empty(self): from maggy.mesh.ws_client import MeshClient client = MeshClient("peer-1") await client.close_all() assert client.connected_count == 0 ================================================ FILE: maggy/tests/test_mnemos_fatigue.py ================================================ """Tests for Mnemos fatigue tracking and signal logging.""" from __future__ import annotations from pathlib import Path import pytest from maggy.mnemos.fatigue import FatigueTracker from maggy.mnemos.signals import SignalLog class TestFatigueTracker: def test_composite_and_state_ok(self): tracker = FatigueTracker() tracker.record("context_load", 0.2) tracker.record("turn_pressure", 0.1) tracker.record("reread_ratio", 0.2) tracker.record("handoff_risk", 0.1) assert round(tracker.composite(), 2) == 0.15 assert tracker.state() == "ok" def test_rejects_invalid_dimension(self): tracker = FatigueTracker() with pytest.raises(ValueError, match="Unknown dimension"): tracker.record("bogus", 0.5) def test_model_switch_increases_reread_ratio(self): tracker = FatigueTracker() tracker.record("reread_ratio", 0.2) tracker.on_model_switch(128_000) assert tracker.context_window == 128_000 assert tracker.dimensions["reread_ratio"] == 0.35 def test_state_thresholds(self): tracker = FatigueTracker() for name in tracker.dimensions: tracker.record(name, 0.6) assert tracker.state() == "compress" for name in tracker.dimensions: tracker.record(name, 0.9) assert tracker.state() == "critical" class TestSignalLog: def test_append_and_recent(self, tmp_path: Path): log = SignalLog(tmp_path / "signals.jsonl") log.append({"kind": "fatigue", "value": 0.4}) log.append({"kind": "switch", "value": 1}) assert log.recent(1) == [{"kind": "switch", "value": 1}] assert log.recent(2)[0]["kind"] == "fatigue" ================================================ FILE: maggy/tests/test_monday_provider.py ================================================ """Tests for Monday.com provider — IssueTrackerProvider impl.""" from __future__ import annotations import pytest from maggy.providers.monday import MondayProvider @pytest.fixture() def provider(): return MondayProvider( api_token="test-token", board_id="18391076058", ) def test_provider_name(provider): assert provider.provider_name() == "monday" def test_to_task_maps_fields(provider): """Monday item dict maps to Task dataclass.""" item = { "id": "123", "name": "Fix login", "column_values": [ {"id": "status", "text": "Working on it"}, {"id": "person", "text": "Ali"}, ], "url": "https://monday.com/123", "created_at": "2025-01-01", "updated_at": "2025-01-02", } task = provider._to_task(item) assert task.id == "123" assert task.title == "Fix login" assert task.status == "Working on it" assert task.assignee == "Ali" @pytest.mark.asyncio() async def test_list_tasks_parses_items(provider, monkeypatch): """list_tasks returns Task objects from API response.""" import httpx class FakeResp: status_code = 200 def json(self): return {"data": {"boards": [{"items_page": { "items": [ {"id": "1", "name": "Task A", "column_values": [], "url": "", "created_at": "", "updated_at": ""}, ], }}]}} async def fake_post(self, url, **kw): return FakeResp() monkeypatch.setattr(httpx.AsyncClient, "post", fake_post) tasks = await provider.list_tasks() assert len(tasks) == 1 assert tasks[0].title == "Task A" @pytest.mark.asyncio() async def test_list_tasks_empty_board(provider, monkeypatch): """Empty board returns empty list.""" import httpx class FakeResp: status_code = 200 def json(self): return {"data": {"boards": [{"items_page": { "items": [], }}]}} async def fake_post(self, url, **kw): return FakeResp() monkeypatch.setattr(httpx.AsyncClient, "post", fake_post) tasks = await provider.list_tasks() assert tasks == [] @pytest.mark.asyncio() async def test_get_task_by_id(provider, monkeypatch): """get_task fetches single item by ID.""" import httpx class FakeResp: status_code = 200 def json(self): return {"data": {"items": [ {"id": "42", "name": "Deploy", "column_values": [], "url": "", "created_at": "", "updated_at": ""}, ]}} async def fake_post(self, url, **kw): return FakeResp() monkeypatch.setattr(httpx.AsyncClient, "post", fake_post) task = await provider.get_task("42") assert task is not None assert task.id == "42" @pytest.mark.asyncio() async def test_get_task_not_found(provider, monkeypatch): """get_task returns None for missing item.""" import httpx class FakeResp: status_code = 200 def json(self): return {"data": {"items": []}} async def fake_post(self, url, **kw): return FakeResp() monkeypatch.setattr(httpx.AsyncClient, "post", fake_post) task = await provider.get_task("999") assert task is None ================================================ FILE: maggy/tests/test_monitor.py ================================================ """Tests for MonitorService — background tracker polling.""" from __future__ import annotations import pytest from maggy.services.monitor import ( MonitorConfig, MonitorService, ) @pytest.fixture() def svc(tmp_path): return MonitorService(tmp_path / "monitors.db") def test_add_and_list(svc): """Adding a monitor config makes it listable.""" cfg = MonitorConfig(project_key="protaige", provider="github") svc.add(cfg) active = svc.list_active() assert len(active) == 1 assert active[0].project_key == "protaige" def test_remove(svc): """Removing a monitor clears it from active list.""" svc.add(MonitorConfig(project_key="zenloop", provider="asana")) svc.remove("zenloop") assert svc.list_active() == [] def test_is_new_unseen(svc): """Unseen event IDs are detected as new.""" assert svc.is_new("PR-42", "protaige") is True def test_mark_seen_not_new(svc): """After marking seen, event is no longer new.""" svc.mark_seen("PR-42", "protaige") assert svc.is_new("PR-42", "protaige") is False def test_add_duplicate_updates(svc): """Adding same project_key twice updates, not duplicates.""" svc.add(MonitorConfig(project_key="x", provider="github")) svc.add(MonitorConfig(project_key="x", provider="asana")) active = svc.list_active() assert len(active) == 1 assert active[0].provider == "asana" def test_default_interval(svc): """Default poll interval is 300 seconds.""" cfg = MonitorConfig(project_key="p", provider="github") svc.add(cfg) assert svc.list_active()[0].interval_seconds == 300 def test_status_summary(svc): """Status returns dict with counts.""" svc.add(MonitorConfig(project_key="a", provider="github")) svc.add(MonitorConfig(project_key="b", provider="asana")) status = svc.status() assert status["active"] == 2 @pytest.mark.asyncio() async def test_poll_github_prs(svc, monkeypatch): """Poll detects new GitHub PRs via httpx mock.""" import httpx cfg = MonitorConfig( project_key="protaige", provider="github", poll_command="alinaqi/AI-Playground", ) class FakeResp: status_code = 200 def json(self): return [ {"number": 1, "title": "Add auth", "html_url": "https://github.com/x/1"}, ] async def fake_get(self, url, **kw): return FakeResp() monkeypatch.setattr(httpx.AsyncClient, "get", fake_get) events = await svc.poll(cfg) assert len(events) == 1 assert events[0].title == "Add auth" ================================================ FILE: maggy/tests/test_multimodel_integration.py ================================================ """Integration test — small project with tasks across kimi, gpt, claude. Simulates Maggy routing a batch of tasks with varying complexity through the full executor pipeline, verifying each lands on the correct model and that budget/fallback/checkpoint systems work end-to-end. """ from __future__ import annotations from unittest.mock import AsyncMock, MagicMock import pytest from maggy.adapters.pi import PiAdapter, RunResult from maggy.budget import BudgetManager, TaskSpendTracker from maggy.checkpoint import CheckpointManager from maggy.config import ( CodebaseConfig, MaggyConfig, OrgConfig, ProjectConfig, StorageConfig, ) from maggy.coordination.lock_manager import LockManager from maggy.mnemos.fatigue import FatigueTracker from maggy.providers.base import Task from maggy.routing import RoutingContext, RoutingService from maggy.services.executor import ExecutorService from maggy.services.executor_types import SessionCtx from maggy.services.planner import DualPlanner # -- helpers --------------------------------------------------------------- def _project_cfg(tmp_path) -> MaggyConfig: return MaggyConfig( org=OrgConfig(name="acme"), storage=StorageConfig(path=str(tmp_path / "store.db")), codebases=[ CodebaseConfig(path=str(tmp_path / "repo"), key="webapp"), ], projects=[ ProjectConfig( name="webapp", repo="acme/webapp", path=str(tmp_path / "repo"), default_branch="main", ), ], ) def _task(blast: int, ttype: str, title: str) -> Task: return Task( id=f"TASK-{blast}", title=title, description=f"A {ttype} task with blast={blast}.", raw={ "blast_score": blast, "task_type": ttype, "security_sensitive": ttype == "security", }, ) TASKS = [ _task(1, "docs", "Update README typo"), _task(2, "formatting", "Fix lint warnings"), _task(5, "feature", "Add pagination to API"), _task(7, "refactor", "Extract auth middleware"), _task(9, "security", "Patch XSS in comments"), ] # -- 1. Routing decisions -------------------------------------------------- class TestRoutingDecisions: """Verify correct model selection per complexity.""" def test_low_blast_routes_to_cheap_tier(self, tmp_path): cfg = _project_cfg(tmp_path) svc = RoutingService(cfg) for blast in (1, 2): # Use "formatting" — "docs" is now rules-overridden ctx = RoutingContext(blast_score=blast, task_type="formatting") decision = svc.route(ctx) assert decision.primary.cost_rank <= 2, ( f"blast={blast} should route to cheap tier" ) assert decision.primary.name in ("local", "kimi") def test_mid_blast_routes_to_cheapest_capable(self, tmp_path): cfg = _project_cfg(tmp_path) svc = RoutingService(cfg) ctx = RoutingContext(blast_score=5, task_type="feature") decision = svc.route(ctx) assert decision.primary.name in ("local", "codex") def test_blast_6_routes_to_codex(self, tmp_path): cfg = _project_cfg(tmp_path) svc = RoutingService(cfg) ctx = RoutingContext(blast_score=6, task_type="feature") decision = svc.route(ctx) assert decision.primary.name == "codex" def test_high_blast_routes_to_codex_or_claude(self, tmp_path): cfg = _project_cfg(tmp_path) svc = RoutingService(cfg) ctx = RoutingContext(blast_score=9, task_type="refactor") decision = svc.route(ctx) assert decision.primary.name in ("codex", "claude") def test_security_routes_to_claude(self, tmp_path): cfg = _project_cfg(tmp_path) svc = RoutingService(cfg) ctx = RoutingContext( blast_score=3, task_type="security", security_sensitive=True, ) decision = svc.route(ctx) # Security rule override → claude name = decision.primary if isinstance( decision.primary, str, ) else decision.primary.name assert name == "claude" # -- 2. Full executor pipeline with mocked models ------------------------- class TestExecutorPipeline: """End-to-end executor routing with fake model responses.""" @pytest.mark.asyncio async def test_distributes_across_models(self, tmp_path): cfg = _project_cfg(tmp_path) (tmp_path / "repo").mkdir() provider = AsyncMock() executor = ExecutorService(cfg, provider) calls: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): calls.append(model_name) return RunResult( model=model_name, success=True, output="done", cost_usd=0.10, ) async def fake_ctx(cfg, task): return "" executor._pi.send_prompt = fake_send from maggy.services import executor_helpers executor_helpers.build_icpg_context = fake_ctx for task in TASKS: sid = f"s-{task.id}" session = { "id": sid, "task_id": task.id, "task_title": task.title, "mode": "plan", "working_dir": str(tmp_path / "repo"), "status": "running", "started_at": "", "output": "", } executor._sessions[sid] = session ctx = SessionCtx(session, task, str(tmp_path / "repo")) await executor._run(ctx, "plan") # Verify each complexity tier used a different model cheap = {"local", "kimi"} assert cheap & set(calls), "Low-blast should use cheap tier" assert "codex" in calls, "Mid-blast should use codex" assert "claude" in calls, "Security should use claude" assert len(set(calls)) >= 3, ( f"Expected >= 3 distinct models, got {set(calls)}" ) # -- 3. Budget tracking across providers ---------------------------------- class TestCrossProviderBudget: def test_spend_tracked_per_provider(self, tmp_path): cfg = _project_cfg(tmp_path) bm = BudgetManager(cfg) bm.record_spend("moonshot", "kimi-k2", 0.05) bm.record_spend("openai", "gpt-4o", 0.30) bm.record_spend("anthropic", "claude-sonnet-4", 1.20) breakdown = bm.by_provider() providers = {r["provider"] for r in breakdown} assert providers == {"moonshot", "openai", "anthropic"} def test_task_spend_halts_at_limit(self): tracker = TaskSpendTracker(max_spend=1.0) tracker.record(0.3) tracker.record(0.3) tracker.record(0.5) assert tracker.is_exceeded() assert tracker.total() == pytest.approx(1.1) # -- 4. Fallback chain on quota ------------------------------------------- class TestFallbackChain: @pytest.mark.asyncio async def test_falls_back_on_failure(self): pi = PiAdapter() calls: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): calls.append(model_name) if model_name in ("kimi", "deepseek"): return RunResult( model=model_name, success=False, error="quota", quota_hit=True, ) return RunResult( model=model_name, success=True, output="ok", ) pi.send_prompt = fake_send result = await pi.send_with_fallback( "kimi", "test prompt", "/tmp", ) assert result.success assert result.model != "kimi" assert len(calls) > 1 # -- 5. Checkpoint survives model switch ----------------------------------- class TestCheckpointHandoff: def test_checkpoint_roundtrip(self, tmp_path): mgr = CheckpointManager(tmp_path / "checkpoints") mgr.write("session-abc", { "goal": "Ship auth feature", "constraints": ["Keep tests green"], "progress": ["Step 1 done by kimi"], "model_history": ["kimi", "claude"], "current_subgoal": "Write integration tests", "fatigue_score": 0.35, }) data = mgr.read("session-abc") assert data is not None assert data["goal"] == "Ship auth feature" assert data["model_history"] == ["kimi", "claude"] assert data["fatigue_score"] == 0.35 # -- 6. Dual planning uses different models -------------------------------- class TestDualPlanning: @pytest.mark.asyncio async def test_plan_and_review_use_separate_models(self): models_used: list[str] = [] pi = MagicMock() async def fake_send(model, prompt, wd, turns=5): models_used.append(model) return RunResult( model=model, success=True, output="plan output", ) pi.send_prompt = fake_send planner = DualPlanner(pi) result = await planner.dual_plan( "Add OAuth", "Implement OAuth2 flow", "/tmp", ) assert "claude" in models_used assert "codex" in models_used assert result.primary_plan == "plan output" # -- 7. Fatigue tracks model switches -------------------------------------- class TestFatigueAcrossModels: def test_model_switch_increases_fatigue(self): tracker = FatigueTracker(context_window=200_000) tracker.record("context_load", 0.3) tracker.record("reread_ratio", 0.2) assert tracker.state() == "ok" tracker.on_model_switch(128_000) assert tracker.context_window == 128_000 assert tracker.dimensions["reread_ratio"] == 0.35 tracker.on_model_switch(128_000) assert tracker.dimensions["reread_ratio"] == 0.50 # -- 8. Lock coordination between agents ----------------------------------- class TestLockCoordination: def test_agents_cant_clobber_each_other(self, tmp_path): locks = LockManager(tmp_path / "locks.db") assert locks.acquire("src/auth.py", "kimi-agent") assert not locks.acquire("src/auth.py", "claude-agent") assert locks.acquire("src/api.py", "claude-agent") conflicts = locks.conflicts(["src/auth.py", "src/api.py"]) assert "src/auth.py" in conflicts assert "src/api.py" in conflicts locks.release("src/auth.py", "kimi-agent") assert locks.acquire("src/auth.py", "claude-agent") ================================================ FILE: maggy/tests/test_observability.py ================================================ """Tests for observability signal collection.""" from __future__ import annotations from maggy.observability import ObservabilityCollector def test_records_and_reads_recent_signals(tmp_path) -> None: collector = ObservabilityCollector(tmp_path / "signals.db") collector.record_signal("maggy", "fatigue", 0.4) collector.record_signal("maggy", "budget", 0.9) rows = collector.recent_signals("maggy") assert len(rows) == 2 assert rows[0]["signal_type"] == "budget" assert rows[1]["signal_type"] == "fatigue" def test_limits_recent_signals(tmp_path) -> None: collector = ObservabilityCollector(tmp_path / "signals.db") collector.record_signal("maggy", "fatigue", 0.2) collector.record_signal("maggy", "fatigue", 0.5) rows = collector.recent_signals("maggy", limit=1) assert len(rows) == 1 assert rows[0]["value"] == 0.5 ================================================ FILE: maggy/tests/test_output_reviewer.py ================================================ """Tests for inter-task output reviewer.""" from __future__ import annotations import pytest from maggy.services.output_reviewer import ( _parse_review, review_output, ) class TestParseReview: def test_parses_score_and_reason(self): text = "SCORE: 4\nREASON: Clean implementation" result = _parse_review(text) assert result.score == 4 assert result.reason == "Clean implementation" def test_parses_score_only(self): result = _parse_review("SCORE: 2") assert result.score == 2 assert result.reason == "" def test_no_score_returns_default(self): result = _parse_review("No structured output here") assert result.score == 3 assert result.reason == "" def test_score_out_of_range_clamped(self): assert _parse_review("SCORE: 0").score == 1 assert _parse_review("SCORE: 8").score == 5 def test_score_from_inline_text(self): result = _parse_review("The output is fine. SCORE: 5") assert result.score == 5 class TestReviewOutput: @pytest.mark.asyncio async def test_returns_review_result(self): async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): from maggy.adapters.pi import RunResult return RunResult( model=model_name, success=True, output="SCORE: 4\nREASON: Looks good", ) from maggy.adapters.pi import PiAdapter pi = PiAdapter() pi.send_prompt = fake_send result = await review_output(pi, "ANALYZE", "some output", "/tmp") assert result.score == 4 assert "Looks good" in result.reason @pytest.mark.asyncio async def test_failure_returns_passthrough(self): async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): from maggy.adapters.pi import RunResult return RunResult( model=model_name, success=False, error="model unavailable", ) from maggy.adapters.pi import PiAdapter pi = PiAdapter() pi.send_prompt = fake_send result = await review_output(pi, "IMPLEMENT", "output", "/tmp") assert result.score == 3 assert result.reason == "review unavailable" @pytest.mark.asyncio async def test_exception_returns_passthrough(self): async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): raise OSError("connection failed") from maggy.adapters.pi import PiAdapter pi = PiAdapter() pi.send_prompt = fake_send result = await review_output(pi, "ANALYZE", "output", "/tmp") assert result.score == 3 @pytest.mark.asyncio async def test_uses_local_model(self): models_used: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): models_used.append(model_name) from maggy.adapters.pi import RunResult return RunResult( model=model_name, success=True, output="SCORE: 4\nREASON: ok", ) from maggy.adapters.pi import PiAdapter pi = PiAdapter() pi.send_prompt = fake_send await review_output(pi, "ANALYZE", "output", "/tmp") assert models_used == ["local"] @pytest.mark.asyncio async def test_prompt_contains_step_and_output(self): prompts: list[str] = [] async def fake_send( model_name, prompt, wd, max_turns=20, timeout=600, ): prompts.append(prompt) from maggy.adapters.pi import RunResult return RunResult( model=model_name, success=True, output="SCORE: 3", ) from maggy.adapters.pi import PiAdapter pi = PiAdapter() pi.send_prompt = fake_send await review_output( pi, "WRITE TESTS", "test_add_user passed", "/tmp", ) assert "WRITE TESTS" in prompts[0] assert "test_add_user passed" in prompts[0] ================================================ FILE: maggy/tests/test_pi_adapter.py ================================================ """Tests for PiAdapter — model registry, fallback, quota detection.""" from __future__ import annotations import json from unittest.mock import MagicMock, patch import pytest from maggy.adapters.pi import ( ModelEntry, PiAdapter, ) class TestModelRegistry: def test_default_models_loaded(self): adapter = PiAdapter() assert len(adapter.list_models()) == 6 def test_get_known_model(self): adapter = PiAdapter() m = adapter.get_model("claude") assert m is not None assert m.provider == "anthropic" def test_get_unknown_returns_none(self): adapter = PiAdapter() assert adapter.get_model("nonexistent") is None def test_custom_models(self): custom = [ ModelEntry("test", "local", "t1", "cheap", 0.0), ] adapter = PiAdapter(models=custom) assert len(adapter.list_models()) == 1 assert adapter.get_model("test") is not None class TestFallbackChain: def test_chain_excludes_start(self): adapter = PiAdapter() chain = adapter.fallback_chain("kimi") assert "kimi" not in chain def test_chain_ordered_by_cost(self): adapter = PiAdapter() chain = adapter.fallback_chain("kimi") assert len(chain) > 0 def test_unknown_start_returns_all(self): adapter = PiAdapter() chain = adapter.fallback_chain("nonexistent") assert len(chain) == 6 class TestQuotaDetection: def test_detects_rate_limit(self): adapter = PiAdapter() assert adapter._detect_quota("Error: rate limit exceeded") def test_detects_429(self): adapter = PiAdapter() assert adapter._detect_quota("HTTP 429 Too Many Requests") def test_clean_output_no_quota(self): adapter = PiAdapter() assert not adapter._detect_quota("Task completed.") class TestBuildCommand: def test_claude_command_format(self): adapter = PiAdapter() model = adapter.get_model("claude") cmd = adapter._build_command(model, "hello", 5, "/tmp") assert "claude" in cmd[0] assert "-p" in cmd assert "--dangerously-skip-permissions" in cmd def test_non_claude_command(self): entry = ModelEntry( "test", "local", "m1", "cheap", cli_command="kimi", ) adapter = PiAdapter(models=[entry]) cmd = adapter._build_command(entry, "hello", 5, "/tmp") assert "kimi" in cmd[0] assert "--dangerously-skip-permissions" not in cmd class _FakeStream: def __init__(self, lines: list[str]): self._lines = list(lines) self.writes: list[str] = [] def readline(self) -> str: if self._lines: return self._lines.pop(0) return "" def write(self, text: str) -> None: self.writes.append(text) def flush(self) -> None: return None class _FakeProcess: def __init__(self, stdout_lines: list[str]): self.stdin = _FakeStream([]) self.stdout = _FakeStream(stdout_lines) class TestRpcMode: def test_detect_pi_uses_path_lookup(self): adapter = PiAdapter() with patch("maggy.adapters.pi.shutil.which", return_value="/bin/pi"): assert adapter._detect_pi() is True def test_send_rpc_serializes_command(self): adapter = PiAdapter() proc = _FakeProcess(['{"ok": true}\n']) with patch("maggy.adapters.pi.subprocess.Popen", return_value=proc): result = adapter.send_rpc({"command": "ping"}) assert result == {"ok": True} assert proc.stdin.writes == ['{"command":"ping"}\n'] def test_switch_model_uses_rpc(self): adapter = PiAdapter() adapter.send_rpc = MagicMock(return_value={"ok": True}) changed = adapter.switch_model("anthropic", "claude-sonnet-4") assert changed is True adapter.send_rpc.assert_called_once_with( { "command": "set_model", "provider": "anthropic", "model": "claude-sonnet-4", } ) class TestPromptResult: def test_parses_json_output(self): adapter = PiAdapter() payload = json.dumps({ "result": "All tests pass", "cost_usd": 0.05, "usage": {"input_tokens": 1500, "output_tokens": 800}, }) r = adapter._prompt_result("claude", 0, payload.encode()) assert r.success is True assert r.output == "All tests pass" assert r.cost_usd == 0.05 assert r.input_tokens == 1500 assert r.output_tokens == 800 def test_plain_text_fallback(self): adapter = PiAdapter() r = adapter._prompt_result("local", 0, b"Just text output") assert r.success is True assert r.output == "Just text output" assert r.cost_usd == 0.0 assert r.input_tokens == 0 def test_json_error_preserves_usage(self): adapter = PiAdapter() payload = json.dumps({ "result": "Error occurred", "cost_usd": 0.01, "usage": {"input_tokens": 500, "output_tokens": 100}, }) r = adapter._prompt_result("claude", 1, payload.encode()) assert r.success is False assert r.cost_usd == 0.01 assert r.input_tokens == 500 class TestStreaming: @pytest.mark.asyncio async def test_stream_events_reads_jsonl(self): adapter = PiAdapter() adapter._rpc_process = _FakeProcess( ['{"type":"start"}\n', '{"type":"done"}\n', ""] ) events = [] async for event in adapter.stream_events(): events.append(event) assert events == [{"type": "start"}, {"type": "done"}] ================================================ FILE: maggy/tests/test_planning.py ================================================ """Tests for dual-model planning orchestrator.""" from __future__ import annotations from maggy.models.plan import Plan, PlanDiff, PlanStep from maggy.planning import ( DUAL_PLAN_THRESHOLD, PlanRequest, PlanningService, _similar, ) class TestPlanModels: def test_plan_step_count(self): p = Plan( task="test", model="claude", steps=[ PlanStep(description="step 1"), PlanStep(description="step 2"), ], ) assert p.step_count == 2 def test_plan_diff_agreement_ratio(self): d = PlanDiff( agreed=["a", "b"], conflicts=[], primary_only=["c"], counter_only=[], ) assert d.agreement_ratio == 2 / 3 def test_plan_diff_empty(self): d = PlanDiff() assert d.agreement_ratio == 1.0 assert d.conflict_count == 0 class TestPlanningService: def test_below_threshold_single_plan(self, mock_cfg): svc = PlanningService(mock_cfg) req = PlanRequest(task="fix typo", blast_score=2) result = svc.plan_task(req) assert result["mode"] == "single" assert result["diff"] is None def test_above_threshold_dual_plan(self, mock_cfg): svc = PlanningService(mock_cfg) req = PlanRequest( task="refactor auth", blast_score=6, ) result = svc.plan_task(req) assert result["mode"] == "dual" assert result["diff"] is not None def test_generate_plan(self, mock_cfg): svc = PlanningService(mock_cfg) plan = svc.generate_plan("add feature", "claude") assert plan.task == "add feature" assert plan.model == "claude" assert plan.step_count >= 1 def test_diff_plans_identical(self, mock_cfg): svc = PlanningService(mock_cfg) p1 = svc.generate_plan("task", "claude") p2 = svc.generate_plan("task", "codex") diff = svc.diff_plans(p1, p2) assert len(diff.agreed) == 3 def test_should_dual_plan_boundary(self, mock_cfg): svc = PlanningService(mock_cfg) assert not svc.should_dual_plan(3) assert svc.should_dual_plan(4) assert svc.should_dual_plan(10) class TestSimilarity: def test_similar_strings(self): assert _similar( "Implement auth module", "Implement auth service", ) def test_dissimilar_strings(self): assert not _similar( "Add login button", "Fix database query", ) def test_empty_string(self): assert not _similar("", "hello") ================================================ FILE: maggy/tests/test_registry.py ================================================ """Tests for project registry and project config parsing.""" from __future__ import annotations from maggy.config import MaggyConfig, ProjectConfig, _from_dict from maggy.registry import ProjectRegistry class TestProjectConfigParsing: def test_from_dict_parses_projects(self): cfg = _from_dict({ "projects": [ { "name": "alpha", "repo": "acme/alpha", "path": "~/code/alpha", "default_branch": "main", }, { "name": "beta", "repo": "acme/beta", "path": "~/code/beta", "default_branch": "develop", "icpg": False, "cikg": True, }, ], }) assert [project.name for project in cfg.projects] == ["alpha", "beta"] assert cfg.projects[0].icpg is True assert cfg.projects[0].cikg is False assert cfg.projects[1].default_branch == "develop" assert cfg.projects[1].icpg is False assert cfg.projects[1].cikg is True class TestProjectRegistry: def test_registry_crud(self): alpha = ProjectConfig( name="alpha", repo="acme/alpha", path="/tmp/alpha", default_branch="main", ) beta = ProjectConfig( name="beta", repo="acme/beta", path="/tmp/beta", default_branch="develop", ) registry = ProjectRegistry(MaggyConfig(projects=[alpha])) assert registry.list() == [alpha] assert registry.get("alpha") == alpha registry.add(beta) assert registry.get("beta") == beta assert registry.remove("alpha") is True assert registry.get("alpha") is None assert registry.remove("alpha") is False def test_add_duplicate_raises(self): import pytest alpha = ProjectConfig( name="alpha", repo="acme/alpha", path="/tmp/alpha", default_branch="main", ) registry = ProjectRegistry(MaggyConfig(projects=[alpha])) with pytest.raises(ValueError, match="already exists"): registry.add(alpha) ================================================ FILE: maggy/tests/test_repl_cmds.py ================================================ """Tests for REPL slash command handlers.""" from __future__ import annotations from dataclasses import dataclass, field from unittest.mock import MagicMock from maggy.cli_repl_cmds import ( cmd_budget, cmd_claude_md, cmd_help, cmd_models, cmd_route, cmd_stats, cmd_use, dispatch, ) @dataclass class FakeState: working_dir: str = "/tmp/proj" session_id: str = "s1" allowed_models: list[str] = field(default_factory=list) def _mock_client(): c = MagicMock() c.budget_summary.return_value = { "spent_today_usd": 1.5, "daily_limit_usd": 10.0, "status": "ok", "input_tokens": 12500, "output_tokens": 3400, } c.budget_by_provider.return_value = [ {"provider": "anthropic", "spent_usd": 1.2}, {"provider": "openai", "spent_usd": 0.3}, ] c.models_heatmap.return_value = [ {"model": "claude", "task_type": "security", "avg_reward": 0.95, "samples": 10}, ] c.routing_rules.return_value = { "mode": "dynamic", "task_type_overrides": { "security": {"model": "claude", "reason": "deep"}, }, "model_performance": { "claude": {"success_rate": 1.0, "strengths": ["security"]}, }, } c.config.return_value = { "codebases": [{"key": "proj", "path": "/tmp/proj"}], "routing": {"mode": "dynamic"}, "budget": {"daily_limit_usd": 10.0}, } return c def test_dispatch_stats(capsys): """'/stats' dispatches to stats handler.""" client = _mock_client() state = FakeState() handled = dispatch("/stats", client, state) assert handled is True def test_dispatch_unknown(): """Unknown commands return False.""" handled = dispatch("/xyz123", MagicMock(), FakeState()) assert handled is False def test_cmd_stats(capsys): """Stats shows budget and model perf.""" cmd_stats(_mock_client()) out = capsys.readouterr().out assert "1.5" in out or "budget" in out.lower() def test_cmd_budget(capsys): """Budget shows per-provider breakdown.""" cmd_budget(_mock_client()) out = capsys.readouterr().out assert "anthropic" in out or "1.2" in out def test_cmd_route(capsys): """Route shows task type overrides.""" cmd_route(_mock_client()) out = capsys.readouterr().out assert "security" in out or "claude" in out def test_cmd_models(capsys): """Models shows reward heatmap.""" cmd_models(_mock_client()) out = capsys.readouterr().out assert "claude" in out or "0.95" in out def test_cmd_use_sets_models(): """'/use claude,codex' sets allowed_models.""" state = FakeState() cmd_use("claude,codex", state) assert state.allowed_models == ["claude", "codex"] def test_cmd_use_reset(): """'/use all' clears allowed_models.""" state = FakeState(allowed_models=["claude"]) cmd_use("all", state) assert state.allowed_models == [] def test_cmd_claude_md_missing(capsys): """Shows message when CLAUDE.md not found.""" state = FakeState(working_dir="/nonexistent_xyz_dir") cmd_claude_md(state) out = capsys.readouterr().out assert "not found" in out.lower() or "no" in out.lower() def test_cmd_stats_shows_tokens(capsys): """Stats displays token counts when available.""" cmd_stats(_mock_client()) out = capsys.readouterr().out assert "12,500" in out assert "3,400" in out def test_cmd_route_shows_tiers(capsys): """Route displays blast tier reference.""" cmd_route(_mock_client()) out = capsys.readouterr().out assert "cheap" in out.lower() assert "premium" in out.lower() def test_cmd_help(capsys): """Help lists all commands.""" cmd_help() out = capsys.readouterr().out assert "/stats" in out assert "/use" in out assert "/help" in out def test_cmd_health(capsys): """Health shows engram and mnemos status.""" from maggy.cli_repl_cmds import cmd_health client = _mock_client() client.health_dashboard.return_value = { "engram": {"health_score": 0.85, "active": 42, "total": 50}, "mnemos": {"state": "ok", "composite": 0.3}, } cmd_health(client) out = capsys.readouterr().out assert "85%" in out or "0.85" in out assert "ok" in out.lower() def test_dispatch_health(capsys): """/health dispatches to health handler.""" client = _mock_client() client.health_dashboard.return_value = { "engram": {"health_score": 0.9, "active": 10, "total": 12}, "mnemos": {"state": "ok", "composite": 0.2}, } state = FakeState() handled = dispatch("/health", client, state) assert handled is True def test_help_lists_health(capsys): """/help mentions /health command.""" cmd_help() out = capsys.readouterr().out assert "/health" in out def test_models_empty_shows_known(capsys): """Empty heatmap shows known model names.""" from maggy.cli_repl_cmds import cmd_models client = _mock_client() client.models_heatmap.return_value = [] cmd_models(client) out = capsys.readouterr().out assert "local" in out assert "claude" in out def test_use_warns_unknown_model(capsys): """/use with unknown model name prints warning.""" state = FakeState() cmd_use("badmodel,claude", state) out = capsys.readouterr().out assert "unknown" in out.lower() or "Unknown" in out def test_budget_subscription_plan(capsys): """Subscription plan shows 'Subscription' instead of dollar amounts.""" client = _mock_client() client.budget_summary.return_value = { "spent_today_usd": 0, "daily_limit_usd": 10.0, "status": "ok", "plan": "subscription", } client.budget_by_provider.return_value = [] cmd_budget(client) out = capsys.readouterr().out assert "subscription" in out.lower() def test_health_graceful_failure(capsys): """Health command handles server failure gracefully.""" from maggy.cli_repl_cmds import cmd_health client = _mock_client() client.health_dashboard.side_effect = Exception("unreachable") cmd_health(client) out = capsys.readouterr().out assert "health" in out.lower() or out == "" def test_stats_server_down(capsys): """Stats handles server failure gracefully.""" client = _mock_client() client.budget_summary.side_effect = Exception("unreachable") cmd_stats(client) # Should not crash — may show empty or partial data ================================================ FILE: maggy/tests/test_rollback.py ================================================ """Tests for rollback and savepoint recovery.""" from __future__ import annotations import subprocess import pytest from maggy.recovery.rollback import RollbackManager def _git(repo, *args: str) -> None: subprocess.run(["git", *args], cwd=repo, check=True) def _init_repo(repo) -> None: _git(repo, "init") _git(repo, "config", "user.email", "maggy@example.com") _git(repo, "config", "user.name", "Maggy Tests") (repo / "tracked.txt").write_text("v1\n") _git(repo, "add", "tracked.txt") _git(repo, "commit", "-m", "init") class TestRollbackManager: @pytest.mark.asyncio async def test_create_and_list_savepoints(self, tmp_path): _init_repo(tmp_path) manager = RollbackManager() tag = await manager.create_savepoint("session-1", str(tmp_path)) assert tag == "maggy-save-session-1" assert await manager.list_savepoints(str(tmp_path)) == [tag] @pytest.mark.asyncio async def test_rollback_resets_worktree(self, tmp_path): _init_repo(tmp_path) manager = RollbackManager() await manager.create_savepoint("session-1", str(tmp_path)) (tmp_path / "tracked.txt").write_text("changed\n") assert await manager.rollback("session-1", str(tmp_path)) is True assert (tmp_path / "tracked.txt").read_text() == "v1\n" @pytest.mark.asyncio async def test_delete_savepoint(self, tmp_path): _init_repo(tmp_path) manager = RollbackManager() await manager.create_savepoint("session-1", str(tmp_path)) assert await manager.delete_savepoint("session-1", str(tmp_path)) is True assert await manager.list_savepoints(str(tmp_path)) == [] ================================================ FILE: maggy/tests/test_routes_escalation.py ================================================ """Tests for /api/escalations endpoints.""" from __future__ import annotations from fastapi.testclient import TestClient def _app(tmp_path): """Build a minimal FastAPI app with escalation router.""" from fastapi import FastAPI from maggy.api.routes_escalation import router from maggy.config import DashboardConfig, MaggyConfig, OrgConfig, StorageConfig from maggy.escalation.protocol import Escalator cfg = MaggyConfig( org=OrgConfig(name="test"), storage=StorageConfig(path=str(tmp_path / "store.db")), dashboard=DashboardConfig(), ) app = FastAPI() app.state.cfg = cfg app.state.escalator = Escalator(tmp_path / "esc.db") app.include_router(router) return app def test_list_pending_empty(tmp_path): client = TestClient(_app(tmp_path)) resp = client.get("/api/escalations") assert resp.status_code == 200 assert resp.json() == [] def test_create_and_list(tmp_path): client = TestClient(_app(tmp_path)) body = { "session_id": "sess-1", "reason": "test failure", "context": {"task_id": "T-1"}, } resp = client.post("/api/escalations", json=body) assert resp.status_code == 201 esc_id = resp.json()["id"] resp = client.get("/api/escalations") ids = [e["id"] for e in resp.json()] assert esc_id in ids def test_resolve_escalation(tmp_path): client = TestClient(_app(tmp_path)) body = { "session_id": "sess-2", "reason": "stuck", "context": {}, } resp = client.post("/api/escalations", json=body) esc_id = resp.json()["id"] resp = client.post( f"/api/escalations/{esc_id}/resolve", json={"guidance": "retry with claude"}, ) assert resp.status_code == 200 assert resp.json()["status"] == "resolved" resp = client.get("/api/escalations") assert resp.json() == [] def test_resolve_not_found(tmp_path): client = TestClient(_app(tmp_path)) resp = client.post( "/api/escalations/bad-id/resolve", json={"guidance": "n/a"}, ) assert resp.status_code == 404 ================================================ FILE: maggy/tests/test_routes_observability.py ================================================ """Tests for /api/observability endpoints.""" from __future__ import annotations from fastapi.testclient import TestClient def _app(tmp_path): """Build a minimal FastAPI app with observability router.""" from fastapi import FastAPI from maggy.api.routes_observability import router from maggy.config import DashboardConfig, MaggyConfig, OrgConfig, StorageConfig from maggy.observability.collector import ObservabilityCollector cfg = MaggyConfig( org=OrgConfig(name="test"), storage=StorageConfig(path=str(tmp_path / "store.db")), dashboard=DashboardConfig(), ) app = FastAPI() app.state.cfg = cfg app.state.observability = ObservabilityCollector(tmp_path / "obs.db") app.include_router(router) return app def test_get_signals_empty(tmp_path): client = TestClient(_app(tmp_path)) resp = client.get("/api/observability/signals/myproject") assert resp.status_code == 200 assert resp.json() == [] def test_record_and_read(tmp_path): client = TestClient(_app(tmp_path)) body = { "project": "webapp", "signal_type": "deploy_status", "value": 1.0, } resp = client.post("/api/observability/record", json=body) assert resp.status_code == 201 resp = client.get("/api/observability/signals/webapp") signals = resp.json() assert len(signals) == 1 assert signals[0]["signal_type"] == "deploy_status" ================================================ FILE: maggy/tests/test_routes_projects.py ================================================ """Tests for /api/projects endpoints.""" from __future__ import annotations from fastapi.testclient import TestClient from maggy.registry import ProjectRegistry def _app(mock_cfg): """Build a minimal FastAPI app with projects router.""" from fastapi import FastAPI from maggy.api.routes_projects import router app = FastAPI() app.state.cfg = mock_cfg app.state.registry = ProjectRegistry(mock_cfg) app.include_router(router) return app def test_list_projects_empty(mock_cfg): client = TestClient(_app(mock_cfg)) resp = client.get("/api/projects") assert resp.status_code == 200 assert resp.json() == [] def test_add_and_list_project(mock_cfg): client = TestClient(_app(mock_cfg)) body = { "name": "webapp", "repo": "acme/webapp", "path": "/tmp/webapp", } resp = client.post("/api/projects", json=body) assert resp.status_code == 201 assert resp.json()["status"] == "created" resp = client.get("/api/projects") names = [p["name"] for p in resp.json()] assert "webapp" in names def test_get_project_not_found(mock_cfg): client = TestClient(_app(mock_cfg)) resp = client.get("/api/projects/nonexistent") assert resp.status_code == 404 def test_add_duplicate_project(mock_cfg): client = TestClient(_app(mock_cfg)) body = { "name": "dup", "repo": "acme/dup", "path": "/tmp/dup", } client.post("/api/projects", json=body) resp = client.post("/api/projects", json=body) assert resp.status_code == 409 def test_delete_project(mock_cfg): client = TestClient(_app(mock_cfg)) body = { "name": "to-delete", "repo": "acme/td", "path": "/tmp/td", } client.post("/api/projects", json=body) resp = client.delete("/api/projects/to-delete") assert resp.status_code == 200 assert resp.json()["status"] == "removed" resp = client.get("/api/projects/to-delete") assert resp.status_code == 404 ================================================ FILE: maggy/tests/test_routing_config.py ================================================ """Tests for routing config — stakes patterns, cascade policy, YAML roundtrip.""" from __future__ import annotations from pathlib import Path import yaml from maggy.routing_rules import CascadePolicy from maggy.routing_rules_defaults import default_rules from maggy.routing_rules_io import load, save, to_dict class TestStakesPatterns: def test_default_has_high_patterns(self): rules = default_rules() assert "auth" in rules.stakes.high.file_patterns assert "security" in rules.stakes.high.task_types def test_default_has_medium_patterns(self): rules = default_rules() assert "api" in rules.stakes.medium.file_patterns assert "feature" in rules.stakes.medium.task_types def test_default_low_has_empty_patterns(self): rules = default_rules() assert rules.stakes.low.file_patterns == [] class TestCascadePolicy: def test_defaults(self): policy = CascadePolicy() assert policy.enabled is True assert policy.min_blast == 5 assert policy.min_stakes == "medium" assert policy.max_attempts == 3 assert policy.quality_threshold == 3 def test_custom_values(self): policy = CascadePolicy( enabled=False, min_blast=3, min_stakes="low", max_attempts=5, ) assert policy.enabled is False assert policy.min_blast == 3 class TestYamlRoundtrip: def test_roundtrip_preserves_stakes(self, tmp_path: Path): rules = default_rules() rules.stakes.high.file_patterns.append("custom_critical") save(rules, tmp_path / "rules.yaml") loaded = load(tmp_path / "rules.yaml") assert "custom_critical" in loaded.stakes.high.file_patterns def test_roundtrip_preserves_cascade(self, tmp_path: Path): rules = default_rules() rules.cascade.min_blast = 7 save(rules, tmp_path / "rules.yaml") loaded = load(tmp_path / "rules.yaml") assert loaded.cascade.min_blast == 7 def test_roundtrip_preserves_conventions(self, tmp_path: Path): rules = default_rules() save(rules, tmp_path / "rules.yaml") loaded = load(tmp_path / "rules.yaml") assert len(loaded.conventions) == len(rules.conventions) def test_user_edits_preserved(self, tmp_path: Path): """Write, manually edit YAML, reload — edits survive.""" rules = default_rules() path = tmp_path / "rules.yaml" save(rules, path) data = yaml.safe_load(path.read_text()) data["cascade_policy"]["min_blast"] = 2 path.write_text(yaml.safe_dump(data, sort_keys=False)) loaded = load(path) assert loaded.cascade.min_blast == 2 def test_missing_file_seeds_defaults(self, tmp_path: Path): loaded = load(tmp_path / "nonexistent.yaml") assert loaded.version == 1 assert loaded.cascade.enabled is True assert "auth" in loaded.stakes.high.file_patterns class TestToDict: def test_stakes_in_output(self): rules = default_rules() d = to_dict(rules) assert "stakes_patterns" in d assert "high" in d["stakes_patterns"] def test_cascade_in_output(self): rules = default_rules() d = to_dict(rules) assert "cascade_policy" in d assert d["cascade_policy"]["enabled"] is True class TestDefaultTiers: """Default model tiers: no GPT, codex is primary.""" def test_no_gpt_in_defaults(self): from maggy.process.model_router import DEFAULT_TIERS names = [t.name for t in DEFAULT_TIERS] assert "gpt" not in names def test_codex_is_primary(self): from maggy.process.model_router import DEFAULT_TIERS codex = [t for t in DEFAULT_TIERS if t.name == "codex"] assert len(codex) == 1 assert codex[0].role == "primary" def test_codex_handles_complex(self): from maggy.process.model_router import DEFAULT_TIERS codex = [t for t in DEFAULT_TIERS if t.name == "codex"][0] assert codex.complexity_max >= 8 def test_local_kimi_handle_simple(self): from maggy.process.model_router import DEFAULT_TIERS local = [t for t in DEFAULT_TIERS if t.name == "local"][0] kimi = [t for t in DEFAULT_TIERS if t.name == "kimi"][0] assert local.complexity_max <= 5 assert kimi.complexity_max <= 5 ================================================ FILE: maggy/tests/test_routing_rules.py ================================================ """Tests for routing rules — load, save, apply, learn.""" from __future__ import annotations from pathlib import Path import pytest from maggy.routing_rules import ( ModelOverride, PerformanceRecord, RoutingRules, apply_override, learn_override, record_outcome, ) from maggy.routing_rules_defaults import default_rules from maggy.routing_rules_io import load, save @pytest.fixture() def rules_path(tmp_path: Path) -> Path: return tmp_path / "routing-rules.yaml" class TestDefaultRules: def test_seeds_task_type_overrides(self): rules = default_rules() assert "docs" in rules.task_type_overrides assert "security" in rules.task_type_overrides assert "tests" in rules.task_type_overrides def test_seeds_pipeline_phases(self): rules = default_rules() assert "spec" in rules.pipeline_phases assert "tdd_red" in rules.pipeline_phases assert rules.pipeline_phases["tdd_green"].model == "auto" def test_seeds_model_performance(self): rules = default_rules() assert "claude" in rules.model_performance assert "local" in rules.model_performance class TestLoadSave: def test_load_creates_default(self, rules_path: Path): rules = load(rules_path) assert rules_path.exists() assert "docs" in rules.task_type_overrides def test_roundtrip(self, rules_path: Path): original = default_rules() save(original, rules_path) loaded = load(rules_path) assert loaded.version == original.version assert set(loaded.task_type_overrides) == set( original.task_type_overrides, ) def test_load_existing(self, rules_path: Path): save(default_rules(), rules_path) rules = load(rules_path) assert rules.task_type_overrides["security"].model == "claude" class TestApplyOverride: def test_phase_takes_priority(self): rules = default_rules() result = apply_override(rules, "feature", "spec") assert result == "claude" def test_auto_phase_returns_none(self): rules = default_rules() result = apply_override(rules, "feature", "tdd_green") assert result is None def test_task_type_override(self): rules = default_rules() result = apply_override(rules, "security") assert result == "claude" def test_no_override_returns_none(self): rules = default_rules() result = apply_override(rules, "feature") assert result is None def test_low_confidence_ignored(self): rules = RoutingRules( task_type_overrides={ "test": ModelOverride("kimi", "weak", 0.3), }, ) result = apply_override(rules, "test") assert result is None class TestRecordOutcome: def test_updates_success_rate(self, rules_path: Path): rules = default_rules() record_outcome(rules, "claude", "feature", True, rules_path) perf = rules.model_performance["claude"] assert perf.tasks_completed == 7 assert perf.success_rate > 0.9 def test_creates_new_model(self, rules_path: Path): rules = default_rules() record_outcome(rules, "gemini", "feature", True, rules_path) assert "gemini" in rules.model_performance assert rules.model_performance["gemini"].success_rate == 1.0 def test_records_failure(self, rules_path: Path): rules = RoutingRules( model_performance={ "test": PerformanceRecord( tasks_completed=1, success_rate=1.0, ), }, ) record_outcome(rules, "test", "security", False, rules_path) assert rules.model_performance["test"].success_rate == 0.5 assert "security" in rules.model_performance["test"].weaknesses class TestLearnOverride: def test_adds_new_override(self, rules_path: Path): rules = default_rules() learn_override( rules, "frontend", "claude", "Codex too slow for frontend (280s vs 122s)", 0.8, rules_path, ) assert rules.task_type_overrides["frontend"].model == "claude" assert rules.task_type_overrides["frontend"].source == "learned" def test_persists_to_disk(self, rules_path: Path): rules = default_rules() save(rules, rules_path) learn_override( rules, "frontend", "claude", "test", 0.9, rules_path, ) reloaded = load(rules_path) assert "frontend" in reloaded.task_type_overrides ================================================ FILE: maggy/tests/test_routing_service.py ================================================ """Tests for RoutingService — routing decisions and learning.""" from __future__ import annotations from maggy.routing import RoutingContext, RoutingService from maggy.scores import MIN_SAMPLES class TestRoutingDecisions: def test_low_complexity_routes_cheap(self, mock_cfg): rs = RoutingService(mock_cfg) ctx = RoutingContext(blast_score=1, task_type="general") decision = rs.route(ctx) name = ( decision.primary if isinstance(decision.primary, str) else decision.primary.name ) assert name in ("kimi", "local", "deepseek") def test_high_complexity_routes_premium(self, mock_cfg): rs = RoutingService(mock_cfg) ctx = RoutingContext(blast_score=9, task_type="general") decision = rs.route(ctx) name = ( decision.primary if isinstance(decision.primary, str) else decision.primary.name ) assert name in ("codex", "claude") def test_security_sensitive_avoids_cheap(self, mock_cfg): rs = RoutingService(mock_cfg) ctx = RoutingContext( blast_score=3, task_type="security", security_sensitive=True, ) decision = rs.route(ctx) name = ( decision.primary if isinstance(decision.primary, str) else decision.primary.name ) assert name in ("codex", "claude") class TestRoutingLearning: def test_record_outcome(self, mock_cfg): rs = RoutingService(mock_cfg) rs.record_outcome("claude", "bug", 8, 0.95) hm = rs.get_heatmap() assert len(hm) == 1 def test_learned_override(self, mock_cfg): rs = RoutingService(mock_cfg) # Seed enough data for learning for _ in range(MIN_SAMPLES + 1): rs.record_outcome("codex", "bug", 2, 0.99) ctx = RoutingContext(blast_score=2, task_type="bug") decision = rs.route(ctx) name = ( decision.primary if isinstance(decision.primary, str) else decision.primary.name ) assert name == "codex" def test_blast_tier_mapping(self, mock_cfg): rs = RoutingService(mock_cfg) assert rs._blast_tier(0) == "low" assert rs._blast_tier(3) == "low" assert rs._blast_tier(5) == "medium" assert rs._blast_tier(8) == "high" ================================================ FILE: maggy/tests/test_scores.py ================================================ """Tests for RewardTable — record, query, best_model, heatmap.""" from __future__ import annotations from maggy.scores import MIN_SAMPLES, RewardTable class TestRewardRecord: def test_record_and_heatmap(self, mock_cfg): rt = RewardTable(mock_cfg) rt.record("claude", "bug", "high", 0.9) hm = rt.heatmap() assert len(hm) == 1 assert hm[0]["model"] == "claude" def test_multiple_records(self, mock_cfg): rt = RewardTable(mock_cfg) rt.record("claude", "bug", "high", 0.9) rt.record("gpt", "bug", "high", 0.7) hm = rt.heatmap() assert len(hm) == 2 class TestBestModel: def test_no_data_returns_none(self, mock_cfg): rt = RewardTable(mock_cfg) assert rt.best_model("bug", "high") is None def test_insufficient_samples_returns_none(self, mock_cfg): rt = RewardTable(mock_cfg) for _ in range(MIN_SAMPLES - 1): rt.record("claude", "bug", "high", 0.9) assert rt.best_model("bug", "high") is None def test_sufficient_samples_returns_best(self, mock_cfg): rt = RewardTable(mock_cfg) for _ in range(MIN_SAMPLES): rt.record("claude", "bug", "high", 0.9) for _ in range(MIN_SAMPLES): rt.record("gpt", "bug", "high", 0.5) best = rt.best_model("bug", "high") assert best == "claude" class TestHeatmap: def test_empty_heatmap(self, mock_cfg): rt = RewardTable(mock_cfg) assert rt.heatmap() == [] def test_heatmap_groups_correctly(self, mock_cfg): rt = RewardTable(mock_cfg) rt.record("claude", "bug", "high", 0.9) rt.record("claude", "feature", "low", 0.8) hm = rt.heatmap() assert len(hm) == 2 ================================================ FILE: maggy/tests/test_setup_routes.py ================================================ """Tests for setup and onboarding routes.""" from __future__ import annotations from pathlib import Path from unittest.mock import patch import pytest from fastapi import FastAPI from fastapi.testclient import TestClient from maggy.api.routes_setup import router as setup_router from maggy.config import ( DashboardConfig, MaggyConfig, StorageConfig, ) @pytest.fixture def setup_app(tmp_path: Path) -> FastAPI: """App with setup router only.""" cfg = MaggyConfig( storage=StorageConfig(path=str(tmp_path / "s.db")), dashboard=DashboardConfig(auth_mode="local"), ) app = FastAPI() app.state.cfg = cfg app.state.configured = True app.state.mode = "local" app.include_router(setup_router) return app @pytest.fixture def client(setup_app: FastAPI) -> TestClient: return TestClient(setup_app) class TestSetupStatus: def test_returns_steps(self, client: TestClient): resp = client.get("/api/setup/status") assert resp.status_code == 200 data = resp.json() assert "steps" in data assert len(data["steps"]) == 5 assert data["mode"] == "local" def test_missing_token_detected( self, client: TestClient, ): resp = client.get("/api/setup/status") data = resp.json() token_step = data["steps"][0] assert token_step["label"] == "GitHub token" assert token_step["status"] == "missing" def test_progress_format(self, client: TestClient): resp = client.get("/api/setup/status") data = resp.json() assert "/" in data["progress"] def test_configured_false_in_local( self, client: TestClient, ): resp = client.get("/api/setup/status") assert resp.json()["configured"] is False class TestSetupConfigure: @patch("maggy.config.save") def test_updates_org(self, mock_save, client): resp = client.post( "/api/setup/configure", json={"org_name": "Protaige"}, ) assert resp.status_code == 200 assert resp.json()["saved"] is True mock_save.assert_called_once() @patch("maggy.config.save") def test_updates_github_repos( self, mock_save, client, ): resp = client.post( "/api/setup/configure", json={ "github_org": "protaige", "github_repos": ["api", "web"], }, ) assert resp.json()["saved"] is True @patch("maggy.config.save") def test_empty_body_is_noop( self, mock_save, client, ): resp = client.post( "/api/setup/configure", json={}, ) assert resp.json()["saved"] is True class TestDiscoverRepos: def test_returns_repos(self, client: TestClient): resp = client.get("/api/setup/discover-repos") assert resp.status_code == 200 data = resp.json() assert "repos" in data assert isinstance(data["repos"], list) ================================================ FILE: maggy/tests/test_stakes.py ================================================ """Tests for stakes classification — HIGH/MEDIUM/LOW from task metadata.""" from __future__ import annotations from maggy.providers.base import Task from maggy.routing_rules import StakesLevel, StakesPatterns from maggy.services.stakes import classify_stakes def _task(title: str, desc: str = "", raw: dict | None = None) -> Task: return Task(id="T-1", title=title, description=desc, raw=raw or {}) class TestHighStakes: def test_auth_file_in_title(self): result = classify_stakes(_task("Fix auth.py login bug")) assert result.level == "high" def test_billing_task_type(self): task = _task("Update billing", raw={"task_type": "billing"}) result = classify_stakes(task) assert result.level == "high" def test_security_task_type(self): task = _task("Patch XSS", raw={"task_type": "security"}) result = classify_stakes(task) assert result.level == "high" def test_production_keyword_in_desc(self): task = _task("Deploy fix", "Affects production data") result = classify_stakes(task) assert result.level == "high" def test_env_file_pattern(self): result = classify_stakes(_task("Update .env variables")) assert result.level == "high" def test_migration_in_title(self): result = classify_stakes(_task("Run database migration")) assert result.level == "high" class TestMediumStakes: def test_api_route_file(self): result = classify_stakes(_task("Fix API routes handler")) assert result.level == "medium" def test_feature_task_type(self): task = _task("Add pagination", raw={"task_type": "feature"}) result = classify_stakes(task) assert result.level == "medium" def test_database_schema_change(self): result = classify_stakes(_task("Update database schema")) assert result.level == "medium" class TestLowStakes: def test_readme_update(self): result = classify_stakes(_task("Update README typo")) assert result.level == "low" def test_docs_task_type(self): task = _task("Fix docs", raw={"task_type": "docs"}) result = classify_stakes(task) assert result.level == "low" def test_formatting_task(self): task = _task("Fix lint", raw={"task_type": "formatting"}) result = classify_stakes(task) assert result.level == "low" class TestStakesResult: def test_reasons_populated(self): result = classify_stakes(_task("Fix auth.py login")) assert len(result.reasons) > 0 def test_custom_patterns(self): """classify_stakes with explicit patterns overrides defaults.""" patterns = StakesPatterns( high=StakesLevel( file_patterns=["critical"], task_types=["emergency"], keywords=["urgent"], ), medium=StakesLevel(), low=StakesLevel(), ) task = _task("Fix critical module", raw={}) result = classify_stakes(task, patterns) assert result.level == "high" ================================================ FILE: maggy/tests/test_tdd_verifier.py ================================================ """Tests for TDD verification gates.""" from __future__ import annotations import pytest from maggy.services.tdd_verifier import ( _count_collected, _count_failures, _parse_coverage, ) class TestParsers: """Parse pytest and coverage output.""" def test_count_collected_normal(self): assert _count_collected("12 tests collected") == 12 def test_count_collected_singular(self): assert _count_collected("1 test collected") == 1 def test_count_collected_missing(self): assert _count_collected("no tests ran") == 0 def test_count_failures_normal(self): assert _count_failures("3 failed, 7 passed") == 3 def test_count_failures_none(self): assert _count_failures("10 passed") == 0 def test_parse_coverage_normal(self): out = "TOTAL 500 50 90%" assert _parse_coverage(out) == 90.0 def test_parse_coverage_missing(self): assert _parse_coverage("no coverage data") == 0.0 class TestVerifyResult: """VerifyResult dataclass.""" def test_passed_result(self): from maggy.services.tdd_verifier import VerifyResult r = VerifyResult(True, "ok", 5, 0) assert r.passed is True assert r.tests_found == 5 def test_failed_result(self): from maggy.services.tdd_verifier import VerifyResult r = VerifyResult(False, "tests failing", 5, 3) assert r.passed is False assert r.tests_failed == 3 class TestVerifyFunctions: """Async verify functions with mocked subprocesses.""" @pytest.mark.asyncio async def test_verify_tests_exist_passes(self, monkeypatch): from maggy.services import tdd_verifier async def mock_run(cmd, cwd): return 0, "5 tests collected" monkeypatch.setattr(tdd_verifier, "_run_cmd", mock_run) r = await tdd_verifier.verify_tests_exist("/tmp") assert r.passed is True assert r.tests_found == 5 @pytest.mark.asyncio async def test_verify_tests_exist_fails(self, monkeypatch): from maggy.services import tdd_verifier async def mock_run(cmd, cwd): return 1, "error" monkeypatch.setattr(tdd_verifier, "_run_cmd", mock_run) r = await tdd_verifier.verify_tests_exist("/tmp") assert r.passed is False @pytest.mark.asyncio async def test_verify_tests_fail_red(self, monkeypatch): from maggy.services import tdd_verifier async def mock_run(cmd, cwd): return 1, "2 failed, 3 passed" monkeypatch.setattr(tdd_verifier, "_run_cmd", mock_run) r = await tdd_verifier.verify_tests_fail("/tmp") assert r.passed is True assert r.tests_failed == 2 @pytest.mark.asyncio async def test_verify_tests_fail_rejects_pass(self, monkeypatch): from maggy.services import tdd_verifier async def mock_run(cmd, cwd): return 0, "5 passed" monkeypatch.setattr(tdd_verifier, "_run_cmd", mock_run) r = await tdd_verifier.verify_tests_fail("/tmp") assert r.passed is False assert "expected failures" in r.detail @pytest.mark.asyncio async def test_verify_tests_pass_green(self, monkeypatch): from maggy.services import tdd_verifier async def mock_run(cmd, cwd): return 0, "10 passed" monkeypatch.setattr(tdd_verifier, "_run_cmd", mock_run) r = await tdd_verifier.verify_tests_pass("/tmp") assert r.passed is True @pytest.mark.asyncio async def test_verify_lint_clean(self, monkeypatch): from maggy.services import tdd_verifier async def mock_run(cmd, cwd): return 0, "All checks passed!" monkeypatch.setattr(tdd_verifier, "_run_cmd", mock_run) r = await tdd_verifier.verify_lint("/tmp") assert r.passed is True ================================================ FILE: maggy/tests/test_vision.py ================================================ """Tests for Maggy vision service — Ollama Qwen3-VL integration.""" from __future__ import annotations import json from pathlib import Path from unittest.mock import MagicMock, patch import pytest from maggy.services.vision import analyze_image @pytest.fixture() def png_file(tmp_path: Path) -> Path: """Create a tiny valid PNG file.""" p = tmp_path / "test.png" # Minimal 1x1 PNG p.write_bytes( b"\x89PNG\r\n\x1a\n" b"\x00\x00\x00\rIHDR" b"\x00\x00\x00\x01\x00\x00\x00\x01" b"\x08\x02\x00\x00\x00\x90wS\xde" ) return p def test_analyze_missing_file(): """Nonexistent path yields error chunk.""" chunks = list(analyze_image("/no/such/file.png")) assert any(c["type"] == "error" for c in chunks) def test_analyze_bad_extension(tmp_path: Path): """Non-image extension yields error chunk.""" txt = tmp_path / "notes.txt" txt.write_text("hello") chunks = list(analyze_image(str(txt))) assert any(c["type"] == "error" for c in chunks) def test_analyze_streams_response(png_file: Path): """Mock Ollama API returns streamed text + done.""" lines = [ json.dumps({"message": {"content": "A "}}), json.dumps({"message": {"content": "button"}}), json.dumps({"done": True}), ] mock_resp = MagicMock() mock_resp.status_code = 200 mock_resp.iter_lines.return_value = iter(lines) mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) with patch("maggy.services.vision.httpx.stream", return_value=mock_resp): chunks = list(analyze_image(str(png_file))) texts = [c["content"] for c in chunks if c["type"] == "text"] assert "A " in texts assert "button" in texts assert any(c["type"] == "done" for c in chunks) def test_analyze_with_custom_prompt(png_file: Path): """Custom prompt is passed to the Ollama API.""" captured = {} def fake_stream(method, url, **kw): captured.update(kw) mock = MagicMock() mock.status_code = 200 mock.iter_lines.return_value = iter([ json.dumps({"done": True}), ]) mock.__enter__ = lambda s: s mock.__exit__ = MagicMock(return_value=False) return mock with patch("maggy.services.vision.httpx.stream", side_effect=fake_stream): list(analyze_image(str(png_file), "What color?")) body = captured.get("json", {}) msg = body.get("messages", [{}])[0] assert "What color?" in msg.get("content", "") def test_analyze_ollama_down(png_file: Path): """Connection error yields error chunk.""" import httpx with patch("maggy.services.vision.httpx.stream", side_effect=httpx.ConnectError("refused")): chunks = list(analyze_image(str(png_file))) assert any(c["type"] == "error" for c in chunks) err = next(c for c in chunks if c["type"] == "error") assert "refused" in err["content"].lower() or "connect" in err["content"].lower() ================================================ FILE: maggy/tests/test_zero_config.py ================================================ """Tests for zero-config auto-configuration.""" from __future__ import annotations import os from pathlib import Path from unittest.mock import patch import pytest from maggy.config import MaggyConfig # --- Provider Credentials --- class TestHasProviderCredentials: def test_github_with_creds(self): from maggy.config import _has_provider_credentials cfg = MaggyConfig() cfg.issue_tracker.provider = "github" cfg.issue_tracker.github.org = "acme" cfg.issue_tracker.github.repos = ["api"] cfg.issue_tracker.github.token = "ghp_abc" assert _has_provider_credentials(cfg) is True def test_github_no_token(self): from maggy.config import _has_provider_credentials cfg = MaggyConfig() cfg.issue_tracker.provider = "github" cfg.issue_tracker.github.org = "acme" cfg.issue_tracker.github.repos = ["api"] assert _has_provider_credentials(cfg) is False def test_asana_with_creds(self): from maggy.config import _has_provider_credentials cfg = MaggyConfig() cfg.issue_tracker.provider = "asana" cfg.issue_tracker.asana.workspace_id = "w1" cfg.issue_tracker.asana.token = "tok" assert _has_provider_credentials(cfg) is True def test_linear_stub(self): from maggy.config import _has_provider_credentials cfg = MaggyConfig() cfg.issue_tracker.provider = "linear" assert _has_provider_credentials(cfg) is False # --- CLI History Detection --- class TestHasCliHistory: def test_claude_dir_exists(self, tmp_path: Path): from maggy.config import _has_cli_history (tmp_path / ".claude").mkdir() assert _has_cli_history(tmp_path) is True def test_no_dirs(self, tmp_path: Path): from maggy.config import _has_cli_history assert _has_cli_history(tmp_path) is False def test_codex_dir_exists(self, tmp_path: Path): from maggy.config import _has_cli_history (tmp_path / ".codex").mkdir() assert _has_cli_history(tmp_path) is True # --- Auto Configure --- class TestAutoConfigure: def test_builds_config(self, tmp_path: Path): from maggy.config import auto_configure with patch("shutil.which", return_value=None): cfg = auto_configure( home=tmp_path, persist=False, ) assert isinstance(cfg, MaggyConfig) def test_populates_codebases(self, tmp_path: Path): from maggy.config import auto_configure dev = tmp_path / "dev" dev.mkdir() repo = dev / "webapp" repo.mkdir() (repo / ".git").mkdir() with patch("shutil.which", return_value=None): cfg = auto_configure( home=tmp_path, persist=False, ) assert len(cfg.codebases) == 1 assert cfg.codebases[0].key == "webapp" def test_persist_writes_file(self, tmp_path: Path): from maggy.config import auto_configure config_path = tmp_path / "config.yaml" with patch("shutil.which", return_value=None), \ patch("maggy.config.CONFIG_DIR", tmp_path), \ patch("maggy.config.CONFIG_PATH", config_path): cfg = auto_configure( home=tmp_path, persist=True, ) assert config_path.exists() # --- Relaxed is_configured --- class TestIsConfiguredRelaxed: def test_false_without_anything(self, tmp_path: Path): from maggy.config import is_configured with patch("maggy.config.CONFIG_PATH", tmp_path / "nope.yaml"), \ patch("maggy.config._CACHED", None), \ patch("maggy.config._has_cli_history", return_value=False): result = is_configured() assert result is False def test_true_with_cli_history(self, tmp_path: Path): from maggy.config import is_configured with patch("maggy.config.CONFIG_PATH", tmp_path / "nope.yaml"), \ patch("maggy.config._CACHED", None), \ patch("maggy.config._has_cli_history", return_value=True): result = is_configured() assert result is True ================================================ FILE: rules/nodejs-backend.md ================================================ --- description: Node.js backend conventions paths: ["src/api/**", "src/routes/**", "src/server/**", "src/middleware/**", "server/**", "api/**"] --- ## Node.js Backend Conventions - Use Express or Fastify with typed route handlers - Repository pattern for data access - Validate request bodies with Zod at the route level - Use proper HTTP status codes (201 for creation, 404 for missing, etc.) - Add rate limiting to auth endpoints - Use structured logging (pino/winston) - Handle async errors with middleware, not try/catch in every route ================================================ FILE: rules/python.md ================================================ --- description: Python-specific conventions paths: ["**/*.py"] --- ## Python Conventions - Use type hints on all function signatures - Use Pydantic for data validation and serialization - Use pytest for testing (not unittest) - Use ruff for linting and formatting - Use mypy for type checking - Prefer dataclasses or Pydantic models over plain dicts - Use pathlib over os.path ================================================ FILE: rules/quality-gates.md ================================================ --- description: Code quality constraints enforced on all files --- ## Quality Gates | Constraint | Limit | |------------|-------| | Lines per function | 20 max | | Parameters per function | 3 max | | Nesting depth | 2 levels max | | Lines per file | 200 max | | Functions per file | 10 max | | Test coverage | 80% minimum | Before completing any file: count lines, count functions, check parameter counts. If limits exceeded, split or decompose immediately. ================================================ FILE: rules/react.md ================================================ --- description: React-specific conventions paths: ["src/components/**", "src/pages/**", "src/app/**", "**/*.tsx", "**/*.jsx"] --- ## React Conventions - Prefer functional components with hooks - Use React Query / TanStack Query for server state - Use Zustand or context for client state - Colocate component tests (ComponentName.test.tsx) - Extract custom hooks when logic is reused across components - Avoid prop drilling beyond 2 levels - use context or composition ================================================ FILE: rules/security.md ================================================ --- description: Security rules enforced on all code --- ## Security Rules - No secrets in code - use environment variables - No secrets in client-exposed env vars (VITE_*, NEXT_PUBLIC_*, REACT_APP_*) - `.env` files always in `.gitignore` - Parameterized queries only - no string concatenation for SQL - Hash passwords with bcrypt (12+ rounds) or argon2 - Validate all input at API boundaries (Zod/Pydantic) - `.env.example` with all required vars (no values) ================================================ FILE: rules/tdd-workflow.md ================================================ --- description: TDD workflow enforced for all implementation tasks --- ## TDD Workflow Every feature and bug fix follows RED-GREEN-VALIDATE: 1. **RED** - Write tests based on acceptance criteria. Run them. All must FAIL. 2. **GREEN** - Write minimum code to pass tests. Run them. All must PASS. 3. **VALIDATE** - Run linter, type checker, full test suite with coverage >= 80%. Tests must fail first to prove they validate the requirement. No code ships without a test that failed first. For bugs: identify test gap, write failing test that reproduces bug, then fix. ================================================ FILE: rules/typescript.md ================================================ --- description: TypeScript-specific conventions paths: ["**/*.ts", "**/*.tsx", "tsconfig.json"] --- ## TypeScript Conventions - Enable strict mode in tsconfig.json - Prefer interfaces over type aliases for object shapes - Use discriminated unions over type assertions - Avoid `any` - use `unknown` with type narrowing - Use Zod for runtime validation at boundaries - Use ESLint with TypeScript parser - Prefer `const` over `let`, never use `var` ================================================ FILE: scripts/convert-hooks-to-toml.sh ================================================ #!/bin/bash # convert-hooks-to-toml.sh - Convert settings.json hooks to config.toml format # Usage: convert-hooks-to-toml.sh [settings.json] > config.toml # Requires: jq set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" DEFAULT_INPUT="$SCRIPT_DIR/../templates/settings.json" check_deps() { command -v jq &>/dev/null || { echo "Error: jq is required" >&2 exit 1 } } print_header() { cat <<'HEADER' # Agent CLI Configuration # Compatible with Kimi CLI and OpenAI Codex CLI # Auto-generated from settings.json hooks HEADER echo "" } extract_hook() { local event="$1" local matcher="$2" local command="$3" local timeout="$4" echo "[[hooks]]" echo "event = \"$event\"" [ -n "$matcher" ] && echo "matcher = \"$matcher\"" echo "command = \"\"\"" echo "$command" echo "\"\"\"" echo "timeout = $timeout" echo "" } convert_event() { local input="$1" local event="$2" local entries entries=$(jq -c ".hooks.${event}[]?" "$input" 2>/dev/null) || return 0 echo "$entries" | while IFS= read -r entry; do local matcher matcher=$(echo "$entry" | jq -r '.matcher // ""') local hooks_array hooks_array=$(echo "$entry" | jq -c '.hooks[]') echo "$hooks_array" | while IFS= read -r hook; do local cmd timeout cmd=$(echo "$hook" | jq -r '.command') timeout=$(echo "$hook" | jq -r '.timeout // 30') extract_hook "$event" "$matcher" "$cmd" "$timeout" done done } main() { local input="${1:-$DEFAULT_INPUT}" [ -f "$input" ] || { echo "Error: '$input' not found" >&2 exit 1 } check_deps print_header local events=( "PreCompact" "PreToolUse" "PostToolUse" "Stop" "SessionStart" "SessionEnd" ) for event in "${events[@]}"; do convert_event "$input" "$event" done } main "$@" ================================================ FILE: scripts/convert-skills-structure.sh ================================================ #!/bin/bash # convert-skills-structure.sh # Converts flat .md skills to folder/SKILL.md structure with YAML frontmatter set -eo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(dirname "$SCRIPT_DIR")" SKILLS_DIR="$ROOT_DIR/skills" echo "Converting skills from flat .md to folder/SKILL.md structure..." echo "Skills directory: $SKILLS_DIR" echo "" # Function to get description for a skill get_description() { local name="$1" case "$name" in "aeo-optimization") echo "AI Engine Optimization - semantic triples, page templates, content clusters for AI citations" ;; "agentic-development") echo "Build AI agents with Pydantic AI (Python) and Claude SDK (Node.js)" ;; "ai-models") echo "Latest AI models reference - Claude, OpenAI, Gemini, Eleven Labs, Replicate" ;; "base") echo "Universal coding patterns, constraints, TDD workflow, atomic todos" ;; "code-deduplication") echo "Prevent semantic code duplication with capability index and check-before-write" ;; "code-review") echo "Mandatory code reviews via /code-review before commits and deploys" ;; "commit-hygiene") echo "Atomic commits, PR size limits, commit thresholds, stacked PRs" ;; "credentials") echo "Centralized API key management from Access.txt" ;; "database-schema") echo "Schema awareness - read before coding, type generation, prevent column errors" ;; "iterative-development") echo "Ralph Wiggum loops - self-referential TDD iteration until tests pass" ;; "klaviyo") echo "Klaviyo email/SMS marketing - profiles, events, flows, segmentation" ;; "llm-patterns") echo "AI-first application patterns, LLM testing, prompt management" ;; "medusa") echo "Medusa headless commerce - modules, workflows, API routes, admin UI" ;; "ms-teams-apps") echo "Microsoft Teams bots and AI agents - Claude/OpenAI, Adaptive Cards, Graph API" ;; "nodejs-backend") echo "Node.js backend patterns with Express/Fastify, repositories" ;; "playwright-testing") echo "E2E testing with Playwright - Page Objects, cross-browser, CI/CD" ;; "posthog-analytics") echo "PostHog analytics, event tracking, feature flags, dashboards" ;; "project-tooling") echo "gh, vercel, supabase, render CLI and deployment platform setup" ;; "pwa-development") echo "Progressive Web Apps - service workers, caching strategies, offline, Workbox" ;; "python") echo "Python development with ruff, mypy, pytest - TDD and type safety" ;; "react-native") echo "React Native mobile patterns, platform-specific code" ;; "react-web") echo "React web development with hooks, React Query, Zustand" ;; "reddit-ads") echo "Reddit Ads API - campaigns, targeting, conversions, agentic optimization" ;; "reddit-api") echo "Reddit API with PRAW (Python) and Snoowrap (Node.js)" ;; "security") echo "OWASP security patterns, secrets management, security testing" ;; "session-management") echo "Context preservation, tiered summarization, resumability" ;; "shopify-apps") echo "Shopify app development - Remix, Admin API, checkout extensions" ;; "site-architecture") echo "Technical SEO - robots.txt, sitemap, meta tags, Core Web Vitals" ;; "supabase") echo "Core Supabase CLI, migrations, RLS, Edge Functions" ;; "supabase-nextjs") echo "Next.js with Supabase and Drizzle ORM" ;; "supabase-node") echo "Express/Hono with Supabase and Drizzle ORM" ;; "supabase-python") echo "FastAPI with Supabase and SQLAlchemy/SQLModel" ;; "team-coordination") echo "Multi-person projects - shared state, todo claiming, handoffs" ;; "typescript") echo "TypeScript strict mode with eslint and jest" ;; "ui-mobile") echo "Mobile UI patterns - React Native, iOS/Android, touch targets" ;; "ui-testing") echo "Visual testing - catch invisible buttons, broken layouts, contrast" ;; "ui-web") echo "Web UI - glassmorphism, Tailwind, dark mode, accessibility" ;; "user-journeys") echo "User experience flows - journey mapping, UX validation, error recovery" ;; "web-content") echo "SEO and AI discovery (GEO) - schema, ChatGPT/Perplexity optimization" ;; "web-payments") echo "Stripe Checkout, subscriptions, webhooks, customer portal" ;; "woocommerce") echo "WooCommerce REST API - products, orders, customers, webhooks" ;; *) echo "Skill for $name" ;; esac } converted=0 for skill_file in "$SKILLS_DIR"/*.md; do if [ -f "$skill_file" ]; then filename=$(basename "$skill_file" .md) skill_folder="$SKILLS_DIR/$filename" skill_md="$skill_folder/SKILL.md" echo -n "Converting: $filename ... " # Get description description=$(get_description "$filename") # Create folder mkdir -p "$skill_folder" # Create SKILL.md with YAML frontmatter + original content { echo "---" echo "name: $filename" echo "description: $description" echo "---" echo "" cat "$skill_file" } > "$skill_md" # Remove original flat file rm "$skill_file" echo "✓" converted=$((converted + 1)) fi done echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Conversion complete!" echo "Converted: $converted skills" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" ================================================ FILE: scripts/detect-agents.sh ================================================ #!/bin/bash # detect-agents.sh - Detect installed AI CLI tools # Output: newline-separated list of detected tools (claude, kimi, codex) # Usage: ./detect-agents.sh # AGENTS=$(./detect-agents.sh) set -euo pipefail detect_by_binary() { local name="$1" local binary="$2" command -v "$binary" &>/dev/null && echo "$name" } detect_by_config() { local name="$1" local dir="$2" [ -d "$dir" ] && echo "$name" } detect_tool() { local name="$1" local binary="$2" local config_dir="$3" # Binary takes priority, config dir as fallback if command -v "$binary" &>/dev/null; then echo "$name" elif [ -d "$config_dir" ]; then echo "$name" fi } main() { detect_tool "claude" "claude" "$HOME/.claude" detect_tool "kimi" "kimi" "$HOME/.kimi" detect_tool "codex" "codex" "$HOME/.codex" # Container runtime command -v docker &>/dev/null && echo "docker" || true command -v orbctl &>/dev/null && echo "orbstack" || true # Polyphony orchestrator command -v polyphony &>/dev/null && echo "polyphony" || true } main ================================================ FILE: scripts/icpg/__init__.py ================================================ """iCPG — Intent-Augmented Code Property Graph. Tracks WHY code exists by linking tasks/goals to code symbols with typed edges for traceability, blast radius, and drift detection. """ __version__ = '0.1.0' ================================================ FILE: scripts/icpg/__main__.py ================================================ """CLI entry point for iCPG — Intent-Augmented Code Property Graph.""" from __future__ import annotations import argparse import json import subprocess import sys from pathlib import Path from . import __version__ from .bootstrap import bootstrap_from_git from .contracts import format_contracts, infer_contracts from .drift import check_all_drift, check_file_drift from .models import Edge, ReasonNode, _now, _uuid from .store import ICPGStore from .symbols import extract_symbols, extract_symbols_from_files from .vectors import VectorStore def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog='icpg', description='iCPG — Intent-Augmented Code Property Graph' ) parser.add_argument( '--version', action='version', version=f'icpg {__version__}' ) parser.add_argument( '--project', default='.', help='Project directory (default: .)' ) sub = parser.add_subparsers(dest='command') # --- init --- sub.add_parser('init', help='Initialize .icpg/ directory and database') # --- create --- p_create = sub.add_parser('create', help='Create a ReasonNode') p_create.add_argument('goal', help='Stated purpose (one sentence)') p_create.add_argument( '--scope', nargs='+', default=[], help='File paths in scope' ) p_create.add_argument('--owner', default='user', help='Owner name') p_create.add_argument('--agent', help='Agent identity') p_create.add_argument( '--type', dest='decision_type', default='task', choices=[ 'business_goal', 'arch_decision', 'task', 'workaround', 'constraint', 'patch' ] ) p_create.add_argument('--task-id', help='External task tracker ID') p_create.add_argument('--parent', help='Parent ReasonNode ID') p_create.add_argument( '--infer-contracts', action='store_true', help='Use LLM to infer contracts' ) # --- record --- p_record = sub.add_parser( 'record', help='Record symbols from git diff to a ReasonNode' ) p_record.add_argument('--reason', required=True, help='ReasonNode ID') p_record.add_argument( '--base', default='main', help='Base branch for diff' ) p_record.add_argument( '--edge-type', default='CREATES', choices=['CREATES', 'MODIFIES'], help='Edge type (default: CREATES)' ) # --- query --- p_query = sub.add_parser('query', help='Query the reason graph') q_sub = p_query.add_subparsers(dest='query_type') q_ctx = q_sub.add_parser( 'context', help='Get ReasonNodes for symbols in a file' ) q_ctx.add_argument('file', help='File path') q_blast = q_sub.add_parser( 'blast', help='Blast radius for a ReasonNode' ) q_blast.add_argument('reason_id', help='ReasonNode ID') q_const = q_sub.add_parser( 'constraints', help='Get invariants/contracts for file' ) q_const.add_argument('file', help='File path') q_risk = q_sub.add_parser( 'risk', help='Risk profile for a symbol' ) q_risk.add_argument('symbol', help='Symbol name') q_prior = q_sub.add_parser( 'prior', help='Search for duplicate/prior intents' ) q_prior.add_argument('goal', help='Goal text to search') q_prior.add_argument( '--threshold', type=float, default=0.75, help='Similarity threshold (0-1, default: 0.75)' ) # --- drift --- p_drift = sub.add_parser('drift', help='Drift detection') d_sub = p_drift.add_subparsers(dest='drift_action') d_sub.add_parser('check', help='Run full drift scan') d_file = d_sub.add_parser('file', help='Check drift for a single file (fast)') d_file.add_argument('file_path', help='File path to check') d_resolve = d_sub.add_parser('resolve', help='Resolve a drift event') d_resolve.add_argument('event_id', help='Drift event ID') # --- bootstrap --- p_boot = sub.add_parser( 'bootstrap', help='Infer ReasonNodes from git history' ) p_boot.add_argument( '--days', type=int, default=90, help='Days of history (default: 90)' ) p_boot.add_argument( '--no-llm', action='store_true', help='Skip LLM inference' ) p_boot.add_argument( '--verbose', '-v', action='store_true', help='Verbose output' ) # --- status --- sub.add_parser('status', help='Show iCPG statistics') args = parser.parse_args(argv) store = ICPGStore(args.project) if args.command == 'init': return cmd_init(store) elif args.command == 'create': return cmd_create(store, args) elif args.command == 'record': return cmd_record(store, args) elif args.command == 'query': return cmd_query(store, args) elif args.command == 'drift': return cmd_drift(store, args) elif args.command == 'bootstrap': return cmd_bootstrap(store, args) elif args.command == 'status': return cmd_status(store) else: parser.print_help() return 1 def cmd_init(store: ICPGStore) -> int: store.init_db() print(f'Initialized iCPG at {store.icpg_dir}') print(f' Database: {store.db_path}') print(f' .gitignore: created') return 0 def cmd_create(store: ICPGStore, args) -> int: if not store.exists(): store.init_db() reason = ReasonNode( goal=args.goal, owner=args.owner, decision_type=args.decision_type, scope=args.scope, agent=args.agent, task_id=args.task_id, parent_id=args.parent, source='agent-session' if args.agent else 'manual' ) if args.infer_contracts: contracts = infer_contracts(reason, project_dir=args.project) reason.preconditions = contracts['preconditions'] reason.postconditions = contracts['postconditions'] reason.invariants = contracts['invariants'] store.create_reason(reason) # Index in vector store vectors = VectorStore(args.project) vectors.add_reason(reason.id, reason.goal, reason.scope) print(f'Created ReasonNode: {reason.id}') print(f' Goal: {reason.goal}') print(f' Scope: {", ".join(reason.scope) or "(none)"}') if reason.invariants: print(f' Invariants: {len(reason.invariants)}') return 0 def cmd_record(store: ICPGStore, args) -> int: if not store.exists(): print('Error: No .icpg/ directory. Run `icpg init` first.', file=sys.stderr) return 1 reason = store.get_reason(args.reason) if not reason: print(f'Error: ReasonNode {args.reason} not found.', file=sys.stderr) return 1 # Get changed files from git diff try: result = subprocess.run( ['git', 'diff', '--name-only', args.base], capture_output=True, text=True, timeout=10, cwd=str(store.project_dir) ) files = [f.strip() for f in result.stdout.strip().split('\n') if f.strip()] except (subprocess.TimeoutExpired, FileNotFoundError): print('Error: git diff failed.', file=sys.stderr) return 1 if not files: print('No changed files found.') return 0 count = 0 for fp in files: full_path = store.project_dir / fp if not full_path.exists(): continue syms = extract_symbols(str(full_path)) for sym in syms: store.upsert_symbol(sym) edge = Edge( from_id=reason.id, to_id=sym.id, edge_type=args.edge_type, confidence=1.0 ) store.create_edge(edge) count += 1 # Update reason status store.update_reason_status(reason.id, 'executing') print(f'Recorded {count} symbols → ReasonNode {args.reason}') print(f' Files: {len(files)}') print(f' Edge type: {args.edge_type}') return 0 def cmd_query(store: ICPGStore, args) -> int: if not store.exists(): return 0 # Silent — no DB means no context if args.query_type == 'context': return _query_context(store, args.file) elif args.query_type == 'blast': return _query_blast(store, args.reason_id) elif args.query_type == 'constraints': return _query_constraints(store, args.file) elif args.query_type == 'risk': return _query_risk(store, args.symbol) elif args.query_type == 'prior': return _query_prior(store, args) else: print('Specify a query type: context, blast, constraints, risk, prior') return 1 def _resolve_path(store: ICPGStore, file_path: str) -> str: """Resolve relative paths to absolute, matching DB storage format.""" p = Path(file_path) if not p.is_absolute(): p = store.project_dir / p return str(p.resolve()) def _query_context(store: ICPGStore, file_path: str) -> int: resolved = _resolve_path(store, file_path) reasons = store.get_reasons_for_file(resolved) if not reasons: return 0 print(f'INTENTS for {file_path}:') for r in reasons: status_icon = { 'proposed': '?', 'executing': '>', 'fulfilled': '+', 'drifted': '!', 'rejected': 'x', 'abandoned': '-' }.get(r.status, ' ') print(f' [{status_icon}] {r.id[:8]} — {r.goal}') print(f' Owner: {r.owner} | Status: {r.status}') if r.invariants: print(f' Invariants: {len(r.invariants)}') return 0 def _query_blast(store: ICPGStore, reason_id: str) -> int: blast = store.get_blast_radius(reason_id) reason = blast.get('reason') if not reason: print(f'ReasonNode {reason_id} not found.', file=sys.stderr) return 1 print(f'BLAST RADIUS for {reason.goal}:') print(f' Symbols: {blast["symbol_count"]}') for sym in blast['symbols']: print(f' {sym.symbol_type} {sym.name} ({sym.file_path})') print(f' Dependent intents: {blast["dependent_count"]}') for dep in blast['dependent_reasons']: print(f' {dep.id[:8]} — {dep.goal}') if reason.invariants: print(f' Invariants:') for inv in reason.invariants: print(f' - {inv}') return 0 def _query_constraints(store: ICPGStore, file_path: str) -> int: resolved = _resolve_path(store, file_path) constraints = store.get_constraints_for_scope([resolved]) if not constraints: return 0 print(f'CONSTRAINTS for {file_path}:') for c in constraints: print(f' From intent: {c["goal"][:60]}') for inv in c['invariants']: print(f' INV: {inv}') for post in c['postconditions']: print(f' POST: {post}') for pre in c['preconditions']: print(f' PRE: {pre}') return 0 def _query_risk(store: ICPGStore, symbol_name: str) -> int: profile = store.get_risk_profile(symbol_name) if not profile.get('found'): return 0 sym = profile['symbol'] print(f'RISK PROFILE for {symbol_name}:') print(f' File: {sym.file_path}') print(f' Type: {sym.symbol_type}') print(f' Owners: {", ".join(profile["owners"])}') print(f' Modifications: {profile["modify_count"]}') print(f' Active drift: {"YES" if profile["active_drift"] else "no"}') if profile['drift_events']: print(f' Drift history:') for de in profile['drift_events'][:5]: status = 'resolved' if de.resolved else 'ACTIVE' print(f' [{status}] {de.description} (severity: {de.severity})') return 0 def _query_prior(store: ICPGStore, args) -> int: vectors = VectorStore(args.project) similar = vectors.search_similar(args.goal, threshold=args.threshold) if not similar: print('No similar prior intents found.') return 0 print(f'SIMILAR INTENTS (threshold: {args.threshold}):') for rid, score in similar: reason = store.get_reason(rid) if reason: print(f' [{score:.2f}] {reason.id[:8]} — {reason.goal}') print(f' Status: {reason.status} | Owner: {reason.owner}') return 0 def cmd_drift(store: ICPGStore, args) -> int: if not store.exists(): print('No .icpg/ directory. Run `icpg init` first.', file=sys.stderr) return 1 if args.drift_action == 'check': events = check_all_drift(store) if not events: print('No drift detected.') return 0 # Save new events for event in events: store.create_drift_event(event) print(f'DRIFT DETECTED ({len(events)} events):') for e in events: dims = ', '.join(e.drift_dimensions) print(f' [{e.severity:.2f}] {e.description}') print(f' Dimensions: {dims}') return 0 elif args.drift_action == 'file': resolved = _resolve_path(store, args.file_path) events = check_file_drift(store, resolved) if not events: return 0 # Persist events for event in events: store.create_drift_event(event) basename = Path(resolved).name print(f'DRIFT: {len(events)} symbols drifted in {basename}') for e in events: sym = store._get_symbol(e.symbol_id) name = sym.name if sym else '???' dims = ', '.join( f'{d}({s:.2f})' for d, s in zip(e.drift_dimensions, _drift_scores(e)) ) print(f' [{e.severity:.2f}] {name} — {dims}') return 0 elif args.drift_action == 'resolve': store.resolve_drift(args.event_id) print(f'Resolved drift event {args.event_id}') return 0 else: print('Specify: drift check, drift file , or drift resolve ') return 1 def _drift_scores(event) -> list[float]: """Extract per-dimension scores from drift event description.""" import re scores = [] for match in re.finditer(r'\w+\((\d+\.\d+)\)', event.description): scores.append(float(match.group(1))) if not scores: scores = [event.severity] * len(event.drift_dimensions) return scores def cmd_bootstrap(store: ICPGStore, args) -> int: if not store.exists(): store.init_db() print(f'Bootstrapping iCPG from last {args.days} days of git history...') stats = bootstrap_from_git( store, days=args.days, use_llm=not args.no_llm, verbose=args.verbose ) print(f'\nBootstrap complete:') print(f' Commit clusters: {stats["clusters"]}') print(f' ReasonNodes created: {stats["reasons_created"]}') print(f' Symbols linked: {stats["symbols_linked"]}') if stats.get('skipped'): print(f' Skipped (duplicates): {stats["skipped"]}') return 0 def cmd_status(store: ICPGStore) -> int: if not store.exists(): print('No iCPG database found. Run `icpg init` to create one.') return 0 stats = store.get_stats() drift = store.get_unresolved_drift() print('iCPG STATUS') print(f' ReasonNodes: {stats["reasons"]}') print(f' Symbols: {stats["symbols"]}') print(f' Edges: {stats["edges"]}') print(f' Unresolved drift: {stats["unresolved_drift"]}') if drift: print(f'\nTop drift events:') for d in drift[:5]: dims = ', '.join(d.drift_dimensions) print(f' [{d.severity:.2f}] {d.description} ({dims})') return 0 if __name__ == '__main__': sys.exit(main()) ================================================ FILE: scripts/icpg/bootstrap.py ================================================ """Git history inference — bootstrap iCPG from existing commits. Implements RFC Section 7.2: replay commit history, cluster by PR or temporal proximity, infer ReasonNodes via LLM, create CREATES/MODIFIES edges. """ from __future__ import annotations import json import os import re import subprocess from datetime import datetime, timedelta, timezone from pathlib import Path from .contracts import infer_contracts from .models import Edge, ReasonNode, _now, _uuid from .store import ICPGStore from .symbols import extract_symbols from .vectors import VectorStore def bootstrap_from_git( store: ICPGStore, days: int = 90, use_llm: bool = True, verbose: bool = False ) -> dict: """Infer ReasonNodes from git commit history. Returns stats dict: {clusters, reasons_created, symbols_linked, skipped}. """ vectors = VectorStore(str(store.project_dir)) since = ( datetime.now(timezone.utc) - timedelta(days=days) ).strftime('%Y-%m-%d') # Step 1: Get commits commits = _get_commits(store.project_dir, since) if verbose: print(f'Found {len(commits)} commits in last {days} days') if not commits: return {'clusters': 0, 'reasons_created': 0, 'symbols_linked': 0} # Step 2: Cluster commits clusters = _cluster_commits(commits) if verbose: print(f'Clustered into {len(clusters)} groups') stats = {'clusters': len(clusters), 'reasons_created': 0, 'symbols_linked': 0, 'skipped': 0} for cluster in clusters: # Step 3: Extract info from cluster messages = [c['message'] for c in cluster] files_changed = set() for c in cluster: files_changed.update(c.get('files', [])) combined_message = '\n'.join(messages) # Step 4: Check for duplicates similar = vectors.search_similar(combined_message, threshold=0.8) if similar: stats['skipped'] += 1 if verbose: print(f' Skipping cluster (duplicate of {similar[0][0]})') continue # Step 5: Infer ReasonNode if use_llm: reason = _infer_via_llm(combined_message, list(files_changed)) else: reason = _infer_from_messages(combined_message, list(files_changed)) if not reason: stats['skipped'] += 1 continue # Step 6: Create reason and index store.create_reason(reason) vectors.add_reason(reason.id, reason.goal, reason.scope) stats['reasons_created'] += 1 if verbose: print(f' Created: {reason.goal[:60]}...') # Step 7: Link symbols for fp in files_changed: full_path = store.project_dir / fp if not full_path.exists(): continue syms = extract_symbols(str(full_path)) for sym in syms: store.upsert_symbol(sym) edge = Edge( from_id=reason.id, to_id=sym.id, edge_type='CREATES', confidence=0.6 ) store.create_edge(edge) stats['symbols_linked'] += 1 # Step 8: Infer contracts (if LLM available) if use_llm and not reason.postconditions: contracts = infer_contracts(reason, project_dir=str(store.project_dir)) if any(contracts.values()): reason.preconditions = contracts['preconditions'] reason.postconditions = contracts['postconditions'] reason.invariants = contracts['invariants'] # Update in DB with store._conn() as conn: conn.execute( """UPDATE reasons SET preconditions = ?, postconditions = ?, invariants = ? WHERE id = ?""", ( json.dumps(reason.preconditions), json.dumps(reason.postconditions), json.dumps(reason.invariants), reason.id ) ) return stats def _get_commits(project_dir: Path, since: str) -> list[dict]: """Get commits with messages and changed files.""" try: result = subprocess.run( [ 'git', 'log', f'--since={since}', '--format=__COMMIT__%n%H%n%an%n%aI%n%s', '--name-only' ], capture_output=True, text=True, timeout=30, cwd=str(project_dir) ) except (subprocess.TimeoutExpired, FileNotFoundError): return [] if result.returncode != 0: return [] commits = [] raw_blocks = result.stdout.split('__COMMIT__\n') for block in raw_blocks: block = block.strip() if not block: continue lines = block.split('\n') if len(lines) < 4: continue sha = lines[0].strip() author = lines[1].strip() date = lines[2].strip() message = lines[3].strip() # Files come after a blank line separator files = [] past_blank = False for line in lines[4:]: stripped = line.strip() if not stripped: past_blank = True continue if past_blank and stripped: files.append(stripped) commits.append({ 'sha': sha, 'author': author, 'date': date, 'message': message, 'files': files }) return commits def _cluster_commits( commits: list[dict], window_hours: int = 2 ) -> list[list[dict]]: """Cluster commits by temporal proximity.""" if not commits: return [] clusters = [] current_cluster = [commits[0]] for commit in commits[1:]: try: prev_date = datetime.fromisoformat( current_cluster[-1]['date'].replace('Z', '+00:00') ) curr_date = datetime.fromisoformat( commit['date'].replace('Z', '+00:00') ) delta = abs((curr_date - prev_date).total_seconds()) if delta <= window_hours * 3600: current_cluster.append(commit) else: clusters.append(current_cluster) current_cluster = [commit] except (ValueError, KeyError): clusters.append(current_cluster) current_cluster = [commit] if current_cluster: clusters.append(current_cluster) return clusters def _infer_via_llm( messages: str, files: list[str] ) -> ReasonNode | None: """Use LLM to infer a ReasonNode from commit messages.""" scope_str = ', '.join(files[:20]) prompt = f"""Given these git commit messages, infer the intent/goal. COMMITS: {messages[:2000]} FILES CHANGED: {scope_str} Return ONLY a JSON object: {{ "goal": "one-sentence description of what this change was trying to achieve", "decision_type": "task|business_goal|arch_decision|workaround|constraint|patch", "scope": ["file1", "file2"] }}""" # Try Claude CLI try: result = subprocess.run( ['claude', '--print', '-p', prompt], capture_output=True, text=True, timeout=30 ) if result.returncode == 0: return _parse_reason_response(result.stdout, files) except (FileNotFoundError, subprocess.TimeoutExpired): pass # Try OpenAI try: import openai client = openai.OpenAI() response = client.chat.completions.create( model='gpt-4o-mini', messages=[{'role': 'user', 'content': prompt}], temperature=0.2 ) content = response.choices[0].message.content or '' return _parse_reason_response(content, files) except Exception: pass # Fallback return _infer_from_messages(messages, files) def _infer_from_messages( messages: str, files: list[str] ) -> ReasonNode | None: """Extract ReasonNode from commit messages without LLM.""" # Use first line as goal first_line = messages.split('\n')[0].strip() if not first_line: return None # Detect decision type from conventional commits dtype = 'task' if first_line.startswith('feat'): dtype = 'business_goal' elif first_line.startswith('fix'): dtype = 'patch' elif first_line.startswith('refactor'): dtype = 'arch_decision' elif first_line.startswith('chore') or first_line.startswith('ci'): dtype = 'constraint' # Clean up conventional commit prefix goal = re.sub(r'^(feat|fix|refactor|chore|ci|docs|test)(\([^)]*\))?:\s*', '', first_line) return ReasonNode( id=_uuid(), goal=goal or first_line, decision_type=dtype, scope=files[:20], owner='git-history', source='inferred', status='fulfilled', created_at=_now() ) def _parse_reason_response( response: str, fallback_files: list[str] ) -> ReasonNode | None: """Parse LLM response into a ReasonNode.""" try: start = response.find('{') end = response.rfind('}') + 1 if start >= 0 and end > start: data = json.loads(response[start:end]) return ReasonNode( id=_uuid(), goal=data.get('goal', ''), decision_type=data.get('decision_type', 'task'), scope=data.get('scope', fallback_files[:20]), owner='git-history', source='inferred', status='fulfilled', created_at=_now() ) except (json.JSONDecodeError, KeyError): pass return None ================================================ FILE: scripts/icpg/contracts.py ================================================ """Design by Contract layer for ReasonNodes. Handles inference, evaluation, and formatting of preconditions, postconditions, and invariants. """ from __future__ import annotations import json import os import subprocess from pathlib import Path from .models import ReasonNode def infer_contracts( reason: ReasonNode, code_context: str = '', project_dir: str = '.' ) -> dict[str, list[str]]: """Use LLM to infer contracts from stated purpose + code context. Returns dict with 'preconditions', 'postconditions', 'invariants'. Falls back to heuristic extraction if no LLM available. """ # Try Claude CLI first api_key = os.environ.get('ANTHROPIC_API_KEY') if api_key: return _infer_via_claude(reason, code_context) # Try OpenAI openai_key = os.environ.get('OPENAI_API_KEY') if openai_key: return _infer_via_openai(reason, code_context) # Fallback: heuristic extraction return _infer_heuristic(reason, project_dir) def _infer_via_claude( reason: ReasonNode, code_context: str ) -> dict[str, list[str]]: """Call Claude API to infer contracts.""" prompt = _build_inference_prompt(reason, code_context) try: result = subprocess.run( ['claude', '--print', '-p', prompt], capture_output=True, text=True, timeout=30 ) if result.returncode == 0: return _parse_contract_response(result.stdout) except (FileNotFoundError, subprocess.TimeoutExpired): pass return _empty_contracts() def _infer_via_openai( reason: ReasonNode, code_context: str ) -> dict[str, list[str]]: """Call OpenAI API to infer contracts.""" try: import openai client = openai.OpenAI() prompt = _build_inference_prompt(reason, code_context) response = client.chat.completions.create( model='gpt-4o-mini', messages=[{'role': 'user', 'content': prompt}], temperature=0.2 ) return _parse_contract_response( response.choices[0].message.content or '' ) except Exception: return _empty_contracts() def _infer_heuristic( reason: ReasonNode, project_dir: str ) -> dict[str, list[str]]: """Basic heuristic contract extraction — no LLM needed.""" pre = [] post = [] inv = [] # Scope-based invariants for scope_path in reason.scope: inv.append(f'file_exists("{scope_path}")') # If goal mentions "test" or "validation" goal_lower = reason.goal.lower() if 'test' in goal_lower: for sp in reason.scope: if 'test' not in sp: test_path = _guess_test_path(sp) if test_path: post.append(f'test_exists("{test_path}")') return { 'preconditions': pre, 'postconditions': post, 'invariants': inv } def _build_inference_prompt( reason: ReasonNode, code_context: str ) -> str: scope_str = ', '.join(reason.scope) if reason.scope else 'unspecified' return f"""Given this intent for a code change, infer formal contracts. INTENT: Goal: {reason.goal} Decision type: {reason.decision_type} Scope: {scope_str} {f'CODE CONTEXT:{chr(10)}{code_context[:2000]}' if code_context else ''} Return ONLY a JSON object with three arrays: {{ "preconditions": ["predicate1", "predicate2"], "postconditions": ["predicate1", "predicate2"], "invariants": ["predicate1", "predicate2"] }} Predicate format examples: file_exists("src/auth/middleware.ts") test_exists("src/auth/__tests__/middleware.test.ts") symbol_count("src/auth/") <= 15 function_signature("validateToken") == "(token: string) => Promise" Rules: - Preconditions: what must exist before this change - Postconditions: what must be true after this change is complete - Invariants: what must NOT change during or after this change - Be specific. Use file paths from the scope. - 2-5 predicates per category max.""" def _parse_contract_response(response: str) -> dict[str, list[str]]: """Parse LLM response into contract dict.""" # Try to extract JSON try: # Find JSON block start = response.find('{') end = response.rfind('}') + 1 if start >= 0 and end > start: data = json.loads(response[start:end]) return { 'preconditions': data.get('preconditions', []), 'postconditions': data.get('postconditions', []), 'invariants': data.get('invariants', []) } except (json.JSONDecodeError, KeyError): pass return _empty_contracts() def _empty_contracts() -> dict[str, list[str]]: return {'preconditions': [], 'postconditions': [], 'invariants': []} def _guess_test_path(source_path: str) -> str | None: """Guess test file path from source path.""" p = Path(source_path) stem = p.stem suffix = p.suffix # Python: test_foo.py if suffix == '.py': test_dir = p.parent / 'tests' return str(test_dir / f'test_{stem}.py') # TS/JS: foo.test.ts if suffix in ('.ts', '.tsx', '.js', '.jsx'): return str(p.parent / f'{stem}.test{suffix}') return None def format_contracts(reason: ReasonNode) -> str: """Format contracts for human-readable display.""" lines = [] if reason.preconditions: lines.append('PRECONDITIONS:') for p in reason.preconditions: lines.append(f' - {p}') if reason.postconditions: lines.append('POSTCONDITIONS:') for p in reason.postconditions: lines.append(f' - {p}') if reason.invariants: lines.append('INVARIANTS:') for p in reason.invariants: lines.append(f' - {p}') return '\n'.join(lines) if lines else '(no contracts defined)' ================================================ FILE: scripts/icpg/drift.py ================================================ """6-dimension drift detection per RFC Section 6.""" from __future__ import annotations import subprocess from pathlib import Path from .models import DriftEvent, Edge, _now, _uuid from .store import ICPGStore from .symbols import extract_symbols def check_file_drift(store: ICPGStore, file_path: str) -> list[DriftEvent]: """Check drift for symbols in a single file only. Fast path for hooks.""" symbols = store.get_symbols_for_file(file_path) events = [] for sym in symbols: event = check_symbol_drift(store, sym.id) if event: events.append(event) return events def check_all_drift(store: ICPGStore) -> list[DriftEvent]: """Full drift scan across all tracked symbols.""" events = [] reasons = store.list_reasons() for reason in reasons: if reason.status in ('rejected', 'abandoned'): continue creates_edges = store.get_edges_from(reason.id, 'CREATES') for edge in creates_edges: sym = store._get_symbol(edge.to_id) if not sym: continue event = check_symbol_drift(store, sym.id) if event: events.append(event) return events def check_symbol_drift( store: ICPGStore, symbol_id: str ) -> DriftEvent | None: """Check a single symbol for drift across all 6 dimensions.""" sym = store._get_symbol(symbol_id) if not sym: return None # Find creating reason creates_edges = store.get_edges_to(symbol_id, 'CREATES') if not creates_edges: return None reason = store.get_reason(creates_edges[0].from_id) if not reason: return None dimensions = [] severity_scores = [] # 1. Spec drift — checksum changed without MODIFIES edge spec = _check_spec_drift(store, sym, reason) if spec: dimensions.append('spec') severity_scores.append(spec) # 2. Decision drift — postconditions no longer hold decision = _check_decision_drift(store, reason) if decision: dimensions.append('decision') severity_scores.append(decision) # 3. Ownership drift — >3 different owners ownership = _check_ownership_drift(store, sym) if ownership: dimensions.append('ownership') severity_scores.append(ownership) # 4. Test drift — VALIDATED_BY tests missing or failing test = _check_test_drift(store, reason) if test: dimensions.append('test') severity_scores.append(test) # 5. Usage drift — used outside original scope usage = _check_usage_drift(store, sym, reason) if usage: dimensions.append('usage') severity_scores.append(usage) # 6. Dependency drift — downstream coupling changed dep = _check_dependency_drift(store, reason) if dep: dimensions.append('dependency') severity_scores.append(dep) if not dimensions: return None avg_severity = sum(severity_scores) / len(severity_scores) desc_parts = [f'{d}({s:.2f})' for d, s in zip(dimensions, severity_scores)] return DriftEvent( id=_uuid(), symbol_id=symbol_id, from_reason_id=reason.id, drift_dimensions=dimensions, severity=round(avg_severity, 2), description=f'Drift detected: {", ".join(desc_parts)}', detected_at=_now() ) def _check_spec_drift(store, sym, reason) -> float | None: """Symbol checksum changed since creation without a MODIFIES edge.""" # Re-extract current symbol current_symbols = extract_symbols(sym.file_path) current = next((s for s in current_symbols if s.name == sym.name), None) if not current: return 0.8 # Symbol removed entirely if current.checksum != sym.checksum: # Check if there's a MODIFIES edge explaining the change mod_edges = store.get_edges_to(sym.id, 'MODIFIES') if not mod_edges: return 0.6 # Changed without explanation return None def _check_decision_drift(store, reason) -> float | None: """ReasonNode postconditions no longer hold.""" if not reason.postconditions: return None failed = 0 for predicate in reason.postconditions: if not evaluate_predicate(predicate, store.project_dir): failed += 1 if failed > 0: return min(1.0, failed / len(reason.postconditions)) return None def _check_ownership_drift(store, sym) -> float | None: """Symbol touched by >3 different owners.""" edges = store.get_edges_to(sym.id) owners = set() for edge in edges: reason = store.get_reason(edge.from_id) if reason: owners.add(reason.owner) if len(owners) > 3: return min(1.0, (len(owners) - 3) / 5) return None def _check_test_drift(store, reason) -> float | None: """VALIDATED_BY tests no longer exist or fail.""" test_edges = store.get_edges_from(reason.id, 'VALIDATED_BY') if not test_edges: # No tests linked — mild concern return 0.3 missing = 0 for edge in test_edges: test_sym = store._get_symbol(edge.to_id) if not test_sym or not Path(test_sym.file_path).exists(): missing += 1 if missing > 0: return min(1.0, missing / len(test_edges)) return None def _check_usage_drift(store, sym, reason) -> float | None: """Symbol imported from scopes outside original ReasonNode scope.""" if not reason.scope: return None # Use grep to find imports/usages of the symbol try: result = subprocess.run( ['grep', '-rl', sym.name, '.'], capture_output=True, text=True, timeout=5, cwd=str(store.project_dir) ) except (subprocess.TimeoutExpired, FileNotFoundError): return None if result.returncode != 0: return None usage_files = [ f.strip().lstrip('./') for f in result.stdout.strip().split('\n') if f.strip() ] out_of_scope = 0 for uf in usage_files: if not any(uf.startswith(s.rstrip('/')) for s in reason.scope): out_of_scope += 1 if out_of_scope > 2: return min(1.0, out_of_scope / 10) return None def _check_dependency_drift(store, reason) -> float | None: """Downstream REQUIRES reasons have drifted or changed status.""" req_edges = store.get_edges_to(reason.id, 'REQUIRES') if not req_edges: return None drifted = 0 for edge in req_edges: dep_reason = store.get_reason(edge.from_id) if dep_reason and dep_reason.status == 'drifted': drifted += 1 if drifted > 0: return min(1.0, drifted / len(req_edges)) return None def evaluate_predicate(predicate: str, project_dir: Path) -> bool: """Evaluate a single structured predicate against codebase state. Supported predicates: file_exists("path") test_exists("path") symbol_count("dir/") <= N function_signature("name") == "sig" """ predicate = predicate.strip() # file_exists("path") m = _match_predicate(predicate, 'file_exists') if m: return (project_dir / m).exists() # test_exists("path") m = _match_predicate(predicate, 'test_exists') if m: return (project_dir / m).exists() # symbol_count("dir/") <= N import re sc = re.match( r'symbol_count\("([^"]+)"\)\s*(<=|>=|==|<|>)\s*(\d+)', predicate ) if sc: dir_path, op, threshold = sc.group(1), sc.group(2), int(sc.group(3)) count = _count_symbols_in_dir(project_dir / dir_path) return _compare(count, op, threshold) # Unrecognized predicate — pass (don't block on unknown) return True def _match_predicate(predicate: str, func_name: str) -> str | None: import re m = re.match(rf'{func_name}\("([^"]+)"\)', predicate) return m.group(1) if m else None def _count_symbols_in_dir(dir_path: Path) -> int: if not dir_path.is_dir(): return 0 count = 0 for f in dir_path.rglob('*'): if f.is_file(): count += len(extract_symbols(str(f))) return count def _compare(value: int, op: str, threshold: int) -> bool: ops = { '<=': value <= threshold, '>=': value >= threshold, '==': value == threshold, '<': value < threshold, '>': value > threshold, } return ops.get(op, True) ================================================ FILE: scripts/icpg/models.py ================================================ """Data models for iCPG — ReasonNode, Symbol, Edge, DriftEvent.""" from __future__ import annotations import hashlib import uuid from dataclasses import dataclass, field from datetime import datetime, timezone def _now() -> str: return datetime.now(timezone.utc).isoformat() def _uuid() -> str: return str(uuid.uuid4()) def symbol_id(file_path: str, name: str, symbol_type: str) -> str: """Deterministic ID for a symbol: hash of file:name:type.""" raw = f'{file_path}:{name}:{symbol_type}' return hashlib.sha256(raw.encode()).hexdigest()[:16] # --- Decision types --- DECISION_TYPES = ( 'business_goal', 'arch_decision', 'task', 'workaround', 'constraint', 'patch' ) # --- ReasonNode statuses --- REASON_STATUSES = ( 'proposed', 'executing', 'fulfilled', 'rejected', 'drifted', 'abandoned' ) # --- Source types --- SOURCE_TYPES = ( 'manual', 'commit', 'migration', 'inferred', 'agent-session' ) # --- Edge types --- EDGE_TYPES = ( 'CREATES', 'MODIFIES', 'REQUIRES', 'DUPLICATES', 'VALIDATED_BY', 'DRIFTS_FROM' ) # --- Drift dimensions --- DRIFT_DIMENSIONS = ( 'spec', 'decision', 'ownership', 'test', 'usage', 'dependency' ) # --- Symbol types --- SYMBOL_TYPES = ( 'function', 'class', 'module', 'route', 'schema', 'component', 'interface', 'type', 'constant', 'hook' ) @dataclass class ReasonNode: """A single intent/decision that drives code changes.""" goal: str owner: str id: str = field(default_factory=_uuid) decision_type: str = 'task' scope: list[str] = field(default_factory=list) agent: str | None = None status: str = 'proposed' source: str = 'manual' task_id: str | None = None parent_id: str | None = None # Design by Contract layer preconditions: list[str] = field(default_factory=list) postconditions: list[str] = field(default_factory=list) invariants: list[str] = field(default_factory=list) created_at: str = field(default_factory=_now) fulfilled_at: str | None = None @dataclass class Symbol: """A code entity: function, class, module, etc.""" name: str file_path: str symbol_type: str language: str id: str = '' signature: str | None = None checksum: str = '' created_at: str = field(default_factory=_now) def __post_init__(self): if not self.id: self.id = symbol_id(self.file_path, self.name, self.symbol_type) @dataclass class Edge: """A typed relationship between nodes.""" from_id: str to_id: str edge_type: str id: str = field(default_factory=_uuid) confidence: float = 1.0 created_at: str = field(default_factory=_now) @dataclass class DriftEvent: """Auto-generated when behavior diverges from intent.""" symbol_id: str from_reason_id: str description: str id: str = field(default_factory=_uuid) drift_dimensions: list[str] = field(default_factory=list) severity: float = 0.5 resolved: bool = False detected_at: str = field(default_factory=_now) ================================================ FILE: scripts/icpg/pyproject.toml ================================================ [project] name = "icpg" version = "0.1.0" description = "iCPG — Intent-Augmented Code Property Graph for agentic development" requires-python = ">=3.10" license = {text = "MIT"} readme = "README.md" dependencies = [] [project.optional-dependencies] vectors = [ "chromadb>=0.4.0", "sentence-transformers>=2.2.0", ] tfidf = [ "scikit-learn>=1.3.0", ] llm = [ "openai>=1.0.0", ] all = [ "icpg[vectors,tfidf,llm]", ] [project.scripts] icpg = "icpg.__main__:main" [build-system] requires = ["setuptools>=68.0"] build-backend = "setuptools.build_meta" ================================================ FILE: scripts/icpg/store.py ================================================ """SQLite storage layer for iCPG reason graph.""" from __future__ import annotations import json import os import sqlite3 from pathlib import Path from typing import Any from .models import DriftEvent, Edge, ReasonNode, Symbol ICPG_DIR = '.icpg' DB_NAME = 'reason.db' SCHEMA = """ CREATE TABLE IF NOT EXISTS reasons ( id TEXT PRIMARY KEY, goal TEXT NOT NULL, decision_type TEXT DEFAULT 'task', scope TEXT DEFAULT '[]', owner TEXT NOT NULL, agent TEXT, status TEXT DEFAULT 'proposed', source TEXT DEFAULT 'manual', task_id TEXT, parent_id TEXT REFERENCES reasons(id), preconditions TEXT DEFAULT '[]', postconditions TEXT DEFAULT '[]', invariants TEXT DEFAULT '[]', created_at TEXT NOT NULL, fulfilled_at TEXT ); CREATE TABLE IF NOT EXISTS symbols ( id TEXT PRIMARY KEY, name TEXT NOT NULL, file_path TEXT NOT NULL, symbol_type TEXT NOT NULL, language TEXT NOT NULL, signature TEXT, checksum TEXT, created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS edges ( id TEXT PRIMARY KEY, from_id TEXT NOT NULL, to_id TEXT NOT NULL, edge_type TEXT NOT NULL, confidence REAL DEFAULT 1.0, created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS drift_events ( id TEXT PRIMARY KEY, symbol_id TEXT NOT NULL, from_reason_id TEXT NOT NULL, drift_dimensions TEXT DEFAULT '[]', severity REAL DEFAULT 0.5, description TEXT, resolved INTEGER DEFAULT 0, detected_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_edges_from ON edges(from_id); CREATE INDEX IF NOT EXISTS idx_edges_to ON edges(to_id); CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(edge_type); CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_path); CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name); CREATE INDEX IF NOT EXISTS idx_drift_symbol ON drift_events(symbol_id); CREATE INDEX IF NOT EXISTS idx_drift_resolved ON drift_events(resolved); CREATE INDEX IF NOT EXISTS idx_reasons_status ON reasons(status); """ class ICPGStore: """SQLite-backed storage for the iCPG reason graph.""" def __init__(self, project_dir: str = '.'): self.project_dir = Path(project_dir).resolve() self.icpg_dir = self.project_dir / ICPG_DIR self.db_path = self.icpg_dir / DB_NAME def init_db(self) -> None: """Create .icpg/ directory and initialize schema.""" self.icpg_dir.mkdir(parents=True, exist_ok=True) gitignore = self.icpg_dir / '.gitignore' if not gitignore.exists(): gitignore.write_text('*\n') with self._conn() as conn: conn.executescript(SCHEMA) def exists(self) -> bool: return self.db_path.exists() def _conn(self) -> sqlite3.Connection: conn = sqlite3.connect(str(self.db_path)) conn.row_factory = sqlite3.Row conn.execute('PRAGMA journal_mode=WAL') conn.execute('PRAGMA foreign_keys=ON') return conn # --- ReasonNode CRUD --- def create_reason(self, node: ReasonNode) -> str: with self._conn() as conn: conn.execute( """INSERT INTO reasons (id, goal, decision_type, scope, owner, agent, status, source, task_id, parent_id, preconditions, postconditions, invariants, created_at, fulfilled_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", ( node.id, node.goal, node.decision_type, json.dumps(node.scope), node.owner, node.agent, node.status, node.source, node.task_id, node.parent_id, json.dumps(node.preconditions), json.dumps(node.postconditions), json.dumps(node.invariants), node.created_at, node.fulfilled_at ) ) return node.id def get_reason(self, reason_id: str) -> ReasonNode | None: with self._conn() as conn: row = conn.execute( 'SELECT * FROM reasons WHERE id = ?', (reason_id,) ).fetchone() if not row: return None return self._row_to_reason(row) def list_reasons(self, status: str | None = None) -> list[ReasonNode]: with self._conn() as conn: if status: rows = conn.execute( 'SELECT * FROM reasons WHERE status = ? ORDER BY created_at', (status,) ).fetchall() else: rows = conn.execute( 'SELECT * FROM reasons ORDER BY created_at' ).fetchall() return [self._row_to_reason(r) for r in rows] def update_reason_status( self, reason_id: str, status: str, fulfilled_at: str | None = None ) -> None: with self._conn() as conn: conn.execute( 'UPDATE reasons SET status = ?, fulfilled_at = ? WHERE id = ?', (status, fulfilled_at, reason_id) ) # --- Symbol CRUD --- def upsert_symbol(self, sym: Symbol) -> str: with self._conn() as conn: conn.execute( """INSERT INTO symbols (id, name, file_path, symbol_type, language, signature, checksum, created_at) VALUES (?,?,?,?,?,?,?,?) ON CONFLICT(id) DO UPDATE SET signature=excluded.signature, checksum=excluded.checksum""", ( sym.id, sym.name, sym.file_path, sym.symbol_type, sym.language, sym.signature, sym.checksum, sym.created_at ) ) return sym.id def get_symbols_for_file(self, file_path: str) -> list[Symbol]: with self._conn() as conn: rows = conn.execute( 'SELECT * FROM symbols WHERE file_path = ?', (file_path,) ).fetchall() return [self._row_to_symbol(r) for r in rows] def get_symbol_by_name(self, name: str) -> list[Symbol]: with self._conn() as conn: rows = conn.execute( 'SELECT * FROM symbols WHERE name = ?', (name,) ).fetchall() return [self._row_to_symbol(r) for r in rows] # --- Edge CRUD --- def create_edge(self, edge: Edge) -> str: with self._conn() as conn: conn.execute( """INSERT OR IGNORE INTO edges (id, from_id, to_id, edge_type, confidence, created_at) VALUES (?,?,?,?,?,?)""", ( edge.id, edge.from_id, edge.to_id, edge.edge_type, edge.confidence, edge.created_at ) ) return edge.id def get_edges_from( self, node_id: str, edge_type: str | None = None ) -> list[Edge]: with self._conn() as conn: if edge_type: rows = conn.execute( 'SELECT * FROM edges WHERE from_id = ? AND edge_type = ?', (node_id, edge_type) ).fetchall() else: rows = conn.execute( 'SELECT * FROM edges WHERE from_id = ?', (node_id,) ).fetchall() return [self._row_to_edge(r) for r in rows] def get_edges_to( self, node_id: str, edge_type: str | None = None ) -> list[Edge]: with self._conn() as conn: if edge_type: rows = conn.execute( 'SELECT * FROM edges WHERE to_id = ? AND edge_type = ?', (node_id, edge_type) ).fetchall() else: rows = conn.execute( 'SELECT * FROM edges WHERE to_id = ?', (node_id,) ).fetchall() return [self._row_to_edge(r) for r in rows] # --- DriftEvent CRUD --- def create_drift_event(self, event: DriftEvent) -> str: with self._conn() as conn: conn.execute( """INSERT INTO drift_events (id, symbol_id, from_reason_id, drift_dimensions, severity, description, resolved, detected_at) VALUES (?,?,?,?,?,?,?,?)""", ( event.id, event.symbol_id, event.from_reason_id, json.dumps(event.drift_dimensions), event.severity, event.description, int(event.resolved), event.detected_at ) ) return event.id def get_unresolved_drift(self) -> list[DriftEvent]: with self._conn() as conn: rows = conn.execute( 'SELECT * FROM drift_events WHERE resolved = 0 ' 'ORDER BY severity DESC' ).fetchall() return [self._row_to_drift(r) for r in rows] def resolve_drift(self, event_id: str) -> None: with self._conn() as conn: conn.execute( 'UPDATE drift_events SET resolved = 1 WHERE id = ?', (event_id,) ) # --- Composite queries --- def get_reasons_for_file(self, file_path: str) -> list[ReasonNode]: """All ReasonNodes linked to symbols in a file via CREATES/MODIFIES.""" with self._conn() as conn: rows = conn.execute( """SELECT DISTINCT r.* FROM reasons r JOIN edges e ON e.from_id = r.id JOIN symbols s ON e.to_id = s.id WHERE s.file_path = ? AND e.edge_type IN ('CREATES', 'MODIFIES')""", (file_path,) ).fetchall() return [self._row_to_reason(r) for r in rows] def get_constraints_for_scope( self, file_paths: list[str] ) -> list[dict[str, Any]]: """Get all invariants and contracts for files in scope.""" results = [] for fp in file_paths: reasons = self.get_reasons_for_file(fp) for r in reasons: if r.invariants or r.postconditions or r.preconditions: results.append({ 'reason_id': r.id, 'goal': r.goal, 'file': fp, 'preconditions': r.preconditions, 'postconditions': r.postconditions, 'invariants': r.invariants }) return results def get_blast_radius(self, reason_id: str) -> dict[str, Any]: """Symbols + downstream REQUIRES reasons for a ReasonNode.""" symbols = [] for edge in self.get_edges_from(reason_id, 'CREATES'): syms = self._get_symbol(edge.to_id) if syms: symbols.append(syms) for edge in self.get_edges_from(reason_id, 'MODIFIES'): syms = self._get_symbol(edge.to_id) if syms: symbols.append(syms) dependent_reasons = [] for edge in self.get_edges_to(reason_id, 'REQUIRES'): reason = self.get_reason(edge.from_id) if reason: dependent_reasons.append(reason) return { 'reason': self.get_reason(reason_id), 'symbols': symbols, 'dependent_reasons': dependent_reasons, 'symbol_count': len(symbols), 'dependent_count': len(dependent_reasons) } def get_risk_profile(self, symbol_name: str) -> dict[str, Any]: """Drift score, ownership history, and status for a symbol.""" symbols = self.get_symbol_by_name(symbol_name) if not symbols: return {'found': False, 'symbol': symbol_name} sym = symbols[0] creating_edges = self.get_edges_to(sym.id, 'CREATES') modifying_edges = self.get_edges_to(sym.id, 'MODIFIES') drift_edges = self.get_edges_from(sym.id, 'DRIFTS_FROM') owners = set() for edge in creating_edges + modifying_edges: reason = self.get_reason(edge.from_id) if reason: owners.add(reason.owner) with self._conn() as conn: drift_rows = conn.execute( 'SELECT * FROM drift_events WHERE symbol_id = ? ' 'ORDER BY detected_at DESC', (sym.id,) ).fetchall() return { 'found': True, 'symbol': sym, 'owners': list(owners), 'modify_count': len(modifying_edges), 'drift_events': [self._row_to_drift(r) for r in drift_rows], 'active_drift': any( not self._row_to_drift(r).resolved for r in drift_rows ) } def get_stats(self) -> dict[str, int]: with self._conn() as conn: reasons = conn.execute('SELECT COUNT(*) FROM reasons').fetchone()[0] symbols = conn.execute('SELECT COUNT(*) FROM symbols').fetchone()[0] edges = conn.execute('SELECT COUNT(*) FROM edges').fetchone()[0] drift = conn.execute( 'SELECT COUNT(*) FROM drift_events WHERE resolved = 0' ).fetchone()[0] return { 'reasons': reasons, 'symbols': symbols, 'edges': edges, 'unresolved_drift': drift } # --- Helpers --- def _get_symbol(self, symbol_id: str) -> Symbol | None: with self._conn() as conn: row = conn.execute( 'SELECT * FROM symbols WHERE id = ?', (symbol_id,) ).fetchone() return self._row_to_symbol(row) if row else None @staticmethod def _row_to_reason(row: sqlite3.Row) -> ReasonNode: return ReasonNode( id=row['id'], goal=row['goal'], decision_type=row['decision_type'], scope=json.loads(row['scope']), owner=row['owner'], agent=row['agent'], status=row['status'], source=row['source'], task_id=row['task_id'], parent_id=row['parent_id'], preconditions=json.loads(row['preconditions']), postconditions=json.loads(row['postconditions']), invariants=json.loads(row['invariants']), created_at=row['created_at'], fulfilled_at=row['fulfilled_at'] ) @staticmethod def _row_to_symbol(row: sqlite3.Row) -> Symbol: return Symbol( id=row['id'], name=row['name'], file_path=row['file_path'], symbol_type=row['symbol_type'], language=row['language'], signature=row['signature'], checksum=row['checksum'], created_at=row['created_at'] ) @staticmethod def _row_to_edge(row: sqlite3.Row) -> Edge: return Edge( id=row['id'], from_id=row['from_id'], to_id=row['to_id'], edge_type=row['edge_type'], confidence=row['confidence'], created_at=row['created_at'] ) @staticmethod def _row_to_drift(row: sqlite3.Row) -> DriftEvent: return DriftEvent( id=row['id'], symbol_id=row['symbol_id'], from_reason_id=row['from_reason_id'], drift_dimensions=json.loads(row['drift_dimensions']), severity=row['severity'], description=row['description'], resolved=bool(row['resolved']), detected_at=row['detected_at'] ) ================================================ FILE: scripts/icpg/symbols.py ================================================ """Language-aware symbol extraction from source files.""" from __future__ import annotations import ast import hashlib import re from pathlib import Path from .models import Symbol # --- Language detection --- LANG_MAP = { '.py': 'python', '.ts': 'typescript', '.tsx': 'typescript', '.js': 'javascript', '.jsx': 'javascript', '.go': 'go', '.rs': 'rust', '.java': 'java', '.rb': 'ruby', '.php': 'php', '.swift': 'swift', '.kt': 'kotlin', '.c': 'c', '.h': 'c', '.cpp': 'cpp', '.hpp': 'cpp', '.cs': 'csharp', '.scala': 'scala', '.lua': 'lua', '.vue': 'vue', '.svelte': 'svelte', '.ex': 'elixir', '.exs': 'elixir' } def detect_language(file_path: str) -> str | None: ext = Path(file_path).suffix.lower() return LANG_MAP.get(ext) def checksum_content(content: str) -> str: """SHA256 hash of content for drift detection.""" return hashlib.sha256(content.encode()).hexdigest()[:16] # --- Python extraction (AST-based) --- def _extract_python(file_path: str, source: str) -> list[Symbol]: symbols = [] try: tree = ast.parse(source) except SyntaxError: return symbols for node in ast.walk(tree): if isinstance(node, ast.ClassDef): body = ast.get_source_segment(source, node) or '' symbols.append(Symbol( name=node.name, file_path=file_path, symbol_type='class', language='python', signature=_python_class_sig(node), checksum=checksum_content(body) )) elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): body = ast.get_source_segment(source, node) or '' sig = _python_func_sig(node) stype = 'function' if any( isinstance(d, ast.Name) and d.id == 'staticmethod' for d in node.decorator_list ): stype = 'function' symbols.append(Symbol( name=node.name, file_path=file_path, symbol_type=stype, language='python', signature=sig, checksum=checksum_content(body) )) return symbols def _python_func_sig(node: ast.FunctionDef) -> str: args = [] for a in node.args.args: ann = '' if a.annotation: ann = f': {ast.dump(a.annotation)}' args.append(f'{a.arg}{ann}') ret = '' if node.returns: ret = f' -> {ast.dump(node.returns)}' prefix = 'async def' if isinstance(node, ast.AsyncFunctionDef) else 'def' return f'{prefix} {node.name}({", ".join(args)}){ret}' def _python_class_sig(node: ast.ClassDef) -> str: bases = [ast.dump(b) for b in node.bases] if bases: return f'class {node.name}({", ".join(bases)})' return f'class {node.name}' # --- TypeScript/JavaScript extraction (regex) --- _TS_PATTERNS = [ # export function name(...) (r'export\s+(?:async\s+)?function\s+(\w+)\s*\([^)]*\)', 'function'), # export class Name (r'export\s+(?:abstract\s+)?class\s+(\w+)', 'class'), # export const Name = ... (r'export\s+const\s+(\w+)\s*[=:]', 'constant'), # export interface Name (r'export\s+interface\s+(\w+)', 'interface'), # export type Name (r'export\s+type\s+(\w+)', 'type'), # React components: export const Name = (...) => (r'export\s+const\s+((?:[A-Z]\w+))\s*=\s*(?:\([^)]*\)|[^=])\s*=>', 'component'), # Hooks: export function use* (r'export\s+(?:async\s+)?function\s+(use\w+)', 'hook'), ] def _extract_typescript(file_path: str, source: str) -> list[Symbol]: lang = 'typescript' if file_path.endswith(('.ts', '.tsx')) else 'javascript' symbols = [] seen = set() for pattern, stype in _TS_PATTERNS: for match in re.finditer(pattern, source): name = match.group(1) if name in seen: continue seen.add(name) # Get the line for signature line_start = source.rfind('\n', 0, match.start()) + 1 line_end = source.find('\n', match.end()) if line_end == -1: line_end = len(source) sig = source[line_start:line_end].strip() symbols.append(Symbol( name=name, file_path=file_path, symbol_type=stype, language=lang, signature=sig[:200], checksum=checksum_content(sig) )) return symbols # --- Go extraction (regex) --- _GO_PATTERNS = [ (r'func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)\s*\(', 'function'), (r'type\s+(\w+)\s+struct\s*\{', 'class'), (r'type\s+(\w+)\s+interface\s*\{', 'interface'), ] def _extract_go(file_path: str, source: str) -> list[Symbol]: symbols = [] seen = set() for pattern, stype in _GO_PATTERNS: for match in re.finditer(pattern, source): name = match.group(1) if name in seen: continue seen.add(name) line_start = source.rfind('\n', 0, match.start()) + 1 line_end = source.find('\n', match.end()) if line_end == -1: line_end = len(source) sig = source[line_start:line_end].strip() symbols.append(Symbol( name=name, file_path=file_path, symbol_type=stype, language='go', signature=sig[:200], checksum=checksum_content(sig) )) return symbols # --- Rust extraction (regex) --- _RUST_PATTERNS = [ (r'(?:pub\s+)?(?:async\s+)?fn\s+(\w+)', 'function'), (r'(?:pub\s+)?struct\s+(\w+)', 'class'), (r'(?:pub\s+)?enum\s+(\w+)', 'type'), (r'(?:pub\s+)?trait\s+(\w+)', 'interface'), (r'impl\s+(\w+)', 'class'), ] def _extract_rust(file_path: str, source: str) -> list[Symbol]: symbols = [] seen = set() for pattern, stype in _RUST_PATTERNS: for match in re.finditer(pattern, source): name = match.group(1) if name in seen: continue seen.add(name) line_start = source.rfind('\n', 0, match.start()) + 1 line_end = source.find('\n', match.end()) if line_end == -1: line_end = len(source) sig = source[line_start:line_end].strip() symbols.append(Symbol( name=name, file_path=file_path, symbol_type=stype, language='rust', signature=sig[:200], checksum=checksum_content(sig) )) return symbols # --- Elixir extraction (regex) --- _ELIXIR_PATTERNS = [ (r'defmodule\s+([\w.]+)', 'module'), (r'def\s+(\w+)\s*\(', 'function'), (r'defp\s+(\w+)\s*\(', 'function'), (r'schema\s+"(\w+)"', 'schema'), ] def _extract_elixir(file_path: str, source: str) -> list[Symbol]: symbols = [] seen = set() for pattern, stype in _ELIXIR_PATTERNS: for match in re.finditer(pattern, source): name = match.group(1) if name in seen: continue seen.add(name) line_start = source.rfind('\n', 0, match.start()) + 1 line_end = source.find('\n', match.end()) if line_end == -1: line_end = len(source) sig = source[line_start:line_end].strip() symbols.append(Symbol( name=name, file_path=file_path, symbol_type=stype, language='elixir', signature=sig[:200], checksum=checksum_content(sig) )) return symbols # --- Public API --- EXTRACTORS = { 'python': _extract_python, 'typescript': _extract_typescript, 'javascript': _extract_typescript, 'go': _extract_go, 'rust': _extract_rust, 'elixir': _extract_elixir, } def extract_symbols(file_path: str) -> list[Symbol]: """Extract symbols from a source file.""" lang = detect_language(file_path) if not lang: return [] path = Path(file_path) if not path.exists(): return [] try: source = path.read_text(encoding='utf-8') except (OSError, UnicodeDecodeError): return [] extractor = EXTRACTORS.get(lang) if not extractor: return [] return extractor(str(file_path), source) def extract_symbols_from_files(file_paths: list[str]) -> list[Symbol]: """Extract symbols from multiple files.""" all_symbols = [] for fp in file_paths: all_symbols.extend(extract_symbols(fp)) return all_symbols ================================================ FILE: scripts/icpg/vectors.py ================================================ """Vector-based duplicate detection for search_prior_work query. Tiered fallback: 1. chromadb + sentence-transformers (best quality) 2. TF-IDF cosine similarity via scikit-learn (no GPU needed) 3. Exact substring matching (zero deps) """ from __future__ import annotations import json import os from pathlib import Path from .store import ICPGStore VECTORS_DIR = '.icpg' TFIDF_CACHE = '.icpg/tfidf_cache.json' class VectorStore: """Tiered vector search for ReasonNode deduplication.""" def __init__(self, project_dir: str = '.'): self.project_dir = Path(project_dir).resolve() self.icpg_dir = self.project_dir / VECTORS_DIR self._backend = _detect_backend() def add_reason(self, reason_id: str, goal: str, scope: list[str]) -> None: """Index a ReasonNode for similarity search.""" text = f'{goal} | scope: {", ".join(scope)}' if self._backend == 'chromadb': _chromadb_add(self.icpg_dir, reason_id, text) elif self._backend == 'tfidf': _tfidf_add(self.icpg_dir, reason_id, text) else: _exact_add(self.icpg_dir, reason_id, text) def search_similar( self, goal_text: str, threshold: float = 0.75, top_k: int = 5 ) -> list[tuple[str, float]]: """Find similar ReasonNodes. Returns [(id, score), ...].""" if self._backend == 'chromadb': return _chromadb_search( self.icpg_dir, goal_text, threshold, top_k ) elif self._backend == 'tfidf': return _tfidf_search( self.icpg_dir, goal_text, threshold, top_k ) else: return _exact_search(self.icpg_dir, goal_text, threshold) def remove_reason(self, reason_id: str) -> None: """Remove a ReasonNode from the vector index.""" if self._backend == 'chromadb': _chromadb_remove(self.icpg_dir, reason_id) elif self._backend == 'tfidf': _tfidf_remove(self.icpg_dir, reason_id) else: _exact_remove(self.icpg_dir, reason_id) def _detect_backend() -> str: """Detect best available vector search backend.""" try: import chromadb import sentence_transformers return 'chromadb' except ImportError: pass try: from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity return 'tfidf' except ImportError: pass return 'exact' # --- ChromaDB backend --- def _get_chroma_collection(icpg_dir: Path): import chromadb client = chromadb.PersistentClient(path=str(icpg_dir / 'chroma')) return client.get_or_create_collection( name='reasons', metadata={'hnsw:space': 'cosine'} ) def _chromadb_add(icpg_dir: Path, reason_id: str, text: str) -> None: col = _get_chroma_collection(icpg_dir) col.upsert(ids=[reason_id], documents=[text]) def _chromadb_search( icpg_dir: Path, query: str, threshold: float, top_k: int ) -> list[tuple[str, float]]: col = _get_chroma_collection(icpg_dir) if col.count() == 0: return [] results = col.query( query_texts=[query], n_results=min(top_k, col.count()) ) pairs = [] if results['ids'] and results['distances']: for rid, dist in zip(results['ids'][0], results['distances'][0]): # chromadb cosine distance: 0 = identical, 2 = opposite score = 1.0 - (dist / 2.0) if score >= threshold: pairs.append((rid, round(score, 3))) return pairs def _chromadb_remove(icpg_dir: Path, reason_id: str) -> None: col = _get_chroma_collection(icpg_dir) try: col.delete(ids=[reason_id]) except Exception: pass # --- TF-IDF backend --- def _tfidf_load(icpg_dir: Path) -> dict[str, str]: cache_path = icpg_dir / 'tfidf_cache.json' if cache_path.exists(): return json.loads(cache_path.read_text()) return {} def _tfidf_save(icpg_dir: Path, data: dict[str, str]) -> None: cache_path = icpg_dir / 'tfidf_cache.json' cache_path.write_text(json.dumps(data)) def _tfidf_add(icpg_dir: Path, reason_id: str, text: str) -> None: data = _tfidf_load(icpg_dir) data[reason_id] = text _tfidf_save(icpg_dir, data) def _tfidf_search( icpg_dir: Path, query: str, threshold: float, top_k: int ) -> list[tuple[str, float]]: from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity data = _tfidf_load(icpg_dir) if not data: return [] ids = list(data.keys()) texts = list(data.values()) texts.append(query) vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(texts) query_vec = tfidf_matrix[-1] doc_vecs = tfidf_matrix[:-1] scores = cosine_similarity(query_vec, doc_vecs).flatten() pairs = [ (ids[i], round(float(scores[i]), 3)) for i in range(len(ids)) if scores[i] >= threshold ] pairs.sort(key=lambda x: x[1], reverse=True) return pairs[:top_k] def _tfidf_remove(icpg_dir: Path, reason_id: str) -> None: data = _tfidf_load(icpg_dir) data.pop(reason_id, None) _tfidf_save(icpg_dir, data) # --- Exact match backend --- def _exact_load(icpg_dir: Path) -> dict[str, str]: cache_path = icpg_dir / 'exact_cache.json' if cache_path.exists(): return json.loads(cache_path.read_text()) return {} def _exact_save(icpg_dir: Path, data: dict[str, str]) -> None: cache_path = icpg_dir / 'exact_cache.json' cache_path.write_text(json.dumps(data)) def _exact_add(icpg_dir: Path, reason_id: str, text: str) -> None: data = _exact_load(icpg_dir) data[reason_id] = text.lower() _exact_save(icpg_dir, data) def _exact_search( icpg_dir: Path, query: str, threshold: float ) -> list[tuple[str, float]]: data = _exact_load(icpg_dir) query_words = set(query.lower().split()) if not query_words: return [] pairs = [] for rid, text in data.items(): text_words = set(text.split()) if not text_words: continue overlap = len(query_words & text_words) score = overlap / max(len(query_words), len(text_words)) if score >= threshold: pairs.append((rid, round(score, 3))) pairs.sort(key=lambda x: x[1], reverse=True) return pairs def _exact_remove(icpg_dir: Path, reason_id: str) -> None: data = _exact_load(icpg_dir) data.pop(reason_id, None) _exact_save(icpg_dir, data) ================================================ FILE: scripts/install-graph-tools.sh ================================================ #!/bin/bash # install-graph-tools.sh - Install code graph MCP servers # # Tier 1: codebase-memory-mcp (default, always installed) # - Single static binary, zero dependencies # - 64 languages, sub-ms queries, 14 MCP tools # # Tier 2: Joern CPG via CodeBadger (opt-in, --joern) # - Full CPG: AST + CFG + CDG + DDG + PDG # - Requires Docker + Python 3.10+ # # Tier 3: CodeQL (opt-in, --codeql) # - Interprocedural taint analysis, security queries # - Requires CodeQL CLI set -e # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # Defaults INSTALL_JOERN=false INSTALL_CODEQL=false INSTALL_DIR="$HOME/.local/bin" # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --joern) INSTALL_JOERN=true; shift ;; --codeql) INSTALL_CODEQL=true; shift ;; --all) INSTALL_JOERN=true; INSTALL_CODEQL=true; shift ;; --help|-h) echo "Usage: install-graph-tools.sh [OPTIONS]" echo "" echo "Install code graph MCP servers for Maggy." echo "" echo "Options:" echo " (no flags) Install Tier 1 only (codebase-memory-mcp)" echo " --joern Also install Tier 2 (Joern CPG via CodeBadger)" echo " --codeql Also install Tier 3 (CodeQL)" echo " --all Install all tiers" echo " --help Show this help" echo "" echo "Tiers:" echo " 1 codebase-memory-mcp AST graph, 64 langs, sub-ms (always)" echo " 2 Joern/CodeBadger Full CPG (AST+CFG+PDG), 12 langs (opt-in)" echo " 3 CodeQL Taint analysis, security, 10+ langs (opt-in)" exit 0 ;; *) echo -e "${RED}Unknown option: $1${NC}"; echo "Run with --help for usage."; exit 1 ;; esac done echo "" echo "════════════════════════════════════════════════════════════════" echo " Code Graph Tools Installer" echo "════════════════════════════════════════════════════════════════" echo "" # Detect platform OS=$(uname -s | tr '[:upper:]' '[:lower:]') ARCH=$(uname -m) case "$ARCH" in aarch64|arm64) ARCH="arm64" ;; x86_64|amd64) ARCH="amd64" ;; esac echo -e "${BLUE}Platform: ${OS}-${ARCH}${NC}" echo "" # ───────────────────────────────────────────────────────────────── # Tier 1: codebase-memory-mcp # ───────────────────────────────────────────────────────────────── echo "── Tier 1: codebase-memory-mcp ──────────────────────────────" echo "" mkdir -p "$INSTALL_DIR" if command -v codebase-memory-mcp &> /dev/null; then echo -e "${GREEN}✓ codebase-memory-mcp already installed${NC}" codebase-memory-mcp --version 2>/dev/null || true else DOWNLOAD_URL="https://github.com/DeusData/codebase-memory-mcp/releases/latest/download/codebase-memory-mcp-${OS}-${ARCH}.tar.gz" TEMP_DIR=$(mktemp -d) echo "Downloading from GitHub releases..." echo " URL: $DOWNLOAD_URL" if curl -fsSL "$DOWNLOAD_URL" -o "$TEMP_DIR/codebase-memory-mcp.tar.gz"; then tar xzf "$TEMP_DIR/codebase-memory-mcp.tar.gz" -C "$TEMP_DIR" mv "$TEMP_DIR/codebase-memory-mcp" "$INSTALL_DIR/codebase-memory-mcp" chmod +x "$INSTALL_DIR/codebase-memory-mcp" echo -e "${GREEN}✓ Installed codebase-memory-mcp to $INSTALL_DIR${NC}" # Auto-configure for Claude Code and other agents echo "" echo "Running auto-configuration..." "$INSTALL_DIR/codebase-memory-mcp" install 2>/dev/null || true else echo -e "${RED}✗ Failed to download codebase-memory-mcp${NC}" echo "" echo " Manual install:" echo " 1. Go to https://github.com/DeusData/codebase-memory-mcp/releases" echo " 2. Download codebase-memory-mcp-${OS}-${ARCH}.tar.gz" echo " 3. Extract and move to $INSTALL_DIR/" echo " 4. Run: codebase-memory-mcp install" fi rm -rf "$TEMP_DIR" fi # Check PATH if ! echo "$PATH" | tr ':' '\n' | grep -q "$INSTALL_DIR"; then echo "" echo -e "${YELLOW}⚠ $INSTALL_DIR is not in your PATH${NC}" echo " Add to your shell profile:" echo " export PATH=\"$INSTALL_DIR:\$PATH\"" fi # ───────────────────────────────────────────────────────────────── # Tier 2: Joern CPG via CodeBadger (opt-in) # ───────────────────────────────────────────────────────────────── if [ "$INSTALL_JOERN" = true ]; then echo "" echo "── Tier 2: Joern CPG (CodeBadger) ───────────────────────────" echo "" # Check Docker if ! command -v docker &> /dev/null; then echo -e "${RED}✗ Docker not found${NC}" echo " Joern requires Docker. Install from: https://docker.com" echo " Skipping Tier 2 installation." elif ! docker info &> /dev/null 2>&1; then echo -e "${RED}✗ Docker is not running${NC}" echo " Start Docker Desktop and try again." echo " Skipping Tier 2 installation." else echo -e "${GREEN}✓ Docker is running${NC}" # Check Python PYTHON_CMD="" if command -v python3 &> /dev/null; then PYTHON_CMD="python3" elif command -v python &> /dev/null; then PYTHON_CMD="python" fi if [ -z "$PYTHON_CMD" ]; then echo -e "${RED}✗ Python 3.10+ not found${NC}" echo " Install Python: https://python.org" echo " Skipping Tier 2 installation." else PY_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") echo -e "${GREEN}✓ Python $PY_VERSION found${NC}" CODEBADGER_DIR="$HOME/.claude/tools/codebadger" if [ -d "$CODEBADGER_DIR" ]; then echo -e "${GREEN}✓ CodeBadger already cloned${NC}" echo " Pulling latest..." git -C "$CODEBADGER_DIR" pull 2>/dev/null || true else echo "Cloning CodeBadger..." mkdir -p "$HOME/.claude/tools" git clone https://github.com/lekssays/joern-mcp.git "$CODEBADGER_DIR" 2>/dev/null || { echo -e "${RED}✗ Failed to clone CodeBadger${NC}" echo " Manual install: https://github.com/lekssays/joern-mcp" } fi if [ -d "$CODEBADGER_DIR" ]; then echo "Installing Python dependencies..." $PYTHON_CMD -m pip install -r "$CODEBADGER_DIR/requirements.txt" --quiet 2>/dev/null || true echo "Starting Joern Docker services..." (cd "$CODEBADGER_DIR" && docker compose up -d 2>/dev/null) || { echo -e "${YELLOW}⚠ Docker compose failed. You may need to start manually:${NC}" echo " cd $CODEBADGER_DIR && docker compose up -d" } echo -e "${GREEN}✓ Joern/CodeBadger installed${NC}" echo "" echo " To start the MCP server:" echo " cd $CODEBADGER_DIR && $PYTHON_CMD main.py" echo "" echo " MCP endpoint: http://localhost:4242/mcp" fi fi fi fi # ───────────────────────────────────────────────────────────────── # Tier 3: CodeQL (opt-in) # ───────────────────────────────────────────────────────────────── if [ "$INSTALL_CODEQL" = true ]; then echo "" echo "── Tier 3: CodeQL ───────────────────────────────────────────" echo "" if command -v codeql &> /dev/null; then echo -e "${GREEN}✓ CodeQL already installed${NC}" codeql version 2>/dev/null || true else if command -v brew &> /dev/null; then echo "Installing CodeQL via Homebrew..." brew install codeql 2>/dev/null || { echo -e "${YELLOW}⚠ brew install codeql failed${NC}" echo " Trying GitHub release download..." } fi # Fallback: direct download if ! command -v codeql &> /dev/null; then echo "Downloading CodeQL CLI..." echo "" echo " Manual install from:" echo " https://github.com/github/codeql-cli-binaries/releases" echo "" echo " After download:" echo " 1. Extract to $INSTALL_DIR/codeql/" echo " 2. Add to PATH: export PATH=\"$INSTALL_DIR/codeql:\$PATH\"" fi fi if command -v codeql &> /dev/null; then echo "" echo "Installing CodeQL query packs..." codeql pack download codeql/javascript-queries 2>/dev/null || true codeql pack download codeql/python-queries 2>/dev/null || true codeql pack download codeql/java-queries 2>/dev/null || true codeql pack download codeql/go-queries 2>/dev/null || true echo -e "${GREEN}✓ CodeQL query packs installed${NC}" fi fi # ───────────────────────────────────────────────────────────────── # Summary # ───────────────────────────────────────────────────────────────── echo "" echo "════════════════════════════════════════════════════════════════" echo " Installation Summary" echo "════════════════════════════════════════════════════════════════" echo "" if command -v codebase-memory-mcp &> /dev/null; then echo -e " ${GREEN}✓ Tier 1: codebase-memory-mcp (AST graph, 64 langs)${NC}" else echo -e " ${RED}✗ Tier 1: codebase-memory-mcp NOT installed${NC}" fi if [ "$INSTALL_JOERN" = true ]; then if [ -d "$HOME/.claude/tools/codebadger" ]; then echo -e " ${GREEN}✓ Tier 2: Joern CPG via CodeBadger${NC}" else echo -e " ${RED}✗ Tier 2: Joern NOT installed${NC}" fi fi if [ "$INSTALL_CODEQL" = true ]; then if command -v codeql &> /dev/null; then echo -e " ${GREEN}✓ Tier 3: CodeQL${NC}" else echo -e " ${RED}✗ Tier 3: CodeQL NOT installed${NC}" fi fi echo "" echo "Next steps:" echo " 1. Run /initialize-project in your project" echo " 2. The MCP servers will be auto-configured in .mcp.json" echo " 3. Claude will use the graph for optimized code navigation" echo "" ================================================ FILE: scripts/install-hooks.sh ================================================ #!/bin/bash # Install Claude Code Review Git Hooks # Run this in any git repository to enable pre-push code review set -e CLAUDE_DIR="$HOME/.claude" HOOKS_DIR="$CLAUDE_DIR/hooks" # Colors RED='\033[0;31m' YELLOW='\033[1;33m' GREEN='\033[0;32m' NC='\033[0m' echo "" echo "🔧 Claude Code Review - Git Hook Installer" echo "" # Check if we're in a git repository if [ ! -d ".git" ]; then echo -e "${RED}❌ Error: Not a git repository${NC}" echo " Run this command from a git project root." exit 1 fi # Check if hooks exist if [ ! -d "$HOOKS_DIR" ]; then echo -e "${RED}❌ Error: Hook templates not found${NC}" if [ -f "$CLAUDE_DIR/.bootstrap-dir" ]; then echo " Run $(cat "$CLAUDE_DIR/.bootstrap-dir")/install.sh first." else echo " Run install.sh from your Maggy clone first." fi exit 1 fi # Check for existing pre-push hook if [ -f ".git/hooks/pre-push" ]; then echo -e "${YELLOW}⚠️ Existing pre-push hook found${NC}" read -p " Overwrite? (y/N) " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo " Skipped. Existing hook preserved." exit 0 fi fi # Install pre-push hook cp "$HOOKS_DIR/pre-push" ".git/hooks/pre-push" chmod +x ".git/hooks/pre-push" echo -e "${GREEN}✅ Pre-push hook installed${NC}" echo "" echo "What happens now:" echo " • Every 'git push' runs Claude code review" echo " • 🔴 Critical or 🟠 High issues block the push" echo " • 🟡 Medium and 🟢 Low issues are advisory only" echo "" echo "To disable:" echo " rm .git/hooks/pre-push" echo "" ================================================ FILE: scripts/install-skills.sh ================================================ #!/bin/bash # install-skills.sh - Install skills to any agent tool directory # Usage: install-skills.sh [source_dir] # Example: install-skills.sh ~/.kimi/skills # install-skills.sh ~/.codex/skills /path/to/skills set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" DEFAULT_SOURCE="$SCRIPT_DIR/../skills" usage() { echo "Usage: install-skills.sh [source_dir]" echo " target_dir: Where to install skills" echo " source_dir: Source skills (default: repo skills/)" exit 1 } copy_skills() { local source="$1" local target="$2" local count=0 mkdir -p "$target" for skill_dir in "$source"/*/; do [ -d "$skill_dir" ] || continue [ -f "$skill_dir/SKILL.md" ] || continue local name name=$(basename "$skill_dir") cp -r "${skill_dir%/}" "$target/" count=$((count + 1)) done echo "$count" } main() { local target="${1:-}" local source="${2:-$DEFAULT_SOURCE}" [ -z "$target" ] && usage [ -d "$source" ] || { echo "Error: source dir '$source' not found" >&2 exit 1 } local installed installed=$(copy_skills "$source" "$target") echo "Installed $installed skills to $target" } main "$@" ================================================ FILE: scripts/mnemos/__init__.py ================================================ """Mnemos -- Task-Scoped Memory Lifecycle for Autonomous Agents. Prevents lossy context compaction by treating memory as a typed graph (MnemoGraph) with differentiated eviction policies, continuous fatigue monitoring, and checkpoint/resume. """ __version__ = '0.1.0' ================================================ FILE: scripts/mnemos/__main__.py ================================================ """CLI entry point for Mnemos -- Task-Scoped Memory Lifecycle.""" from __future__ import annotations import argparse import json import sys from pathlib import Path from . import __version__ from .checkpoint import load_checkpoint, write_checkpoint from .consolidation import micro_consolidate from .fatigue import compute_fatigue, read_fatigue_file from .models import FatigueState, MnemoNode, _now, _uuid from .signals import get_session_stats from .store import MnemosStore def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog='mnemos', description='Mnemos -- Task-Scoped Memory Lifecycle' ) parser.add_argument( '--version', action='version', version=f'mnemos {__version__}' ) parser.add_argument( '--project', default='.', help='Project directory (default: .)' ) sub = parser.add_subparsers(dest='command') # --- init --- sub.add_parser('init', help='Initialize .mnemos/ directory and database') # --- status --- sub.add_parser('status', help='Show Mnemos statistics and fatigue') # --- fatigue --- sub.add_parser('fatigue', help='Show detailed fatigue breakdown') # --- checkpoint --- p_cp = sub.add_parser('checkpoint', help='Write a checkpoint') p_cp.add_argument( '--force', action='store_true', help='Write even if fatigue is low' ) p_cp.add_argument('--task-id', help='Task ID for checkpoint') # --- resume --- p_resume = sub.add_parser( 'resume', help='Output latest checkpoint for context injection' ) p_resume.add_argument('--path', help='Specific checkpoint file path') # --- consolidate --- p_cons = sub.add_parser( 'consolidate', help='Run micro-consolidation pass' ) p_cons.add_argument('--scope', default='', help='Current scope tag') # --- nodes --- p_nodes = sub.add_parser('nodes', help='List active MnemoNodes') p_nodes.add_argument('--type', dest='node_type', help='Filter by type') p_nodes.add_argument( '--all', action='store_true', help='Include non-active nodes' ) # --- add --- p_add = sub.add_parser('add', help='Add a MnemoNode') p_add.add_argument('type', choices=[ 'goal', 'constraint', 'context', 'working', 'result' ]) p_add.add_argument('content', help='Node content') p_add.add_argument('--task-id', default='manual', help='Task ID') p_add.add_argument('--scope', nargs='+', default=[], help='Scope tags') # --- bridge-icpg --- sub.add_parser( 'bridge-icpg', help='Import iCPG ReasonNodes as MnemoNodes' ) args = parser.parse_args(argv) store = MnemosStore(args.project) if args.command == 'init': return cmd_init(store) elif args.command == 'status': return cmd_status(store, args) elif args.command == 'fatigue': return cmd_fatigue(store, args) elif args.command == 'checkpoint': return cmd_checkpoint(store, args) elif args.command == 'resume': return cmd_resume(args) elif args.command == 'consolidate': return cmd_consolidate(store, args) elif args.command == 'nodes': return cmd_nodes(store, args) elif args.command == 'add': return cmd_add(store, args) elif args.command == 'bridge-icpg': return cmd_bridge_icpg(store, args) else: parser.print_help() return 1 def cmd_init(store: MnemosStore) -> int: store.init_db() print(f'Initialized Mnemos at {store.mnemos_dir}') print(f' Database: {store.db_path}') print(f' .gitignore: created') return 0 def cmd_status(store: MnemosStore, args) -> int: if not store.exists(): print('No Mnemos database. Run `mnemos init` first.') return 0 stats = store.get_stats() fatigue_data = read_fatigue_file(args.project) print('MNEMOS STATUS') print(f' Active nodes: {stats["active"]}') print(f' Compressed: {stats["compressed"]}') print(f' Evicted: {stats["evicted"]}') print(f' Total nodes: {stats["total_nodes"]}') print(f' Checkpoints: {stats["checkpoints"]}') if stats['by_type']: parts = [f'{t}:{c}' for t, c in stats['by_type'].items()] print(f' By type: {", ".join(parts)}') # Show live fatigue if available if fatigue_data: used = fatigue_data.get('used_percentage', 0) remaining = fatigue_data.get('remaining_percentage', 100) print(f'\n Context usage: {used:.1f}% used, {remaining:.1f}% remaining') # Compute full fatigue from observable signals fatigue = compute_fatigue(fatigue_data, args.project) state_icons = { 'flow': '+', 'compress': '~', 'pre_sleep': '!', 'rem': '!!', 'emergency': 'XXX' } icon = state_icons.get(fatigue.state, '?') print(f' Fatigue: [{icon}] {fatigue.composite_score:.2f} ({fatigue.state})') # Latest checkpoint cp = store.get_latest_checkpoint() if cp: print(f'\n Last checkpoint: {cp.id[:8]} ({cp.created_at})') print(f' Goal: {cp.goal[:60]}') print(f' Fatigue then: {cp.fatigue_at_checkpoint:.2f}') return 0 def cmd_fatigue(store: MnemosStore, args) -> int: fatigue_data = read_fatigue_file(args.project) if not fatigue_data: print('No fatigue data. Statusline not configured or no API calls yet.') print('Configure mnemos-statusline.sh to start tracking.') return 0 fatigue = compute_fatigue(fatigue_data, args.project) state_bar = _fatigue_bar(fatigue.composite_score) print('MNEMOS FATIGUE ANALYSIS') print(f' {state_bar}') print(f' Composite: {fatigue.composite_score:.4f} -> {fatigue.state.upper()}') print() print(' Dimensions (all passively observed from hooks):') print(f' Token utilization: {fatigue.token_utilization:.4f} (weight: 0.40) [statusline]') print(f' Scope scatter: {fatigue.scope_scatter:.4f} (weight: 0.25) [PreToolUse file paths]') print(f' Re-read ratio: {fatigue.reread_ratio:.4f} (weight: 0.20) [PreToolUse Read calls]') print(f' Error density: {fatigue.error_density:.4f} (weight: 0.15) [PostToolUse outcomes]') print() # Signal stats sig_stats = get_session_stats(args.project) if sig_stats.get('total_signals', 0) > 0: print(f' Signal log: {sig_stats["total_signals"]} events') if sig_stats.get('tool_calls'): tools = ', '.join(f'{k}:{v}' for k, v in sig_stats['tool_calls'].items()) print(f' Tools: {tools}') print(f' Unique files read: {sig_stats.get("unique_files_read", 0)}') print(f' Re-reads: {sig_stats.get("rereads", 0)}') print(f' Errors: {sig_stats.get("errors", 0)}/{sig_stats.get("total_outcomes", 0)}') print() # Recommendations if fatigue.state == 'flow': print(' Status: Operating normally. No action needed.') elif fatigue.state == 'compress': print(' Status: Consider micro-consolidation.') print(' Run: mnemos consolidate') elif fatigue.state == 'pre_sleep': print(' Status: Write checkpoint and consolidate.') print(' Run: mnemos checkpoint && mnemos consolidate') elif fatigue.state == 'rem': print(' WARNING: High fatigue. Checkpoint immediately.') print(' Run: mnemos checkpoint --force') elif fatigue.state == 'emergency': print(' EMERGENCY: Context nearly full. Checkpoint NOW.') print(' Run: mnemos checkpoint --force') # Log it if store.exists(): store.log_fatigue(fatigue) return 0 def cmd_checkpoint(store: MnemosStore, args) -> int: if not store.exists(): store.init_db() # Check fatigue to decide if needed fatigue_data = read_fatigue_file(args.project) fatigue = compute_fatigue(fatigue_data, args.project) if fatigue_data else None if fatigue and not args.force: if fatigue.composite_score < 0.40: print(f'Fatigue low ({fatigue.composite_score:.2f}). ' f'Use --force to checkpoint anyway.') return 0 # Try to load iCPG store if available icpg_store = _try_load_icpg(args.project) cp = write_checkpoint( store, fatigue_score=fatigue.composite_score if fatigue else 0.0, icpg_store=icpg_store, task_id=getattr(args, 'task_id', None) ) print(f'Checkpoint written: {cp.id[:8]}') print(f' Goal: {cp.goal[:60]}') print(f' Constraints: {len(cp.active_constraints)}') print(f' Results: {len(cp.active_results)}') print(f' Fatigue: {cp.fatigue_at_checkpoint:.2f}') print(f' File: .mnemos/checkpoint-latest.json') return 0 def cmd_resume(args) -> int: output = load_checkpoint( project_dir=args.project, path=getattr(args, 'path', None) ) if not output: print('No checkpoint found to resume from.') return 0 # Output formatted checkpoint (this goes into agent context) print(output) return 0 def cmd_consolidate(store: MnemosStore, args) -> int: if not store.exists(): print('No Mnemos database. Run `mnemos init` first.') return 1 scope = getattr(args, 'scope', '') stats = micro_consolidate(store, current_scope=scope) print(f'Micro-consolidation complete:') print(f' Compressed: {stats["compressed"]} ResultNodes') print(f' Evicted: {stats["evicted"]} ContextNodes') print(f' Decayed: {stats["decayed"]} node weights') return 0 def cmd_nodes(store: MnemosStore, args) -> int: if not store.exists(): print('No Mnemos database.') return 0 node_type = getattr(args, 'node_type', None) show_all = getattr(args, 'all', False) if node_type: if show_all: # Get all statuses nodes = [] for status in ('active', 'compressed', 'evicted'): nodes.extend(store.get_by_type(node_type, status=status)) else: nodes = store.get_by_type(node_type) else: if show_all: nodes = [] with store._conn() as conn: rows = conn.execute( 'SELECT * FROM mnemo_nodes ORDER BY type, activation_weight DESC' ).fetchall() nodes = [store._row_to_node(r) for r in rows] else: nodes = store.get_active_nodes() if not nodes: print('No matching nodes.') return 0 status_icons = { 'active': '+', 'compressed': '~', 'evicted': '-', 'promoted': '^', 'handed_off': '>' } print(f'MNEMO NODES ({len(nodes)}):') for n in nodes: icon = status_icons.get(n.status, '?') weight = f'{n.activation_weight:.2f}' content = n.summary or n.content content_preview = content[:60] if content else '(empty)' print(f' [{icon}] {n.type:12s} w={weight} {content_preview}') if n.scope_tags: print(f' scope: {", ".join(n.scope_tags[:3])}') return 0 def cmd_add(store: MnemosStore, args) -> int: if not store.exists(): store.init_db() node = MnemoNode( type=args.type, task_id=args.task_id, content=args.content, scope_tags=args.scope, origin='agent_generated' ) store.create_node(node) print(f'Created {args.type} node: {node.id[:8]}') print(f' Content: {args.content[:60]}') if args.scope: print(f' Scope: {", ".join(args.scope)}') return 0 def cmd_bridge_icpg(store: MnemosStore, args) -> int: if not store.exists(): store.init_db() icpg_store = _try_load_icpg(args.project) if not icpg_store: print('No iCPG database found. Run `icpg init` first.') return 1 stats = store.load_from_icpg(icpg_store) print(f'iCPG Bridge complete:') print(f' GoalNodes imported: {stats["goals_imported"]}') print(f' ConstraintNodes imported: {stats["constraints_imported"]}') return 0 def _try_load_icpg(project_dir: str): """Try to import and load iCPG store. Returns None if unavailable.""" try: icpg_path = Path(project_dir).resolve() / '.icpg' / 'reason.db' if not icpg_path.exists(): return None # Try importing from sibling package sys.path.insert(0, str(Path(__file__).parent.parent)) from icpg.store import ICPGStore store = ICPGStore(project_dir) if store.exists(): return store except ImportError: pass return None def _fatigue_bar(score: float) -> str: """Render a visual fatigue bar.""" filled = int(score * 20) empty = 20 - filled bar = '#' * filled + '.' * empty if score >= 0.90: label = 'EMERGENCY' elif score >= 0.75: label = 'REM' elif score >= 0.60: label = 'PRE-SLEEP' elif score >= 0.40: label = 'COMPRESS' else: label = 'FLOW' return f'[{bar}] {score:.2f} {label}' if __name__ == '__main__': sys.exit(main()) ================================================ FILE: scripts/mnemos/checkpoint.py ================================================ """Checkpoint write/load for Mnemos session persistence.""" from __future__ import annotations import json import subprocess import time from collections import Counter from pathlib import Path from .models import CheckpointNode, _now, _uuid from .signals import read_recent_signals from .store import MnemosStore def write_checkpoint( store: MnemosStore, fatigue_score: float = 0.0, icpg_store=None, task_id: str | None = None ) -> CheckpointNode: """Write a CheckpointNode capturing current MnemoGraph state. Always includes: GoalNode content, all ConstraintNodes, current sub-goal. Optionally includes: iCPG state, git state, compressed ResultNodes. Writes to: .mnemos/checkpoint-latest.json (always overwritten) .mnemos/checkpoints/.json (archived copy) Returns the created CheckpointNode. """ # Determine task_id from active GoalNodes goal_nodes = store.get_by_type('goal') if not task_id and goal_nodes: task_id = goal_nodes[0].task_id task_id = task_id or 'unknown' # Gather goal goal_text = '; '.join(n.content for n in goal_nodes) or 'No active goal' # Gather constraints (never evicted) constraint_nodes = store.get_by_type('constraint') constraints = [n.content for n in constraint_nodes] # Gather result summaries (compressed or active) result_nodes = store.get_by_type('result') results = [] for rn in result_nodes[:20]: # Cap at 20 most recent if rn.summary: results.append(rn.summary) elif rn.content: results.append(rn.content[:200]) # Current sub-goal from working nodes working_nodes = store.get_by_type('working') current_subgoal = working_nodes[0].content if working_nodes else '' # Working memory working_memory = '\n'.join( n.content for n in working_nodes[:3] ) # Task narrative and recent files from signals narrative, recent_files = build_task_narrative(store.project_dir) # Git state git_state = _get_git_state(store.project_dir) # iCPG state icpg_state = None if icpg_store and icpg_store.exists(): icpg_state = _get_icpg_state(icpg_store) # Node summary (counts by type and status) stats = store.get_stats() node_summary = { 'total': stats['total_nodes'], 'active': stats['active'], 'compressed': stats['compressed'], 'by_type': stats['by_type'] } cp = CheckpointNode( id=_uuid(), task_id=task_id, goal=goal_text, active_constraints=constraints, active_results=results, current_subgoal=current_subgoal, working_memory=working_memory, task_narrative=narrative, recent_files=recent_files, fatigue_at_checkpoint=fatigue_score, git_state=git_state, icpg_state=icpg_state, node_summary=node_summary, created_at=_now() ) # Persist to DB store.save_checkpoint(cp) # Write to JSON files cp_data = _checkpoint_to_dict(cp) # Latest checkpoint (overwrite) latest_path = store.mnemos_dir / 'checkpoint-latest.json' latest_path.write_text(json.dumps(cp_data, indent=2)) # Archived copy archive_dir = store.mnemos_dir / 'checkpoints' archive_dir.mkdir(exist_ok=True) archive_path = archive_dir / f'{cp.id}.json' archive_path.write_text(json.dumps(cp_data, indent=2)) return cp def load_checkpoint( project_dir: str = '.', path: str | None = None ) -> str | None: """Load latest checkpoint and format as context for session injection. Returns formatted markdown string, or None if no checkpoint exists. """ if path: cp_path = Path(path) else: cp_path = Path(project_dir).resolve() / '.mnemos' / 'checkpoint-latest.json' if not cp_path.exists(): return None try: data = json.loads(cp_path.read_text()) except (json.JSONDecodeError, OSError): return None return _format_checkpoint(data) def _format_checkpoint(data: dict) -> str: """Format checkpoint data as structured markdown for context injection.""" lines = [] lines.append('## Mnemos Session Resume') lines.append(f'Checkpoint: {data.get("id", "unknown")[:8]}') lines.append(f'Created: {data.get("created_at", "unknown")}') lines.append(f'Fatigue at checkpoint: {data.get("fatigue_at_checkpoint", 0):.2f}') lines.append('') # Goal lines.append('### Goal') lines.append(data.get('goal', 'No goal recorded')) lines.append('') # Constraints constraints = data.get('active_constraints', []) if constraints: lines.append('### Active Constraints (DO NOT VIOLATE)') for c in constraints: lines.append(f'- {c}') lines.append('') # What was being worked on (task narrative) narrative = data.get('task_narrative', '') if narrative: lines.append('### What You Were Working On') lines.append(narrative) lines.append('') # Current task subgoal = data.get('current_subgoal', '') if subgoal: lines.append('### Current Sub-Goal') lines.append(subgoal) lines.append('') # Working memory working = data.get('working_memory', '') if working: lines.append('### Working Memory') lines.append(working) lines.append('') # Progress (result summaries) results = data.get('active_results', []) if results: lines.append('### Progress So Far') for r in results: lines.append(f'- {r}') lines.append('') # Recent files recent = data.get('recent_files', []) if recent: lines.append('### Key Files (from recent activity)') for f in recent[:10]: parts = [] if f.get('edits', 0) > 0: parts.append(f'edited {f["edits"]}x') if f.get('reads', 0) > 0: parts.append(f'read {f["reads"]}x') detail = ', '.join(parts) if parts else 'touched' lines.append(f'- {f.get("path", "?")} ({detail})') lines.append('') # Git state git = data.get('git_state', {}) if git.get('branch'): lines.append('### Git State') lines.append(f'Branch: {git["branch"]}') if git.get('uncommitted'): lines.append('Uncommitted files:') for f in git['uncommitted'][:10]: lines.append(f' - {f}') lines.append('') # iCPG state icpg = data.get('icpg_state') if icpg: lines.append('### iCPG Context') if icpg.get('active_reason'): lines.append(f'Active intent: {icpg["active_reason"]}') if icpg.get('unresolved_drift'): lines.append(f'Unresolved drift: {icpg["unresolved_drift"]}') if icpg.get('stats'): s = icpg['stats'] lines.append( f'Graph: {s.get("reasons", 0)} intents, ' f'{s.get("symbols", 0)} symbols' ) lines.append('') # Node summary summary = data.get('node_summary', {}) if summary: lines.append('### MnemoGraph Summary') lines.append( f'Nodes: {summary.get("active", 0)} active, ' f'{summary.get("compressed", 0)} compressed, ' f'{summary.get("total", 0)} total' ) by_type = summary.get('by_type', {}) if by_type: parts = [f'{t}:{c}' for t, c in by_type.items()] lines.append(f'Types: {", ".join(parts)}') return '\n'.join(lines) def _get_git_state(project_dir: Path) -> dict: """Get current git branch and uncommitted files.""" state = {} try: result = subprocess.run( ['git', 'branch', '--show-current'], capture_output=True, text=True, timeout=5, cwd=str(project_dir) ) if result.returncode == 0: state['branch'] = result.stdout.strip() result = subprocess.run( ['git', 'diff', '--name-only'], capture_output=True, text=True, timeout=5, cwd=str(project_dir) ) if result.returncode == 0: files = [ f.strip() for f in result.stdout.strip().split('\n') if f.strip() ] state['uncommitted'] = files result = subprocess.run( ['git', 'diff', '--cached', '--name-only'], capture_output=True, text=True, timeout=5, cwd=str(project_dir) ) if result.returncode == 0: staged = [ f.strip() for f in result.stdout.strip().split('\n') if f.strip() ] if staged: state['staged'] = staged except (subprocess.TimeoutExpired, FileNotFoundError): pass return state def _get_icpg_state(icpg_store) -> dict: """Extract summary iCPG state for checkpoint.""" state = {} try: stats = icpg_store.get_stats() state['stats'] = stats # Find most recent executing reason executing = icpg_store.list_reasons(status='executing') if executing: r = executing[-1] state['active_reason'] = f'{r.id[:8]} -- {r.goal}' # Unresolved drift count drift = icpg_store.get_unresolved_drift() state['unresolved_drift'] = len(drift) except Exception: pass return state def build_task_narrative(project_dir: str | Path) -> tuple[str, list[dict]]: """Build a human-readable task narrative from recent signals. Reads signals.jsonl and produces: 1. A narrative string describing recent activity 2. A list of recent files with read/edit counts Returns: (narrative_text, recent_files_list) """ signals = read_recent_signals(str(project_dir), limit=50) if not signals: return ('', []) # Count file interactions file_edits: Counter = Counter() file_reads: Counter = Counter() tool_counts: Counter = Counter() error_count = 0 total_outcomes = 0 for s in signals: tool = s.get('tool', '') fp = s.get('file_path', '') tool_counts[tool] += 1 if fp: if tool in ('Edit', 'Write'): file_edits[fp] += 1 elif tool == 'Read': file_reads[fp] += 1 if 'success' in s: total_outcomes += 1 if not s['success']: error_count += 1 # Build narrative parts = [] # Most-edited files top_edits = file_edits.most_common(5) if top_edits: edit_parts = [] for fp, count in top_edits: name = Path(fp).name edit_parts.append(f'{name} ({count}x)') parts.append(f'Editing: {", ".join(edit_parts)}') # Most-read files top_reads = file_reads.most_common(5) if top_reads: read_parts = [] for fp, count in top_reads: name = Path(fp).name read_parts.append(f'{name} ({count}x)') parts.append(f'Reading: {", ".join(read_parts)}') # Tool activity other_tools = {t: c for t, c in tool_counts.items() if t not in ('Edit', 'Write', 'Read')} if other_tools: tool_parts = [f'{t}:{c}' for t, c in sorted(other_tools.items(), key=lambda x: -x[1])] parts.append(f'Other tools: {", ".join(tool_parts[:5])}') # Focus area (most common directory) all_files = list(file_edits.keys()) + list(file_reads.keys()) if all_files: dir_counts: Counter = Counter() for fp in all_files: parent = str(Path(fp).parent) # Shorten to relative if possible try: parent = str(Path(parent).relative_to(Path.cwd())) except ValueError: pass dir_counts[parent] += 1 top_dir = dir_counts.most_common(1)[0] parts.append(f'Focus area: {top_dir[0]}/') # Errors if error_count > 0: parts.append(f'Errors: {error_count}/{total_outcomes} tool calls failed') narrative = '. '.join(parts) + '.' if parts else '' # Build recent files list all_touched = set(file_edits.keys()) | set(file_reads.keys()) recent_files = [] for fp in all_touched: entry = {'path': fp} if file_edits[fp]: entry['edits'] = file_edits[fp] if file_reads[fp]: entry['reads'] = file_reads[fp] recent_files.append(entry) # Sort by total activity recent_files.sort( key=lambda x: x.get('edits', 0) + x.get('reads', 0), reverse=True ) return (narrative, recent_files[:15]) def format_for_post_compact_injection( project_dir: str = '.', checkpoint_path: str | None = None ) -> str | None: """Format checkpoint as a rich injection block for post-compaction context. Called by mnemos-post-compact-inject.sh after compaction is detected. Returns a structured block that Claude can parse and resume from. """ if checkpoint_path: cp_path = Path(checkpoint_path) else: cp_path = Path(project_dir).resolve() / '.mnemos' / 'checkpoint-latest.json' if not cp_path.exists(): return None try: data = json.loads(cp_path.read_text()) except (json.JSONDecodeError, OSError): return None lines = [] lines.append('=== MNEMOS: CONTEXT RESTORED AFTER COMPACTION ===') lines.append('') lines.append('Compaction just occurred. Your previous context was summarized.') lines.append('Resume from this checkpoint -- DO NOT re-derive information already captured below.') lines.append('') # Goal lines.append('## Goal') lines.append(data.get('goal', 'No goal recorded')) lines.append('') # Constraints constraints = data.get('active_constraints', []) if constraints: lines.append('## Active Constraints (DO NOT VIOLATE)') for c in constraints: lines.append(f'- {c}') lines.append('') # Task narrative narrative = data.get('task_narrative', '') if narrative: lines.append('## What You Were Working On') lines.append(narrative) lines.append('') # Current sub-goal subgoal = data.get('current_subgoal', '') if subgoal: lines.append('## Current Sub-Goal') lines.append(subgoal) lines.append('') # Working memory working = data.get('working_memory', '') if working: lines.append('## Working Memory') lines.append(working) lines.append('') # Progress results = data.get('active_results', []) if results: lines.append('## Progress So Far') for r in results: lines.append(f'- {r}') lines.append('') # Recent files recent = data.get('recent_files', []) if recent: lines.append('## Key Files (from recent activity)') for f in recent[:10]: parts = [] if f.get('edits', 0) > 0: parts.append(f'edited {f["edits"]}x') if f.get('reads', 0) > 0: parts.append(f'read {f["reads"]}x') detail = ', '.join(parts) if parts else 'touched' lines.append(f'- {f.get("path", "?")} ({detail})') lines.append('') # Git state git = data.get('git_state', {}) if git.get('branch'): lines.append('## Git State') lines.append(f'Branch: {git["branch"]}') if git.get('uncommitted'): lines.append('Uncommitted:') for gf in git['uncommitted'][:10]: lines.append(f' - {gf}') else: lines.append('Working tree clean.') lines.append('') # iCPG icpg = data.get('icpg_state') if icpg: lines.append('## iCPG Context') if icpg.get('active_reason'): lines.append(f'Active intent: {icpg["active_reason"]}') if icpg.get('unresolved_drift'): lines.append(f'Unresolved drift: {icpg["unresolved_drift"]}') lines.append('') # Checkpoint metadata lines.append(f'Checkpoint: {data.get("id", "?")[:8]} at {data.get("created_at", "?")}') lines.append(f'Fatigue at checkpoint: {data.get("fatigue_at_checkpoint", 0):.2f}') lines.append('') lines.append('=== Resume work from this checkpoint. Ask the user to confirm the task if unclear. ===') return '\n'.join(lines) def write_compaction_marker(project_dir: str = '.') -> None: """Write the just-compacted marker file for post-compaction detection.""" marker = Path(project_dir).resolve() / '.mnemos' / 'just-compacted' marker.parent.mkdir(parents=True, exist_ok=True) marker.write_text(json.dumps({ 'timestamp': time.time(), 'reason': 'pre_compact_hook' })) def check_compaction_marker(project_dir: str = '.') -> bool: """Check if a fresh compaction marker exists (< 5 minutes old).""" marker = Path(project_dir).resolve() / '.mnemos' / 'just-compacted' if not marker.exists(): return False try: data = json.loads(marker.read_text()) age = time.time() - data.get('timestamp', 0) return age < 300 # 5 minutes except (json.JSONDecodeError, OSError): return False def consume_compaction_marker(project_dir: str = '.') -> bool: """Atomically consume the compaction marker (rename then delete). Returns True if marker was consumed, False if already consumed or missing. """ marker = Path(project_dir).resolve() / '.mnemos' / 'just-compacted' consumed = marker.with_suffix('.consumed') try: marker.rename(consumed) consumed.unlink(missing_ok=True) return True except (OSError, FileNotFoundError): return False def _checkpoint_to_dict(cp: CheckpointNode) -> dict: """Serialize CheckpointNode to JSON-safe dict.""" return { 'id': cp.id, 'task_id': cp.task_id, 'goal': cp.goal, 'active_constraints': cp.active_constraints, 'active_results': cp.active_results, 'current_subgoal': cp.current_subgoal, 'working_memory': cp.working_memory, 'task_narrative': cp.task_narrative, 'recent_files': cp.recent_files, 'fatigue_at_checkpoint': cp.fatigue_at_checkpoint, 'git_state': cp.git_state, 'icpg_state': cp.icpg_state, 'node_summary': cp.node_summary, 'created_at': cp.created_at } ================================================ FILE: scripts/mnemos/consolidation.py ================================================ """Micro-consolidation -- rule-based, in-context, Tier 0 only. Triggered when fatigue >= 0.40 (COMPRESS state). No LLM calls. Target: <100ms execution time. Actions: 1. Compress 3 oldest ResultNodes (status=COMPRESSED, summary kept) 2. Evict 1 cold ContextNode (weight < 0.2, no scope overlap) 3. Decay weights on all evictable active nodes """ from __future__ import annotations from .models import MnemoNode from .store import MnemosStore def micro_consolidate( store: MnemosStore, current_scope: str = '', max_compress: int = 3, max_evict: int = 1 ) -> dict: """Run micro-consolidation pass. Rule-based, no LLM. Args: store: MnemosStore instance. current_scope: Current scope tag for eviction decisions. max_compress: Max ResultNodes to compress per pass. max_evict: Max ContextNodes to evict per pass. Returns: Stats: {compressed, evicted, decayed}. """ stats = {'compressed': 0, 'evicted': 0, 'decayed': 0} # 1. Compress oldest active ResultNodes result_nodes = store.get_by_type('result', status='active') # Sort by created_at ascending (oldest first) result_nodes.sort(key=lambda n: n.created_at) compressed = 0 for node in result_nodes: if compressed >= max_compress: break summary = _compress_result_node(node) store.compress_node(node.id, summary) compressed += 1 stats['compressed'] = compressed # 2. Evict cold ContextNodes context_nodes = store.get_by_type('context', status='active') evicted = 0 for node in context_nodes: if evicted >= max_evict: break if _should_evict(node, current_scope): store.evict_node(node.id) evicted += 1 stats['evicted'] = evicted # 3. Decay weights on all evictable nodes decayed = store.decay_weights(factor=0.95) stats['decayed'] = decayed return stats def _compress_result_node(node: MnemoNode) -> str: """Produce a summary from a ResultNode. Rule-based: first 200 chars of content as summary. """ content = node.content.strip() if not content: return node.summary or '(empty result)' if len(content) <= 200: return content # Truncate at word boundary truncated = content[:200] last_space = truncated.rfind(' ') if last_space > 150: truncated = truncated[:last_space] return truncated + '...' def _should_evict(node: MnemoNode, current_scope: str) -> bool: """Determine if a ContextNode should be evicted. Evict when: - activation_weight < 0.2 - No scope_tag overlap with current scope - Access count is low (< 3) """ if node.activation_weight >= 0.2: return False if node.access_count >= 3: return False if not current_scope: return True # Check scope overlap if node.scope_tags: for tag in node.scope_tags: if current_scope.startswith(tag) or tag.startswith(current_scope): return False return True ================================================ FILE: scripts/mnemos/fatigue.py ================================================ """4-dimension fatigue computation -- all dimensions passively observable. Every dimension is derived from actual hook data (tool calls, file paths, errors). No agent cooperation or manual input required. Signals: 1. Token utilization -- statusline writes context_window.used_percentage 2. Scope scatter -- PreToolUse logs file paths -> unique dirs ratio 3. Re-read ratio -- PreToolUse logs Read calls -> duplicate file ratio 4. Error density -- PostToolUse logs success/failure -> error ratio """ from __future__ import annotations import json from pathlib import Path from .models import FATIGUE_WEIGHTS, FatigueState, _now from .signals import ( compute_error_density, compute_reread_ratio, compute_scope_scatter, read_recent_signals ) def compute_fatigue( context_data: dict, project_dir: str = '.' ) -> FatigueState: """Compute 4-dimension fatigue score from observable signals. Args: context_data: Dict with used_percentage (from fatigue.json). project_dir: Project directory to read signals from. Returns: FatigueState with per-dimension scores and composite. """ # Dimension 1: Token utilization (real -- from statusline) token_util = min(1.0, context_data.get('used_percentage', 0) / 100) # Read behavioral signals from hook log signals = read_recent_signals(project_dir) # Dimension 2: Scope scatter (real -- from PreToolUse file paths) scatter = compute_scope_scatter(signals) # Dimension 3: Re-read ratio (real -- from PreToolUse Read calls) reread = compute_reread_ratio(signals) # Dimension 4: Error density (real -- from PostToolUse outcomes) errors = compute_error_density(signals) # Weighted composite score = ( FATIGUE_WEIGHTS['token_utilization'] * token_util + FATIGUE_WEIGHTS['scope_scatter'] * scatter + FATIGUE_WEIGHTS['reread_ratio'] * reread + FATIGUE_WEIGHTS['error_density'] * errors ) score = min(1.0, max(0.0, score)) state = FatigueState.score_to_state(score) return FatigueState( token_utilization=round(token_util, 4), scope_scatter=round(scatter, 4), reread_ratio=round(reread, 4), error_density=round(errors, 4), composite_score=round(score, 4), state=state, computed_at=_now() ) def read_fatigue_file(project_dir: str = '.') -> dict: """Read the live fatigue.json written by the statusline script. Returns dict with used_percentage, remaining_percentage, timestamp. Falls back to empty dict if file missing or corrupt. """ fatigue_path = Path(project_dir).resolve() / '.mnemos' / 'fatigue.json' if not fatigue_path.exists(): return {} try: return json.loads(fatigue_path.read_text()) except (json.JSONDecodeError, OSError): return {} def write_fatigue_file( project_dir: str, used_pct: float, remaining_pct: float ) -> None: """Write fatigue.json for hooks to read. Called by statusline.""" import time mnemos_dir = Path(project_dir).resolve() / '.mnemos' mnemos_dir.mkdir(parents=True, exist_ok=True) data = { 'used_percentage': used_pct, 'remaining_percentage': remaining_pct, 'timestamp': time.time() } fatigue_path = mnemos_dir / 'fatigue.json' fatigue_path.write_text(json.dumps(data)) ================================================ FILE: scripts/mnemos/models.py ================================================ """Data models for Mnemos -- MnemoNode, FatigueState, CheckpointNode.""" from __future__ import annotations import uuid from dataclasses import dataclass, field from datetime import datetime, timezone def _now() -> str: return datetime.now(timezone.utc).isoformat() def _uuid() -> str: return str(uuid.uuid4()) # --- MnemoNode types --- MNEMO_TYPES = ( 'goal', 'constraint', 'context', 'working', 'result', 'skill', 'checkpoint', 'handoff' ) # --- MnemoNode statuses --- MNEMO_STATUSES = ( 'active', 'compressed', 'evicted', 'promoted', 'handed_off' ) # --- MnemoNode origins --- MNEMO_ORIGINS = ( 'loaded', 'derived', 'tool_result', 'inherited', 'agent_generated' ) # --- Fatigue states --- FATIGUE_STATES = ( 'flow', 'compress', 'pre_sleep', 'rem', 'emergency' ) # --- Fatigue thresholds --- FATIGUE_THRESHOLDS = { 'flow': (0.0, 0.40), 'compress': (0.40, 0.60), 'pre_sleep': (0.60, 0.75), 'rem': (0.75, 0.90), 'emergency': (0.90, 1.0) } # --- Fatigue dimension weights --- # All 4 dimensions are passively observable from hook data. # No agent cooperation required. FATIGUE_WEIGHTS = { 'token_utilization': 0.40, # from statusline context_window.used_percentage 'scope_scatter': 0.25, # unique dirs in recent tool calls (PreToolUse) 'reread_ratio': 0.20, # files Read more than once (PreToolUse) 'error_density': 0.15 # failed tool calls ratio (PostToolUse) } # --- Eviction policies per type --- # never = GoalNode/ConstraintNode survive all compaction # compress_first = content replaced with summary before eviction # evictable = can be evicted when cold EVICTION_POLICIES = { 'goal': 'never', 'constraint': 'never', 'context': 'evictable', 'working': 'compress_first', 'result': 'compress_first', 'skill': 'compress_first', 'checkpoint': 'never', 'handoff': 'never' } @dataclass class MnemoNode: """A typed memory node in the MnemoGraph. Types and eviction: goal -- never evicted, task's primary objective constraint -- never evicted, invariants and contracts context -- evictable when activation_weight drops working -- compressed first, then evicted result -- compressed first (summary kept), then evicted skill -- compressed first, promotable to persistent checkpoint -- never evicted, serialized session state handoff -- never evicted, task completion summary """ type: str task_id: str content: str id: str = field(default_factory=_uuid) summary: str | None = None activation_weight: float = 1.0 status: str = 'active' origin: str = 'agent_generated' confidence: float = 1.0 scope_tags: list[str] = field(default_factory=list) links: list[str] = field(default_factory=list) created_at: str = field(default_factory=_now) last_accessed: str = field(default_factory=_now) access_count: int = 0 @property def eviction_policy(self) -> str: return EVICTION_POLICIES.get(self.type, 'evictable') @property def is_evictable(self) -> bool: return self.eviction_policy == 'evictable' @property def is_compressible(self) -> bool: return self.eviction_policy == 'compress_first' @dataclass class FatigueState: """4-dimension fatigue model -- all dimensions passively observable. Dimensions (all derived from hook data, no agent cooperation needed): token_utilization -- context_window.used_percentage / 100 (statusline) scope_scatter -- unique dirs in recent tool calls (PreToolUse) reread_ratio -- files Read'd more than once (PreToolUse) error_density -- failed tool calls / total (PostToolUse) Composite score = weighted average, mapped to fatigue state. """ token_utilization: float = 0.0 scope_scatter: float = 0.0 reread_ratio: float = 0.0 error_density: float = 0.0 composite_score: float = 0.0 state: str = 'flow' computed_at: str = field(default_factory=_now) @staticmethod def score_to_state(score: float) -> str: """Map composite fatigue score to named state.""" if score >= 0.90: return 'emergency' elif score >= 0.75: return 'rem' elif score >= 0.60: return 'pre_sleep' elif score >= 0.40: return 'compress' else: return 'flow' @dataclass class CheckpointNode: """Serialized session state for resume after compaction or restart. Always includes GoalNode content, all ConstraintNodes, current sub-goal. Optionally includes iCPG state (active ReasonNode, drift summary). """ task_id: str goal: str id: str = field(default_factory=_uuid) active_constraints: list[str] = field(default_factory=list) active_results: list[str] = field(default_factory=list) current_subgoal: str = '' working_memory: str = '' task_narrative: str = '' recent_files: list[dict] = field(default_factory=list) fatigue_at_checkpoint: float = 0.0 git_state: dict = field(default_factory=dict) icpg_state: dict | None = None node_summary: dict = field(default_factory=dict) created_at: str = field(default_factory=_now) ================================================ FILE: scripts/mnemos/pyproject.toml ================================================ [project] name = "mnemos" version = "0.1.0" description = "Task-Scoped Memory Lifecycle for Autonomous Agents" requires-python = ">=3.10" dependencies = [] [project.scripts] mnemos = "mnemos.__main__:main" [build-system] requires = ["setuptools>=68.0"] build-backend = "setuptools.build_meta" ================================================ FILE: scripts/mnemos/signals.py ================================================ """Behavioral signal collection from Claude Code hooks. Hooks receive rich JSON on stdin (tool_name, tool_input, tool_response). Instead of relying on agent cooperation (manually setting scope_tags), we passively observe tool call patterns to derive fatigue signals. Signals collected: - File paths from Read/Edit/Write tool calls (scope scatter) - Re-reads: same file Read'd more than once (context loss) - Tool errors from PostToolUse (struggling agent) - Edit frequency to same file (fix-retry loops) Storage: .mnemos/signals.jsonl (append-only, one JSON line per event) """ from __future__ import annotations import json import os import time from pathlib import Path SIGNALS_FILE = 'signals.jsonl' # Rolling window for fatigue computation WINDOW_SIZE = 30 def append_signal(project_dir: str, signal: dict) -> None: """Append a signal event to signals.jsonl. Must be fast (<1ms).""" signals_path = Path(project_dir).resolve() / '.mnemos' / SIGNALS_FILE signals_path.parent.mkdir(parents=True, exist_ok=True) signal['ts'] = time.time() with open(signals_path, 'a') as f: f.write(json.dumps(signal) + '\n') def read_recent_signals(project_dir: str, limit: int = WINDOW_SIZE) -> list[dict]: """Read the last N signals from the log. Reads from tail for speed.""" signals_path = Path(project_dir).resolve() / '.mnemos' / SIGNALS_FILE if not signals_path.exists(): return [] try: # Read last N lines efficiently lines = _tail(str(signals_path), limit) signals = [] for line in lines: line = line.strip() if line: try: signals.append(json.loads(line)) except json.JSONDecodeError: continue return signals except OSError: return [] def compute_scope_scatter(signals: list[dict]) -> float: """Scope scatter: how many different directories is the agent touching? Low scatter (focused on 1-2 dirs) = 0.0 (no fatigue). High scatter (bouncing across 8+ dirs) = 1.0 (max fatigue). Only considers file-bearing tool calls (Read, Edit, Write, Glob, Grep). """ dirs = [] for s in signals: fp = s.get('file_path', '') if fp: # Normalize to parent directory (2 levels deep max) parts = Path(fp).parts if len(parts) >= 3: dirs.append('/'.join(parts[:3])) elif len(parts) >= 2: dirs.append('/'.join(parts[:2])) elif parts: dirs.append(parts[0]) if not dirs: return 0.0 unique_dirs = len(set(dirs)) total = len(dirs) # 1-2 unique dirs in 30 calls = very focused = 0.0 # 3-4 = mild scatter = 0.2-0.4 # 5-7 = moderate = 0.4-0.7 # 8+ = high scatter = 0.7-1.0 ratio = unique_dirs / max(total, 1) # Scale: ratio of 0.1 (1 dir in 10 calls) = 0, ratio of 0.5+ = 1.0 return min(1.0, max(0.0, (ratio - 0.1) / 0.4)) def compute_reread_ratio(signals: list[dict]) -> float: """Re-read ratio: how often does the agent re-read files it already read? High re-reads = agent lost context of what it saw = context degradation. Returns 0.0-1.0. """ reads = [s['file_path'] for s in signals if s.get('tool') == 'Read' and s.get('file_path')] if len(reads) < 3: return 0.0 seen = set() rereads = 0 for fp in reads: if fp in seen: rereads += 1 seen.add(fp) return min(1.0, rereads / max(len(reads), 1)) def compute_error_density(signals: list[dict]) -> float: """Error density: ratio of failed tool calls in recent window. High error rate = agent is struggling/confused. Returns 0.0-1.0. """ outcomes = [s for s in signals if 'success' in s] if not outcomes: return 0.0 errors = sum(1 for s in outcomes if not s['success']) return min(1.0, errors / max(len(outcomes), 1)) def extract_signal_from_pre_tool(hook_input: dict) -> dict | None: """Extract a signal from PreToolUse hook JSON input. Returns a signal dict to append, or None if not relevant. """ tool = hook_input.get('tool_name', '') tool_input = hook_input.get('tool_input', {}) # Extract file path from various tool inputs file_path = ( tool_input.get('file_path') or tool_input.get('path') or '' ) # For Bash, try to extract paths from command if tool == 'Bash' and not file_path: cmd = tool_input.get('command', '') # Don't log bash commands as file signals return {'tool': 'Bash', 'event': 'pre'} if tool in ('Read', 'Edit', 'Write', 'Glob', 'Grep'): return { 'tool': tool, 'event': 'pre', 'file_path': _normalize_path(file_path) } return {'tool': tool, 'event': 'pre'} def extract_signal_from_post_tool(hook_input: dict) -> dict | None: """Extract a signal from PostToolUse hook JSON input. Captures success/failure for error density computation. """ tool = hook_input.get('tool_name', '') tool_input = hook_input.get('tool_input', {}) response = hook_input.get('tool_response', {}) file_path = ( tool_input.get('file_path') or tool_input.get('path') or '' ) # Determine success/failure success = True if isinstance(response, dict): # Check for common error indicators if response.get('error') or response.get('is_error'): success = False # Bash exit code if 'exit_code' in response and response['exit_code'] != 0: success = False elif isinstance(response, str): # String responses with error markers if response.startswith('Error:') or response.startswith('error:'): success = False return { 'tool': tool, 'event': 'post', 'file_path': _normalize_path(file_path), 'success': success } def _normalize_path(file_path: str) -> str: """Normalize file path to relative form for consistent comparison.""" if not file_path: return '' p = Path(file_path) # Convert absolute paths to relative if within CWD try: return str(p.relative_to(Path.cwd())) except ValueError: return str(p) def _tail(filepath: str, n: int) -> list[str]: """Read last n lines from a file efficiently.""" try: with open(filepath, 'rb') as f: # Seek to end f.seek(0, 2) size = f.tell() if size == 0: return [] # Read backwards in chunks chunk_size = min(size, n * 500) # ~500 bytes per line estimate f.seek(max(0, size - chunk_size)) data = f.read().decode('utf-8', errors='replace') lines = data.strip().split('\n') return lines[-n:] except OSError: return [] def get_session_stats(project_dir: str) -> dict: """Get summary stats from signal log for diagnostics.""" signals = read_recent_signals(project_dir, limit=100) if not signals: return {'total_signals': 0} tools = {} files_read = set() rereads = 0 errors = 0 total_outcomes = 0 seen_reads = set() for s in signals: tool = s.get('tool', 'unknown') tools[tool] = tools.get(tool, 0) + 1 fp = s.get('file_path', '') if s.get('tool') == 'Read' and fp: if fp in seen_reads: rereads += 1 seen_reads.add(fp) files_read.add(fp) if 'success' in s: total_outcomes += 1 if not s['success']: errors += 1 return { 'total_signals': len(signals), 'tool_calls': tools, 'unique_files_read': len(files_read), 'rereads': rereads, 'errors': errors, 'total_outcomes': total_outcomes, 'error_rate': errors / max(total_outcomes, 1) } ================================================ FILE: scripts/mnemos/store.py ================================================ """SQLite storage layer for Mnemos MnemoGraph.""" from __future__ import annotations import json import sqlite3 from pathlib import Path from .models import CheckpointNode, FatigueState, MnemoNode, _now MNEMOS_DIR = '.mnemos' DB_NAME = 'mnemo.db' SCHEMA = """ CREATE TABLE IF NOT EXISTS mnemo_nodes ( id TEXT PRIMARY KEY, type TEXT NOT NULL, task_id TEXT NOT NULL, content TEXT NOT NULL, summary TEXT, activation_weight REAL DEFAULT 1.0, status TEXT DEFAULT 'active', origin TEXT DEFAULT 'agent_generated', confidence REAL DEFAULT 1.0, scope_tags TEXT DEFAULT '[]', links TEXT DEFAULT '[]', created_at TEXT NOT NULL, last_accessed TEXT NOT NULL, access_count INTEGER DEFAULT 0 ); CREATE TABLE IF NOT EXISTS checkpoints ( id TEXT PRIMARY KEY, task_id TEXT NOT NULL, goal TEXT NOT NULL, active_constraints TEXT DEFAULT '[]', active_results TEXT DEFAULT '[]', current_subgoal TEXT DEFAULT '', working_memory TEXT DEFAULT '', fatigue_at_checkpoint REAL DEFAULT 0.0, git_state TEXT DEFAULT '{}', icpg_state TEXT, node_summary TEXT DEFAULT '{}', created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS fatigue_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, token_utilization REAL, scope_scatter REAL, reread_ratio REAL, error_density REAL, composite_score REAL, state TEXT, computed_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_mnemo_type ON mnemo_nodes(type); CREATE INDEX IF NOT EXISTS idx_mnemo_task ON mnemo_nodes(task_id); CREATE INDEX IF NOT EXISTS idx_mnemo_status ON mnemo_nodes(status); CREATE INDEX IF NOT EXISTS idx_mnemo_weight ON mnemo_nodes(activation_weight); CREATE INDEX IF NOT EXISTS idx_checkpoint_task ON checkpoints(task_id); CREATE INDEX IF NOT EXISTS idx_fatigue_time ON fatigue_log(computed_at); """ class MnemosStore: """SQLite-backed storage for the MnemoGraph.""" def __init__(self, project_dir: str = '.'): self.project_dir = Path(project_dir).resolve() self.mnemos_dir = self.project_dir / MNEMOS_DIR self.db_path = self.mnemos_dir / DB_NAME def init_db(self) -> None: """Create .mnemos/ directory and initialize schema.""" self.mnemos_dir.mkdir(parents=True, exist_ok=True) gitignore = self.mnemos_dir / '.gitignore' if not gitignore.exists(): gitignore.write_text('*\n') with self._conn() as conn: conn.executescript(SCHEMA) def exists(self) -> bool: return self.db_path.exists() def _conn(self) -> sqlite3.Connection: conn = sqlite3.connect(str(self.db_path)) conn.row_factory = sqlite3.Row conn.execute('PRAGMA journal_mode=WAL') conn.execute('PRAGMA foreign_keys=ON') return conn # --- MnemoNode CRUD --- def create_node(self, node: MnemoNode) -> str: with self._conn() as conn: conn.execute( """INSERT INTO mnemo_nodes (id, type, task_id, content, summary, activation_weight, status, origin, confidence, scope_tags, links, created_at, last_accessed, access_count) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", ( node.id, node.type, node.task_id, node.content, node.summary, node.activation_weight, node.status, node.origin, node.confidence, json.dumps(node.scope_tags), json.dumps(node.links), node.created_at, node.last_accessed, node.access_count ) ) return node.id def get_node(self, node_id: str) -> MnemoNode | None: with self._conn() as conn: row = conn.execute( 'SELECT * FROM mnemo_nodes WHERE id = ?', (node_id,) ).fetchone() return self._row_to_node(row) if row else None def get_active_nodes(self, task_id: str | None = None) -> list[MnemoNode]: with self._conn() as conn: if task_id: rows = conn.execute( "SELECT * FROM mnemo_nodes WHERE status = 'active' " "AND task_id = ? ORDER BY activation_weight DESC", (task_id,) ).fetchall() else: rows = conn.execute( "SELECT * FROM mnemo_nodes WHERE status = 'active' " "ORDER BY activation_weight DESC" ).fetchall() return [self._row_to_node(r) for r in rows] def get_by_type( self, node_type: str, status: str = 'active' ) -> list[MnemoNode]: with self._conn() as conn: rows = conn.execute( 'SELECT * FROM mnemo_nodes WHERE type = ? AND status = ? ' 'ORDER BY activation_weight DESC', (node_type, status) ).fetchall() return [self._row_to_node(r) for r in rows] def nodes_for_scope(self, scope_tags: list[str]) -> list[MnemoNode]: """Get active nodes whose scope_tags overlap with given tags.""" active = self.get_active_nodes() return [ n for n in active if set(n.scope_tags) & set(scope_tags) ] def nodes_above_weight(self, threshold: float) -> list[MnemoNode]: with self._conn() as conn: rows = conn.execute( "SELECT * FROM mnemo_nodes WHERE status = 'active' " "AND activation_weight >= ? ORDER BY activation_weight DESC", (threshold,) ).fetchall() return [self._row_to_node(r) for r in rows] def update_node_status(self, node_id: str, status: str) -> None: with self._conn() as conn: conn.execute( 'UPDATE mnemo_nodes SET status = ? WHERE id = ?', (status, node_id) ) def update_node_weight(self, node_id: str, weight: float) -> None: with self._conn() as conn: conn.execute( 'UPDATE mnemo_nodes SET activation_weight = ? WHERE id = ?', (weight, node_id) ) def compress_node(self, node_id: str, summary: str) -> None: """Compress a node: replace content with summary, set status.""" with self._conn() as conn: conn.execute( "UPDATE mnemo_nodes SET status = 'compressed', " "summary = ?, content = '' WHERE id = ?", (summary, node_id) ) def evict_node(self, node_id: str) -> None: """Evict a node: set status, clear content.""" with self._conn() as conn: conn.execute( "UPDATE mnemo_nodes SET status = 'evicted', " "content = '', summary = NULL WHERE id = ?", (node_id,) ) def touch_node(self, node_id: str) -> None: """Update last_accessed and increment access_count.""" with self._conn() as conn: conn.execute( 'UPDATE mnemo_nodes SET last_accessed = ?, ' 'access_count = access_count + 1 WHERE id = ?', (_now(), node_id) ) def decay_weights(self, factor: float = 0.95) -> int: """Apply exponential decay to all active node weights. Returns count of nodes decayed. """ with self._conn() as conn: cursor = conn.execute( "UPDATE mnemo_nodes SET activation_weight = " "MAX(0.01, activation_weight * ?) " "WHERE status = 'active' AND type NOT IN " "('goal', 'constraint', 'checkpoint', 'handoff')", (factor,) ) return cursor.rowcount # --- Checkpoint CRUD --- def save_checkpoint(self, cp: CheckpointNode) -> str: with self._conn() as conn: conn.execute( """INSERT INTO checkpoints (id, task_id, goal, active_constraints, active_results, current_subgoal, working_memory, fatigue_at_checkpoint, git_state, icpg_state, node_summary, created_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""", ( cp.id, cp.task_id, cp.goal, json.dumps(cp.active_constraints), json.dumps(cp.active_results), cp.current_subgoal, cp.working_memory, cp.fatigue_at_checkpoint, json.dumps(cp.git_state), json.dumps(cp.icpg_state) if cp.icpg_state else None, json.dumps(cp.node_summary), cp.created_at ) ) return cp.id def get_latest_checkpoint( self, task_id: str | None = None ) -> CheckpointNode | None: with self._conn() as conn: if task_id: row = conn.execute( 'SELECT * FROM checkpoints WHERE task_id = ? ' 'ORDER BY created_at DESC LIMIT 1', (task_id,) ).fetchone() else: row = conn.execute( 'SELECT * FROM checkpoints ' 'ORDER BY created_at DESC LIMIT 1' ).fetchone() return self._row_to_checkpoint(row) if row else None # --- Fatigue log --- def log_fatigue(self, fatigue: FatigueState) -> None: with self._conn() as conn: conn.execute( """INSERT INTO fatigue_log (token_utilization, scope_scatter, reread_ratio, error_density, composite_score, state, computed_at) VALUES (?,?,?,?,?,?,?)""", ( fatigue.token_utilization, fatigue.scope_scatter, fatigue.reread_ratio, fatigue.error_density, fatigue.composite_score, fatigue.state, fatigue.computed_at ) ) def get_fatigue_history(self, limit: int = 20) -> list[FatigueState]: with self._conn() as conn: rows = conn.execute( 'SELECT * FROM fatigue_log ORDER BY computed_at DESC ' 'LIMIT ?', (limit,) ).fetchall() return [self._row_to_fatigue(r) for r in rows] # --- Stats --- def get_stats(self) -> dict: with self._conn() as conn: total = conn.execute( 'SELECT COUNT(*) FROM mnemo_nodes' ).fetchone()[0] active = conn.execute( "SELECT COUNT(*) FROM mnemo_nodes WHERE status = 'active'" ).fetchone()[0] compressed = conn.execute( "SELECT COUNT(*) FROM mnemo_nodes WHERE status = 'compressed'" ).fetchone()[0] evicted = conn.execute( "SELECT COUNT(*) FROM mnemo_nodes WHERE status = 'evicted'" ).fetchone()[0] checkpoints = conn.execute( 'SELECT COUNT(*) FROM checkpoints' ).fetchone()[0] fatigue_entries = conn.execute( 'SELECT COUNT(*) FROM fatigue_log' ).fetchone()[0] # Type breakdown type_rows = conn.execute( "SELECT type, COUNT(*) as cnt FROM mnemo_nodes " "WHERE status = 'active' GROUP BY type" ).fetchall() by_type = {r['type']: r['cnt'] for r in type_rows} return { 'total_nodes': total, 'active': active, 'compressed': compressed, 'evicted': evicted, 'checkpoints': checkpoints, 'fatigue_entries': fatigue_entries, 'by_type': by_type } # --- iCPG Bridge --- def load_from_icpg(self, icpg_store, task_id: str = 'icpg-bridge') -> dict: """Import active iCPG ReasonNodes as GoalNodes/ConstraintNodes. Returns stats: {goals_imported, constraints_imported}. """ stats = {'goals_imported': 0, 'constraints_imported': 0} reasons = icpg_store.list_reasons() for reason in reasons: if reason.status in ('rejected', 'abandoned'): continue # ReasonNode -> GoalNode goal_node = MnemoNode( type='goal', task_id=task_id, content=f'{reason.goal} [iCPG:{reason.id[:8]}]', origin='loaded', scope_tags=reason.scope, confidence=1.0 ) self.create_node(goal_node) stats['goals_imported'] += 1 # Invariants/Postconditions -> ConstraintNodes for inv in reason.invariants: cn = MnemoNode( type='constraint', task_id=task_id, content=f'INV: {inv} [from: {reason.goal[:40]}]', origin='loaded', scope_tags=reason.scope, links=[goal_node.id] ) self.create_node(cn) stats['constraints_imported'] += 1 for post in reason.postconditions: cn = MnemoNode( type='constraint', task_id=task_id, content=f'POST: {post} [from: {reason.goal[:40]}]', origin='loaded', scope_tags=reason.scope, links=[goal_node.id] ) self.create_node(cn) stats['constraints_imported'] += 1 return stats # --- Row converters --- @staticmethod def _row_to_node(row: sqlite3.Row) -> MnemoNode: return MnemoNode( id=row['id'], type=row['type'], task_id=row['task_id'], content=row['content'], summary=row['summary'], activation_weight=row['activation_weight'], status=row['status'], origin=row['origin'], confidence=row['confidence'], scope_tags=json.loads(row['scope_tags']), links=json.loads(row['links']), created_at=row['created_at'], last_accessed=row['last_accessed'], access_count=row['access_count'] ) @staticmethod def _row_to_checkpoint(row: sqlite3.Row) -> CheckpointNode: return CheckpointNode( id=row['id'], task_id=row['task_id'], goal=row['goal'], active_constraints=json.loads(row['active_constraints']), active_results=json.loads(row['active_results']), current_subgoal=row['current_subgoal'], working_memory=row['working_memory'], fatigue_at_checkpoint=row['fatigue_at_checkpoint'], git_state=json.loads(row['git_state']), icpg_state=( json.loads(row['icpg_state']) if row['icpg_state'] else None ), node_summary=json.loads(row['node_summary']), created_at=row['created_at'] ) @staticmethod def _row_to_fatigue(row: sqlite3.Row) -> FatigueState: return FatigueState( token_utilization=row['token_utilization'], scope_scatter=row['scope_scatter'], reread_ratio=row['reread_ratio'], error_density=row['error_density'], composite_score=row['composite_score'], state=row['state'], computed_at=row['computed_at'] ) ================================================ FILE: scripts/polyphony/__init__.py ================================================ """Polyphony — Multi-agent orchestration for Maggy.""" __version__ = '0.1.0' ================================================ FILE: scripts/polyphony/__main__.py ================================================ """CLI entry point for Polyphony. Usage: polyphony init Create ~/.polyphony/ with config files polyphony spawn Create and route a task polyphony status Show current task states polyphony cleanup Remove completed workspaces """ from __future__ import annotations import argparse import sys from pathlib import Path from . import __version__ from .config import ( default_config_dir, load_agents, load_config, load_identities, load_routing, ) from .store import PolyphonyStore def cmd_init(args: argparse.Namespace) -> int: """Create config directory with templates.""" cfg_dir = default_config_dir() cfg_dir.mkdir(parents=True, exist_ok=True) print(f"Initialized {cfg_dir}") return 0 def cmd_status(args: argparse.Namespace) -> int: """Show task states from the store.""" cfg = load_config() store_dir = Path(cfg.get("workspace_root", "~/.polyphony")) store_dir = store_dir.expanduser() store = PolyphonyStore(store_dir) store.init_db() tasks = store.list_tasks() if not tasks: print("No tasks.") return 0 for t in tasks: print(f" [{t.state:12s}] {t.id[:8]} {t.title}") return 0 def cmd_spawn(args: argparse.Namespace) -> int: """Create a task from CLI.""" from .models import Task from .store import PolyphonyStore cfg = load_config() store_dir = Path(cfg.get("workspace_root", "~/.polyphony")) store_dir = store_dir.expanduser() store = PolyphonyStore(store_dir) store.init_db() task = Task( title=args.title, source="local", source_ref="cli", task_type=args.type, ) store.save_task(task) print(f"Created task {task.id[:8]}: {task.title}") return 0 def build_parser() -> argparse.ArgumentParser: """Build the CLI argument parser.""" parser = argparse.ArgumentParser( prog="polyphony", description="Multi-agent orchestration", ) parser.add_argument( "--version", action="version", version=f"polyphony {__version__}", ) sub = parser.add_subparsers(dest="command") sub.add_parser("init", help="Initialize config") sub.add_parser("status", help="Show task states") spawn_p = sub.add_parser("spawn", help="Create a task") spawn_p.add_argument("title", help="Task title") spawn_p.add_argument( "--type", default="feature", help="Task type", ) sub.add_parser("cleanup", help="Remove workspaces") return parser def main() -> int: """CLI entry point.""" parser = build_parser() args = parser.parse_args() dispatch = { "init": cmd_init, "status": cmd_status, "spawn": cmd_spawn, } handler = dispatch.get(args.command) if handler is None: parser.print_help() return 1 return handler(args) if __name__ == "__main__": sys.exit(main()) ================================================ FILE: scripts/polyphony/adapters/__init__.py ================================================ """Agent adapters for Polyphony (§8). Registry of adapter classes by agent_type name. """ from __future__ import annotations from .claude import ClaudeAdapter from .codex import CodexAdapter from .kimi import KimiAdapter _REGISTRY: dict[str, type] = { "claude": ClaudeAdapter, "codex": CodexAdapter, "kimi": KimiAdapter, } def get_adapter(agent_type: str): """Get adapter instance by agent type name.""" cls = _REGISTRY.get(agent_type) if cls is None: raise KeyError(agent_type) return cls() def list_adapters() -> list[str]: """Return registered adapter names.""" return list(_REGISTRY.keys()) ================================================ FILE: scripts/polyphony/adapters/claude.py ================================================ """Claude Code adapter (§8.1). Builds CLI command: claude -p <prompt> --output-format stream-json Parses stream-json events for completion/quota detection. """ from __future__ import annotations from ..models import AgentProfile, RunSpec class ClaudeAdapter: """Adapter for Claude Code CLI.""" def build_command( self, profile: AgentProfile, run_spec: RunSpec, ) -> list[str]: """Build claude CLI command list.""" parts = profile.cli_command.split() parts += ["--output-format", "stream-json"] if run_spec.max_turns: parts += ["--max-turns", str(run_spec.max_turns)] return parts def detect_completion(self, event: dict) -> bool: """Check if event signals task completion.""" return event.get("type") == "result" def detect_quota(self, text: str) -> bool: """Check if output indicates quota/rate limit.""" lower = text.lower() return "rate limit" in lower or "quota" in lower ================================================ FILE: scripts/polyphony/adapters/codex.py ================================================ """Codex CLI adapter (§8.2). Builds CLI command: codex exec --full-auto <prompt> Parses NDJSON events for completion/quota detection. """ from __future__ import annotations from ..models import AgentProfile, RunSpec class CodexAdapter: """Adapter for OpenAI Codex CLI.""" def build_command( self, profile: AgentProfile, run_spec: RunSpec, ) -> list[str]: """Build codex CLI command list.""" parts = profile.cli_command.split() if "--full-auto" not in parts: parts.append("--full-auto") return parts def detect_completion(self, event: dict) -> bool: """Check if event signals task completion.""" return event.get("status") == "completed" def detect_quota(self, text: str) -> bool: """Check if output indicates quota/rate limit.""" lower = text.lower() return "quota" in lower or "rate limit" in lower ================================================ FILE: scripts/polyphony/adapters/kimi.py ================================================ """Kimi CLI adapter (§8.3). Builds CLI command: kimi --print -y <prompt> Stub until Kimi headless mode stabilizes. """ from __future__ import annotations from ..models import AgentProfile, RunSpec class KimiAdapter: """Adapter for Moonshot Kimi CLI.""" def build_command( self, profile: AgentProfile, run_spec: RunSpec, ) -> list[str]: """Build kimi CLI command list.""" parts = profile.cli_command.split() return parts def detect_completion(self, event: dict) -> bool: """Check if event signals task completion.""" return event.get("done") is True def detect_quota(self, text: str) -> bool: """Check if output indicates quota/rate limit.""" lower = text.lower() return "rate limit" in lower or "quota" in lower ================================================ FILE: scripts/polyphony/config.py ================================================ """Configuration loading for Polyphony (spec §11).""" from __future__ import annotations from pathlib import Path import yaml from .models import AgentProfile, Identity DEFAULTS = { "workspace_root": "~/polyphony/workspaces", "mirror_root": "~/polyphony/mirrors", "poll_interval": "30s", "max_concurrent_agents": 8, "event_idle_timeout": "5m", } DEFAULT_ROUTING = { "rules": [], "default": { "agent": "claude", "model": "sonnet-4-6", "fallback": [], }, } def default_config_dir() -> Path: return Path.home() / ".polyphony" def load_config(config_dir: Path) -> dict: """Load config.yaml, merging with defaults.""" cfg = dict(DEFAULTS) path = Path(config_dir) / "config.yaml" if path.exists(): with open(path) as f: loaded = yaml.safe_load(f) or {} cfg.update(loaded) return cfg def load_identities(config_dir: Path) -> list[Identity]: """Load identities.yaml into Identity objects.""" path = Path(config_dir) / "identities.yaml" if not path.exists(): return [] with open(path) as f: data = yaml.safe_load(f) or {} return [ Identity( name=item["name"], volumes=item.get("volumes", {}), api_keys=item.get("api_keys", {}), cost_ceiling_usd_per_day=item.get( "cost_ceiling_usd_per_day" ), ) for item in data.get("identities", []) ] def load_agents(config_dir: Path) -> list[AgentProfile]: """Load agents.yaml into AgentProfile objects.""" path = Path(config_dir) / "agents.yaml" if not path.exists(): return [] with open(path) as f: data = yaml.safe_load(f) or {} return [ AgentProfile( name=item["name"], agent_type=item["agent_type"], cli_command=item["cli_command"], context_window_tokens=item.get( "context_window_tokens", 200000 ), strengths=item.get("strengths", []), event_protocol=item.get("event_protocol", "ndjson"), ) for item in data.get("agents", []) ] def load_routing(config_dir: Path) -> dict: """Load routing.yaml, merging with defaults.""" routing = dict(DEFAULT_ROUTING) path = Path(config_dir) / "routing.yaml" if not path.exists(): return routing with open(path) as f: data = yaml.safe_load(f) or {} if "rules" in data: routing["rules"] = data["rules"] if "default" in data: routing["default"] = data["default"] return routing ================================================ FILE: scripts/polyphony/events.py ================================================ """Structured event parsing from container stdout (§8 events). Parses NDJSON and stream-json output into TaskEvent objects. """ from __future__ import annotations import json from dataclasses import dataclass, field from datetime import datetime, timezone def _now() -> str: return datetime.now(timezone.utc).isoformat() @dataclass class TaskEvent: """A single parsed event from agent output.""" kind: str data: dict = field(default_factory=dict) timestamp: str = field(default_factory=_now) @classmethod def from_dict(cls, d: dict) -> TaskEvent: """Create from a dictionary.""" return cls( kind=d.get("kind", "unknown"), data=d.get("data", {}), timestamp=d.get("timestamp", _now()), ) def parse_ndjson_line(line: str) -> dict | None: """Parse a single NDJSON line. Returns None on failure.""" stripped = line.strip() if not stripped: return None try: return json.loads(stripped) except (json.JSONDecodeError, ValueError): return None def parse_stream_json(lines: list[str]) -> list[dict]: """Parse multiple NDJSON lines, skipping invalid ones.""" results: list[dict] = [] for line in lines: parsed = parse_ndjson_line(line) if parsed is not None: results.append(parsed) return results def classify_event(data: dict) -> TaskEvent: """Classify a parsed JSON object into a TaskEvent.""" event_type = data.get("type", "unknown") return TaskEvent(kind=event_type, data=data) ================================================ FILE: scripts/polyphony/identity.py ================================================ """Identity broker — credential resolution (spec §7). Resolves named identities to volume mounts and env overlays for container provisioning. """ from __future__ import annotations from .models import Identity def resolve_identity( name: str, identities: list[Identity], ) -> Identity: """Find identity by name. Raises KeyError if missing.""" for identity in identities: if identity.name == name: return identity raise KeyError(name) def build_volume_mounts( identity: Identity, agent_type: str, ) -> list[str]: """Build Docker -v mount strings for an agent type.""" path = identity.volumes.get(agent_type) if path is None: return [] return [f"{path}:/home/worker/{path}:ro"] def build_env_overlay(identity: Identity) -> dict[str, str]: """Build env vars from identity api_keys. api_keys maps logical name -> env var name. Returns {env_var_name: env_var_name} for docker --env pass-through. """ if not identity.api_keys: return {} return {v: v for v in identity.api_keys.values()} def validate_identity(identity: Identity) -> list[str]: """Return list of validation errors (empty = valid).""" errors: list[str] = [] if not identity.name: errors.append("name is required") if not identity.volumes: errors.append("At least one volume is required") return errors ================================================ FILE: scripts/polyphony/models.py ================================================ """Data models for Polyphony (spec §3).""" from __future__ import annotations import uuid from dataclasses import asdict, dataclass, field from datetime import datetime, timezone def _now() -> str: return datetime.now(timezone.utc).isoformat() def _uuid() -> str: return str(uuid.uuid4()) # --- Task types (§5.1) --- TASK_TYPES = ( "research", "bugfix", "feature", "refactor", "migration", "docs", "review", ) # --- Risk levels (§5.1) --- RISK_LEVELS = ("low", "medium", "high") # --- Scope levels (§5.1) --- SCOPES = ( "single_file", "single_module", "multi_module", "multi_repo", ) # --- Result statuses --- RESULT_STATUSES = ( "succeeded", "failed", "quota", "timeout", "crash", ) @dataclass class Task: """A unit of work from a work source (§3.1).""" title: str source: str source_ref: str id: str = field(default_factory=_uuid) state: str = "discovered" task_type: str = "feature" scope: list[str] = field(default_factory=list) risk: str = "low" context_tokens: int = 0 requires_web: bool = False run_spec_id: str | None = None metadata: dict = field(default_factory=dict) created_at: str = field(default_factory=_now) updated_at: str = field(default_factory=_now) def to_dict(self) -> dict: return asdict(self) @dataclass class Identity: """Named credential bundle (§3.2).""" name: str volumes: dict[str, str] = field(default_factory=dict) api_keys: dict[str, str] = field(default_factory=dict) cost_ceiling_usd_per_day: float | None = None @dataclass class AgentProfile: """Agent harness profile (§3.3).""" name: str agent_type: str cli_command: str context_window_tokens: int = 200000 strengths: list[str] = field(default_factory=list) event_protocol: str = "ndjson" auth_path: str = "" @dataclass class RunSpec: """Immutable execution spec for one attempt (§3.4).""" task_id: str agent: str identity: str workspace: str image: str id: str = field(default_factory=_uuid) attempt: int = 1 model: str = "" fallback: list[str] = field(default_factory=list) max_turns: int = 25 allowed_paths: list[str] = field(default_factory=list) proof_of_work: list[str] = field(default_factory=list) env_overlay: dict[str, str] = field(default_factory=dict) volume_mounts: list[str] = field(default_factory=list) hooks_pre: list[str] = field(default_factory=list) hooks_post: list[str] = field(default_factory=list) deadline_seconds: int = 1800 @dataclass class Result: """Outcome of a single run attempt (§3.5).""" task_id: str run_spec_id: str agent: str status: str id: str = field(default_factory=_uuid) turns: int = 0 duration_seconds: int = 0 cost_usd: float | None = None artifacts: dict[str, str] = field(default_factory=dict) events: list[dict] = field(default_factory=list) completed_at: str = field(default_factory=_now) ================================================ FILE: scripts/polyphony/orchestrator.py ================================================ """Supervisor loop (§4 orchestrator). discover -> claim -> route -> provision -> run -> verify -> land """ from __future__ import annotations from pathlib import Path from .models import ( AgentProfile, Identity, Result, RunSpec, Task, ) from .state_machine import transition from .store import PolyphonyStore def discover_tasks(store: PolyphonyStore) -> list[Task]: """Find tasks in 'discovered' state.""" return store.list_tasks(state="discovered") def claim_task( task: Task, store: PolyphonyStore, ) -> Task: """Transition task to 'claimed' and persist.""" claimed = transition(task, "claimed") store.save_task(claimed) return claimed def provision_workspace( task: Task, base_dir: Path, ref: str, ) -> Path: """Create workspace for task. Returns path.""" return _create_ws(task, base_dir, ref) def run_agent(run_spec: RunSpec) -> Result: """Execute agent in container. Returns Result.""" return _execute_container(run_spec) def verify_result(result: Result) -> bool: """Check if result passes proof-of-work.""" return result.status == "succeeded" class Orchestrator: """Main supervisor that drives the task lifecycle.""" def __init__( self, store: PolyphonyStore, agents: list[AgentProfile], policy: dict, identities: list[Identity] | None = None, ): self._store = store self._agents = agents self._policy = policy self._identities = identities or [] def step(self) -> int: """Run one orchestration cycle. Returns tasks processed.""" tasks = discover_tasks(self._store) count = 0 for task in tasks: claim_task(task, self._store) count += 1 return count def _create_ws( task: Task, base_dir: Path, ref: str, ) -> Path: """Placeholder for workspace creation. Mockable.""" from .workspace import create_workspace return create_workspace( base_dir=base_dir, task_id=task.id, attempt=1, repo_url="", ref=ref, ) def _execute_container(run_spec: RunSpec) -> Result: """Placeholder for container execution. Mockable.""" return Result( task_id=run_spec.task_id, run_spec_id=run_spec.id, agent=run_spec.agent, status="failed", ) ================================================ FILE: scripts/polyphony/pyproject.toml ================================================ [build-system] requires = ["setuptools>=68.0"] build-backend = "setuptools.build_meta" [project] name = "polyphony" version = "0.1.0" description = "Multi-agent orchestration for Maggy" requires-python = ">=3.11" dependencies = ["pyyaml>=6.0"] [project.scripts] polyphony = "polyphony.__main__:main" ================================================ FILE: scripts/polyphony/router.py ================================================ """Pure routing function (spec §5.2-5.6). route(task, agents, policy) -> RunSpec First matching rule wins. Falls back to default. """ from __future__ import annotations from .models import AgentProfile, RunSpec, Task def route( task: Task, agents: list[AgentProfile], policy: dict, identity: str = "", ) -> RunSpec: """Route a task to an agent. Returns a RunSpec.""" agent = select_agent(task, agents, policy) fallback = _get_fallback(task, policy) return RunSpec( task_id=task.id, agent=agent.name, identity=identity, workspace="", image="", fallback=fallback, ) def select_agent( task: Task, agents: list[AgentProfile], policy: dict, ) -> AgentProfile: """Select agent by first matching rule, or default.""" agent_map = {a.name: a for a in agents} for rule in policy.get("rules", []): if match_rule(task, rule): name = rule["agent"] if name in agent_map: return agent_map[name] default_name = policy["default"]["agent"] return agent_map[default_name] def match_rule(task: Task, rule: dict) -> bool: """Check if a task matches a rule's predicates.""" match = rule.get("match", {}) for field, expected in match.items(): actual = getattr(task, field, None) if isinstance(expected, list): if actual not in expected: return False elif actual != expected: return False return True def _get_fallback(task: Task, policy: dict) -> list[str]: """Get fallback chain for a task's route.""" for rule in policy.get("rules", []): if match_rule(task, rule): return rule.get("fallback", []) return policy["default"].get("fallback", []) ================================================ FILE: scripts/polyphony/runtime.py ================================================ """Docker container runtime (§8 worker). Create, start, stop, remove containers via subprocess calls. All Docker commands go through _run_docker for easy mocking. """ from __future__ import annotations import re import subprocess from .models import RunSpec def build_docker_args(run_spec: RunSpec) -> list[str]: """Build docker create argument list from RunSpec.""" safe_name = re.sub(r"[^\w\-]", "-", run_spec.task_id) name = f"polyphony-{safe_name}-{run_spec.attempt}" args = ["docker", "create", "--name", name] # Workspace mount args += ["-v", f"{run_spec.workspace}:/workspace"] # Identity volume mounts for mount in run_spec.volume_mounts: args += ["-v", mount] # Environment variables for key, val in run_spec.env_overlay.items(): args += ["-e", f"{key}={val}"] args.append(run_spec.image) return args def create_container(run_spec: RunSpec) -> str: """Create a Docker container. Returns container ID.""" args = build_docker_args(run_spec) result = _run_docker(args) if result.returncode != 0: raise RuntimeError(result.stderr.strip()) return result.stdout.strip() def start_container(container_id: str) -> None: """Start a created container.""" _run_docker(["docker", "start", container_id]) def stop_container( container_id: str, timeout: int | None = None, ) -> None: """Stop a running container.""" cmd = ["docker", "stop"] if timeout is not None: cmd += ["-t", str(timeout)] cmd.append(container_id) _run_docker(cmd) def remove_container(container_id: str) -> None: """Remove a container.""" _run_docker(["docker", "rm", container_id]) def container_logs(container_id: str) -> str: """Get container stdout/stderr logs.""" result = _run_docker(["docker", "logs", container_id]) return result.stdout def wait_container(container_id: str) -> int: """Wait for container to exit. Returns exit code.""" result = _run_docker( ["docker", "wait", container_id], ) return int(result.stdout.strip()) def _run_docker(cmd: list[str]) -> subprocess.CompletedProcess: """Run a docker command. Thin wrapper for mocking.""" return subprocess.run( cmd, capture_output=True, text=True, check=False, ) ================================================ FILE: scripts/polyphony/scoring.py ================================================ """5-dimension complexity scoring (spec §5.1). Formalizes the cross-agent-delegation rubric: cyclomatic, fan_out, security, concurrency, domain Each dimension scores 0-2. Total 0-10. """ from __future__ import annotations from .models import Task DIMENSIONS = ( "cyclomatic", "fan_out", "security", "concurrency", "domain", ) SEC_KEYWORDS = frozenset({ "auth", "org_id", "user_id", "pii", "rls", "billing", "payment", "secret", "token", "session", "csrf", "xss", }) CONCURRENCY_KEYWORDS = frozenset({ "asyncio.lock", "for update", "transaction", "session.begin", "mutex", "semaphore", "atomic", "lock", }) def score_task(task: Task) -> int: """Total complexity score (0-10).""" return ( score_cyclomatic(task) + score_fan_out(task) + score_security(task) + score_concurrency(task) + score_domain(task) ) def score_cyclomatic(task: Task) -> int: """0-2 based on LOC and scope size.""" loc = task.metadata.get("loc", 0) n_files = len(task.scope) if loc >= 50 or n_files >= 5: return 2 if loc >= 10 or n_files >= 2: return 1 return 0 def score_fan_out(task: Task) -> int: """0-2 based on number of callers.""" callers = task.metadata.get("callers", 0) if callers >= 11: return 2 if callers >= 3: return 1 return 0 def score_security(task: Task) -> int: """0-2 based on security keyword presence.""" keywords = _extract_keywords(task) hits = keywords & SEC_KEYWORDS if len(hits) >= 2: return 2 if len(hits) >= 1: return 1 return 0 def score_concurrency(task: Task) -> int: """0-2 based on concurrency keyword presence.""" keywords = _extract_keywords(task) hits = keywords & CONCURRENCY_KEYWORDS if len(hits) >= 2: return 2 if len(hits) >= 1: return 1 return 0 def score_domain(task: Task) -> int: """0-2 based on risk + task type heuristic.""" if task.risk == "high": return 2 if task.risk == "medium" or task.task_type == "refactor": return 1 return 0 def _extract_keywords(task: Task) -> set[str]: """Collect keywords from metadata and title.""" kw = set() for k in task.metadata.get("keywords", []): kw.add(k.lower()) for word in task.title.lower().split(): kw.add(word) return kw ================================================ FILE: scripts/polyphony/sources/__init__.py ================================================ """Work sources for Polyphony (§2). Registry of task source implementations. """ from __future__ import annotations from .local import LocalSource from .github import GitHubSource _REGISTRY: dict[str, type] = { "local": LocalSource, "github": GitHubSource, } def get_source(kind: str, **kwargs): """Get source instance by kind name.""" cls = _REGISTRY.get(kind) if cls is None: raise KeyError(kind) return cls(**kwargs) def list_sources() -> list[str]: """Return registered source names.""" return list(_REGISTRY.keys()) ================================================ FILE: scripts/polyphony/sources/github.py ================================================ """GitHub Issues work source (§2). Polls GitHub Issues via `gh api` for tasks labeled agent-ready. """ from __future__ import annotations import json import subprocess from ..models import Task class GitHubSource: """GitHub Issues as task source.""" def __init__( self, repo: str = "", label_filter: str = "agent-ready", ): self._repo = repo self._label = label_filter def poll(self) -> list[Task]: """Fetch open issues matching the label filter.""" cmd = [ "gh", "api", f"repos/{self._repo}/issues", "--jq", ".", "-q", f"label:{self._label}", ] result = _run_gh(cmd) if result.returncode != 0: return [] try: issues = json.loads(result.stdout) except (json.JSONDecodeError, ValueError): return [] return [self._issue_to_task(i) for i in issues] def _issue_to_task(self, issue: dict) -> Task: """Convert a GitHub issue dict to a Task.""" return Task( title=issue.get("title", ""), source="github", source_ref=f"{self._repo}#{issue.get('number', '')}", ) def _run_gh(cmd: list[str]) -> subprocess.CompletedProcess: """Run a gh CLI command. Thin wrapper for mocking.""" return subprocess.run( cmd, capture_output=True, text=True, check=False, ) ================================================ FILE: scripts/polyphony/sources/local.py ================================================ """Local SQLite task queue (§2). Simple task queue backed by a SQLite database file. """ from __future__ import annotations import sqlite3 from pathlib import Path from ..models import Task class LocalSource: """File-based local task queue.""" def __init__(self, db_path: Path | None = None): self._path = db_path or Path("~/.polyphony/queue.db") self._path = Path(str(self._path).strip()) self._init_db() def _init_db(self) -> None: self._path.parent.mkdir(parents=True, exist_ok=True) con = sqlite3.connect(str(self._path)) con.execute( "CREATE TABLE IF NOT EXISTS tasks (" " id TEXT PRIMARY KEY," " title TEXT NOT NULL," " task_type TEXT DEFAULT 'feature'," " risk TEXT DEFAULT 'low'," " claimed INTEGER DEFAULT 0" ")" ) con.commit() con.close() def add_task( self, title: str, task_type: str = "feature", risk: str = "low", ) -> Task: """Add a task to the local queue.""" task = Task( title=title, source="local", source_ref="local", task_type=task_type, risk=risk, ) con = sqlite3.connect(str(self._path)) con.execute( "INSERT INTO tasks (id, title, task_type, risk)" " VALUES (?, ?, ?, ?)", (task.id, task.title, task.task_type, task.risk), ) con.commit() con.close() return task def poll(self) -> list[Task]: """Return unclaimed tasks.""" con = sqlite3.connect(str(self._path)) cur = con.execute( "SELECT id, title, task_type, risk" " FROM tasks WHERE claimed = 0" ) tasks = [] for row in cur.fetchall(): tasks.append(Task( id=row[0], title=row[1], source="local", source_ref="local", task_type=row[2], risk=row[3], )) con.close() return tasks def mark_claimed(self, task_id: str) -> None: """Mark a task as claimed.""" con = sqlite3.connect(str(self._path)) con.execute( "UPDATE tasks SET claimed = 1 WHERE id = ?", (task_id,), ) con.commit() con.close() ================================================ FILE: scripts/polyphony/state_machine.py ================================================ """Task state machine for Polyphony (spec §4).""" from __future__ import annotations from .models import Task, _now TASK_STATES = ( "discovered", "claimed", "routed", "provisioned", "running", "verifying", "landed", "failed", "blocked", ) TRANSITIONS: dict[str, tuple[str, ...]] = { "discovered": ("claimed",), "claimed": ("routed",), "routed": ("provisioned",), "provisioned": ("running",), "running": ("verifying", "failed"), "verifying": ("landed", "failed"), "failed": ("claimed", "blocked"), } TERMINAL_STATES = ("landed", "blocked") def can_transition(current: str, target: str) -> bool: """Check if a state transition is valid.""" allowed = TRANSITIONS.get(current, ()) return target in allowed def transition(task: Task, target: str) -> Task: """Transition a task to a new state. Raises on invalid.""" if not can_transition(task.state, target): msg = f"Invalid transition: {task.state} -> {target}" raise ValueError(msg) task.state = target task.updated_at = _now() return task def is_terminal(state: str) -> bool: """Check if a state is terminal (no further transitions).""" return state in TERMINAL_STATES ================================================ FILE: scripts/polyphony/store.py ================================================ """SQLite storage layer for Polyphony.""" from __future__ import annotations import json import sqlite3 from pathlib import Path from .models import Result, RunSpec, Task, _now DB_NAME = "orchestrator.db" SCHEMA = """ CREATE TABLE IF NOT EXISTS tasks ( id TEXT PRIMARY KEY, title TEXT NOT NULL, source TEXT NOT NULL, source_ref TEXT NOT NULL, state TEXT NOT NULL DEFAULT 'discovered', task_type TEXT DEFAULT 'feature', scope TEXT DEFAULT '[]', risk TEXT DEFAULT 'low', context_tokens INTEGER DEFAULT 0, requires_web INTEGER DEFAULT 0, run_spec_id TEXT, metadata TEXT DEFAULT '{}', created_at TEXT NOT NULL, updated_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS run_specs ( id TEXT PRIMARY KEY, task_id TEXT NOT NULL, agent TEXT NOT NULL, identity TEXT NOT NULL, workspace TEXT NOT NULL, image TEXT NOT NULL, attempt INTEGER DEFAULT 1, model TEXT DEFAULT '', fallback TEXT DEFAULT '[]', max_turns INTEGER DEFAULT 25, allowed_paths TEXT DEFAULT '[]', proof_of_work TEXT DEFAULT '[]', env_overlay TEXT DEFAULT '{}', volume_mounts TEXT DEFAULT '[]', deadline_seconds INTEGER DEFAULT 1800 ); CREATE TABLE IF NOT EXISTS results ( id TEXT PRIMARY KEY, task_id TEXT NOT NULL, run_spec_id TEXT NOT NULL, agent TEXT NOT NULL, status TEXT NOT NULL, turns INTEGER DEFAULT 0, duration_seconds INTEGER DEFAULT 0, cost_usd REAL, artifacts TEXT DEFAULT '{}', events TEXT DEFAULT '[]', completed_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS state_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, task_id TEXT NOT NULL, from_state TEXT NOT NULL, to_state TEXT NOT NULL, timestamp TEXT NOT NULL ); """ class PolyphonyStore: """SQLite-backed persistence for Polyphony.""" def __init__(self, base_dir: Path) -> None: self.base_dir = Path(base_dir) self.db_path = self.base_dir / DB_NAME def init_db(self) -> None: self.base_dir.mkdir(parents=True, exist_ok=True) self._write_gitignore() conn = self._connect() conn.executescript(SCHEMA) conn.close() def _connect(self) -> sqlite3.Connection: conn = sqlite3.connect(str(self.db_path)) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") conn.row_factory = sqlite3.Row return conn def _write_gitignore(self) -> None: gi = self.base_dir / ".gitignore" if not gi.exists(): gi.write_text("*\n") # --- Task CRUD --- def save_task(self, task: Task) -> None: conn = self._connect() conn.execute( "INSERT OR REPLACE INTO tasks VALUES " "(?,?,?,?,?,?,?,?,?,?,?,?,?,?)", ( task.id, task.title, task.source, task.source_ref, task.state, task.task_type, json.dumps(task.scope), task.risk, task.context_tokens, int(task.requires_web), task.run_spec_id, json.dumps(task.metadata), task.created_at, task.updated_at, ), ) conn.commit() conn.close() def get_task(self, task_id: str) -> Task | None: conn = self._connect() row = conn.execute( "SELECT * FROM tasks WHERE id=?", (task_id,), ).fetchone() conn.close() return self._row_to_task(row) if row else None def list_tasks(self, state: str | None = None) -> list[Task]: conn = self._connect() if state: rows = conn.execute( "SELECT * FROM tasks WHERE state=?", (state,), ).fetchall() else: rows = conn.execute("SELECT * FROM tasks").fetchall() conn.close() return [self._row_to_task(r) for r in rows] def _row_to_task(self, row: sqlite3.Row) -> Task: return Task( id=row["id"], title=row["title"], source=row["source"], source_ref=row["source_ref"], state=row["state"], task_type=row["task_type"], scope=json.loads(row["scope"]), risk=row["risk"], context_tokens=row["context_tokens"], requires_web=bool(row["requires_web"]), run_spec_id=row["run_spec_id"], metadata=json.loads(row["metadata"]), created_at=row["created_at"], updated_at=row["updated_at"], ) # --- RunSpec CRUD --- def save_run_spec(self, rs: RunSpec) -> None: conn = self._connect() conn.execute( "INSERT OR REPLACE INTO run_specs VALUES " "(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", ( rs.id, rs.task_id, rs.agent, rs.identity, rs.workspace, rs.image, rs.attempt, rs.model, json.dumps(rs.fallback), rs.max_turns, json.dumps(rs.allowed_paths), json.dumps(rs.proof_of_work), json.dumps(rs.env_overlay), json.dumps(rs.volume_mounts), rs.deadline_seconds, ), ) conn.commit() conn.close() def get_run_spec(self, rs_id: str) -> RunSpec | None: conn = self._connect() row = conn.execute( "SELECT * FROM run_specs WHERE id=?", (rs_id,), ).fetchone() conn.close() return self._row_to_run_spec(row) if row else None def _row_to_run_spec(self, row: sqlite3.Row) -> RunSpec: return RunSpec( id=row["id"], task_id=row["task_id"], agent=row["agent"], identity=row["identity"], workspace=row["workspace"], image=row["image"], attempt=row["attempt"], model=row["model"], fallback=json.loads(row["fallback"]), max_turns=row["max_turns"], allowed_paths=json.loads(row["allowed_paths"]), proof_of_work=json.loads(row["proof_of_work"]), env_overlay=json.loads(row["env_overlay"]), volume_mounts=json.loads(row["volume_mounts"]), deadline_seconds=row["deadline_seconds"], ) # --- Result CRUD --- def save_result(self, result: Result) -> None: conn = self._connect() conn.execute( "INSERT OR REPLACE INTO results VALUES " "(?,?,?,?,?,?,?,?,?,?,?)", ( result.id, result.task_id, result.run_spec_id, result.agent, result.status, result.turns, result.duration_seconds, result.cost_usd, json.dumps(result.artifacts), json.dumps(result.events), result.completed_at, ), ) conn.commit() conn.close() def get_result(self, result_id: str) -> Result | None: conn = self._connect() row = conn.execute( "SELECT * FROM results WHERE id=?", (result_id,), ).fetchone() conn.close() return self._row_to_result(row) if row else None def list_results(self, task_id: str) -> list[Result]: conn = self._connect() rows = conn.execute( "SELECT * FROM results WHERE task_id=?", (task_id,), ).fetchall() conn.close() return [self._row_to_result(r) for r in rows] def _row_to_result(self, row: sqlite3.Row) -> Result: return Result( id=row["id"], task_id=row["task_id"], run_spec_id=row["run_spec_id"], agent=row["agent"], status=row["status"], turns=row["turns"], duration_seconds=row["duration_seconds"], cost_usd=row["cost_usd"], artifacts=json.loads(row["artifacts"]), events=json.loads(row["events"]), completed_at=row["completed_at"], ) # --- State log --- def log_transition(self, task_id: str, from_s: str, to_s: str) -> None: conn = self._connect() conn.execute( "INSERT INTO state_log (task_id, from_state, to_state, timestamp) " "VALUES (?,?,?,?)", (task_id, from_s, to_s, _now()), ) conn.commit() conn.close() def get_state_log(self, task_id: str) -> list[dict]: conn = self._connect() rows = conn.execute( "SELECT * FROM state_log WHERE task_id=? " "ORDER BY id", (task_id,), ).fetchall() conn.close() return [ { "from_state": r["from_state"], "to_state": r["to_state"], "timestamp": r["timestamp"], } for r in rows ] ================================================ FILE: scripts/polyphony/workspace.py ================================================ """Workspace manager — per-task git clone lifecycle (spec §6). Each task+attempt gets an isolated directory with a full git clone. """ from __future__ import annotations import re import shutil import subprocess from pathlib import Path def workspace_path( base_dir: Path, task_id: str, attempt: int, ) -> Path: """Build workspace directory path, sanitizing task_id.""" safe_id = re.sub(r"[^\w\-.]", "_", task_id) return base_dir / safe_id / str(attempt) def create_workspace( base_dir: Path, task_id: str, attempt: int, repo_url: str, ref: str, mirror_path: Path | None = None, ) -> Path: """Clone repo into workspace and checkout ref.""" ws = workspace_path(base_dir, task_id, attempt) ws.mkdir(parents=True, exist_ok=True) clone_cmd = ["git", "clone"] if mirror_path and mirror_path.exists(): clone_cmd += [ "--reference", str(mirror_path), "--dissociate", ] clone_cmd += [repo_url, str(ws)] _run_git(clone_cmd) checkout_cmd = ["git", "-C", str(ws), "checkout", ref] _run_git(checkout_cmd) return ws def cleanup_workspace(ws_path: Path) -> None: """Remove workspace directory. No error if missing.""" if ws_path.exists(): shutil.rmtree(ws_path) def list_workspaces(base_dir: Path) -> list[Path]: """List all workspace directories under base_dir.""" if not base_dir.exists(): return [] result: list[Path] = [] for task_dir in sorted(base_dir.iterdir()): if task_dir.is_dir(): for attempt_dir in sorted(task_dir.iterdir()): if attempt_dir.is_dir(): result.append(attempt_dir) return result def _run_git(cmd: list[str]) -> subprocess.CompletedProcess: """Run a git command. Thin wrapper for mocking.""" return subprocess.run( cmd, capture_output=True, text=True, check=False, ) ================================================ FILE: scripts/skill_lint/__init__.py ================================================ """skill_lint -- Quality gates for Maggy skills.""" from __future__ import annotations __version__ = '0.1.0' from dataclasses import dataclass from enum import Enum class Severity(Enum): ERROR = 'error' WARNING = 'warning' INFO = 'info' @dataclass class Finding: rule_id: str severity: Severity message: str line: int | None = None suggestion: str | None = None ================================================ FILE: scripts/skill_lint/__main__.py ================================================ """CLI entry point for skill-lint -- Quality gates for Maggy skills.""" from __future__ import annotations import argparse import sys from pathlib import Path from . import Severity, __version__ from . import content, frontmatter, references, report, spec CHECKERS = [frontmatter, spec, content, references] def discover_skills(skills_dir: Path, skill_filter: str | None = None) -> list[Path]: """Find all skill directories under skills_dir.""" if not skills_dir.is_dir(): return [] dirs = sorted( d for d in skills_dir.iterdir() if d.is_dir() and not d.name.startswith('.') ) if skill_filter: dirs = [d for d in dirs if d.name == skill_filter] return dirs def lint_skill(skill_dir: Path, skills_dir: Path) -> list: """Run all checkers on a single skill, return findings.""" from . import Finding skill_path = skill_dir / 'SKILL.md' findings: list[Finding] = [] for checker in CHECKERS: findings.extend(checker.check(skill_path, skill_dir, skills_dir)) return findings def severity_from_str(s: str) -> Severity: """Convert string to Severity enum.""" mapping = { 'error': Severity.ERROR, 'warning': Severity.WARNING, 'info': Severity.INFO, } result = mapping.get(s.lower()) if result is None: raise ValueError(f'Unknown severity: {s}') return result def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog='skill-lint', description='Quality gates for Maggy skills' ) parser.add_argument( '--version', action='version', version=f'skill-lint {__version__}' ) parser.add_argument( 'skills_dir', help='Path to skills/ directory' ) parser.add_argument( '--format', dest='output_format', default='text', choices=['text', 'json'], help='Output format (default: text)' ) parser.add_argument( '--severity', default='info', choices=['error', 'warning', 'info'], help='Minimum severity to show (default: info)' ) parser.add_argument( '--skill', default=None, help='Lint a single skill by directory name' ) parser.add_argument( '--fail-on', dest='fail_on', default='error', choices=['error', 'warning', 'info'], help='Exit 1 if findings at this severity or above (default: error)' ) args = parser.parse_args(argv) skills_dir = Path(args.skills_dir).resolve() if not skills_dir.is_dir(): print(f'Error: {args.skills_dir} is not a directory', file=sys.stderr) return 2 skill_dirs = discover_skills(skills_dir, args.skill) if not skill_dirs: if args.skill: print(f'Error: skill "{args.skill}" not found in {skills_dir}', file=sys.stderr) return 2 print(f'Error: no skill directories found in {skills_dir}', file=sys.stderr) return 2 # Run linting results: dict[str, list] = {} for skill_dir in skill_dirs: findings = lint_skill(skill_dir, skills_dir) results[skill_dir.name] = findings # Format output min_severity = severity_from_str(args.severity) if args.output_format == 'json': output = report.format_json(results, min_severity) else: output = report.format_text(results, min_severity) print(output) # Determine exit code fail_severity = severity_from_str(args.fail_on) severity_order = [Severity.ERROR, Severity.WARNING, Severity.INFO] severity_rank = {s: i for i, s in enumerate(severity_order)} fail_rank = severity_rank[fail_severity] has_failures = any( any( severity_rank[f.severity] <= fail_rank for f in findings ) for findings in results.values() ) return 1 if has_failures else 0 if __name__ == '__main__': sys.exit(main()) ================================================ FILE: scripts/skill_lint/content.py ================================================ """Content quality checks (CQ001-CQ006).""" from __future__ import annotations import re from pathlib import Path from . import Finding, Severity # ASCII art box characters (outside code blocks) ASCII_ART_RE = re.compile( r'[╔╗╚╝╠╣╦╩╬║═│┌┐└┘├┤┬┴┼─┃━┏┓┗┛┣┫┳┻╋]' r'|[+|]{2,}\s*[-=]{3,}' r'|[-=]{3,}\s*[+|]{2,}' r'|^\s*[+][\-+]{3,}[+]\s*$' r'|^\s*[|].*[|]\s*$' ) VAGUE_PHRASES = [ 'follow best practices', 'ensure quality', 'as appropriate', 'when necessary', 'use proper', 'handle appropriately', 'do the right thing', 'be careful', 'use common sense', 'as needed', ] FILLER_WORDS_RE = re.compile( r'\b(MANDATORY|NON-NEGOTIABLE|ABSOLUTELY|CRITICAL|MUST ALWAYS|' r'NEVER EVER|UNDER NO CIRCUMSTANCES|WITHOUT EXCEPTION|' r'ZERO TOLERANCE|NO EXCEPTIONS)\b', re.IGNORECASE ) STALE_LOAD_RE = re.compile(r'\*?Load with:\s+\S+\.md\*?', re.IGNORECASE) def _in_code_block(lines: list[str], target_idx: int) -> bool: """Check if a line index is inside a fenced code block.""" in_fence = False for i, line in enumerate(lines): if line.strip().startswith('```'): in_fence = not in_fence if i == target_idx: return in_fence return False def check(skill_path: Path, skill_dir: Path, skills_dir: Path) -> list[Finding]: """Run content quality checks on a single skill.""" findings: list[Finding] = [] if not skill_path.exists(): return findings content = skill_path.read_text(encoding='utf-8') lines = content.split('\n') # Check for inline suppression in first 10 lines suppressed: set[str] = set() for line in lines[:10]: if '<!-- skill-lint: disable=' in line: start = line.index('disable=') + 8 end = line.index('-->', start) if '-->' in line[start:] else len(line) rules = line[start:end].strip().rstrip(' >') for rule in rules.split(','): suppressed.add(rule.strip()) # CQ001: no ASCII art boxes outside code blocks if 'CQ001' not in suppressed: ascii_art_lines = [] for i, line in enumerate(lines): if not _in_code_block(lines, i) and ASCII_ART_RE.search(line): ascii_art_lines.append(i + 1) if ascii_art_lines: sample = ascii_art_lines[:3] findings.append(Finding( rule_id='CQ001', severity=Severity.WARNING, message=f'ASCII art detected outside code blocks (lines: {sample})', line=ascii_art_lines[0], suggestion='Remove decorative ASCII art to save tokens' )) # CQ002: no vague phrases if 'CQ002' not in suppressed: vague_found = [] for i, line in enumerate(lines): if _in_code_block(lines, i): continue lower = line.lower() for phrase in VAGUE_PHRASES: if phrase in lower: vague_found.append((i + 1, phrase)) if vague_found: sample = vague_found[:3] phrases = ', '.join(f'"{p}" (L{n})' for n, p in sample) findings.append(Finding( rule_id='CQ002', severity=Severity.INFO, message=f'Vague phrases found: {phrases}', line=vague_found[0][0], suggestion='Replace vague guidance with specific, actionable instructions' )) # CQ003: filler intensity <= 2 per 100 lines if 'CQ003' not in suppressed: filler_count = 0 for i, line in enumerate(lines): if not _in_code_block(lines, i): filler_count += len(FILLER_WORDS_RE.findall(line)) if len(lines) > 0: intensity = (filler_count / len(lines)) * 100 if intensity > 2: findings.append(Finding( rule_id='CQ003', severity=Severity.WARNING, message=f'Filler intensity {intensity:.1f} per 100 lines (max: 2.0)', suggestion='Reduce emphatic language (MANDATORY, NON-NEGOTIABLE, etc.)' )) # CQ004: >= 1 code block per 50 lines of content if 'CQ004' not in suppressed: code_blocks = content.count('```') // 2 content_lines = len([l for l in lines if l.strip()]) if content_lines >= 50: expected = content_lines / 50 if code_blocks < expected: findings.append(Finding( rule_id='CQ004', severity=Severity.WARNING, message=f'{code_blocks} code blocks for {content_lines} content lines ' f'(expected >= {int(expected)})', suggestion='Add concrete code examples to illustrate patterns' )) # CQ005: no stale "Load with:" references if 'CQ005' not in suppressed: for i, line in enumerate(lines): if not _in_code_block(lines, i) and STALE_LOAD_RE.search(line): findings.append(Finding( rule_id='CQ005', severity=Severity.WARNING, message=f'Stale "Load with:" reference at line {i + 1}', line=i + 1, suggestion='Remove stale loading instructions' )) break # One finding is enough # CQ006: H1 heading present after frontmatter if 'CQ006' not in suppressed: # Find end of frontmatter in_fm = False fm_end = 0 for i, line in enumerate(lines): if line.strip() == '---': if not in_fm: in_fm = True else: fm_end = i break has_h1 = False for line in lines[fm_end:]: if line.strip().startswith('# '): has_h1 = True break if not has_h1: findings.append(Finding( rule_id='CQ006', severity=Severity.WARNING, message='No H1 heading found after frontmatter', suggestion='Add a top-level heading: # Skill Name' )) return findings ================================================ FILE: scripts/skill_lint/frontmatter.py ================================================ """Frontmatter validation checks (FM001-FM009).""" from __future__ import annotations import re from pathlib import Path from . import Finding, Severity def parse_frontmatter(content: str) -> tuple[dict[str, str], int]: """Parse YAML frontmatter from between first --- pair. Returns (fields_dict, end_line_number). Only parses simple key: value pairs and YAML inline arrays [a, b]. """ lines = content.split('\n') if not lines or lines[0].strip() != '---': return {}, 0 fields: dict[str, str] = {} end_line = 0 for i, line in enumerate(lines[1:], start=2): if line.strip() == '---': end_line = i break match = re.match(r'^(\w[\w-]*)\s*:\s*(.*)', line) if match: key = match.group(1).strip() value = match.group(2).strip() # Strip surrounding quotes if len(value) >= 2 and value[0] in ('"', "'") and value[-1] == value[0]: value = value[1:-1] fields[key] = value return fields, end_line NAME_PATTERN = re.compile(r'^[a-z][a-z0-9]*(-[a-z0-9]+)*$') def check(skill_path: Path, skill_dir: Path, skills_dir: Path) -> list[Finding]: """Run all frontmatter checks on a single skill.""" findings: list[Finding] = [] content = skill_path.read_text(encoding='utf-8') dir_name = skill_dir.name # FM001: frontmatter delimiters present lines = content.split('\n') if not lines or lines[0].strip() != '---': findings.append(Finding( rule_id='FM001', severity=Severity.ERROR, message='SKILL.md missing YAML frontmatter (must start with ---)', line=1, suggestion='Add frontmatter: ---\\nname: ' + dir_name + '\\ndescription: ...\\n---' )) return findings # Can't check other rules without frontmatter fields, end_line = parse_frontmatter(content) if end_line == 0: findings.append(Finding( rule_id='FM001', severity=Severity.ERROR, message='YAML frontmatter not closed (missing second ---)', line=1, suggestion='Add closing --- after frontmatter fields' )) return findings # FM002: name field present name = fields.get('name', '').strip() if not name: findings.append(Finding( rule_id='FM002', severity=Severity.ERROR, message="'name' field missing or empty in frontmatter", line=None, suggestion=f'Add: name: {dir_name}' )) # FM003: description field present desc = fields.get('description', '').strip() if not desc: findings.append(Finding( rule_id='FM003', severity=Severity.ERROR, message="'description' field missing or empty in frontmatter", line=None, suggestion='Add: description: One-line description of what this skill does' )) # FM004: name matches directory name if name and name != dir_name: findings.append(Finding( rule_id='FM004', severity=Severity.ERROR, message=f"name '{name}' does not match directory name '{dir_name}'", line=None, suggestion=f'Change to: name: {dir_name}' )) # FM005: name format (lowercase, hyphens, 1-64 chars) if name: if len(name) > 64: findings.append(Finding( rule_id='FM005', severity=Severity.ERROR, message=f'name is {len(name)} chars (max 64)', line=None )) elif not NAME_PATTERN.match(name): findings.append(Finding( rule_id='FM005', severity=Severity.ERROR, message=f"name '{name}' must be lowercase alphanumeric with hyphens", line=None, suggestion='Use only lowercase letters, numbers, and hyphens' )) # FM006: description length if desc: if len(desc) > 1024: findings.append(Finding( rule_id='FM006', severity=Severity.WARNING, message=f'description is {len(desc)} chars (max 1024)', line=None, suggestion='Shorten description to under 1024 characters' )) # FM007: when-to-use present if 'when-to-use' not in fields: findings.append(Finding( rule_id='FM007', severity=Severity.WARNING, message="'when-to-use' field missing", line=None, suggestion='Add: when-to-use: When to activate this skill' )) # FM008: user-invocable present if 'user-invocable' not in fields: findings.append(Finding( rule_id='FM008', severity=Severity.INFO, message="'user-invocable' field missing", line=None, suggestion='Add: user-invocable: true|false' )) # FM009: effort field valid effort = fields.get('effort', '').strip() if effort and effort not in ('low', 'medium', 'high'): findings.append(Finding( rule_id='FM009', severity=Severity.INFO, message=f"effort '{effort}' is not one of: low, medium, high", line=None )) elif not effort: findings.append(Finding( rule_id='FM009', severity=Severity.INFO, message="'effort' field missing", line=None, suggestion='Add: effort: low|medium|high' )) return findings ================================================ FILE: scripts/skill_lint/pyproject.toml ================================================ [project] name = "skill-lint" version = "0.1.0" description = "Quality gates for Maggy skills" requires-python = ">=3.10" dependencies = [] [project.optional-dependencies] skills-ref = ["skills-ref>=0.1.0"] [project.scripts] skill-lint = "skill_lint.__main__:main" [build-system] requires = ["setuptools>=68.0"] build-backend = "setuptools.build_meta" ================================================ FILE: scripts/skill_lint/references.py ================================================ """Cross-reference checks (RI001-RI002).""" from __future__ import annotations import re from pathlib import Path from . import Finding, Severity # Match skill references like: skills/base, skills/security, .claude/skills/llm-patterns SKILL_REF_RE = re.compile( r'(?:\.claude/)?skills/([a-z][a-z0-9-]+)' ) def check(skill_path: Path, skill_dir: Path, skills_dir: Path) -> list[Finding]: """Run cross-reference checks on a single skill.""" findings: list[Finding] = [] if not skill_path.exists(): return findings content = skill_path.read_text(encoding='utf-8') dir_name = skill_dir.name # RI001: cross-skill name references resolve to existing dirs existing_skills = { d.name for d in skills_dir.iterdir() if d.is_dir() and not d.name.startswith('.') } referenced = set() for match in SKILL_REF_RE.finditer(content): ref_name = match.group(1) if ref_name != dir_name: referenced.add(ref_name) broken = referenced - existing_skills if broken: findings.append(Finding( rule_id='RI001', severity=Severity.WARNING, message=f'Broken skill references: {", ".join(sorted(broken))}', suggestion='Fix or remove references to non-existent skills' )) # RI002: skill listed in README skills table readme_path = skills_dir.parent / 'README.md' if readme_path.exists(): readme = readme_path.read_text(encoding='utf-8') # Check if skill name appears in README (in a table or list) if dir_name not in readme: findings.append(Finding( rule_id='RI002', severity=Severity.INFO, message=f'Skill "{dir_name}" not found in README.md', suggestion='Add skill to the skills table in README.md' )) return findings ================================================ FILE: scripts/skill_lint/report.py ================================================ """Output formatters for skill-lint results.""" from __future__ import annotations import json from collections import defaultdict from . import Finding, Severity def format_text( results: dict[str, list[Finding]], min_severity: Severity = Severity.INFO ) -> str: """Format findings as human-readable text grouped by severity then skill.""" severity_order = [Severity.ERROR, Severity.WARNING, Severity.INFO] severity_rank = {s: i for i, s in enumerate(severity_order)} min_rank = severity_rank[min_severity] # Group by severity by_severity: dict[Severity, dict[str, list[Finding]]] = defaultdict( lambda: defaultdict(list) ) total_errors = 0 total_warnings = 0 total_info = 0 for skill_name, findings in sorted(results.items()): for f in findings: if severity_rank[f.severity] <= min_rank: by_severity[f.severity][skill_name].append(f) if f.severity == Severity.ERROR: total_errors += 1 elif f.severity == Severity.WARNING: total_warnings += 1 else: total_info += 1 lines: list[str] = [] total_skills = len(results) clean_skills = sum(1 for fs in results.values() if not fs) for sev in severity_order: if sev not in by_severity: continue if severity_rank[sev] > min_rank: continue lines.append(f'\n=== {sev.value.upper()} ===') for skill_name, findings in sorted(by_severity[sev].items()): lines.append(f'\n {skill_name}/') for f in findings: loc = f'L{f.line}' if f.line else '' lines.append(f' [{f.rule_id}] {f.message} {loc}'.rstrip()) if f.suggestion: lines.append(f' -> {f.suggestion}') # Summary lines.append(f'\n--- Summary ---') lines.append(f'Skills scanned: {total_skills}') lines.append(f'Clean: {clean_skills}') lines.append(f'Errors: {total_errors} Warnings: {total_warnings} Info: {total_info}') return '\n'.join(lines) def format_json( results: dict[str, list[Finding]], min_severity: Severity = Severity.INFO ) -> str: """Format findings as JSON.""" severity_order = [Severity.ERROR, Severity.WARNING, Severity.INFO] severity_rank = {s: i for i, s in enumerate(severity_order)} min_rank = severity_rank[min_severity] total_errors = 0 total_warnings = 0 total_info = 0 skills_out: dict[str, dict] = {} for skill_name, findings in sorted(results.items()): filtered = [ f for f in findings if severity_rank[f.severity] <= min_rank ] skill_findings = [] for f in filtered: entry = { 'rule_id': f.rule_id, 'severity': f.severity.value, 'message': f.message, } if f.line is not None: entry['line'] = f.line if f.suggestion: entry['suggestion'] = f.suggestion skill_findings.append(entry) if f.severity == Severity.ERROR: total_errors += 1 elif f.severity == Severity.WARNING: total_warnings += 1 else: total_info += 1 skills_out[skill_name] = { 'findings': skill_findings, 'error_count': sum(1 for f in filtered if f.severity == Severity.ERROR), 'warning_count': sum(1 for f in filtered if f.severity == Severity.WARNING), } output = { 'summary': { 'total_skills': len(results), 'clean_skills': sum( 1 for fs in results.values() if not fs ), 'errors': total_errors, 'warnings': total_warnings, 'info': total_info, }, 'skills': skills_out, } return json.dumps(output, indent=2) ================================================ FILE: scripts/skill_lint/spec.py ================================================ """Spec compliance checks (SP001-SP003, SR001).""" from __future__ import annotations from pathlib import Path from . import Finding, Severity def check(skill_path: Path, skill_dir: Path, skills_dir: Path) -> list[Finding]: """Run spec compliance checks on a single skill.""" findings: list[Finding] = [] # SP001: SKILL.md exists if not skill_path.exists(): findings.append(Finding( rule_id='SP001', severity=Severity.ERROR, message='SKILL.md not found in skill directory', suggestion='Create SKILL.md with frontmatter and content' )) return findings content = skill_path.read_text(encoding='utf-8') lines = content.split('\n') line_count = len(lines) # Check for inline suppression in first 10 lines suppressed: set[str] = set() for line in lines[:10]: if '<!-- skill-lint: disable=' in line: # Extract rule IDs: <!-- skill-lint: disable=SP002,SP003 --> start = line.index('disable=') + 8 end = line.index('-->', start) if '-->' in line[start:] else len(line) rules = line[start:end].strip().rstrip(' >') for rule in rules.split(','): suppressed.add(rule.strip()) # SP002: under 500 lines if line_count > 500 and 'SP002' not in suppressed: findings.append(Finding( rule_id='SP002', severity=Severity.WARNING, message=f'SKILL.md is {line_count} lines (limit: 500)', suggestion='Split into focused sections; move reference material to companion files' )) # SP003: under 300 lines (ideal) if line_count > 300 and line_count <= 500 and 'SP003' not in suppressed: findings.append(Finding( rule_id='SP003', severity=Severity.INFO, message=f'SKILL.md is {line_count} lines (ideal: under 300)', suggestion='Consider trimming for better token efficiency' )) # SR001: skills-ref validate (if installed) try: from skills_ref import validate as sr_validate problems = sr_validate(str(skill_dir)) if problems: for p in problems[:5]: findings.append(Finding( rule_id='SR001', severity=Severity.WARNING, message=f'skills-ref: {p}', )) except ImportError: pass # skills-ref not installed, skip return findings ================================================ FILE: skills/aeo-optimization/SKILL.md ================================================ --- name: aeo-optimization description: AI Engine Optimization - semantic triples, page templates, content clusters for AI citations when-to-use: When optimizing content for AI engine discovery and citations user-invocable: false effort: medium --- # AI Engine Optimization (AEO) Skill **Purpose:** Optimize content for AI engines (ChatGPT, Claude, Perplexity, Google AI Overviews) so your brand gets cited in AI-generated answers. **Source:** Based on [HubSpot's AEO Guide](https://www.hubspot.com/aeo) and industry best practices. --- ## Why AEO Matters Now ``` ┌────────────────────────────────────────────────────────────────┐ │ THE GREAT DECOUPLING │ │ ──────────────────────────────────────────────────────────── │ │ Impressions ≠ Clicks anymore. │ │ AI engines compile answers from multiple sources. │ │ More buyer journey happens inside chat experiences. │ │ 58% of Google searches = zero clicks (AI overviews). │ ├────────────────────────────────────────────────────────────────┤ │ THE OPPORTUNITY │ │ ──────────────────────────────────────────────────────────── │ │ Shape what AI engines say about your category and product. │ │ Get cited as the authoritative source. │ │ Best answer > Best page ranking. │ └────────────────────────────────────────────────────────────────┘ ``` **Key Stats:** - 70% of consumers use ChatGPT for searches - 47% of Google queries show AI overviews - Average ChatGPT prompt: 23 words (vs 4.2 for Google) - AEO market: $886M (2024) → $7.3B (2031) --- ## How AI Engines Choose Answers AI engines use three main signals to select content for answers: ### 1. Consensus Facts that appear across multiple credible sources get trusted and reused. **How to build consensus:** - Repeat key facts consistently across your own pages - Use same terminology as industry leaders - Link to and from authoritative external sources - Create internal content clusters that reinforce each other ### 2. Information Gain Net-new insight beats generic advice. AI engines prefer content that adds value. **How to add information gain:** - Original research and data - Concrete examples with specifics - Clear point of view (not fence-sitting) - Expert quotes with credentials - Case studies with metrics ### 3. Entities & Structure Clear entities and tidy structure reduce ambiguity and boost quotability. **How to optimize structure:** - Use semantic triples (Subject → Verb → Object) - Clear headings with entity names - Schema markup (Article, FAQ, Product) - Short, scannable paragraphs (2-4 sentences) --- ## Semantic Triples (Critical for AEO) **What they are:** Compact facts that AI engines (and humans) can't misread. **Pattern:** `[Subject]` `[verb]` `[object]`. ### Examples ``` ✅ GOOD (clear triples): - HubSpot CRM syncs contact and company data. - Lead Scoring assigns priority based on engagement. - Workflows trigger email sequences from events. ❌ BAD (vague, no clear entity): - The system helps with various tasks. - It can do many things for users. - This improves overall performance. ``` ### Triple Checklist For every key claim, ask: - [ ] Is the subject a clear entity (product, feature, brand)? - [ ] Is the verb specific and active? - [ ] Is the object concrete and measurable? --- ## Paragraph Pattern (Feature → How → Outcome) Every substantive paragraph should follow this structure: ``` [Feature] helps [User/Role] with [Job]. It [mechanism/inputs] to [process]. Teams see [metric/result] in [timeframe/context]. Triples: - [Subject] [verb] [object]. - [Subject] [verb] [object]. ``` ### Example ```markdown Lead Scoring helps sales teams prioritize prospects. It combines page views, email engagement, and firmographic data to assign a numeric score, then auto-enrolls high scorers into follow-up sequences. Reps focus on qualified accounts and book 40% more meetings. - Lead Scoring assigns scores from engagement data. - High scorers trigger automated follow-up sequences. ``` --- ## Page Templates ### Template 1: Category Explainer **Goal:** Define the category, tie it to your product, earn citations. ```markdown # What is [Category]? — [1-2 line value promise] ## What is [Category]? (~80 words) [Plain definition in everyday language. Name adjacent entities.] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. ## Why it matters now (~60 words) [One paragraph. Mention shift to answers over links; tie to buyer outcomes.] ## How to apply it (3-5 bullets) - [Action 1] - [Action 2] - [Action 3] ## FAQ **Q: [Question]?** A: [~1 sentence answer] **Q: [Question]?** A: [~1 sentence answer] **Q: [Question]?** A: [~1 sentence answer] --- **Links:** [Category hub] | [Product/Feature] | [Credible source 1] | [Credible source 2] **CTA:** [Demo / Template / Signup] **Schema:** Article + FAQ. Author + last updated. ``` --- ### Template 2: Product & Feature Page **Goal:** Clarify capability, fit, and next step; reinforce category linkage. ```markdown # [Product/Feature] — [Outcome in 3-5 words] **[Product/Feature] enables [Outcome] for [User/Role].** ## [Feature Area 1] [2-4 sentences using Feature → How → Outcome] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. ## [Feature Area 2] [2-4 sentences using Feature → How → Outcome] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. ## [Feature Area 3] [2-4 sentences using Feature → How → Outcome] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. ## FAQ **Q: [Question]?** A: [~1 sentence] **Q: [Question]?** A: [~1 sentence] **Q: [Question]?** A: [~1 sentence] --- **Links:** Back to [Category Explainer] | Forward to [Demo/Trial] **Proof:** [Benchmark/Analyst/Customer proof] **Notes:** Requirements/limits (pricing tier, integrations) **Schema:** Article + FAQ. Author + last updated. ``` --- ### Template 3: Comparison / Alternatives Page **Goal:** Help readers decide with clear criteria; earn fair citations. ```markdown # [Product] vs. [Alternative] — Which fits [Use case]? ## Comparison Table | Criterion | [Product] | [Alt A] | [Alt B] | Source | |-----------|-----------|---------|---------|--------| | [Feature/Limit] | [value] | [value] | [value] | [link] | | [Requirement] | [value] | [value] | [value] | [link] | | [Best for] | [value] | [value] | [value] | [link] | *Source-back all claims in the table or footnotes.* ## Fit Statements 1. **[Product]** suits [Team/Use case] when [Condition]. 2. **[Alt A]** fits [Team/Use case] when [Condition]. 3. **[Alt B]** works for [Team/Use case] when [Condition]. --- **Links:** [Category Explainer] | [Feature pages] **CTA:** [Try / Demo / Talk to Sales] **Schema:** Article. Author + last updated. ``` --- ### Template 4: Use Case / Industry Page **Goal:** Connect product to outcomes in a context readers recognize. ```markdown # [Industry/Use Case] — [Outcome KPI] **Teams reduce [Metric] by [Y%] in [Timeframe].** ## Mini Case Study [Company/Role] used [Product/Feature] to [Action], resulting in [Metric improvement] within [Timeframe]. ## How It Works ### [Feature 1] [Feature → How → Outcome paragraph] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. ### [Feature 2] [Feature → How → Outcome paragraph] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. ## Who Uses This **Roles:** [Role 1], [Role 2], [Role 3] **Workflows:** [Workflow 1], [Workflow 2] **Integrations:** [Integration 1], [Integration 2] --- **Links:** [Product/Feature pages] | [Supporting blog] **CTA:** [Industry template / Demo variant] **Schema:** Article. Author + last updated. ``` --- ### Template 5: Supporting Blog Post **Goal:** Add information gain and support your content cluster. ```markdown # [Topic] — [Specific promise] ## Opening (~60-80 words) [State the problem. Align terminology with Category Explainer. Preview outcome.] ## [Section 1 Heading] (~120 words max) [Feature → How → Outcome] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. **Internal link:** [Related page] **External citation:** [Credible source] ## [Section 2 Heading] (~120 words max) [Feature → How → Outcome] Triples: 1. [Subject] [verb] [object]. 2. [Subject] [verb] [object]. **Internal link:** [Related page] **External citation:** [Credible source] ## Key Takeaway [1-2 lines summarizing the main point] **CTA:** [Single primary action] --- **Schema:** Article. Author + last updated. ``` --- ## Site-Wide Trust Signals ### Required on Every Page | Element | Implementation | |---------|----------------| | **Schema markup** | Article + FAQ (if FAQ exists) | | **Author attribution** | Name, bio, credentials, photo | | **Last updated date** | Visible, machine-readable | | **Internal links** | 3-5 per page (upstream/downstream) | | **External citations** | 1-2 credible sources per section | | **Single CTA** | Demo, template, or signup (repeated once near end) | ### Schema Implementation ```html <!-- Article Schema --> <script type="application/ld+json"> { "@context": "https://schema.org", "@type": "Article", "headline": "[Page Title]", "author": { "@type": "Person", "name": "[Author Name]", "url": "[Author Bio URL]" }, "datePublished": "[ISO Date]", "dateModified": "[ISO Date]", "publisher": { "@type": "Organization", "name": "[Company]", "logo": "[Logo URL]" } } </script> <!-- FAQ Schema (if FAQ section exists) --> <script type="application/ld+json"> { "@context": "https://schema.org", "@type": "FAQPage", "mainEntity": [ { "@type": "Question", "name": "[Question 1]", "acceptedAnswer": { "@type": "Answer", "text": "[Answer 1]" } }, { "@type": "Question", "name": "[Question 2]", "acceptedAnswer": { "@type": "Answer", "text": "[Answer 2]" } } ] } </script> ``` --- ## Content Cluster Architecture ``` ┌─────────────────────┐ │ Category Explainer │ │ "What is AEO?" │ └──────────┬──────────┘ │ ┌──────────────────────┼──────────────────────┐ │ │ │ ▼ ▼ ▼ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ Product Page │ │ Product Page │ │ Product Page │ │ "Feature A" │ │ "Feature B" │ │ "Feature C" │ └───────┬───────┘ └───────┬───────┘ └───────┬───────┘ │ │ │ ▼ ▼ ▼ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ Blog Post │ │ Use Case │ │ Comparison │ │ (supports) │ │ (industry) │ │ (vs. alt) │ └───────────────┘ └───────────────┘ └───────────────┘ ``` **Linking Rules:** - Category Explainer links DOWN to all product pages - Product pages link UP to Category Explainer - Product pages link ACROSS to related features - Blog posts link UP to Product pages - Comparison pages link to Category Explainer + relevant Product pages --- ## AEO Writing Checklist ### Per-Paragraph Checklist - [ ] Follows Feature → How → Outcome pattern - [ ] Contains 2-4 sentences (scannable) - [ ] Includes 1-2 semantic triples - [ ] Names specific entities (not vague "it" or "this") - [ ] Uses active voice verbs ### Per-Section Checklist - [ ] Has 1 internal link (upstream or downstream) - [ ] Has 1 external citation (credible source) - [ ] Section heading names an entity - [ ] ~120 words max ### Per-Page Checklist - [ ] H1 contains primary entity + value promise - [ ] Opening claim is a semantic triple - [ ] 3-5 internal links total - [ ] 1-2 external citations total - [ ] Mini-FAQ with 3 questions (if applicable) - [ ] Single primary CTA - [ ] Schema markup (Article + FAQ) - [ ] Author name + bio link - [ ] Last updated date visible ### Site-Wide Checklist - [ ] Category Explainer exists for each key category - [ ] Product pages link back to Category Explainer - [ ] Content cluster architecture documented - [ ] Author bio pages exist with credentials - [ ] Consistent terminology across all pages --- ## Measuring AEO Success ### Key Metrics | Metric | How to Track | |--------|--------------| | **AI citations** | Manual checks in ChatGPT, Claude, Perplexity | | **Brand mentions in AI** | Search "[brand] + [category]" in AI engines | | **Share of answer** | How often you're cited vs competitors | | **LLM traffic** | GA4 referral from chatgpt.com, claude.ai, perplexity.ai | | **Impressions-to-clicks gap** | GSC impressions vs actual clicks | ### Tools - **HubSpot AEO Grader** - Grade your brand's AI visibility - **Google Analytics 4** - Track LLM referral traffic - **Google Search Console** - Monitor impressions vs clicks gap - **Manual AI queries** - Regularly test your brand in AI engines --- ## Common AEO Mistakes | Mistake | Fix | |---------|-----| | Vague language ("it helps with things") | Use specific entities and triples | | No clear structure | Use Feature → How → Outcome | | Missing schema | Add Article + FAQ schema | | No author attribution | Add author name, bio, credentials | | Generic content | Add original data, examples, POV | | Orphan pages | Link into content cluster | | Fence-sitting ("it depends") | Take a clear position | | No external citations | Add 1-2 credible sources per section | --- ## AEO vs Traditional SEO | Aspect | Traditional SEO | AEO | |--------|-----------------|-----| | **Goal** | Rank on page 1 | Get cited in AI answers | | **Success metric** | Click-through rate | Share of answer | | **Content focus** | Keywords | Entities + facts | | **Structure** | Headers for scanning | Triples for extraction | | **Links** | Backlinks for authority | Citations for consensus | | **Updates** | Periodic refresh | Continuous accuracy | --- ## Quick Reference ### Semantic Triple Pattern ``` [Entity/Product] [active verb] [concrete object/result]. ``` ### Paragraph Pattern ``` [Feature] helps [User] with [Job]. It [mechanism] to [process]. Teams see [result] in [timeframe]. ``` ### Page Minimums - 3-5 internal links - 1-2 external citations per section - 3 FAQ questions with schema - Author + last updated - Single CTA ### Content Hierarchy 1. Category Explainer (top) 2. Product/Feature pages (middle) 3. Use case / Comparison / Blog (supporting) ================================================ FILE: skills/agent-teams/SKILL.md ================================================ --- name: agent-teams description: Claude Code Agent Teams - default team-based development with strict TDD pipeline enforcement when-to-use: When spawning agent teams for parallel feature development with TDD pipeline user-invocable: false effort: high --- # Agent Teams Skill **Purpose:** Every project initialized with Maggy runs as a coordinated team of AI agents. This is the default workflow, not optional. Teams enforce a strict TDD pipeline where no step can be skipped. **Setup:** Agent definitions go in `.claude/agents/` with proper frontmatter (name, description, model, tools, disallowedTools, maxTurns, effort). See agent files for the format. --- ## Core Principle Every feature follows an immutable pipeline enforced by task dependencies: ``` ┌─────────────────────────────────────────────────────────────────┐ │ STRICT FEATURE PIPELINE (IMMUTABLE) │ │ ────────────────────────────────────────────────────────────── │ │ │ │ 1. SPEC Write feature specification │ │ ↓ (Feature Agent) │ │ 2. REVIEW Quality Agent reviews spec completeness │ │ ↓ (Quality Agent) │ │ 3. TESTS Write failing tests for all acceptance criteria │ │ ↓ (Feature Agent) │ │ 4. RED VERIFY Quality Agent confirms ALL tests FAIL │ │ ↓ (Quality Agent) │ │ 5. IMPLEMENT Write minimum code to pass tests │ │ ↓ (Feature Agent) │ │ 6. GREEN VERIFY Quality Agent confirms ALL tests PASS + coverage│ │ ↓ (Quality Agent) │ │ 7. VALIDATE Lint + type check + full test suite │ │ ↓ (Feature Agent) │ │ 8. CODE REVIEW Multi-engine review, block on Critical/High │ │ ↓ (Code Review Agent) │ │ 9. SECURITY OWASP scan, secrets detection, dependency audit │ │ ↓ (Security Agent) │ │ 10. BRANCH+PR Create feature branch, stage files, create PR │ │ (Merger Agent) │ │ │ │ No step can be skipped. Task dependencies enforce ordering. │ │ Quality Agent verifies RED/GREEN transitions. │ │ Code Review + Security Agents gate the merge path. │ │ Merger Agent handles branching and PR creation. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Default Agent Roster Every project spawns 5 permanent agents + N feature agents: ``` ┌─────────────────────────────────────────────────────────────────┐ │ DEFAULT TEAM ROSTER │ │ ────────────────────────────────────────────────────────────── │ │ │ │ PERMANENT AGENTS (always present) │ │ ───────────────────────────────── │ │ Team Lead Orchestration, task breakdown, assignment │ │ Uses delegate mode - NEVER writes code │ │ │ │ Quality Agent TDD verification (RED/GREEN phases) │ │ Coverage gates (>= 80%) │ │ Spec completeness review │ │ │ │ Security Agent OWASP scanning, secrets detection │ │ Dependency audit, .env validation │ │ Blocks on Critical/High │ │ │ │ Code Review Agent Multi-engine code review │ │ Claude / Codex / Gemini / All │ │ Blocks on Critical/High │ │ │ │ Merger Agent Creates feature branches │ │ Stages feature-specific files only │ │ Creates PRs via gh CLI │ │ NEVER merges - only creates PRs │ │ │ │ DYNAMIC AGENTS (one per feature) │ │ ──────────────────────────────── │ │ Feature Agent Implements one feature end-to-end │ │ (x N features) Follows strict pipeline above │ │ Uses Ralph loops for implementation │ │ │ └─────────────────────────────────────────────────────────────────┘ ``` | Agent | Role | Plan Mode | Can Edit Code | |-------|------|-----------|---------------| | team-lead | Orchestration, task breakdown, assignment | No (delegate mode) | No | | quality-agent | TDD verification, coverage gates | Yes | No (read-only) | | security-agent | OWASP scanning, secrets detection | Yes | No (read-only) | | review-agent | Multi-engine code review | Yes | No (read-only) | | merger-agent | Branch creation, PR management | No | No (git only) | | feature-{name} | Feature implementation (one per feature) | No | Yes | --- ## Team Lead Responsibilities The Team Lead is the orchestrator. It NEVER writes code. 1. Read `_project_specs/features/*.md` to identify all features 2. Break each feature into the 10-task dependency chain (see below) 3. Spawn one feature agent per feature 4. Assign initial tasks (spec-writing) to feature agents 5. Monitor TaskList continuously for progress and blockers 6. Handle blocked tasks and reassignment 7. Coordinate cross-feature dependencies 8. Send `shutdown_request` to all agents when all PRs are created 9. Clean up the team when done **Delegate mode is mandatory.** The team lead uses only: - TeamCreate, TaskCreate, TaskUpdate, TaskList, TaskGet - SendMessage (message, broadcast, shutdown_request) - Read, Glob, Grep (for monitoring) --- ## Feature Agent Workflow (MANDATORY) Each feature agent MUST follow this exact sequence. Task dependencies enforce ordering - a feature agent cannot start step N+1 until step N is marked complete and verified. ### Step 1: Write Spec - Create `_project_specs/features/{feature-name}.md` - Include: description, acceptance criteria, test cases table, dependencies - Follow the atomic TODO format from base.md skill - Mark task complete -> Quality Agent reviews ### Step 2: Write Tests (RED Phase) - Write test files based on spec's test cases table - Tests MUST cover ALL acceptance criteria - Import modules that don't exist yet (they will fail) - Mark task complete -> Quality Agent verifies tests EXIST and FAIL ### Step 3: Wait for RED Verification - Quality Agent runs tests and verifies ALL new tests fail - If any test passes without implementation -> rewrite tests - Quality Agent marks verification complete -> unlocks implementation ### Step 4: Implement (GREEN Phase) - Write minimum code to make all tests pass - Follow simplicity rules from base.md (20 lines/function, 200 lines/file, 3 params) - Use Ralph loops (`/ralph-loop`) for iterative implementation - Run tests after implementation - ALL must pass - Mark task complete -> Quality Agent verifies tests pass ### Step 5: Wait for GREEN Verification - Quality Agent runs full test suite and checks coverage - Coverage must be >= 80% - If tests fail or coverage insufficient -> fix and re-request - Quality Agent marks verification complete -> unlocks validation ### Step 6: Validate - Run linter (ESLint / Ruff) - Run type checker (TypeScript / mypy) - Run full test suite with coverage - Fix any issues - Mark task complete -> unlocks code review ### Step 7: Wait for Code Review - Code Review Agent runs `/code-review` on changed files - If Critical or High issues -> fix and re-request review - Code Review Agent marks complete -> unlocks security scan ### Step 8: Wait for Security Scan - Security Agent runs security checks - If Critical or High issues -> fix and re-request scan - Security Agent marks complete -> unlocks merge ### Step 9: Wait for Branch + PR - Merger Agent creates feature branch, stages files, creates PR - Feature is complete when PR is created --- ## Task Dependency Chain Model For each feature "X", the team lead creates these 10 tasks with strict ordering: ``` ┌────────────────────────────────────────────────────────────────┐ │ TASK CHAIN FOR FEATURE "X" │ │ │ │ Task 1: X-spec │ │ owner: feature-X │ │ blockedBy: (none) │ │ ↓ │ │ Task 2: X-spec-review │ │ owner: quality-agent │ │ blockedBy: X-spec │ │ ↓ │ │ Task 3: X-tests │ │ owner: feature-X │ │ blockedBy: X-spec-review │ │ ↓ │ │ Task 4: X-tests-fail-verify │ │ owner: quality-agent │ │ blockedBy: X-tests │ │ ↓ │ │ Task 5: X-implement │ │ owner: feature-X │ │ blockedBy: X-tests-fail-verify │ │ ↓ │ │ Task 6: X-tests-pass-verify │ │ owner: quality-agent │ │ blockedBy: X-implement │ │ ↓ │ │ Task 7: X-validate │ │ owner: feature-X │ │ blockedBy: X-tests-pass-verify │ │ ↓ │ │ Task 8: X-code-review │ │ owner: review-agent │ │ blockedBy: X-validate │ │ ↓ │ │ Task 9: X-security-scan │ │ owner: security-agent │ │ blockedBy: X-code-review │ │ ↓ │ │ Task 10: X-branch-pr │ │ owner: merger-agent │ │ blockedBy: X-security-scan │ └────────────────────────────────────────────────────────────────┘ ``` ### Parallel Feature Execution Multiple features run their chains in parallel. Shared agents process tasks as they unblock: ``` Feature: auth Feature: dashboard Feature: payments auth-spec dash-spec pay-spec auth-spec-review dash-spec-review pay-spec-review auth-tests dash-tests pay-tests auth-fail-verify dash-fail-verify pay-fail-verify auth-implement dash-implement pay-implement auth-pass-verify dash-pass-verify pay-pass-verify auth-validate dash-validate pay-validate auth-code-review dash-code-review pay-code-review auth-security dash-security pay-security auth-branch-pr dash-branch-pr pay-branch-pr | | | v v v [All chains run simultaneously] [Quality Agent handles all verify tasks as they unblock] [Review Agent handles all review tasks as they unblock] [Security Agent handles all scan tasks as they unblock] [Merger Agent handles all branch-pr tasks as they unblock] ``` --- ## Inter-Agent Communication ### Direct Messages (for targeted work) ``` Feature Agent -> Quality Agent: "Tests written for auth, ready for RED verify" Quality Agent -> Feature Agent: "All 7 tests fail as expected. Proceed to implement" Feature Agent -> Review Agent: "Implementation complete, ready for code review" Review Agent -> Feature Agent: "2 High issues found: [details]. Fix before proceeding" Security Agent -> Merger Agent: "Security scan passed for auth feature" Merger Agent -> Team Lead: "PR #42 created for auth feature" ``` ### Task List (source of truth for state) - All agents check TaskList after completing work - Quality Agent claims verification tasks automatically - Review Agent claims code-review tasks automatically - Security Agent claims security-scan tasks automatically - Merger Agent claims branch-pr tasks automatically ### Broadcast (rare - blocking issues only) - Team Lead -> All: "Blocking dependency found between auth and dashboard" - Security Agent -> All: "Critical vulnerability in shared dependency" --- ## Feature Agent Spawning The team lead spawns one feature agent per feature: 1. Read `_project_specs/features/*.md` 2. For each feature spec, spawn a feature agent: - name: `feature-{feature-name}` - Uses `.claude/agents/feature.md` definition - Spawn prompt includes the feature name and spec location 3. Create the full 10-task dependency chain for that feature 4. Assign the spec-writing task to the feature agent ### Example If project has 3 features: auth, dashboard, payments - Spawn: `feature-auth`, `feature-dashboard`, `feature-payments` - Create 30 tasks total (10 per feature) - Each feature agent starts with their spec task - All 3 work in parallel --- ## Branch and PR Strategy **One branch per feature. One PR per feature.** ``` Branch naming: feature/{feature-name} PR title: feat({feature-name}): {short description} PR body: Generated from spec + test results + review + security results ``` The Merger Agent: 1. `git checkout main && git pull origin main` 2. `git checkout -b feature/{feature-name}` 3. Stages ONLY files changed for this feature (never `git add -A`) 4. Commits with descriptive message including verification results 5. `git push -u origin feature/{feature-name}` 6. `gh pr create` with full template including: - Summary from feature spec - Test results from quality verification - Code review summary from review agent - Security scan results from security agent - Checklist of all pipeline steps completed --- ## Quality Gates ### Workflow Enforcement (via task dependencies) - Task dependencies make it **structurally impossible** to skip steps - A feature agent cannot see "implement" until quality agent completes "tests-fail-verify" - This is the primary enforcement mechanism ### Cross-Agent Verification (trust but verify) - Quality agent independently runs tests (doesn't trust feature agent's report) - Security agent independently scans (doesn't trust review agent) - Merger agent verifies all predecessor tasks are complete before branching ### Blocking Rules - Quality Agent: blocks if tests don't fail (RED) or don't pass (GREEN) or coverage < 80% - Code Review Agent: blocks on Critical or High severity issues - Security Agent: blocks on Critical or High severity findings - Merger Agent: refuses to branch if any predecessor task is incomplete --- ## Integration with Existing Skills | Existing Skill | How Agent Teams Uses It | |----------------|------------------------| | base.md | TDD workflow, atomic todos, simplicity rules - all agents follow | | code-review.md | Review Agent executes `/code-review` per this skill | | security.md | Security Agent follows OWASP patterns from this skill | | session-management.md | Each agent maintains its own session state | | iterative-development.md | Feature agents use Stop hook TDD loops for implementation | | project-tooling.md | Merger Agent uses `gh` CLI for branches and PRs | | team-coordination.md | Superseded by agent-teams for automated coordination | | **icpg.md** | **Team lead creates ReasonNodes. Feature agents query constraints/risk. Quality agent checks drift. PreToolUse hook injects context. Stop hook auto-records symbols.** | | code-graph.md | Feature agents use graph for symbol lookup alongside iCPG for intent context | --- ## Environment Setup ### Required Setting ```json // settings.json or environment { "env": { "agent teams (via .claude/agents/ definitions)": "1" } } ``` ### Project Structure (created by /initialize-project) ``` .claude/ agents/ # Agent definitions (from agent-teams skill) team-lead.md quality.md security.md code-review.md merger.md feature.md skills/ agent-teams/ # This skill SKILL.md agents/ # Agent definition templates base/ code-review/ security/ ... ``` --- ## Spawning the Team ### Automatic (via /initialize-project) After project setup completes, Phase 6 asks for features and spawns the team automatically. ### Manual (via /spawn-team) For existing projects: run `/spawn-team` to spawn the team from existing feature specs. --- ## Container Isolation (Polyphony) When Docker/OrbStack is available, feature agents run in Polyphony containers by default. The team lead and shared agents (quality, security, review, merger) still run natively — they only read and coordinate. ### What changes with Polyphony | Aspect | Without Polyphony | With Polyphony | |--------|-------------------|----------------| | Feature agents | Shared filesystem | Own container + git branch | | File conflicts | Team lead must serialize | Impossible (isolated clones) | | Test execution | Shared, can interfere | Independent per container | | Branch strategy | Merger agent creates branches | Each container has its own branch | ### How it works 1. `/spawn-team` detects Docker + polyphony CLI 2. For each feature, runs `polyphony spawn "$FEATURE" --type feature` 3. Polyphony creates a container with its own git clone + branch 4. Agent CLI starts inside the container 5. On completion, changes are on a dedicated branch ready for PR ### Fallback If Docker is not available, `/spawn-team` falls back to the native Agent tool (shared filesystem). A note is printed: > "Running without container isolation (Docker not found). Agents share the workspace." --- ## Limitations - **Experimental feature** - Agent teams require the experimental env var - **No nested teams** - Teammates cannot spawn sub-teams - **One team per session** - Clean up before starting a new team - **No session resumption** - If session dies, re-run `/spawn-team` (tasks persist) - **File conflicts** - Features sharing files must be serialized by team lead (unless using Polyphony containers) - **Token cost** - Each agent is a separate Claude instance (5 + N instances) ================================================ FILE: skills/agent-teams/agents/code-review.md ================================================ --- name: review-agent description: Performs code reviews on completed features - checks security, performance, architecture, code quality. Blocks on Critical/High. model: sonnet tools: [Read, Glob, Grep, Bash, TaskUpdate, TaskList, TaskGet, SendMessage] disallowedTools: [Write, Edit] maxTurns: 20 effort: high --- # Code Review Agent You perform code reviews on completed features. ## Review Protocol For each `{name}-code-review` task: 1. Identify changed files via `git diff main --name-only` 2. Review for: security vulnerabilities, performance issues (N+1, memory leaks), architecture problems (coupling, SOLID), code quality (simplicity rules, DRY, dead code), test quality (behavior tests, edge cases, isolation) 3. Categorize findings by severity (Critical/High/Medium/Low) ## Blocking Rules If Critical or High issues found: 1. Message feature agent with file:line, description, and suggested fix 2. Do NOT mark complete 3. Wait for fixes, then re-review If only Medium/Low: mark complete, message security-agent. ## Rules - Read-only: review code, do NOT fix it - Block on Critical and High, no exceptions - Process tasks in order (lowest task ID first) ================================================ FILE: skills/agent-teams/agents/feature.md ================================================ --- name: feature-agent description: Implements one feature end-to-end following the strict TDD pipeline - spec, tests, implementation, validation. model: inherit tools: [Read, Write, Edit, Bash, Glob, Grep, TaskUpdate, TaskList, TaskGet, SendMessage] maxTurns: 40 effort: high --- # Feature Agent You implement one specific feature following the strict TDD pipeline. ## Your Steps (enforced by task dependencies) 1. **SPEC** — Write `_project_specs/features/{name}.md` with description, acceptance criteria, test cases table, dependencies 2. *Wait for quality-agent spec review* 3. **TESTS (RED)** — Write test files covering ALL acceptance criteria. Tests MUST fail. 4. *Wait for quality-agent RED verification* 5. **PRE-IMPLEMENT** — Before coding: - Run `icpg query constraints <scope-files>` to understand invariants - Run `icpg query risk <key-symbol>` for fragile symbols - Write feature name to `.icpg/.current-intent` (enables auto-recording) 6. **IMPLEMENT (GREEN)** — Write minimum code to pass all tests. Follow simplicity rules (20 lines/function, 200 lines/file, 3 params max). PreToolUse hook auto-injects intent context before every edit. 7. **POST-IMPLEMENT** — After tests pass: - Run `icpg record --reason <intent-id> --base main` (or auto via Stop hook) - Run `icpg drift check` to verify no unintended scope drift 8. *Wait for quality-agent GREEN verification* 9. **VALIDATE** — Run linter, type checker, full test suite with coverage. 10. *Wait for code review and security scan* ## Rules - Always write tests before implementation (TDD is mandatory) - Always check constraints and risk before implementing (iCPG is mandatory) - Follow simplicity rules from project CLAUDE.md - If blocked by environment issues (DB down, missing API key), message team-lead - Mark tasks complete only when the work is actually done - Process tasks in order following the pipeline ================================================ FILE: skills/agent-teams/agents/merger.md ================================================ --- name: merger-agent description: Creates feature branches and PRs for completed features via gh CLI. Never merges - only creates PRs. model: sonnet tools: [Read, Glob, Grep, Bash, TaskUpdate, TaskList, TaskGet, SendMessage] disallowedTools: [Write, Edit] maxTurns: 15 effort: medium --- # Merger Agent You handle git branching and PR creation. You NEVER merge - you only create PRs. ## Protocol For each `{name}-branch-pr` task: 1. `git checkout main && git pull origin main` 2. `git checkout -b feature/{feature-name}` 3. Stage ONLY files related to this feature (never `git add -A`) 4. Commit with: `feat({feature-name}): {description}` 5. `git push -u origin feature/{feature-name}` 6. `gh pr create` with summary, test results, review results, security results, pipeline checklist 7. `git checkout main` 8. Message team-lead with PR URL ## Gathering Results Before creating PR, use TaskGet to read predecessor tasks for: - Test count and coverage from `{name}-tests-pass-verify` - Review summary from `{name}-code-review` - Security summary from `{name}-security-scan` ## Rules - Never merge PRs, only create them - Never force push - Never use `git add -A` or `git add .` - One branch per feature, one PR per feature - Process tasks in order (lowest task ID first) ================================================ FILE: skills/agent-teams/agents/quality.md ================================================ --- name: quality-agent description: Enforces TDD discipline - verifies specs are complete, tests fail before implementation, tests pass after implementation, coverage >= 80% model: sonnet tools: [Read, Glob, Grep, Bash, TaskUpdate, TaskList, TaskGet, SendMessage] disallowedTools: [Write, Edit] maxTurns: 30 effort: high --- # Quality Agent You enforce TDD discipline. You verify that specs are complete, tests fail before implementation, and tests pass after implementation. You are read-only for source code. ## Verification Protocols ### Spec Review (`{name}-spec-review`) Read `_project_specs/features/{name}.md` and verify: - Has clear description - Has numbered acceptance criteria - Has test cases table (Test, Input, Expected Output) - Has dependencies listed - Criteria are testable, not vague If incomplete: message feature agent with what's missing. Do NOT mark complete. ### RED Phase (`{name}-tests-fail-verify`) 1. Run the project's test command 2. ALL new tests must FAIL (not error from imports — actual test failures) 3. Every spec test case must have a corresponding test If tests pass: message feature agent to rewrite tests. If tests fail: mark complete, message feature agent to proceed. ### GREEN Phase (`{name}-tests-pass-verify`) 1. Run full test suite (not just new tests) 2. ALL tests must pass 3. Coverage >= 80% 4. **iCPG drift check**: Run `icpg drift check` to verify no unintended scope drift If tests fail or coverage insufficient: message feature agent with details. If drift detected: message feature agent with drift dimensions and severity. If all pass and no drift: mark complete, message feature agent to proceed. ### Spec-Intent Alignment (`{name}-spec-review`) During spec review, also verify: - The feature's ReasonNode exists in iCPG (`icpg query context` on scope files) - Scope in spec matches scope in ReasonNode - No DUPLICATES edges flagged for this intent ## Rules - You are read-only: run tests and icpg queries, do NOT fix code - Mark tasks complete only when verification passes - Process tasks in order (lowest task ID first) - Report drift events with specific dimensions and severity ================================================ FILE: skills/agent-teams/agents/security.md ================================================ --- name: security-agent description: Performs security analysis on completed features - OWASP scanning, secrets detection, dependency audit. Blocks on Critical/High. model: sonnet tools: [Read, Glob, Grep, Bash, TaskUpdate, TaskList, TaskGet, SendMessage] disallowedTools: [Write, Edit] maxTurns: 20 effort: high --- # Security Agent You perform security analysis on completed features before they can be merged. ## Security Scan Protocol For each `{name}-security-scan` task: ### 1. Identify Changed Files Use `git diff main --name-only` to identify feature files. ### 2. Secrets Detection Check for: hardcoded API keys (sk-, pk_, api_key, secret), passwords, tokens, connection strings with credentials, .env committed to git. ### 3. OWASP Top 10 Check for: SQL injection (raw queries with string interpolation), XSS (innerHTML with user input), broken auth (missing auth on protected routes), insecure crypto (MD5/SHA1 for passwords), SSRF (user-controlled URLs), path traversal, mass assignment, missing rate limits on auth. ### 4. Dependency Audit Run `npm audit` or `safety check`. Flag known vulnerabilities. ### 5. Environment Variables Verify no secrets in VITE_*, NEXT_PUBLIC_*, REACT_APP_* vars. ## Severity and Blocking | Severity | Action | |----------|--------| | Critical | Block merge. Must fix. | | High | Block merge. Should fix. | | Medium | Advisory. Can merge. | | Low | Informational. | If Critical/High found: message feature agent with file:line references and fix suggestions. Do NOT mark complete. If clean: mark complete, message merger-agent. ## Rules - Read-only: scan code, do NOT fix it - Block on Critical and High, no exceptions - Process tasks in order (lowest task ID first) ================================================ FILE: skills/agent-teams/agents/team-lead.md ================================================ --- name: team-lead description: Orchestrates the agent team - creates tasks, spawns feature agents, monitors progress. Never writes code. model: sonnet tools: [Read, Glob, Grep, TaskCreate, TaskUpdate, TaskList, TaskGet, SendMessage, TeamCreate] disallowedTools: [Write, Edit, Bash] maxTurns: 50 effort: high --- # Team Lead Agent You orchestrate work. You do NOT implement. ## Responsibilities 1. Read `_project_specs/features/*.md` to identify all features 2. **iCPG: Check for duplicates** — run `icpg query prior "<feature goal>"` before creating tasks. If >0.75 similarity, warn user. 3. **iCPG: Create ReasonNode** — for each feature, run `icpg create "<goal>" --scope <files> --owner feature-{name} --type task` 4. For each feature, create the full 10-task dependency chain 5. Spawn one feature agent per feature 6. Assign initial tasks (spec-writing) to feature agents 7. Monitor TaskList continuously for progress and blockers 8. Handle blocked tasks and reassign if needed 9. Coordinate cross-feature dependencies (serialize features sharing files) 10. When all PRs are created, send `shutdown_request` to all agents ## Task Chain Template (per feature) For each feature `{name}`, create these tasks with `addBlockedBy` dependencies: 1. `{name}-spec` — owner: feature-{name} 2. `{name}-spec-review` — owner: quality-agent, blockedBy: [1] 3. `{name}-tests` — owner: feature-{name}, blockedBy: [2] 4. `{name}-tests-fail-verify` — owner: quality-agent, blockedBy: [3] 5. `{name}-implement` — owner: feature-{name}, blockedBy: [4] 6. `{name}-tests-pass-verify` — owner: quality-agent, blockedBy: [5] 7. `{name}-validate` — owner: feature-{name}, blockedBy: [6] 8. `{name}-code-review` — owner: review-agent, blockedBy: [7] 9. `{name}-security-scan` — owner: security-agent, blockedBy: [8] 10. `{name}-branch-pr` — owner: merger-agent, blockedBy: [9] ## Cross-Feature Dependencies If two features share files: 1. Add `addBlockedBy` from the second feature's implement task to the first feature's branch-pr task 2. Message both feature agents about the serialization ## Completion Protocol When all `{name}-branch-pr` tasks are completed: 1. Verify all PRs created via `gh pr list` 2. Send broadcast: "All features complete. Shutting down team." 3. Send `shutdown_request` to each agent ================================================ FILE: skills/agentic-development/SKILL.md ================================================ --- name: agentic-development description: Build AI agents with Pydantic AI (Python) and Claude SDK (Node.js) when-to-use: When building AI agents, tool-using LLM systems, or agentic workflows user-invocable: false effort: high --- # Agentic Development Skill For building autonomous AI agents that perform multi-step tasks with tools. **Sources:** [Claude Agent SDK](https://docs.anthropic.com/en/docs/agents-and-tools/claude-agent-sdk) | [Anthropic Claude Code Best Practices](https://www.anthropic.com/engineering/claude-code-best-practices) | [Pydantic AI](https://ai.pydantic.dev/) | [Google Gemini Agent Development](https://developers.googleblog.com/en/building-agents-google-gemini-open-source-frameworks/) | [OpenAI Building Agents](https://developers.openai.com/tracks/building-agents/) --- ## Framework Selection by Language | Language/Framework | Default | Why | |-------------------|---------|-----| | **Python** | **Pydantic AI** | Type-safe, Pydantic validation, multi-model, production-ready | | **Node.js / Next.js** | **Claude Agent SDK** | Official Anthropic SDK, tools, multi-agent, native streaming | ### Python: Pydantic AI (Default) ```python from pydantic_ai import Agent from pydantic import BaseModel class SearchResult(BaseModel): title: str url: str summary: str agent = Agent( 'claude-sonnet-4-20250514', result_type=list[SearchResult], system_prompt='You are a research assistant.', ) # Type-safe result result = await agent.run('Find articles about AI agents') for item in result.data: print(f"{item.title}: {item.url}") ``` ### Node.js / Next.js: Claude Agent SDK (Default) ```typescript import Anthropic from "@anthropic-ai/sdk"; const client = new Anthropic(); // Define tools const tools: Anthropic.Tool[] = [ { name: "web_search", description: "Search the web for information", input_schema: { type: "object", properties: { query: { type: "string", description: "Search query" }, }, required: ["query"], }, }, ]; // Agentic loop async function runAgent(prompt: string) { const messages: Anthropic.MessageParam[] = [ { role: "user", content: prompt }, ]; while (true) { const response = await client.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 4096, tools, messages, }); // Check for tool use if (response.stop_reason === "tool_use") { const toolUse = response.content.find((b) => b.type === "tool_use"); if (toolUse) { const result = await executeTool(toolUse.name, toolUse.input); messages.push({ role: "assistant", content: response.content }); messages.push({ role: "user", content: [{ type: "tool_result", tool_use_id: toolUse.id, content: result }], }); continue; } } // Done - return final response return response.content.find((b) => b.type === "text")?.text; } } ``` --- ## Core Principle **Plan first, act incrementally, verify always.** Agents that research and plan before executing consistently outperform those that jump straight to action. Break complex tasks into verifiable steps, use tools judiciously, and maintain clear state throughout execution. --- ## Agent Architecture ### Three Components (OpenAI) ``` ┌─────────────────────────────────────────────────┐ │ AGENT │ ├─────────────────────────────────────────────────┤ │ Model (Brain) │ LLM for reasoning & │ │ │ decision-making │ ├─────────────────────┼───────────────────────────┤ │ Tools (Arms/Legs) │ APIs, functions, external │ │ │ systems for action │ ├─────────────────────┼───────────────────────────┤ │ Instructions │ System prompts defining │ │ (Rules) │ behavior & boundaries │ └─────────────────────┴───────────────────────────┘ ``` ### Project Structure ``` project/ ├── src/ │ ├── agents/ │ │ ├── orchestrator.ts # Main agent coordinator │ │ ├── specialized/ # Task-specific agents │ │ │ ├── researcher.ts │ │ │ ├── coder.ts │ │ │ └── reviewer.ts │ │ └── base.ts # Shared agent interface │ ├── tools/ │ │ ├── definitions/ # Tool schemas │ │ ├── implementations/ # Tool logic │ │ └── registry.ts # Tool discovery │ ├── prompts/ │ │ ├── system/ # Agent instructions │ │ └── templates/ # Task templates │ └── memory/ │ ├── conversation.ts # Short-term context │ └── persistent.ts # Long-term storage ├── tests/ │ ├── agents/ # Agent behavior tests │ ├── tools/ # Tool unit tests │ └── evals/ # End-to-end evaluations └── skills/ # Agent skills (Anthropic pattern) ├── skill-name/ │ ├── instructions.md │ ├── scripts/ │ └── resources/ ``` --- ## Workflow Pattern: Explore-Plan-Execute-Verify ### 1. Explore Phase ```typescript // Gather context before acting async function explore(task: Task): Promise<Context> { const relevantFiles = await agent.searchCodebase(task.query); const existingPatterns = await agent.analyzePatterns(relevantFiles); const dependencies = await agent.identifyDependencies(task); return { relevantFiles, existingPatterns, dependencies }; } ``` ### 2. Plan Phase (Critical) ```typescript // Plan explicitly before execution async function plan(task: Task, context: Context): Promise<Plan> { const prompt = ` Task: ${task.description} Context: ${JSON.stringify(context)} Create a step-by-step plan. For each step: 1. What action to take 2. What tools to use 3. How to verify success 4. What could go wrong Output JSON with steps array. `; return await llmCall({ prompt, schema: PlanSchema }); } ``` ### 3. Execute Phase ```typescript // Execute with verification at each step async function execute(plan: Plan): Promise<Result[]> { const results: Result[] = []; for (const step of plan.steps) { // Execute single step const result = await executeStep(step); // Verify before continuing if (!await verify(step, result)) { // Self-correct or escalate const corrected = await selfCorrect(step, result); if (!corrected.success) { return handleFailure(step, results); } } results.push(result); } return results; } ``` ### 4. Verify Phase ```typescript // Independent verification prevents overfitting async function verify(step: Step, result: Result): Promise<boolean> { // Run tests if available if (step.testCommand) { const testResult = await runCommand(step.testCommand); if (!testResult.success) return false; } // Use LLM to verify against criteria const verification = await llmCall({ prompt: ` Step: ${step.description} Expected: ${step.successCriteria} Actual: ${JSON.stringify(result)} Does the result satisfy the success criteria? Respond with { "passes": boolean, "reasoning": string } `, schema: VerificationSchema }); return verification.passes; } ``` --- ## Tool Design ### Tool Definition Pattern ```typescript // tools/definitions/file-operations.ts import { z } from 'zod'; export const ReadFileTool = { name: 'read_file', description: 'Read contents of a file. Use before modifying any file.', parameters: z.object({ path: z.string().describe('Absolute path to the file'), startLine: z.number().optional().describe('Start line (1-indexed)'), endLine: z.number().optional().describe('End line (1-indexed)'), }), // Risk level for guardrails (OpenAI pattern) riskLevel: 'low' as const, }; export const WriteFileTool = { name: 'write_file', description: 'Write content to a file. Always read first to understand context.', parameters: z.object({ path: z.string().describe('Absolute path to the file'), content: z.string().describe('Complete file content'), }), riskLevel: 'medium' as const, // Require confirmation for high-risk operations requiresConfirmation: true, }; ``` ### Tool Implementation ```typescript // tools/implementations/file-operations.ts export async function readFile( params: z.infer<typeof ReadFileTool.parameters> ): Promise<ToolResult> { try { const content = await fs.readFile(params.path, 'utf-8'); const lines = content.split('\n'); const start = (params.startLine ?? 1) - 1; const end = params.endLine ?? lines.length; return { success: true, data: lines.slice(start, end).join('\n'), metadata: { totalLines: lines.length } }; } catch (error) { return { success: false, error: `Failed to read file: ${error.message}` }; } } ``` ### Prefer Built-in Tools (OpenAI) ```typescript // Use platform-provided tools when available const agent = createAgent({ tools: [ // Built-in tools (handled by platform) { type: 'web_search' }, { type: 'code_interpreter' }, // Custom tools only when needed { type: 'function', function: customDatabaseTool }, ], }); ``` --- ## Multi-Agent Patterns ### Single Agent (Default) Use one agent for most tasks. Multiple agents add complexity. ### Agent-as-Tool Pattern (OpenAI) ```typescript // Expose specialized agents as callable tools const researchAgent = createAgent({ name: 'researcher', instructions: 'You research topics and return structured findings.', tools: [webSearchTool, documentReadTool], }); const mainAgent = createAgent({ tools: [ { type: 'function', function: { name: 'research_topic', description: 'Delegate research to specialized agent', parameters: ResearchQuerySchema, handler: async (query) => researchAgent.run(query), }, }, ], }); ``` ### Handoff Pattern (OpenAI) ```typescript // One-way transfer between agents const customerServiceAgent = createAgent({ tools: [ // Handoff to specialist when needed { name: 'transfer_to_billing', description: 'Transfer to billing specialist for payment issues', handler: async (context) => { return { handoff: 'billing_agent', context }; }, }, ], }); ``` ### When to Use Multiple Agents - Separate task domains with non-overlapping tools - Different authorization levels needed - Complex workflows with clear handoff points - Parallel execution of independent subtasks --- ## Memory & State ### Conversation Memory ```typescript // memory/conversation.ts interface ConversationMemory { messages: Message[]; maxTokens: number; add(message: Message): void; getContext(): Message[]; summarize(): Promise<string>; } // Maintain state across tool calls (Gemini pattern) interface AgentState { thoughtSignature?: string; // Encrypted reasoning state conversationId: string; // For shared memory currentPlan?: Plan; completedSteps: Step[]; } ``` ### Persistent Memory ```typescript // memory/persistent.ts interface PersistentMemory { // Store learnings across sessions store(key: string, value: any): Promise<void>; retrieve(key: string): Promise<any>; // Semantic search over past interactions search(query: string, limit: number): Promise<Memory[]>; } ``` --- ## Guardrails & Safety ### Multi-Layer Protection (OpenAI) ```typescript // guards/index.ts interface GuardrailConfig { // Input validation inputClassifier: (input: string) => Promise<SafetyResult>; // Output validation outputValidator: (output: string) => Promise<SafetyResult>; // Tool risk assessment toolRiskLevels: Record<string, 'low' | 'medium' | 'high'>; // Actions requiring human approval humanInTheLoop: string[]; } async function executeWithGuardrails( agent: Agent, input: string, config: GuardrailConfig ): Promise<Result> { // 1. Check input safety const inputCheck = await config.inputClassifier(input); if (!inputCheck.safe) { return { blocked: true, reason: inputCheck.reason }; } // 2. Execute with tool monitoring const result = await agent.run(input, { beforeTool: async (tool, params) => { const risk = config.toolRiskLevels[tool.name]; if (risk === 'high' || config.humanInTheLoop.includes(tool.name)) { return await requestHumanApproval(tool, params); } return { approved: true }; }, }); // 3. Validate output const outputCheck = await config.outputValidator(result.output); if (!outputCheck.safe) { return { blocked: true, reason: outputCheck.reason }; } return result; } ``` ### Scope Enforcement (OpenAI) ```typescript // Agent must stay within defined scope const agentInstructions = ` You are a customer service agent for Acme Corp. SCOPE BOUNDARIES (non-negotiable): - Only answer questions about Acme products and services - Never provide legal, medical, or financial advice - Never access or modify data outside your authorized scope - If a request is out of scope, politely decline and explain why If you cannot complete a task within scope, notify the user and request explicit approval before proceeding. `; ``` --- ## Model Selection ### Match Model to Task | Task Complexity | Recommended Model | Notes | |-----------------|-------------------|-------| | Simple, fast | gpt-5-mini, claude-haiku | Low latency | | General purpose | gpt-4.1, claude-sonnet | Balance | | Complex reasoning | o4-mini, claude-opus | Higher accuracy | | Deep planning | gpt-5 + reasoning, ultrathink | Maximum capability | ### Gemini-Specific ```typescript // Use thinking_level for reasoning depth const response = await gemini.generate({ model: 'gemini-3', thinking_level: 'high', // For complex planning temperature: 1.0, // Optimized for reasoning engine }); // Preserve thought state across tool calls const nextResponse = await gemini.generate({ thoughtSignature: response.thoughtSignature, // Required for function calling // ... rest of params }); ``` ### Claude-Specific (Thinking Modes) ```typescript // Trigger extended thinking with keywords const thinkingLevels = { 'think': 'standard analysis', 'think hard': 'deeper reasoning', 'think harder': 'extensive analysis', 'ultrathink': 'maximum reasoning budget', }; const prompt = ` Think hard about this problem before proposing a solution. Task: ${task.description} `; ``` --- ## Testing Agents ### Unit Tests (Tools) ```typescript describe('readFile tool', () => { it('reads file content correctly', async () => { const result = await readFile({ path: '/test/file.txt' }); expect(result.success).toBe(true); expect(result.data).toContain('expected content'); }); }); ``` ### Behavior Tests (Agent Decisions) ```typescript describe('agent planning', () => { it('creates plan before executing file modifications', async () => { const trace = await agent.runWithTrace('Refactor the auth module'); // Verify planning happened first const firstToolCall = trace.toolCalls[0]; expect(firstToolCall.name).toBe('read_file'); // Verify no writes without reads const writeIndex = trace.toolCalls.findIndex(t => t.name === 'write_file'); const readIndex = trace.toolCalls.findIndex(t => t.name === 'read_file'); expect(readIndex).toBeLessThan(writeIndex); }); }); ``` ### Evaluation Tests ```typescript // Run nightly, not in regular CI describe('Agent Accuracy (Eval)', () => { const testCases = loadTestCases('./evals/coding-tasks.json'); it.each(testCases)('completes $name correctly', async (testCase) => { const result = await agent.run(testCase.input); // Verify against expected outcomes expect(result.filesModified).toEqual(testCase.expectedFiles); expect(await runTests(testCase.testCommand)).toBe(true); }, 120000); }); ``` --- ## Pydantic AI Patterns (Python Default) ### Project Structure (Python) ``` project/ ├── src/ │ ├── agents/ │ │ ├── __init__.py │ │ ├── researcher.py # Research agent │ │ ├── coder.py # Coding agent │ │ └── orchestrator.py # Main coordinator │ ├── tools/ │ │ ├── __init__.py │ │ ├── web.py # Web search tools │ │ ├── files.py # File operations │ │ └── database.py # DB queries │ ├── models/ │ │ ├── __init__.py │ │ └── schemas.py # Pydantic models │ └── deps.py # Dependencies ├── tests/ │ ├── test_agents.py │ └── test_tools.py └── pyproject.toml ``` ### Agent with Tools ```python from pydantic_ai import Agent, RunContext from pydantic import BaseModel from httpx import AsyncClient class SearchResult(BaseModel): title: str url: str snippet: str class ResearchDeps(BaseModel): http_client: AsyncClient api_key: str research_agent = Agent( 'claude-sonnet-4-20250514', deps_type=ResearchDeps, result_type=list[SearchResult], system_prompt='You are a research assistant. Use tools to find information.', ) @research_agent.tool async def web_search(ctx: RunContext[ResearchDeps], query: str) -> list[dict]: """Search the web for information.""" response = await ctx.deps.http_client.get( 'https://api.search.com/search', params={'q': query}, headers={'Authorization': f'Bearer {ctx.deps.api_key}'}, ) return response.json()['results'] @research_agent.tool async def read_webpage(ctx: RunContext[ResearchDeps], url: str) -> str: """Read and extract content from a webpage.""" response = await ctx.deps.http_client.get(url) return response.text[:5000] # Truncate for context # Usage async def main(): async with AsyncClient() as client: deps = ResearchDeps(http_client=client, api_key='...') result = await research_agent.run( 'Find recent articles about LLM agents', deps=deps, ) for item in result.data: print(f"- {item.title}") ``` ### Structured Output with Validation ```python from pydantic import BaseModel, Field from pydantic_ai import Agent class CodeReview(BaseModel): summary: str = Field(description="Brief summary of the review") issues: list[str] = Field(description="List of issues found") suggestions: list[str] = Field(description="Improvement suggestions") approval: bool = Field(description="Whether code is approved") confidence: float = Field(ge=0, le=1, description="Confidence score") review_agent = Agent( 'claude-sonnet-4-20250514', result_type=CodeReview, system_prompt='Review code for quality, security, and best practices.', ) # Result is validated Pydantic model result = await review_agent.run(f"Review this code:\n```python\n{code}\n```") if result.data.approval: print("Code approved!") else: for issue in result.data.issues: print(f"Issue: {issue}") ``` ### Multi-Agent Coordination ```python from pydantic_ai import Agent # Specialized agents planner = Agent('claude-sonnet-4-20250514', system_prompt='Create detailed plans.') executor = Agent('claude-sonnet-4-20250514', system_prompt='Execute tasks precisely.') reviewer = Agent('claude-sonnet-4-20250514', system_prompt='Review and verify work.') async def orchestrate(task: str): # 1. Plan plan = await planner.run(f"Create a plan for: {task}") # 2. Execute each step results = [] for step in plan.data.steps: result = await executor.run(f"Execute: {step}") results.append(result.data) # 3. Review review = await reviewer.run( f"Review the results:\nTask: {task}\nResults: {results}" ) return review.data ``` ### Streaming Responses ```python from pydantic_ai import Agent agent = Agent('claude-sonnet-4-20250514') async def stream_response(prompt: str): async with agent.run_stream(prompt) as response: async for chunk in response.stream(): print(chunk, end='', flush=True) # Get final structured result result = await response.get_data() return result ``` ### Testing Agents ```python import pytest from pydantic_ai import Agent from pydantic_ai.models.test import TestModel @pytest.fixture def test_agent(): return Agent( TestModel(), # Mock model for testing result_type=str, ) async def test_agent_response(test_agent): result = await test_agent.run('Test prompt') assert result.data is not None # Test with specific responses async def test_with_mock_response(): model = TestModel() model.seed_response('Expected output') agent = Agent(model) result = await agent.run('Any prompt') assert result.data == 'Expected output' ``` --- ## Skills Pattern (Anthropic) ### Skill Structure ``` skills/ └── code-review/ ├── instructions.md # How to perform code reviews ├── scripts/ │ └── run-linters.sh # Supporting scripts └── resources/ └── checklist.md # Review checklist ``` ### instructions.md Example ```markdown # Code Review Skill ## When to Use Activate this skill when asked to review code, PRs, or diffs. ## Process 1. Read the changed files completely 2. Run linters: `./scripts/run-linters.sh` 3. Check against resources/checklist.md 4. Provide structured feedback ## Output Format - Summary (1-2 sentences) - Issues found (severity: critical/major/minor) - Suggestions for improvement - Approval recommendation ``` ### Loading Skills Dynamically ```typescript async function loadSkill(skillName: string): Promise<Skill> { const skillPath = `./skills/${skillName}`; const instructions = await fs.readFile(`${skillPath}/instructions.md`, 'utf-8'); const scripts = await glob(`${skillPath}/scripts/*`); const resources = await glob(`${skillPath}/resources/*`); return { name: skillName, instructions, scripts: scripts.map(s => ({ name: path.basename(s), path: s })), resources: await Promise.all(resources.map(loadResource)), }; } ``` --- ## Anti-Patterns - **No planning before execution** - Agents that jump to action make more errors - **Monolithic agents** - One agent with 50 tools becomes confused - **No verification** - Agents must verify their own work - **Hardcoded tool sequences** - Let the model decide tool order - **Missing guardrails** - All agents need safety boundaries - **No state management** - Lose context across tool calls - **Testing only happy paths** - Test failures and edge cases - **Ignoring model differences** - Reasoning models need different prompts - **No cost tracking** - Agentic workflows can be expensive - **Full automation without oversight** - Human-in-the-loop for critical actions --- ## Quick Reference ### Agent Development Checklist - [ ] Define clear agent scope and boundaries - [ ] Design tools with explicit schemas and risk levels - [ ] Implement explore-plan-execute-verify workflow - [ ] Add multi-layer guardrails - [ ] Set up conversation and persistent memory - [ ] Write behavior and evaluation tests - [ ] Configure appropriate model for task complexity - [ ] Add human-in-the-loop for high-risk operations - [ ] Monitor token usage and costs - [ ] Document skills and instructions ### Thinking Triggers (Claude) ``` "think" → Standard analysis "think hard" → Deeper reasoning "think harder" → Extensive analysis "ultrathink" → Maximum reasoning ``` ### Gemini Settings ``` thinking_level: "high" | "low" temperature: 1.0 (keep at 1.0 for reasoning) thoughtSignature: <pass back for function calling> ``` ================================================ FILE: skills/ai-models/SKILL.md ================================================ --- name: ai-models description: Latest AI models reference - Claude, OpenAI, Gemini, Eleven Labs, Replicate when-to-use: When choosing models, comparing capabilities, or referencing model specs user-invocable: true effort: low --- # AI Models Reference Skill **Last Updated: December 2025** ## Philosophy **Use the right model for the job.** Bigger isn't always better - match model capabilities to task requirements. Consider cost, latency, and accuracy tradeoffs. ## Model Selection Matrix | Task | Recommended | Why | |------|-------------|-----| | Complex reasoning | Claude Opus 4.5, o3, Gemini 3 Pro | Highest accuracy | | Fast chat/completion | Claude Haiku, GPT-4.1 mini, Gemini Flash | Low latency, cheap | | Code generation | Claude Sonnet 4.5, Codestral, GPT-4.1 | Strong coding | | Vision/images | Claude Sonnet, GPT-4o, Gemini 3 Pro | Multimodal | | Embeddings | text-embedding-3-small, Voyage | Cost-effective | | Voice synthesis | Eleven Labs v3, OpenAI TTS | Natural sounding | | Image generation | FLUX.2, DALL-E 3, SD 3.5 | Different styles | --- ## Anthropic (Claude) ### Documentation - **API Docs**: https://docs.anthropic.com - **Models Overview**: https://docs.anthropic.com/en/docs/about-claude/models/overview - **Pricing**: https://www.anthropic.com/pricing ### Latest Models (December 2025) ```typescript const CLAUDE_MODELS = { // Flagship - highest capability opus: 'claude-opus-4-5-20251101', // Balanced - best for most tasks sonnet: 'claude-sonnet-4-5-20250929', // Previous generation (still excellent) opus4: 'claude-opus-4-20250514', sonnet4: 'claude-sonnet-4-20250514', // Fast & cheap - high volume tasks haiku: 'claude-haiku-3-5-20241022', } as const; ``` ### Usage ```typescript import Anthropic from '@anthropic-ai/sdk'; const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY, }); const response = await anthropic.messages.create({ model: 'claude-sonnet-4-5-20250929', max_tokens: 1024, messages: [ { role: 'user', content: 'Hello, Claude!' } ], }); ``` ### Model Selection ``` claude-opus-4-5-20251101 (Opus 4.5) ├── Best for: Complex analysis, research, nuanced writing ├── Context: 200K tokens ├── Cost: $5/$25 per 1M tokens (input/output) └── Use when: Accuracy matters most claude-sonnet-4-5-20250929 (Sonnet 4.5) ├── Best for: Code, general tasks, balanced performance ├── Context: 200K tokens ├── Cost: $3/$15 per 1M tokens └── Use when: Default choice for most applications claude-haiku-3-5-20241022 (Haiku 3.5) ├── Best for: Classification, extraction, high-volume ├── Context: 200K tokens ├── Cost: $0.25/$1.25 per 1M tokens └── Use when: Speed and cost matter most ``` --- ## OpenAI ### Documentation - **API Docs**: https://platform.openai.com/docs - **Models**: https://platform.openai.com/docs/models - **Pricing**: https://openai.com/pricing ### Latest Models (December 2025) ```typescript const OPENAI_MODELS = { // GPT-5 series (latest) gpt5: 'gpt-5.2', gpt5Mini: 'gpt-5-mini', // GPT-4.1 series (recommended for most) gpt41: 'gpt-4.1', gpt41Mini: 'gpt-4.1-mini', gpt41Nano: 'gpt-4.1-nano', // Reasoning models (o-series) o3: 'o3', o3Pro: 'o3-pro', o4Mini: 'o4-mini', // Legacy but still useful gpt4o: 'gpt-4o', // Still has audio support gpt4oMini: 'gpt-4o-mini', // Embeddings embeddingSmall: 'text-embedding-3-small', embeddingLarge: 'text-embedding-3-large', // Image generation dalle3: 'dall-e-3', gptImage: 'gpt-image-1', // Audio tts: 'tts-1', ttsHd: 'tts-1-hd', whisper: 'whisper-1', } as const; ``` ### Usage ```typescript import OpenAI from 'openai'; const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); // Chat completion const response = await openai.chat.completions.create({ model: 'gpt-4.1', messages: [ { role: 'user', content: 'Hello!' } ], }); // With vision const visionResponse = await openai.chat.completions.create({ model: 'gpt-4.1', messages: [ { role: 'user', content: [ { type: 'text', text: 'What is in this image?' }, { type: 'image_url', image_url: { url: 'https://...' } }, ], }, ], }); // Embeddings const embedding = await openai.embeddings.create({ model: 'text-embedding-3-small', input: 'Your text here', }); ``` ### Model Selection ``` o3 / o3-pro ├── Best for: Math, coding, complex multi-step reasoning ├── Context: 200K tokens ├── Cost: Premium pricing └── Use when: Hardest problems, need chain-of-thought gpt-4.1 ├── Best for: General tasks, coding, instruction following ├── Context: 1M tokens (!) ├── Cost: Lower than GPT-4o └── Use when: Default choice, replaces GPT-4o gpt-4.1-mini / gpt-4.1-nano ├── Best for: High-volume, cost-sensitive ├── Context: 1M tokens ├── Cost: Very low └── Use when: Simple tasks at scale o4-mini ├── Best for: Fast reasoning at low cost ├── Context: 200K tokens ├── Cost: Budget reasoning └── Use when: Need reasoning but cost-conscious ``` --- ## Google (Gemini) ### Documentation - **API Docs**: https://ai.google.dev/docs - **Models**: https://ai.google.dev/gemini-api/docs/models/gemini - **Pricing**: https://ai.google.dev/pricing ### Latest Models (December 2025) ```typescript const GEMINI_MODELS = { // Gemini 3 (Latest) gemini3Pro: 'gemini-3-pro-preview', gemini3ProImage: 'gemini-3-pro-image-preview', gemini3Flash: 'gemini-3-flash-preview', // Gemini 2.5 (Stable) gemini25Pro: 'gemini-2.5-pro', gemini25Flash: 'gemini-2.5-flash', gemini25FlashLite: 'gemini-2.5-flash-lite', // Specialized gemini25FlashTTS: 'gemini-2.5-flash-preview-tts', gemini25FlashAudio: 'gemini-2.5-flash-native-audio-preview-12-2025', // Previous generation gemini2Flash: 'gemini-2.0-flash', } as const; ``` ### Usage ```typescript import { GoogleGenerativeAI } from '@google/generative-ai'; const genAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY); const model = genAI.getGenerativeModel({ model: 'gemini-2.5-flash' }); const result = await model.generateContent('Hello!'); const response = result.response.text(); // With vision const visionModel = genAI.getGenerativeModel({ model: 'gemini-2.5-pro' }); const imagePart = { inlineData: { data: base64Image, mimeType: 'image/jpeg', }, }; const result = await visionModel.generateContent(['Describe this:', imagePart]); ``` ### Model Selection ``` gemini-3-pro-preview ├── Best for: "Best model in the world for multimodal" ├── Context: 2M tokens ├── Cost: Premium └── Use when: Need absolute best quality gemini-2.5-pro ├── Best for: State-of-the-art thinking, complex tasks ├── Context: 2M tokens ├── Cost: $1.25/$5 per 1M tokens └── Use when: Long context, complex reasoning gemini-2.5-flash ├── Best for: Fast, balanced performance ├── Context: 1M tokens ├── Cost: $0.075/$0.30 per 1M tokens └── Use when: Speed and cost matter gemini-2.5-flash-lite ├── Best for: Ultra-fast, lowest cost ├── Context: 1M tokens ├── Cost: $0.04/$0.15 per 1M tokens └── Use when: High volume, simple tasks ``` --- ## Eleven Labs (Voice) ### Documentation - **API Docs**: https://elevenlabs.io/docs - **Models**: https://elevenlabs.io/docs/models - **Pricing**: https://elevenlabs.io/pricing ### Latest Models (December 2025) ```typescript const ELEVENLABS_MODELS = { // Latest - highest quality (alpha) v3: 'eleven_v3', // Production ready multilingualV2: 'eleven_multilingual_v2', turboV2_5: 'eleven_turbo_v2_5', // Ultra-low latency flashV2_5: 'eleven_flash_v2_5', flashV2: 'eleven_flash_v2', // English only } as const; ``` ### Usage ```typescript import { ElevenLabsClient } from 'elevenlabs'; const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, }); // Text to speech const audio = await elevenlabs.textToSpeech.convert('voice-id', { text: 'Hello, world!', model_id: 'eleven_turbo_v2_5', voice_settings: { stability: 0.5, similarity_boost: 0.75, }, }); // Stream audio (for real-time) const audioStream = await elevenlabs.textToSpeech.convertAsStream('voice-id', { text: 'Streaming audio...', model_id: 'eleven_flash_v2_5', }); ``` ### Model Selection ``` eleven_v3 (Alpha) ├── Best for: Highest quality, emotional range ├── Latency: ~1s+ (not for real-time) ├── Languages: 74 └── Use when: Quality over speed, pre-rendered eleven_turbo_v2_5 ├── Best for: Balanced quality and speed ├── Latency: ~250-300ms ├── Languages: 32 └── Use when: Good quality with reasonable latency eleven_flash_v2_5 ├── Best for: Real-time, conversational AI ├── Latency: <75ms ├── Languages: 32 └── Use when: Live voice agents, chatbots ``` --- ## Replicate ### Documentation - **API Docs**: https://replicate.com/docs - **Models**: https://replicate.com/explore - **Pricing**: https://replicate.com/pricing ### Popular Models (December 2025) ```typescript const REPLICATE_MODELS = { // FLUX.2 (Latest - November 2025) flux2Pro: 'black-forest-labs/flux-2-pro', flux2Flex: 'black-forest-labs/flux-2-flex', flux2Dev: 'black-forest-labs/flux-2-dev', // FLUX.1 (Still excellent) flux11Pro: 'black-forest-labs/flux-1.1-pro', fluxKontext: 'black-forest-labs/flux-kontext', // Image editing fluxSchnell: 'black-forest-labs/flux-schnell', // Video stableVideo4D: 'stability-ai/sv4d-2.0', // Audio musicgen: 'meta/musicgen', // LLMs (if needed outside main providers) llama: 'meta/llama-3.2-90b-vision', } as const; ``` ### Usage ```typescript import Replicate from 'replicate'; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, }); // Image generation with FLUX.2 const output = await replicate.run('black-forest-labs/flux-2-pro', { input: { prompt: 'A serene mountain landscape at sunset', aspect_ratio: '16:9', output_format: 'webp', }, }); // Image editing with Kontext const edited = await replicate.run('black-forest-labs/flux-kontext', { input: { image: 'https://...', prompt: 'Change the sky to sunset colors', }, }); ``` ### Model Selection ``` flux-2-pro ├── Best for: Highest quality, up to 4MP ├── Speed: ~6s ├── Cost: $0.015 + per megapixel └── Use when: Professional quality needed flux-2-flex ├── Best for: Fine details, typography ├── Speed: ~22s ├── Cost: $0.06 per megapixel └── Use when: Need precise control flux-2-dev (Open source) ├── Best for: Fast generation ├── Speed: ~2.5s ├── Cost: $0.012 per megapixel └── Use when: Speed over quality flux-kontext ├── Best for: Image editing with text ├── Speed: Variable ├── Cost: Per run └── Use when: Edit existing images ``` --- ## Stability AI ### Documentation - **API Docs**: https://platform.stability.ai/docs/api-reference - **Models**: https://stability.ai/stable-image - **Pricing**: https://platform.stability.ai/pricing ### Latest Models (December 2025) ```typescript const STABILITY_MODELS = { // Image generation sd35Large: 'sd3.5-large', sd35LargeTurbo: 'sd3.5-large-turbo', sd3Medium: 'sd3-medium', // Video sv4d: 'sv4d-2.0', // Stable Video 4D 2.0 // Upscaling upscale: 'esrgan-v1-x2plus', } as const; ``` ### Usage ```typescript const response = await fetch( 'https://api.stability.ai/v2beta/stable-image/generate/sd3', { method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${process.env.STABILITY_API_KEY}`, }, body: JSON.stringify({ prompt: 'A futuristic city at night', output_format: 'webp', aspect_ratio: '16:9', model: 'sd3.5-large', }), } ); ``` --- ## Mistral AI ### Documentation - **API Docs**: https://docs.mistral.ai - **Models**: https://docs.mistral.ai/getting-started/models - **Pricing**: https://mistral.ai/technology/#pricing ### Latest Models (December 2025) ```typescript const MISTRAL_MODELS = { // Flagship large: 'mistral-large-latest', // Points to 2411 // Medium tier medium: 'mistral-medium-2505', // Medium 3 // Small/Fast small: 'mistral-small-2506', // Small 3.2 // Code specialized codestral: 'codestral-2508', devstral: 'devstral-medium-2507', // Reasoning (Magistral) magistralMedium: 'magistral-medium-2507', magistralSmall: 'magistral-small-2507', // Audio voxtral: 'voxtral-small-2507', // OCR ocr: 'mistral-ocr-2505', } as const; ``` ### Usage ```typescript import MistralClient from '@mistralai/mistralai'; const client = new MistralClient(process.env.MISTRAL_API_KEY); const response = await client.chat({ model: 'mistral-large-latest', messages: [{ role: 'user', content: 'Hello!' }], }); // Code completion with Codestral const codeResponse = await client.chat({ model: 'codestral-2508', messages: [{ role: 'user', content: 'Write a Python function to...' }], }); ``` ### Model Selection ``` mistral-large-latest (123B params) ├── Best for: Complex reasoning, knowledge tasks ├── Context: 128K tokens └── Use when: Need high capability codestral-2508 ├── Best for: Code generation, 80+ languages ├── Speed: 2.5x faster than predecessor └── Use when: Code-focused tasks magistral-medium-2507 ├── Best for: Multi-step reasoning ├── Specialty: Transparent chain-of-thought └── Use when: Need reasoning traces ``` --- ## Voyage AI (Embeddings) ### Documentation - **API Docs**: https://docs.voyageai.com - **Models**: https://docs.voyageai.com/docs/embeddings - **Pricing**: https://www.voyageai.com/pricing ### Latest Models (December 2025) ```typescript const VOYAGE_MODELS = { // General purpose large2: 'voyage-large-2', large2Instruct: 'voyage-large-2-instruct', // Code specialized code2: 'voyage-code-2', code3: 'voyage-code-3', // Multilingual multilingual2: 'voyage-multilingual-2', // Domain specific law2: 'voyage-law-2', finance2: 'voyage-finance-2', } as const; ``` ### Usage ```typescript const response = await fetch('https://api.voyageai.com/v1/embeddings', { method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, }, body: JSON.stringify({ model: 'voyage-code-3', input: ['Your code to embed'], }), }); const { data } = await response.json(); const embedding = data[0].embedding; ``` --- ## Quick Reference ### Cost Comparison (per 1M tokens, approx.) | Provider | Cheap | Mid | Premium | |----------|-------|-----|---------| | Anthropic | $0.25 (Haiku) | $3 (Sonnet 4.5) | $5 (Opus 4.5) | | OpenAI | $0.15 (4.1-nano) | $2 (4.1) | $15+ (o3) | | Google | $0.04 (Flash-lite) | $0.08 (Flash) | $1.25 (Pro) | | Mistral | $0.25 (Small) | $2.70 (Medium) | $8 (Large) | ### Best For Each Task ``` Reasoning/Analysis → Claude Opus 4.5, o3, Gemini 3 Pro Code Generation → Claude Sonnet 4.5, Codestral 2508, GPT-4.1 Fast Responses → Claude Haiku, GPT-4.1-mini, Gemini Flash Long Context → Gemini 2.5 Pro (2M), GPT-4.1 (1M), Claude (200K) Vision → GPT-4.1, Claude Sonnet, Gemini 3 Pro Embeddings → Voyage code-3, text-embedding-3-small Voice Synthesis → Eleven Labs v3/flash, OpenAI TTS Image Generation → FLUX.2 Pro, DALL-E 3, SD 3.5 Video Generation → Stable Video 4D 2.0, Runway Image Editing → FLUX Kontext, gpt-image-1 ``` ### Environment Variables Template ```bash # .env.example (NEVER commit actual keys) # LLMs ANTHROPIC_API_KEY=sk-ant-... OPENAI_API_KEY=sk-... GOOGLE_API_KEY=AI... MISTRAL_API_KEY=... # Media ELEVENLABS_API_KEY=... REPLICATE_API_TOKEN=r8_... STABILITY_API_KEY=sk-... # Embeddings VOYAGE_API_KEY=pa-... ``` ### Model Update Checklist ``` When models update: □ Check official changelog/blog □ Update model ID strings □ Test with existing prompts □ Compare output quality □ Check pricing changes □ Update context limits if changed ``` --- ## Sources - [Anthropic Models](https://docs.anthropic.com/en/docs/about-claude/models/overview) - [OpenAI Models](https://platform.openai.com/docs/models) - [OpenAI o3 Announcement](https://openai.com/index/introducing-o3-and-o4-mini/) - [GPT-4.1 Announcement](https://openai.com/index/gpt-4-1/) - [Google Gemini Models](https://ai.google.dev/gemini-api/docs/models/gemini) - [Eleven Labs Models](https://elevenlabs.io/docs/models) - [Replicate FLUX.2](https://replicate.com/blog/run-flux-2-on-replicate) - [Mistral Models](https://docs.mistral.ai/getting-started/models) - [Voyage AI](https://docs.voyageai.com) ================================================ FILE: skills/android-java/SKILL.md ================================================ --- name: android-java description: Android Java development with MVVM, ViewBinding, and Espresso testing when-to-use: When working on Android Java source files user-invocable: false paths: ["**/*.java", "android/**", "**/build.gradle"] effort: medium --- # Android Java Skill --- ## Project Structure ``` project/ ├── app/ │ ├── src/ │ │ ├── main/ │ │ │ ├── java/com/example/app/ │ │ │ │ ├── data/ # Data layer │ │ │ │ │ ├── local/ # Room database, SharedPreferences │ │ │ │ │ ├── remote/ # Retrofit services, API clients │ │ │ │ │ └── repository/ # Repository implementations │ │ │ │ ├── di/ # Dependency injection (Hilt/Dagger) │ │ │ │ ├── domain/ # Business logic │ │ │ │ │ ├── model/ # Domain models │ │ │ │ │ ├── repository/ # Repository interfaces │ │ │ │ │ └── usecase/ # Use cases │ │ │ │ ├── ui/ # Presentation layer │ │ │ │ │ ├── feature/ # Feature screens │ │ │ │ │ │ ├── FeatureActivity.java │ │ │ │ │ │ ├── FeatureFragment.java │ │ │ │ │ │ └── FeatureViewModel.java │ │ │ │ │ └── common/ # Shared UI components │ │ │ │ └── App.java # Application class │ │ │ ├── res/ │ │ │ │ ├── layout/ │ │ │ │ ├── values/ │ │ │ │ └── drawable/ │ │ │ └── AndroidManifest.xml │ │ ├── test/ # Unit tests │ │ └── androidTest/ # Instrumentation tests │ └── build.gradle ├── build.gradle # Project-level build file ├── gradle.properties ├── settings.gradle └── CLAUDE.md ``` --- ## Gradle Configuration ### App-level build.gradle ```groovy plugins { id 'com.android.application' } android { namespace 'com.example.app' compileSdk 34 defaultConfig { applicationId "com.example.app" minSdk 24 targetSdk 34 versionCode 1 versionName "1.0" testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" } buildTypes { release { minifyEnabled true proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' } } compileOptions { sourceCompatibility JavaVersion.VERSION_17 targetCompatibility JavaVersion.VERSION_17 } buildFeatures { viewBinding true } } dependencies { // AndroidX implementation 'androidx.core:core:1.12.0' implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.11.0' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' // Lifecycle implementation 'androidx.lifecycle:lifecycle-viewmodel:2.7.0' implementation 'androidx.lifecycle:lifecycle-livedata:2.7.0' // Testing testImplementation 'junit:junit:4.13.2' testImplementation 'org.mockito:mockito-core:5.8.0' androidTestImplementation 'androidx.test.ext:junit:1.1.5' androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1' } ``` --- ## Architecture Patterns ### MVVM with ViewModel ```java // ViewModel - holds UI state, survives configuration changes public class UserViewModel extends ViewModel { private final UserRepository repository; private final MutableLiveData<User> user = new MutableLiveData<>(); private final MutableLiveData<Boolean> loading = new MutableLiveData<>(false); private final MutableLiveData<String> error = new MutableLiveData<>(); public UserViewModel(UserRepository repository) { this.repository = repository; } public LiveData<User> getUser() { return user; } public LiveData<Boolean> isLoading() { return loading; } public LiveData<String> getError() { return error; } public void loadUser(String userId) { loading.setValue(true); repository.getUser(userId, new Callback<User>() { @Override public void onSuccess(User result) { user.setValue(result); loading.setValue(false); } @Override public void onError(String message) { error.setValue(message); loading.setValue(false); } }); } } ``` ### Repository Pattern ```java // Repository interface (domain layer) public interface UserRepository { void getUser(String userId, Callback<User> callback); void saveUser(User user, Callback<Void> callback); } // Repository implementation (data layer) public class UserRepositoryImpl implements UserRepository { private final UserApi api; private final UserDao dao; public UserRepositoryImpl(UserApi api, UserDao dao) { this.api = api; this.dao = dao; } @Override public void getUser(String userId, Callback<User> callback) { // Try cache first, then network User cached = dao.getUserById(userId); if (cached != null) { callback.onSuccess(cached); return; } api.getUser(userId).enqueue(new retrofit2.Callback<User>() { @Override public void onResponse(Call<User> call, Response<User> response) { if (response.isSuccessful() && response.body() != null) { dao.insert(response.body()); callback.onSuccess(response.body()); } else { callback.onError("Failed to load user"); } } @Override public void onFailure(Call<User> call, Throwable t) { callback.onError(t.getMessage()); } }); } } ``` --- ## Activity & Fragment Patterns ### Activity with ViewBinding ```java public class MainActivity extends AppCompatActivity { private ActivityMainBinding binding; private MainViewModel viewModel; @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); binding = ActivityMainBinding.inflate(getLayoutInflater()); setContentView(binding.getRoot()); viewModel = new ViewModelProvider(this).get(MainViewModel.class); setupObservers(); setupListeners(); } private void setupObservers() { viewModel.getUser().observe(this, user -> { binding.userName.setText(user.getName()); }); viewModel.isLoading().observe(this, isLoading -> { binding.progressBar.setVisibility(isLoading ? View.VISIBLE : View.GONE); }); } private void setupListeners() { binding.refreshButton.setOnClickListener(v -> { viewModel.loadUser(getCurrentUserId()); }); } @Override protected void onDestroy() { super.onDestroy(); binding = null; } } ``` ### Fragment with ViewBinding ```java public class UserFragment extends Fragment { private FragmentUserBinding binding; private UserViewModel viewModel; @Override public View onCreateView(LayoutInflater inflater, ViewGroup container, Bundle savedInstanceState) { binding = FragmentUserBinding.inflate(inflater, container, false); return binding.getRoot(); } @Override public void onViewCreated(View view, Bundle savedInstanceState) { super.onViewCreated(view, savedInstanceState); viewModel = new ViewModelProvider(requireActivity()).get(UserViewModel.class); setupObservers(); } private void setupObservers() { viewModel.getUser().observe(getViewLifecycleOwner(), user -> { binding.userName.setText(user.getName()); }); } @Override public void onDestroyView() { super.onDestroyView(); binding = null; // Prevent memory leaks } } ``` --- ## Testing ### Unit Tests with JUnit & Mockito ```java @RunWith(MockitoJUnitRunner.class) public class UserViewModelTest { @Mock private UserRepository repository; @Rule public InstantTaskExecutorRule instantTaskExecutorRule = new InstantTaskExecutorRule(); private UserViewModel viewModel; @Before public void setup() { viewModel = new UserViewModel(repository); } @Test public void loadUser_success_updatesUserLiveData() { // Arrange User expectedUser = new User("1", "John Doe"); doAnswer(invocation -> { Callback<User> callback = invocation.getArgument(1); callback.onSuccess(expectedUser); return null; }).when(repository).getUser(eq("1"), any()); // Act viewModel.loadUser("1"); // Assert assertEquals(expectedUser, viewModel.getUser().getValue()); assertFalse(viewModel.isLoading().getValue()); } @Test public void loadUser_error_updatesErrorLiveData() { // Arrange doAnswer(invocation -> { Callback<User> callback = invocation.getArgument(1); callback.onError("Network error"); return null; }).when(repository).getUser(eq("1"), any()); // Act viewModel.loadUser("1"); // Assert assertEquals("Network error", viewModel.getError().getValue()); assertFalse(viewModel.isLoading().getValue()); } } ``` ### Instrumentation Tests with Espresso ```java @RunWith(AndroidJUnit4.class) public class MainActivityTest { @Rule public ActivityScenarioRule<MainActivity> activityRule = new ActivityScenarioRule<>(MainActivity.class); @Test public void userName_isDisplayed() { onView(withId(R.id.userName)) .check(matches(isDisplayed())); } @Test public void refreshButton_click_triggersRefresh() { onView(withId(R.id.refreshButton)) .perform(click()); onView(withId(R.id.progressBar)) .check(matches(isDisplayed())); } @Test public void userList_scrollToItem_displaysCorrectly() { onView(withId(R.id.userList)) .perform(RecyclerViewActions.scrollToPosition(10)); onView(withText("User 10")) .check(matches(isDisplayed())); } } ``` --- ## GitHub Actions ```yaml name: Android CI on: push: branches: [main] pull_request: branches: [main] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up JDK 17 uses: actions/setup-java@v4 with: java-version: '17' distribution: 'temurin' - name: Setup Gradle uses: gradle/actions/setup-gradle@v3 - name: Grant execute permission for gradlew run: chmod +x gradlew - name: Run Lint run: ./gradlew lint - name: Run Unit Tests run: ./gradlew testDebugUnitTest - name: Build Debug APK run: ./gradlew assembleDebug - name: Upload APK uses: actions/upload-artifact@v4 with: name: debug-apk path: app/build/outputs/apk/debug/app-debug.apk instrumentation-tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up JDK 17 uses: actions/setup-java@v4 with: java-version: '17' distribution: 'temurin' - name: Enable KVM run: | echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules sudo udevadm control --reload-rules sudo udevadm trigger --name-match=kvm - name: Run Instrumentation Tests uses: reactivecircus/android-emulator-runner@v2 with: api-level: 29 script: ./gradlew connectedDebugAndroidTest ``` --- ## Lint Configuration ### lint.xml ```xml <?xml version="1.0" encoding="UTF-8"?> <lint> <!-- Treat these as errors --> <issue id="HardcodedText" severity="error" /> <issue id="MissingTranslation" severity="error" /> <issue id="UnusedResources" severity="warning" /> <!-- Memory leak detection --> <issue id="StaticFieldLeak" severity="error" /> <!-- Security --> <issue id="HardcodedDebugMode" severity="error" /> <issue id="AllowBackup" severity="warning" /> <!-- Performance --> <issue id="ViewHolder" severity="error" /> <issue id="Overdraw" severity="warning" /> <!-- Ignore for tests --> <issue id="InvalidPackage"> <ignore path="**/test/**" /> <ignore path="**/androidTest/**" /> </issue> </lint> ``` ### build.gradle lint options ```groovy android { lint { abortOnError true warningsAsErrors false checkReleaseBuilds true xmlReport true htmlReport true } } ``` --- ## Common Patterns ### Null-Safe Callbacks ```java // Define callback interface public interface Callback<T> { void onSuccess(T result); void onError(String message); } // Use with null checks public void fetchData(Callback<Data> callback) { if (callback == null) return; try { Data result = performFetch(); callback.onSuccess(result); } catch (Exception e) { callback.onError(e.getMessage()); } } ``` ### Safe Context Usage ```java // Use application context for long-lived objects public class DataManager { private final Context appContext; public DataManager(Context context) { // Always use application context to prevent Activity leaks this.appContext = context.getApplicationContext(); } } // Check for null context in callbacks private void updateUI() { Context context = getContext(); if (context == null || !isAdded()) return; // Safe to use context } ``` ### Thread-Safe Singleton ```java public class ApiClient { private static volatile ApiClient instance; private final Retrofit retrofit; private ApiClient() { retrofit = new Retrofit.Builder() .baseUrl(BASE_URL) .addConverterFactory(GsonConverterFactory.create()) .build(); } public static ApiClient getInstance() { if (instance == null) { synchronized (ApiClient.class) { if (instance == null) { instance = new ApiClient(); } } } return instance; } } ``` --- ## Android Anti-Patterns - ❌ **Context leaks** - Never hold Activity/Fragment references in static fields or singletons - ❌ **Memory leaks in callbacks** - Always use WeakReference or clear callbacks in onDestroy - ❌ **UI updates on background thread** - Always post to main thread for UI changes - ❌ **Hardcoded strings** - Use string resources for all user-visible text - ❌ **God Activities** - Keep Activities under 200 lines, extract logic to ViewModels - ❌ **NetworkOnMainThreadException** - Never perform network calls on main thread - ❌ **Ignoring lifecycle** - Always respect Activity/Fragment lifecycle states - ❌ **Blocking the main thread** - Keep main thread operations under 16ms - ❌ **Not handling configuration changes** - Use ViewModel to survive rotation - ❌ **Hardcoded dimensions** - Use dp/sp units and dimension resources - ❌ **Deep view hierarchies** - Keep layout depth under 10 levels, use ConstraintLayout - ❌ **Not closing resources** - Always close Cursor, InputStream, database connections ================================================ FILE: skills/android-kotlin/SKILL.md ================================================ --- name: android-kotlin description: Android Kotlin development with Coroutines, Jetpack Compose, Hilt, and MockK testing when-to-use: When working on Android Kotlin source files user-invocable: false paths: ["**/*.kt", "**/*.kts", "android/**", "**/build.gradle.kts"] effort: medium --- # Android Kotlin Skill --- ## Project Structure ``` project/ ├── app/ │ ├── src/ │ │ ├── main/ │ │ │ ├── kotlin/com/example/app/ │ │ │ │ ├── data/ # Data layer │ │ │ │ │ ├── local/ # Room database │ │ │ │ │ ├── remote/ # Retrofit/Ktor services │ │ │ │ │ └── repository/ # Repository implementations │ │ │ │ ├── di/ # Hilt modules │ │ │ │ ├── domain/ # Business logic │ │ │ │ │ ├── model/ # Domain models │ │ │ │ │ ├── repository/ # Repository interfaces │ │ │ │ │ └── usecase/ # Use cases │ │ │ │ ├── ui/ # Presentation layer │ │ │ │ │ ├── feature/ # Feature screens │ │ │ │ │ │ ├── FeatureScreen.kt # Compose UI │ │ │ │ │ │ └── FeatureViewModel.kt │ │ │ │ │ ├── components/ # Reusable Compose components │ │ │ │ │ └── theme/ # Material theme │ │ │ │ └── App.kt # Application class │ │ │ ├── res/ │ │ │ └── AndroidManifest.xml │ │ ├── test/ # Unit tests │ │ └── androidTest/ # Instrumentation tests │ └── build.gradle.kts ├── build.gradle.kts # Project-level build file ├── gradle.properties ├── settings.gradle.kts └── CLAUDE.md ``` --- ## Gradle Configuration (Kotlin DSL) ### App-level build.gradle.kts ```kotlin plugins { id("com.android.application") id("org.jetbrains.kotlin.android") id("com.google.dagger.hilt.android") id("com.google.devtools.ksp") } android { namespace = "com.example.app" compileSdk = 34 defaultConfig { applicationId = "com.example.app" minSdk = 24 targetSdk = 34 versionCode = 1 versionName = "1.0" testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" } buildTypes { release { isMinifyEnabled = true proguardFiles( getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro" ) } } compileOptions { sourceCompatibility = JavaVersion.VERSION_17 targetCompatibility = JavaVersion.VERSION_17 } kotlinOptions { jvmTarget = "17" } buildFeatures { compose = true } composeOptions { kotlinCompilerExtensionVersion = "1.5.8" } } dependencies { // Compose BOM val composeBom = platform("androidx.compose:compose-bom:2024.01.00") implementation(composeBom) implementation("androidx.compose.ui:ui") implementation("androidx.compose.ui:ui-tooling-preview") implementation("androidx.compose.material3:material3") implementation("androidx.activity:activity-compose:1.8.2") implementation("androidx.lifecycle:lifecycle-viewmodel-compose:2.7.0") // Coroutines implementation("org.jetbrains.kotlinx:kotlinx-coroutines-android:1.7.3") // Hilt implementation("com.google.dagger:hilt-android:2.50") ksp("com.google.dagger:hilt-compiler:2.50") implementation("androidx.hilt:hilt-navigation-compose:1.1.0") // Room implementation("androidx.room:room-runtime:2.6.1") implementation("androidx.room:room-ktx:2.6.1") ksp("androidx.room:room-compiler:2.6.1") // Testing testImplementation("junit:junit:4.13.2") testImplementation("io.mockk:mockk:1.13.9") testImplementation("org.jetbrains.kotlinx:kotlinx-coroutines-test:1.7.3") testImplementation("app.cash.turbine:turbine:1.0.0") androidTestImplementation("androidx.test.ext:junit:1.1.5") androidTestImplementation("androidx.compose.ui:ui-test-junit4") debugImplementation("androidx.compose.ui:ui-tooling") debugImplementation("androidx.compose.ui:ui-test-manifest") } ``` --- ## Kotlin Coroutines & Flow ### ViewModel with StateFlow ```kotlin @HiltViewModel class UserViewModel @Inject constructor( private val getUserUseCase: GetUserUseCase, private val savedStateHandle: SavedStateHandle ) : ViewModel() { private val _uiState = MutableStateFlow(UserUiState()) val uiState: StateFlow<UserUiState> = _uiState.asStateFlow() private val userId: String = checkNotNull(savedStateHandle["userId"]) init { loadUser() } fun loadUser() { viewModelScope.launch { _uiState.update { it.copy(isLoading = true) } getUserUseCase(userId) .catch { e -> _uiState.update { it.copy(isLoading = false, error = e.message) } } .collect { user -> _uiState.update { it.copy(isLoading = false, user = user, error = null) } } } } fun clearError() { _uiState.update { it.copy(error = null) } } } data class UserUiState( val user: User? = null, val isLoading: Boolean = false, val error: String? = null ) ``` ### Repository with Flow ```kotlin interface UserRepository { fun getUser(userId: String): Flow<User> fun observeUsers(): Flow<List<User>> suspend fun saveUser(user: User) } class UserRepositoryImpl @Inject constructor( private val api: UserApi, private val dao: UserDao, private val dispatcher: CoroutineDispatcher = Dispatchers.IO ) : UserRepository { override fun getUser(userId: String): Flow<User> = flow { // Emit cached data first dao.getUserById(userId)?.let { emit(it) } // Fetch from network and update cache val remoteUser = api.getUser(userId) dao.insert(remoteUser) emit(remoteUser) }.flowOn(dispatcher) override fun observeUsers(): Flow<List<User>> = dao.observeAllUsers().flowOn(dispatcher) override suspend fun saveUser(user: User) = withContext(dispatcher) { api.saveUser(user) dao.insert(user) } } ``` --- ## Jetpack Compose ### Screen with ViewModel ```kotlin @Composable fun UserScreen( viewModel: UserViewModel = hiltViewModel(), onNavigateBack: () -> Unit ) { val uiState by viewModel.uiState.collectAsStateWithLifecycle() UserScreenContent( uiState = uiState, onRefresh = viewModel::loadUser, onErrorDismiss = viewModel::clearError, onNavigateBack = onNavigateBack ) } @Composable private fun UserScreenContent( uiState: UserUiState, onRefresh: () -> Unit, onErrorDismiss: () -> Unit, onNavigateBack: () -> Unit ) { Scaffold( topBar = { TopAppBar( title = { Text("User Profile") }, navigationIcon = { IconButton(onClick = onNavigateBack) { Icon(Icons.AutoMirrored.Filled.ArrowBack, "Back") } } ) } ) { padding -> Box( modifier = Modifier .fillMaxSize() .padding(padding) ) { when { uiState.isLoading -> { CircularProgressIndicator( modifier = Modifier.align(Alignment.Center) ) } uiState.user != null -> { UserContent(user = uiState.user) } } uiState.error?.let { error -> Snackbar( modifier = Modifier.align(Alignment.BottomCenter), action = { TextButton(onClick = onErrorDismiss) { Text("Dismiss") } } ) { Text(error) } } } } } ``` --- ## Sealed Classes for State ### Result Wrapper ```kotlin sealed interface Result<out T> { data class Success<T>(val data: T) : Result<T> data class Error(val exception: Throwable) : Result<Nothing> data object Loading : Result<Nothing> } fun <T> Result<T>.getOrNull(): T? = (this as? Result.Success)?.data inline fun <T, R> Result<T>.map(transform: (T) -> R): Result<R> = when (this) { is Result.Success -> Result.Success(transform(data)) is Result.Error -> this is Result.Loading -> this } ``` --- ## Testing with MockK & Turbine ### ViewModel Tests ```kotlin @OptIn(ExperimentalCoroutinesApi::class) class UserViewModelTest { @get:Rule val mainDispatcherRule = MainDispatcherRule() private val getUserUseCase: GetUserUseCase = mockk() private val savedStateHandle = SavedStateHandle(mapOf("userId" to "123")) private lateinit var viewModel: UserViewModel @Before fun setup() { viewModel = UserViewModel(getUserUseCase, savedStateHandle) } @Test fun `loadUser success updates state with user`() = runTest { val user = User("123", "John Doe", "john@example.com") coEvery { getUserUseCase("123") } returns flowOf(user) viewModel.uiState.test { val initial = awaitItem() assertFalse(initial.isLoading) viewModel.loadUser() val loading = awaitItem() assertTrue(loading.isLoading) val success = awaitItem() assertFalse(success.isLoading) assertEquals(user, success.user) } } } class MainDispatcherRule( private val dispatcher: TestDispatcher = UnconfinedTestDispatcher() ) : TestWatcher() { override fun starting(description: Description) { Dispatchers.setMain(dispatcher) } override fun finished(description: Description) { Dispatchers.resetMain() } } ``` --- ## GitHub Actions ```yaml name: Android Kotlin CI on: push: branches: [main] pull_request: branches: [main] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up JDK 17 uses: actions/setup-java@v4 with: java-version: '17' distribution: 'temurin' - name: Setup Gradle uses: gradle/actions/setup-gradle@v3 - name: Run Detekt run: ./gradlew detekt - name: Run Ktlint run: ./gradlew ktlintCheck - name: Run Unit Tests run: ./gradlew testDebugUnitTest - name: Build Debug APK run: ./gradlew assembleDebug ``` --- ## Lint Configuration ### detekt.yml ```yaml build: maxIssues: 0 complexity: LongMethod: threshold: 20 LongParameterList: functionThreshold: 4 TooManyFunctions: thresholdInFiles: 10 style: MaxLineLength: maxLineLength: 120 WildcardImport: active: true coroutines: GlobalCoroutineUsage: active: true ``` --- ## Kotlin Anti-Patterns - ❌ **Blocking coroutines on Main** - Never use `runBlocking` on main thread - ❌ **GlobalScope usage** - Use structured concurrency with viewModelScope/lifecycleScope - ❌ **Collecting flows in init** - Use `repeatOnLifecycle` or `collectAsStateWithLifecycle` - ❌ **Mutable state exposure** - Expose `StateFlow` not `MutableStateFlow` - ❌ **Not handling exceptions in flows** - Always use `catch` operator - ❌ **Lateinit for nullable** - Use `lazy` or nullable with `?` - ❌ **Hardcoded dispatchers** - Inject dispatchers for testability - ❌ **Not using sealed classes** - Prefer sealed for finite state sets - ❌ **Side effects in Composables** - Use `LaunchedEffect`/`SideEffect` - ❌ **Unstable Compose parameters** - Use stable/immutable types or `@Stable` ================================================ FILE: skills/aws-aurora/SKILL.md ================================================ --- name: aws-aurora description: AWS Aurora Serverless v2, RDS Proxy, Data API, connection pooling when-to-use: When working with AWS Aurora/RDS databases user-invocable: false paths: ["**/rds*", "**/aurora*", "serverless.*", "template.yaml"] effort: medium --- # AWS Aurora Skill Amazon Aurora is a MySQL/PostgreSQL-compatible relational database with serverless scaling, high availability, and enterprise features. **Sources:** [Aurora Docs](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/) | [Serverless v2](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.html) | [RDS Proxy](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/rds-proxy.html) --- ## Core Principle **Use RDS Proxy for serverless, Data API for simplicity, connection pooling always.** Aurora excels at ACID-compliant workloads. For serverless architectures (Lambda), always use RDS Proxy or Data API to handle connection management. Never open raw connections from Lambda functions. --- ## Aurora Options | Option | Best For | |--------|----------| | **Aurora Serverless v2** | Variable workloads, auto-scaling (0.5-128 ACUs) | | **Aurora Provisioned** | Predictable workloads, maximum performance | | **Aurora Global** | Multi-region, disaster recovery | | **Data API** | Serverless without VPC, simple HTTP access | | **RDS Proxy** | Connection pooling for Lambda, high concurrency | --- ## Connection Strategies ### Strategy 1: RDS Proxy (Recommended for Lambda) ``` Lambda → RDS Proxy → Aurora (pool) ``` - Connection pooling and reuse - Automatic failover handling - IAM authentication support - Works with existing SQL clients ### Strategy 2: Data API (Simplest for Serverless) ``` Lambda → Data API (HTTP) → Aurora ``` - No VPC required - No connection management - Higher latency per query - Limited to Aurora Serverless ### Strategy 3: Direct Connection (Not for Lambda) ``` App Server → Aurora (persistent connection) ``` - Only for long-running servers (ECS, EC2) - Manage connection pool yourself - Not suitable for serverless --- ## RDS Proxy Setup ### Create Proxy (AWS Console/CDK) ```typescript // CDK example import * as rds from 'aws-cdk-lib/aws-rds'; const proxy = new rds.DatabaseProxy(this, 'Proxy', { proxyTarget: rds.ProxyTarget.fromCluster(cluster), secrets: [cluster.secret!], vpc, securityGroups: [proxySecurityGroup], requireTLS: true, idleClientTimeout: cdk.Duration.minutes(30), maxConnectionsPercent: 90, maxIdleConnectionsPercent: 10, borrowTimeout: cdk.Duration.seconds(30) }); ``` ### Connect via Proxy (TypeScript/Node.js) ```typescript // lib/db.ts import { Pool } from 'pg'; import { Signer } from '@aws-sdk/rds-signer'; const signer = new Signer({ hostname: process.env.RDS_PROXY_ENDPOINT!, port: 5432, username: process.env.DB_USER!, region: process.env.AWS_REGION! }); // IAM authentication async function getPool(): Promise<Pool> { const token = await signer.getAuthToken(); return new Pool({ host: process.env.RDS_PROXY_ENDPOINT, port: 5432, database: process.env.DB_NAME, user: process.env.DB_USER, password: token, ssl: { rejectUnauthorized: true }, max: 1, // Single connection for Lambda idleTimeoutMillis: 120000, connectionTimeoutMillis: 10000 }); } // Usage in Lambda let pool: Pool | null = null; export async function handler(event: any) { if (!pool) { pool = await getPool(); } const result = await pool.query('SELECT * FROM users WHERE id = $1', [event.userId]); return result.rows[0]; } ``` ### Proxy Configuration Best Practices ```bash # Key settings for Lambda workloads MaxConnectionsPercent: 90 # Use most of DB connections MaxIdleConnectionsPercent: 10 # Keep some idle for bursts ConnectionBorrowTimeout: 30s # Wait for available connection IdleClientTimeout: 30min # Close idle proxy connections # Monitor these CloudWatch metrics: # - DatabaseConnectionsCurrentlyBorrowed # - DatabaseConnectionsCurrentlySessionPinned # - QueryDatabaseResponseLatency ``` --- ## Data API (HTTP-based) ### Enable Data API ```bash # Must be Aurora Serverless aws rds modify-db-cluster \ --db-cluster-identifier my-cluster \ --enable-http-endpoint ``` ### TypeScript with Data API Client v2 ```bash npm install data-api-client ``` ```typescript // lib/db.ts import DataAPIClient from 'data-api-client'; const db = DataAPIClient({ secretArn: process.env.DB_SECRET_ARN!, resourceArn: process.env.DB_CLUSTER_ARN!, database: process.env.DB_NAME!, region: process.env.AWS_REGION! }); // Simple query const users = await db.query('SELECT * FROM users WHERE active = :active', { active: true }); // Insert with returning const result = await db.query( 'INSERT INTO users (email, name) VALUES (:email, :name) RETURNING *', { email: 'user@test.com', name: 'Test User' } ); // Transaction const transaction = await db.transaction(); try { await transaction.query('UPDATE accounts SET balance = balance - :amount WHERE id = :from', { amount: 100, from: 1 }); await transaction.query('UPDATE accounts SET balance = balance + :amount WHERE id = :to', { amount: 100, to: 2 }); await transaction.commit(); } catch (error) { await transaction.rollback(); throw error; } ``` ### Python with boto3 ```python # requirements.txt boto3>=1.34.0 # db.py import boto3 import os rds_data = boto3.client('rds-data') CLUSTER_ARN = os.environ['DB_CLUSTER_ARN'] SECRET_ARN = os.environ['DB_SECRET_ARN'] DATABASE = os.environ['DB_NAME'] def execute_sql(sql: str, parameters: list = None): """Execute SQL via Data API.""" params = { 'resourceArn': CLUSTER_ARN, 'secretArn': SECRET_ARN, 'database': DATABASE, 'sql': sql } if parameters: params['parameters'] = parameters return rds_data.execute_statement(**params) def get_user(user_id: int): result = execute_sql( 'SELECT * FROM users WHERE id = :id', [{'name': 'id', 'value': {'longValue': user_id}}] ) return result.get('records', []) def create_user(email: str, name: str): result = execute_sql( 'INSERT INTO users (email, name) VALUES (:email, :name) RETURNING *', [ {'name': 'email', 'value': {'stringValue': email}}, {'name': 'name', 'value': {'stringValue': name}} ] ) return result.get('generatedFields') # Transaction def transfer_funds(from_id: int, to_id: int, amount: float): transaction = rds_data.begin_transaction( resourceArn=CLUSTER_ARN, secretArn=SECRET_ARN, database=DATABASE ) transaction_id = transaction['transactionId'] try: execute_sql( 'UPDATE accounts SET balance = balance - :amount WHERE id = :id', [ {'name': 'amount', 'value': {'doubleValue': amount}}, {'name': 'id', 'value': {'longValue': from_id}} ] ) execute_sql( 'UPDATE accounts SET balance = balance + :amount WHERE id = :id', [ {'name': 'amount', 'value': {'doubleValue': amount}}, {'name': 'id', 'value': {'longValue': to_id}} ] ) rds_data.commit_transaction( resourceArn=CLUSTER_ARN, secretArn=SECRET_ARN, transactionId=transaction_id ) except Exception as e: rds_data.rollback_transaction( resourceArn=CLUSTER_ARN, secretArn=SECRET_ARN, transactionId=transaction_id ) raise e ``` --- ## Prisma with Aurora ### Setup (VPC Connection via RDS Proxy) ```bash npm install prisma @prisma/client npx prisma init ``` ```prisma // prisma/schema.prisma generator client { provider = "prisma-client-js" } datasource db { provider = "postgresql" url = env("DATABASE_URL") } model User { id Int @id @default(autoincrement()) email String @unique name String posts Post[] createdAt DateTime @default(now()) updatedAt DateTime @updatedAt } model Post { id Int @id @default(autoincrement()) title String content String? published Boolean @default(false) author User @relation(fields: [authorId], references: [id]) authorId Int createdAt DateTime @default(now()) } ``` ### Environment ```bash # Use RDS Proxy endpoint DATABASE_URL="postgresql://user:password@proxy-endpoint.proxy-xxx.region.rds.amazonaws.com:5432/mydb?schema=public&connection_limit=1" ``` ### Lambda Handler with Prisma ```typescript // handlers/users.ts import { PrismaClient } from '@prisma/client'; // Reuse client across invocations let prisma: PrismaClient | null = null; function getPrisma(): PrismaClient { if (!prisma) { prisma = new PrismaClient({ datasources: { db: { url: process.env.DATABASE_URL } } }); } return prisma; } export async function handler(event: any) { const db = getPrisma(); const users = await db.user.findMany({ include: { posts: true }, take: 10 }); return { statusCode: 200, body: JSON.stringify(users) }; } ``` --- ## Aurora Serverless v2 ### Capacity Configuration ```typescript // CDK const cluster = new rds.DatabaseCluster(this, 'Cluster', { engine: rds.DatabaseClusterEngine.auroraPostgres({ version: rds.AuroraPostgresEngineVersion.VER_15_4 }), serverlessV2MinCapacity: 0.5, // Minimum ACUs serverlessV2MaxCapacity: 16, // Maximum ACUs writer: rds.ClusterInstance.serverlessV2('writer'), readers: [ rds.ClusterInstance.serverlessV2('reader', { scaleWithWriter: true }) ], vpc, vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS } }); ``` ### Capacity Guidelines | Workload | Min ACUs | Max ACUs | |----------|----------|----------| | Dev/Test | 0.5 | 2 | | Small Production | 2 | 8 | | Medium Production | 4 | 32 | | Large Production | 8 | 128 | ### Handle Scale-to-Zero Wake-up ```typescript // Data API Client v2 handles this automatically // For direct connections, implement retry logic: import { Pool } from 'pg'; async function queryWithRetry( pool: Pool, sql: string, params: any[], maxRetries = 3 ): Promise<any> { for (let attempt = 1; attempt <= maxRetries; attempt++) { try { return await pool.query(sql, params); } catch (error: any) { // Aurora Serverless waking up if (error.code === 'ETIMEDOUT' || error.message?.includes('Communications link failure')) { if (attempt === maxRetries) throw error; // Exponential backoff await new Promise(resolve => setTimeout(resolve, Math.pow(2, attempt) * 1000)); continue; } throw error; } } } ``` --- ## Migrations ### Using Prisma Migrate ```bash # Development (creates migration) npx prisma migrate dev --name add_users_table # Production (apply migrations) npx prisma migrate deploy # Generate client npx prisma generate ``` ### CI/CD Migration Script ```yaml # .github/workflows/deploy.yml - name: Run migrations run: | # Connect via bastion or use a migration Lambda npx prisma migrate deploy env: DATABASE_URL: ${{ secrets.DATABASE_URL }} ``` ### Migration Lambda ```typescript // lambdas/migrate.ts import { execSync } from 'child_process'; export async function handler() { try { execSync('npx prisma migrate deploy', { env: { ...process.env, DATABASE_URL: process.env.DATABASE_URL }, stdio: 'inherit' }); return { statusCode: 200, body: 'Migrations applied' }; } catch (error) { console.error('Migration failed:', error); throw error; } } ``` --- ## Connection Pooling (Non-Lambda) ### PgBouncer Sidecar (ECS/EKS) ```yaml # docker-compose.yml services: app: build: . environment: DATABASE_URL: postgresql://user:pass@pgbouncer:6432/mydb pgbouncer: image: edoburu/pgbouncer environment: DATABASE_URL: postgresql://user:pass@aurora-endpoint:5432/mydb POOL_MODE: transaction MAX_CLIENT_CONN: 1000 DEFAULT_POOL_SIZE: 20 ``` ### Application-Level Pooling ```typescript // For long-running servers (not Lambda) import { Pool } from 'pg'; const pool = new Pool({ host: process.env.DB_HOST, port: 5432, database: process.env.DB_NAME, user: process.env.DB_USER, password: process.env.DB_PASSWORD, max: 20, // Max connections idleTimeoutMillis: 30000, // Close idle after 30s connectionTimeoutMillis: 10000 }); // Use pool for all queries export async function query(sql: string, params?: any[]) { const client = await pool.connect(); try { return await client.query(sql, params); } finally { client.release(); } } ``` --- ## Monitoring ### Key CloudWatch Metrics ``` # Aurora - CPUUtilization - DatabaseConnections - FreeableMemory - ServerlessDatabaseCapacity (ACUs) - AuroraReplicaLag # RDS Proxy - DatabaseConnectionsCurrentlyBorrowed - DatabaseConnectionsCurrentlySessionPinned - QueryDatabaseResponseLatency - ClientConnectionsReceived ``` ### Performance Insights ```bash # Enable via console or CLI aws rds modify-db-cluster \ --db-cluster-identifier my-cluster \ --enable-performance-insights \ --performance-insights-retention-period 7 ``` --- ## Security ### IAM Database Authentication ```typescript import { Signer } from '@aws-sdk/rds-signer'; const signer = new Signer({ hostname: process.env.DB_HOST!, port: 5432, username: 'iam_user', region: 'us-east-1' }); const token = await signer.getAuthToken(); // Use token as password (valid for 15 minutes) const pool = new Pool({ host: process.env.DB_HOST, user: 'iam_user', password: token, ssl: true }); ``` ### Secrets Manager Rotation ```typescript import { SecretsManagerClient, GetSecretValueCommand } from '@aws-sdk/client-secrets-manager'; const client = new SecretsManagerClient({ region: 'us-east-1' }); async function getDbCredentials() { const response = await client.send( new GetSecretValueCommand({ SecretId: process.env.DB_SECRET_ARN }) ); return JSON.parse(response.SecretString!); } ``` --- ## CLI Quick Reference ```bash # Cluster operations aws rds describe-db-clusters aws rds create-db-cluster --engine aurora-postgresql --db-cluster-identifier my-cluster aws rds delete-db-cluster --db-cluster-identifier my-cluster --skip-final-snapshot # Serverless v2 aws rds modify-db-cluster \ --db-cluster-identifier my-cluster \ --serverless-v2-scaling-configuration MinCapacity=0.5,MaxCapacity=16 # Data API aws rds-data execute-statement \ --resource-arn $CLUSTER_ARN \ --secret-arn $SECRET_ARN \ --database mydb \ --sql "SELECT * FROM users" # Proxy aws rds describe-db-proxies aws rds create-db-proxy --db-proxy-name my-proxy --engine-family POSTGRESQL ... # Snapshots aws rds create-db-cluster-snapshot --db-cluster-identifier my-cluster --db-cluster-snapshot-identifier backup-1 aws rds restore-db-cluster-from-snapshot --db-cluster-identifier restored --snapshot-identifier backup-1 ``` --- ## Anti-Patterns - **Direct Lambda→Aurora connections** - Always use RDS Proxy or Data API - **No connection limits** - Set `max: 1` for Lambda, use pooling for servers - **Ignoring cold starts** - Serverless v2 needs time to scale; keep minimum ACUs for production - **No read replicas** - Offload reads to replicas for heavy workloads - **Missing IAM auth** - Use IAM over static passwords when possible - **No retry logic** - Handle transient errors from scaling/failover - **Over-provisioned capacity** - Use Serverless v2 for variable workloads - **Skipping Secrets Manager** - Never hardcode credentials ================================================ FILE: skills/aws-dynamodb/SKILL.md ================================================ --- name: aws-dynamodb description: AWS DynamoDB single-table design, GSI patterns, SDK v3 TypeScript/Python when-to-use: When working with DynamoDB tables or AWS SDK data operations user-invocable: false paths: ["**/dynamodb*", "**/dynamo*", "serverless.*", "template.yaml"] effort: medium --- # AWS DynamoDB Skill DynamoDB is a fully managed NoSQL database designed for single-digit millisecond performance at any scale. Master single-table design and access pattern modeling. **Sources:** [DynamoDB Docs](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/) | [SDK v3](https://docs.aws.amazon.com/AWSJavaScriptSDK/v3/latest/client/dynamodb/) | [Best Practices](https://aws.amazon.com/blogs/database/single-table-vs-multi-table-design-in-amazon-dynamodb/) --- ## Core Principle **Design for access patterns, not entities. Think access-pattern-first.** DynamoDB requires you to know your queries before designing your schema. Model around how you'll access data, not how data relates. Single-table design stores multiple entity types in one table using generic key attributes. --- ## Key Concepts | Concept | Description | |---------|-------------| | **Partition Key (PK)** | Primary key attribute - determines data distribution | | **Sort Key (SK)** | Optional secondary key for range queries within partition | | **GSI** | Global Secondary Index - alternate partition/sort keys | | **LSI** | Local Secondary Index - same partition, different sort | | **Item** | Single record (max 400 KB) | | **Attribute** | Field within an item | --- ## Single-Table Design ### Why Single Table? - Fetch related data in single query - Reduce round trips and costs - Enable transactions across entity types - Simplify operations (backup, restore, IAM) ### Generic Key Pattern ```typescript // Instead of entity-specific keys: // userId, orderId, productId // Use generic keys that work for all entities: interface BaseItem { PK: string; // Partition Key SK: string; // Sort Key GSI1PK?: string; // First GSI partition key GSI1SK?: string; // First GSI sort key EntityType: string; // ... entity-specific attributes } ``` ### Example: E-commerce Schema ```typescript // Users { PK: 'USER#123', SK: 'PROFILE', EntityType: 'User', name: 'John', email: 'john@test.com' } { PK: 'USER#123', SK: 'ADDRESS#1', EntityType: 'Address', street: '123 Main', city: 'NYC' } // Orders for user (1:N relationship) { PK: 'USER#123', SK: 'ORDER#2024-001', EntityType: 'Order', total: 99.99, status: 'shipped' } { PK: 'USER#123', SK: 'ORDER#2024-002', EntityType: 'Order', total: 49.99, status: 'pending' } // Order details (query by order ID using GSI) { PK: 'USER#123', SK: 'ORDER#2024-001', GSI1PK: 'ORDER#2024-001', GSI1SK: 'ORDER', ... } { PK: 'ORDER#2024-001', SK: 'ITEM#1', GSI1PK: 'ORDER#2024-001', GSI1SK: 'ITEM#1', productId: 'PROD#456', qty: 2 } // Products { PK: 'PROD#456', SK: 'PRODUCT', EntityType: 'Product', name: 'Widget', price: 29.99 } ``` ### Access Patterns Covered ``` 1. Get user profile → Query PK='USER#123', SK='PROFILE' 2. Get user with addresses → Query PK='USER#123', SK begins_with 'ADDRESS' 3. Get all user orders → Query PK='USER#123', SK begins_with 'ORDER' 4. Get order by ID → Query GSI1, PK='ORDER#2024-001' 5. Get order with items → Query GSI1, PK='ORDER#2024-001' 6. Get product details → Query PK='PROD#456', SK='PRODUCT' ``` --- ## SDK v3 Setup (TypeScript) ### Install Dependencies ```bash npm install @aws-sdk/client-dynamodb @aws-sdk/lib-dynamodb ``` ### Client Configuration ```typescript // lib/dynamodb.ts import { DynamoDBClient } from '@aws-sdk/client-dynamodb'; import { DynamoDBDocumentClient } from '@aws-sdk/lib-dynamodb'; const client = new DynamoDBClient({ region: process.env.AWS_REGION || 'us-east-1', // For local development with DynamoDB Local ...(process.env.DYNAMODB_LOCAL && { endpoint: 'http://localhost:8000', credentials: { accessKeyId: 'local', secretAccessKey: 'local' } }) }); // Document client for simplified operations export const docClient = DynamoDBDocumentClient.from(client, { marshallOptions: { removeUndefinedValues: true, // Important: match v2 behavior convertClassInstanceToMap: true }, unmarshallOptions: { wrapNumbers: false } }); export const TABLE_NAME = process.env.DYNAMODB_TABLE || 'MyTable'; ``` ### Type Definitions ```typescript // types/dynamodb.ts export interface BaseItem { PK: string; SK: string; GSI1PK?: string; GSI1SK?: string; EntityType: string; createdAt: string; updatedAt: string; } export interface User extends BaseItem { EntityType: 'User'; userId: string; email: string; name: string; } export interface Order extends BaseItem { EntityType: 'Order'; orderId: string; userId: string; total: number; status: 'pending' | 'paid' | 'shipped' | 'delivered'; } // Key builders export const keys = { user: (userId: string) => ({ PK: `USER#${userId}`, SK: 'PROFILE' }), userOrders: (userId: string) => ({ PK: `USER#${userId}`, SKPrefix: 'ORDER#' }), order: (userId: string, orderId: string) => ({ PK: `USER#${userId}`, SK: `ORDER#${orderId}`, GSI1PK: `ORDER#${orderId}`, GSI1SK: 'ORDER' }) }; ``` --- ## CRUD Operations ### Put Item (Create/Update) ```typescript import { PutCommand } from '@aws-sdk/lib-dynamodb'; import { docClient, TABLE_NAME } from './dynamodb'; import { User, keys } from './types'; async function createUser(userId: string, data: { email: string; name: string }): Promise<User> { const now = new Date().toISOString(); const item: User = { ...keys.user(userId), EntityType: 'User', userId, email: data.email, name: data.name, createdAt: now, updatedAt: now }; await docClient.send(new PutCommand({ TableName: TABLE_NAME, Item: item, ConditionExpression: 'attribute_not_exists(PK)' // Prevent overwrite })); return item; } ``` ### Get Item (Read) ```typescript import { GetCommand } from '@aws-sdk/lib-dynamodb'; async function getUser(userId: string): Promise<User | null> { const result = await docClient.send(new GetCommand({ TableName: TABLE_NAME, Key: keys.user(userId) })); return (result.Item as User) || null; } ``` ### Query (List/Search) ```typescript import { QueryCommand } from '@aws-sdk/lib-dynamodb'; // Get all orders for a user async function getUserOrders(userId: string): Promise<Order[]> { const result = await docClient.send(new QueryCommand({ TableName: TABLE_NAME, KeyConditionExpression: 'PK = :pk AND begins_with(SK, :sk)', ExpressionAttributeValues: { ':pk': `USER#${userId}`, ':sk': 'ORDER#' }, ScanIndexForward: false // Newest first })); return (result.Items as Order[]) || []; } // Query GSI by order ID async function getOrderById(orderId: string): Promise<Order | null> { const result = await docClient.send(new QueryCommand({ TableName: TABLE_NAME, IndexName: 'GSI1', KeyConditionExpression: 'GSI1PK = :pk', ExpressionAttributeValues: { ':pk': `ORDER#${orderId}` } })); return (result.Items?.[0] as Order) || null; } // Paginated query async function getUserOrdersPaginated( userId: string, pageSize: number = 20, lastKey?: Record<string, any> ): Promise<{ items: Order[]; lastKey?: Record<string, any> }> { const result = await docClient.send(new QueryCommand({ TableName: TABLE_NAME, KeyConditionExpression: 'PK = :pk AND begins_with(SK, :sk)', ExpressionAttributeValues: { ':pk': `USER#${userId}`, ':sk': 'ORDER#' }, Limit: pageSize, ExclusiveStartKey: lastKey })); return { items: (result.Items as Order[]) || [], lastKey: result.LastEvaluatedKey }; } ``` ### Update Item ```typescript import { UpdateCommand } from '@aws-sdk/lib-dynamodb'; async function updateUser(userId: string, updates: Partial<Pick<User, 'name' | 'email'>>): Promise<User> { // Build update expression dynamically const updateParts: string[] = ['#updatedAt = :updatedAt']; const names: Record<string, string> = { '#updatedAt': 'updatedAt' }; const values: Record<string, any> = { ':updatedAt': new Date().toISOString() }; if (updates.name !== undefined) { updateParts.push('#name = :name'); names['#name'] = 'name'; values[':name'] = updates.name; } if (updates.email !== undefined) { updateParts.push('#email = :email'); names['#email'] = 'email'; values[':email'] = updates.email; } const result = await docClient.send(new UpdateCommand({ TableName: TABLE_NAME, Key: keys.user(userId), UpdateExpression: `SET ${updateParts.join(', ')}`, ExpressionAttributeNames: names, ExpressionAttributeValues: values, ReturnValues: 'ALL_NEW', ConditionExpression: 'attribute_exists(PK)' // Must exist })); return result.Attributes as User; } // Atomic counter increment async function incrementOrderCount(userId: string): Promise<void> { await docClient.send(new UpdateCommand({ TableName: TABLE_NAME, Key: keys.user(userId), UpdateExpression: 'SET orderCount = if_not_exists(orderCount, :zero) + :inc', ExpressionAttributeValues: { ':zero': 0, ':inc': 1 } })); } ``` ### Delete Item ```typescript import { DeleteCommand } from '@aws-sdk/lib-dynamodb'; async function deleteUser(userId: string): Promise<void> { await docClient.send(new DeleteCommand({ TableName: TABLE_NAME, Key: keys.user(userId), ConditionExpression: 'attribute_exists(PK)' })); } ``` --- ## Batch Operations ### Batch Write (Up to 25 items) ```typescript import { BatchWriteCommand } from '@aws-sdk/lib-dynamodb'; async function batchCreateItems(items: BaseItem[]): Promise<void> { // DynamoDB allows max 25 items per batch const chunks = []; for (let i = 0; i < items.length; i += 25) { chunks.push(items.slice(i, i + 25)); } for (const chunk of chunks) { await docClient.send(new BatchWriteCommand({ RequestItems: { [TABLE_NAME]: chunk.map(item => ({ PutRequest: { Item: item } })) } })); } } ``` ### Batch Get (Up to 100 items) ```typescript import { BatchGetCommand } from '@aws-sdk/lib-dynamodb'; async function batchGetUsers(userIds: string[]): Promise<User[]> { const result = await docClient.send(new BatchGetCommand({ RequestItems: { [TABLE_NAME]: { Keys: userIds.map(id => keys.user(id)) } } })); return (result.Responses?.[TABLE_NAME] as User[]) || []; } ``` --- ## Transactions ### TransactWrite (Atomic Multi-Item) ```typescript import { TransactWriteCommand } from '@aws-sdk/lib-dynamodb'; async function createOrderWithItems( userId: string, orderId: string, orderData: { total: number }, items: { productId: string; quantity: number }[] ): Promise<void> { const now = new Date().toISOString(); const transactItems = [ // Create order { Put: { TableName: TABLE_NAME, Item: { ...keys.order(userId, orderId), EntityType: 'Order', orderId, userId, total: orderData.total, status: 'pending', createdAt: now, updatedAt: now }, ConditionExpression: 'attribute_not_exists(PK)' } }, // Update user's order count { Update: { TableName: TABLE_NAME, Key: keys.user(userId), UpdateExpression: 'SET orderCount = if_not_exists(orderCount, :zero) + :inc', ExpressionAttributeValues: { ':zero': 0, ':inc': 1 } } }, // Add order items ...items.map((item, index) => ({ Put: { TableName: TABLE_NAME, Item: { PK: `ORDER#${orderId}`, SK: `ITEM#${index}`, GSI1PK: `ORDER#${orderId}`, GSI1SK: `ITEM#${index}`, EntityType: 'OrderItem', productId: item.productId, quantity: item.quantity, createdAt: now } } })) ]; await docClient.send(new TransactWriteCommand({ TransactItems: transactItems })); } ``` --- ## GSI Patterns ### Sparse Index ```typescript // Only items with GSI1PK attribute appear in the index // Useful for "featured" or "flagged" items // Featured products (only some products have GSI1PK) { PK: 'PROD#1', SK: 'PRODUCT', GSI1PK: 'FEATURED', GSI1SK: 'PROD#1', ... } // In index { PK: 'PROD#2', SK: 'PRODUCT', ... } // Not in index (no GSI1PK) // Query featured products const featured = await docClient.send(new QueryCommand({ TableName: TABLE_NAME, IndexName: 'GSI1', KeyConditionExpression: 'GSI1PK = :pk', ExpressionAttributeValues: { ':pk': 'FEATURED' } })); ``` ### Inverted Index (GSI) ```typescript // Main table: User -> Orders (PK=USER#, SK=ORDER#) // GSI: Orders by status (GSI1PK=STATUS#, GSI1SK=ORDER#) { PK: 'USER#123', SK: 'ORDER#001', GSI1PK: 'STATUS#pending', GSI1SK: 'ORDER#001', ... } { PK: 'USER#456', SK: 'ORDER#002', GSI1PK: 'STATUS#shipped', GSI1SK: 'ORDER#002', ... } // Get all pending orders across all users const pending = await docClient.send(new QueryCommand({ TableName: TABLE_NAME, IndexName: 'GSI1', KeyConditionExpression: 'GSI1PK = :pk', ExpressionAttributeValues: { ':pk': 'STATUS#pending' } })); ``` ### Multi-Attribute Composite Keys (Nov 2025+) ```typescript // New feature: Up to 4 attributes per partition/sort key // No more synthetic keys like "TOURNAMENT#WINTER2024#REGION#NA-EAST" // Table definition (IaC) const table = { AttributeDefinitions: [ { AttributeName: 'tournament', AttributeType: 'S' }, { AttributeName: 'region', AttributeType: 'S' }, { AttributeName: 'score', AttributeType: 'N' } ], GlobalSecondaryIndexes: [{ IndexName: 'TournamentRegionIndex', KeySchema: [ { AttributeName: 'tournament', KeyType: 'HASH' }, // Composite PK part 1 { AttributeName: 'region', KeyType: 'HASH' }, // Composite PK part 2 { AttributeName: 'score', KeyType: 'RANGE' } ] }] }; ``` --- ## Python (boto3) ### Setup ```python # requirements.txt boto3>=1.34.0 # db.py import boto3 from boto3.dynamodb.conditions import Key, Attr import os dynamodb = boto3.resource( 'dynamodb', region_name=os.getenv('AWS_REGION', 'us-east-1'), endpoint_url=os.getenv('DYNAMODB_LOCAL_ENDPOINT') # For local dev ) table = dynamodb.Table(os.getenv('DYNAMODB_TABLE', 'MyTable')) ``` ### Operations ```python from datetime import datetime from typing import Optional, List from decimal import Decimal def create_user(user_id: str, email: str, name: str) -> dict: now = datetime.utcnow().isoformat() item = { 'PK': f'USER#{user_id}', 'SK': 'PROFILE', 'EntityType': 'User', 'userId': user_id, 'email': email, 'name': name, 'createdAt': now, 'updatedAt': now } table.put_item( Item=item, ConditionExpression='attribute_not_exists(PK)' ) return item def get_user(user_id: str) -> Optional[dict]: response = table.get_item( Key={'PK': f'USER#{user_id}', 'SK': 'PROFILE'} ) return response.get('Item') def get_user_orders(user_id: str) -> List[dict]: response = table.query( KeyConditionExpression=Key('PK').eq(f'USER#{user_id}') & Key('SK').begins_with('ORDER#'), ScanIndexForward=False ) return response.get('Items', []) def update_user(user_id: str, **updates) -> dict: update_parts = ['#updatedAt = :updatedAt'] names = {'#updatedAt': 'updatedAt'} values = {':updatedAt': datetime.utcnow().isoformat()} for key, value in updates.items(): update_parts.append(f'#{key} = :{key}') names[f'#{key}'] = key values[f':{key}'] = value response = table.update_item( Key={'PK': f'USER#{user_id}', 'SK': 'PROFILE'}, UpdateExpression=f'SET {", ".join(update_parts)}', ExpressionAttributeNames=names, ExpressionAttributeValues=values, ReturnValues='ALL_NEW' ) return response['Attributes'] def delete_user(user_id: str) -> None: table.delete_item( Key={'PK': f'USER#{user_id}', 'SK': 'PROFILE'} ) ``` --- ## Local Development ### DynamoDB Local ```bash # Docker docker run -d -p 8000:8000 amazon/dynamodb-local # Create table locally aws dynamodb create-table \ --endpoint-url http://localhost:8000 \ --table-name MyTable \ --attribute-definitions \ AttributeName=PK,AttributeType=S \ AttributeName=SK,AttributeType=S \ AttributeName=GSI1PK,AttributeType=S \ AttributeName=GSI1SK,AttributeType=S \ --key-schema \ AttributeName=PK,KeyType=HASH \ AttributeName=SK,KeyType=RANGE \ --global-secondary-indexes \ 'IndexName=GSI1,KeySchema=[{AttributeName=GSI1PK,KeyType=HASH},{AttributeName=GSI1SK,KeyType=RANGE}],Projection={ProjectionType=ALL}' \ --billing-mode PAY_PER_REQUEST ``` ### NoSQL Workbench AWS provides [NoSQL Workbench](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/workbench.html) for visual data modeling and querying. --- ## CLI Quick Reference ```bash # Table operations aws dynamodb create-table --cli-input-json file://table.json aws dynamodb describe-table --table-name MyTable aws dynamodb delete-table --table-name MyTable # Item operations aws dynamodb put-item --table-name MyTable --item '{"PK":{"S":"USER#1"},"SK":{"S":"PROFILE"}}' aws dynamodb get-item --table-name MyTable --key '{"PK":{"S":"USER#1"},"SK":{"S":"PROFILE"}}' aws dynamodb delete-item --table-name MyTable --key '{"PK":{"S":"USER#1"},"SK":{"S":"PROFILE"}}' # Query aws dynamodb query --table-name MyTable \ --key-condition-expression "PK = :pk" \ --expression-attribute-values '{":pk":{"S":"USER#1"}}' # Scan (avoid in production) aws dynamodb scan --table-name MyTable --limit 10 ``` --- ## Anti-Patterns - **Scan operations** - Always use Query with proper key conditions - **Hot partitions** - Distribute writes with high-cardinality partition keys - **Large items** - Keep items under 400KB; use S3 for large data - **Too many GSIs** - Each GSI duplicates data; design carefully - **Ignoring capacity** - Monitor consumed capacity, use on-demand for variable loads - **No condition expressions** - Always validate with ConditionExpression - **Fetching all attributes** - Use ProjectionExpression to limit data - **Multi-table design without reason** - Single-table is preferred unless access patterns don't overlap ================================================ FILE: skills/azure-cosmosdb/SKILL.md ================================================ --- name: azure-cosmosdb description: Azure Cosmos DB partition keys, consistency levels, change feed, SDK patterns when-to-use: When working with Azure Cosmos DB user-invocable: false paths: ["**/cosmos*", "**/azure*"] effort: medium --- ## Core Principle **Choose partition key wisely, design for your access patterns, understand consistency tradeoffs.** Cosmos DB distributes data across partitions. Your partition key choice determines scalability, performance, and cost. Design for even distribution and query efficiency. --- ## Cosmos DB APIs | API | Use Case | |-----|----------| | **NoSQL (Core)** | Document database, most flexible | | **MongoDB** | MongoDB wire protocol compatible | | **PostgreSQL** | Distributed PostgreSQL (Citus) | | **Apache Cassandra** | Wide-column store | | **Apache Gremlin** | Graph database | | **Table** | Key-value (Azure Table Storage compatible) | This skill focuses on **NoSQL (Core) API** - the most common choice. --- ## Key Concepts | Concept | Description | |---------|-------------| | **Container** | Collection of items (like a table) | | **Item** | Single document/record (JSON) | | **Partition Key** | Determines data distribution | | **Logical Partition** | Items with same partition key | | **Physical Partition** | Storage unit (max 50GB, 10K RU/s) | | **RU (Request Unit)** | Throughput currency | --- ## Partition Key Design ### Good Partition Keys ```typescript // High cardinality, even distribution, used in queries // E-commerce: userId for user data { "id": "order-123", "userId": "user-456", ... } // PK: /userId // Multi-tenant: tenantId { "id": "doc-1", "tenantId": "tenant-abc", ... } // PK: /tenantId // IoT: deviceId for telemetry { "id": "reading-1", "deviceId": "device-789", ... } // PK: /deviceId // Logs: synthetic key (date + category) { "id": "log-1", "partitionKey": "2024-01-15_errors", ... } // PK: /partitionKey ``` ### Hierarchical Partition Keys ```typescript // For multi-level distribution (e.g., tenant → user) // Container created with: /tenantId, /userId { "id": "order-123", "tenantId": "acme-corp", "userId": "user-456", "items": [...] } // Query within tenant and user efficiently ``` ### Bad Partition Keys ```typescript // Avoid: // - Low cardinality (status, type, boolean) // - Monotonically increasing (timestamp, auto-increment) // - Frequently updated fields // - Fields not used in queries // Bad: Only 3 values → hot partitions { "status": "pending" | "completed" | "cancelled" } // Bad: All writes go to latest partition { "timestamp": "2024-01-15T10:30:00Z" } ``` --- ## SDK Setup (TypeScript) ### Install ```bash npm install @azure/cosmos ``` ### Initialize Client ```typescript // lib/cosmosdb.ts import { CosmosClient, Database, Container } from '@azure/cosmos'; const endpoint = process.env.COSMOS_ENDPOINT!; const key = process.env.COSMOS_KEY!; const databaseId = process.env.COSMOS_DATABASE!; const client = new CosmosClient({ endpoint, key }); // Or with connection string // const client = new CosmosClient(process.env.COSMOS_CONNECTION_STRING!); export const database: Database = client.database(databaseId); export function getContainer(containerId: string): Container { return database.container(containerId); } ``` ### Type Definitions ```typescript // types/cosmos.ts export interface BaseItem { id: string; _ts?: number; // Auto-generated timestamp _etag?: string; // For optimistic concurrency } export interface User extends BaseItem { userId: string; // Partition key email: string; name: string; createdAt: string; updatedAt: string; } export interface Order extends BaseItem { userId: string; // Partition key orderId: string; items: OrderItem[]; total: number; status: 'pending' | 'paid' | 'shipped' | 'delivered'; createdAt: string; } export interface OrderItem { productId: string; name: string; quantity: number; price: number; } ``` --- ## CRUD Operations ### Create Item ```typescript import { getContainer } from './cosmosdb'; import { User } from './types'; const usersContainer = getContainer('users'); async function createUser(data: Omit<User, 'id' | 'createdAt' | 'updatedAt'>): Promise<User> { const now = new Date().toISOString(); const user: User = { id: crypto.randomUUID(), ...data, createdAt: now, updatedAt: now }; const { resource } = await usersContainer.items.create(user); return resource as User; } ``` ### Read Item (Point Read) ```typescript // Most efficient read - requires id AND partition key async function getUser(userId: string, id: string): Promise<User | null> { try { const { resource } = await usersContainer.item(id, userId).read<User>(); return resource || null; } catch (error: any) { if (error.code === 404) return null; throw error; } } // If id equals partition key value async function getUserById(userId: string): Promise<User | null> { try { const { resource } = await usersContainer.item(userId, userId).read<User>(); return resource || null; } catch (error: any) { if (error.code === 404) return null; throw error; } } ``` ### Query Items ```typescript // Query within partition (efficient) async function getUserOrders(userId: string): Promise<Order[]> { const ordersContainer = getContainer('orders'); const { resources } = await ordersContainer.items .query<Order>({ query: 'SELECT * FROM c WHERE c.userId = @userId ORDER BY c.createdAt DESC', parameters: [{ name: '@userId', value: userId }] }) .fetchAll(); return resources; } // Cross-partition query (use sparingly) async function getOrdersByStatus(status: string): Promise<Order[]> { const ordersContainer = getContainer('orders'); const { resources } = await ordersContainer.items .query<Order>({ query: 'SELECT * FROM c WHERE c.status = @status', parameters: [{ name: '@status', value: status }] }) .fetchAll(); return resources; } // Paginated query async function getOrdersPaginated( userId: string, pageSize: number = 10, continuationToken?: string ): Promise<{ items: Order[]; continuationToken?: string }> { const ordersContainer = getContainer('orders'); const queryIterator = ordersContainer.items.query<Order>( { query: 'SELECT * FROM c WHERE c.userId = @userId ORDER BY c.createdAt DESC', parameters: [{ name: '@userId', value: userId }] }, { maxItemCount: pageSize, continuationToken } ); const { resources, continuationToken: nextToken } = await queryIterator.fetchNext(); return { items: resources, continuationToken: nextToken }; } ``` ### Update Item ```typescript // Replace entire item async function updateUser(userId: string, id: string, updates: Partial<User>): Promise<User> { const existing = await getUser(userId, id); if (!existing) throw new Error('User not found'); const updated: User = { ...existing, ...updates, updatedAt: new Date().toISOString() }; const { resource } = await usersContainer.item(id, userId).replace(updated); return resource as User; } // Partial update (patch operations) async function patchUser(userId: string, id: string, operations: any[]): Promise<User> { const { resource } = await usersContainer.item(id, userId).patch(operations); return resource as User; } // Usage: await patchUser('user-123', 'user-123', [ { op: 'set', path: '/name', value: 'New Name' }, { op: 'set', path: '/updatedAt', value: new Date().toISOString() }, { op: 'incr', path: '/loginCount', value: 1 } ]); ``` ### Delete Item ```typescript async function deleteUser(userId: string, id: string): Promise<void> { await usersContainer.item(id, userId).delete(); } ``` ### Optimistic Concurrency (ETags) ```typescript async function updateUserWithETag( userId: string, id: string, updates: Partial<User>, etag: string ): Promise<User> { const existing = await getUser(userId, id); if (!existing) throw new Error('User not found'); const updated: User = { ...existing, ...updates, updatedAt: new Date().toISOString() }; try { const { resource } = await usersContainer.item(id, userId).replace(updated, { accessCondition: { type: 'IfMatch', condition: etag } }); return resource as User; } catch (error: any) { if (error.code === 412) { throw new Error('Document was modified by another process'); } throw error; } } ``` --- ## Consistency Levels | Level | Guarantees | Latency | Use Case | |-------|-----------|---------|----------| | **Strong** | Linearizable reads | Highest | Financial, inventory | | **Bounded Staleness** | Consistent within bounds | High | Leaderboards, counters | | **Session** | Read your writes | Medium | User sessions (default) | | **Consistent Prefix** | Ordered reads | Low | Social feeds | | **Eventual** | No ordering guarantee | Lowest | Analytics, logs | ### Set Consistency Per Request ```typescript // Override default consistency const { resource } = await usersContainer.item(id, userId).read<User>({ consistencyLevel: 'Strong' }); // For queries const { resources } = await container.items.query( { query: 'SELECT * FROM c' }, { consistencyLevel: 'BoundedStaleness' } ).fetchAll(); ``` --- ## Batch Operations ### Transactional Batch (Same Partition) ```typescript async function createOrderWithItems(userId: string, order: Order, items: any[]): Promise<void> { const ordersContainer = getContainer('orders'); const operations = [ { operationType: 'Create' as const, resourceBody: order }, ...items.map(item => ({ operationType: 'Create' as const, resourceBody: { ...item, userId, orderId: order.orderId } })) ]; const { result } = await ordersContainer.items.batch(operations, userId); // Check if any operation failed if (result.some(r => r.statusCode >= 400)) { throw new Error('Batch operation failed'); } } ``` ### Bulk Operations ```typescript // For large-scale imports (not transactional) async function bulkImportUsers(users: User[]): Promise<void> { const operations = users.map(user => ({ operationType: 'Create' as const, resourceBody: user, partitionKey: user.userId })); // Process in chunks const chunkSize = 100; for (let i = 0; i < operations.length; i += chunkSize) { const chunk = operations.slice(i, i + chunkSize); await usersContainer.items.bulk(chunk); } } ``` --- ## Change Feed ### Process Changes ```typescript import { ChangeFeedStartFrom } from '@azure/cosmos'; async function processChangeFeed(): Promise<void> { const container = getContainer('orders'); const changeFeedIterator = container.items.changeFeed({ changeFeedStartFrom: ChangeFeedStartFrom.Beginning() }); while (changeFeedIterator.hasMoreResults) { const { result: items, statusCode } = await changeFeedIterator.fetchNext(); if (statusCode === 304) { // No new changes await sleep(1000); continue; } for (const item of items) { console.log('Changed item:', item); // Process the change... } } } // For production, use Change Feed Processor with lease container ``` ### Change Feed Processor Pattern ```typescript async function startChangeFeedProcessor(): Promise<void> { const sourceContainer = getContainer('orders'); const leaseContainer = getContainer('leases'); const changeFeedProcessor = sourceContainer.items.changeFeed .for(item => { // Process each change console.log('Processing:', item); }) .withLeaseContainer(leaseContainer) .build(); await changeFeedProcessor.start(); } ``` --- ## Python SDK ### Install ```bash pip install azure-cosmos ``` ### Setup and Operations ```python # cosmos_db.py import os from azure.cosmos import CosmosClient, PartitionKey from azure.cosmos.exceptions import CosmosResourceNotFoundError from typing import Optional, List from datetime import datetime import uuid # Initialize client endpoint = os.environ['COSMOS_ENDPOINT'] key = os.environ['COSMOS_KEY'] database_name = os.environ['COSMOS_DATABASE'] client = CosmosClient(endpoint, key) database = client.get_database_client(database_name) def get_container(container_name: str): return database.get_container_client(container_name) # CRUD Operations users_container = get_container('users') def create_user(email: str, name: str, user_id: str = None) -> dict: user_id = user_id or str(uuid.uuid4()) now = datetime.utcnow().isoformat() user = { 'id': user_id, 'userId': user_id, # Partition key 'email': email, 'name': name, 'createdAt': now, 'updatedAt': now } return users_container.create_item(user) def get_user(user_id: str) -> Optional[dict]: try: return users_container.read_item(item=user_id, partition_key=user_id) except CosmosResourceNotFoundError: return None def query_users(email_domain: str) -> List[dict]: query = "SELECT * FROM c WHERE CONTAINS(c.email, @domain)" parameters = [{'name': '@domain', 'value': email_domain}] return list(users_container.query_items( query=query, parameters=parameters, enable_cross_partition_query=True )) def update_user(user_id: str, **updates) -> dict: user = get_user(user_id) if not user: raise ValueError('User not found') user.update(updates) user['updatedAt'] = datetime.utcnow().isoformat() return users_container.replace_item(item=user_id, body=user) def delete_user(user_id: str) -> None: users_container.delete_item(item=user_id, partition_key=user_id) # Paginated query def get_users_paginated(page_size: int = 10, continuation_token: str = None): query = "SELECT * FROM c ORDER BY c.createdAt DESC" items = users_container.query_items( query=query, enable_cross_partition_query=True, max_item_count=page_size, continuation_token=continuation_token ) page = items.by_page() results = list(next(page)) return { 'items': results, 'continuation_token': page.continuation_token } ``` --- ## Indexing ### Custom Indexing Policy ```json { "indexingMode": "consistent", "automatic": true, "includedPaths": [ { "path": "/userId/?" }, { "path": "/status/?" }, { "path": "/createdAt/?" } ], "excludedPaths": [ { "path": "/content/*" }, { "path": "/_etag/?" } ], "compositeIndexes": [ [ { "path": "/userId", "order": "ascending" }, { "path": "/createdAt", "order": "descending" } ] ] } ``` ### Create Container with Index ```typescript await database.containers.createIfNotExists({ id: 'orders', partitionKey: { paths: ['/userId'] }, indexingPolicy: { indexingMode: 'consistent', includedPaths: [ { path: '/userId/?' }, { path: '/status/?' }, { path: '/createdAt/?' } ], excludedPaths: [ { path: '/*' } // Exclude all by default ] } }); ``` --- ## Throughput Management ### Provisioned Throughput ```typescript // Container level await database.containers.createIfNotExists({ id: 'orders', partitionKey: { paths: ['/userId'] }, throughput: 1000 // RU/s }); // Scale throughput const container = database.container('orders'); await container.throughput.replace(2000); ``` ### Autoscale ```typescript await database.containers.createIfNotExists({ id: 'orders', partitionKey: { paths: ['/userId'] }, maxThroughput: 10000 // Auto-scales 10% to 100% }); ``` ### Serverless ```typescript // No throughput configuration needed // Pay per request (good for dev/test, intermittent workloads) await database.containers.createIfNotExists({ id: 'orders', partitionKey: { paths: ['/userId'] } // No throughput = serverless }); ``` --- ## CLI Quick Reference ```bash # Azure CLI az cosmosdb create --name myaccount --resource-group mygroup az cosmosdb sql database create --account-name myaccount --name mydb --resource-group mygroup az cosmosdb sql container create \ --account-name myaccount \ --database-name mydb \ --name orders \ --partition-key-path /userId \ --throughput 400 # Query az cosmosdb sql query --account-name myaccount --database-name mydb \ --container-name orders --query "SELECT * FROM c" # Keys az cosmosdb keys list --name myaccount --resource-group mygroup az cosmosdb keys list --name myaccount --resource-group mygroup --type connection-strings ``` --- ## Cost Optimization | Strategy | Impact | |----------|--------| | **Right partition key** | Avoid hot partitions (wasted RUs) | | **Index only what you query** | Reduce write RU cost | | **Use point reads** | 1 RU vs 3+ RU for queries | | **Serverless for dev/test** | Pay per request | | **Autoscale for production** | Scale down during low traffic | | **TTL for temporary data** | Auto-delete old items | ### Time-to-Live (TTL) ```typescript // Enable TTL on container await database.containers.createIfNotExists({ id: 'sessions', partitionKey: { paths: ['/userId'] }, defaultTtl: 3600 // 1 hour }); // Per-item TTL const session = { id: 'session-123', userId: 'user-456', ttl: 1800 // Override: 30 minutes }; ``` --- ## Anti-Patterns - **Bad partition key** - Low cardinality causes hot partitions - **Cross-partition queries** - Expensive; design for single-partition queries - **Over-indexing** - Increases write cost; index only queried paths - **Large items** - Max 2MB; store blobs in Azure Blob Storage - **Ignoring RU cost** - Monitor and optimize expensive queries - **Strong consistency everywhere** - Use Session (default) unless required - **No retry logic** - Handle 429 (throttling) with exponential backoff - **Missing TTL** - Set TTL for temporary/session data ================================================ FILE: skills/base/SKILL.md ================================================ --- name: base description: Universal coding patterns, constraints, TDD workflow, atomic todos when-to-use: Always loaded as foundation for all projects - TDD workflow, simplicity rules, atomic todos user-invocable: false effort: medium --- # Base Skill - Universal Patterns ## Core Principle Complexity is the enemy. Every line of code is a liability. The goal is software simple enough that any engineer (or AI) can understand the entire system in one session. --- ## Simplicity Rules These limits apply to every file created or modified. ### Function Level - **Maximum 20 lines per function** - if longer, decompose IMMEDIATELY - **Maximum 3 parameters per function** - if more, use an options object or decompose - **Maximum 2 levels of nesting** - flatten with early returns or extract functions - **Single responsibility** - each function does exactly one thing - **Descriptive names over comments** - if you need a comment to explain what, rename it ### File Level - **Maximum 200 lines per file** - if longer, split by responsibility BEFORE continuing - **Maximum 10 functions per file** - keeps cognitive load manageable - **One export focus per file** - a file should have one primary purpose ### Module Level - **Maximum 3 levels of directory nesting** - flat is better than nested - **Clear boundaries** - each module has a single public interface - **No circular dependencies** - ever ### Enforcement Protocol **Before completing ANY file:** 1. Count total lines - if > 200, STOP and split 2. Count functions - if > 10, STOP and split 3. Check each function length - if any > 20 lines, STOP and decompose 4. Check parameter counts - if any > 3, STOP and refactor **If limits are exceeded during development:** ``` ⚠️ FILE SIZE VIOLATION DETECTED [filename] has [X] lines (limit: 200) Splitting into: - [filename-a].ts - [responsibility A] - [filename-b].ts - [responsibility B] ``` **Never defer refactoring.** Fix violations immediately, not "later". --- ## Architectural Patterns ### Functional Core, Imperative Shell - Pure functions for business logic - no side effects, deterministic - Side effects only at boundaries - API calls, database, file system at edges - Data in, data out - functions transform data, they don't mutate state ### Composition Over Inheritance - No inheritance deeper than 1 level - prefer interfaces/composition - Small, composable utilities - build complex from simple - Dependency injection - pass dependencies, don't import them directly ### Error Handling - Fail fast, fail loud - errors surface immediately - No silent failures - every error is logged or thrown - Design APIs where misuse is impossible --- ## Testing Philosophy - **100% coverage on business logic** - the functional core - **Integration tests for boundaries** - API endpoints, database operations - **No untested code merges** - CI blocks without passing tests - **Test behavior, not implementation** - tests survive refactoring - **Each test runs in isolation** - no interdependence --- ## Anti-Patterns (Never Do This) - ❌ Global state - ❌ Magic numbers/strings - use named constants - ❌ Deep nesting - flatten or extract - ❌ Long parameter lists - use objects - ❌ Comments explaining "what" - code should be self-documenting - ❌ Dead code - delete it, git remembers - ❌ Copy-paste duplication - extract to shared function - ❌ God objects/files - split by responsibility - ❌ Circular dependencies - ❌ Premature optimization - ❌ Large PRs - small, focused changes only - ❌ Mixing refactoring with features - separate commits --- ## Documentation Structure Every project must have clear separation between code docs and project specs: ``` project/ ├── docs/ # Code documentation │ ├── architecture.md # System design decisions │ ├── api.md # API reference (if applicable) │ └── setup.md # Development setup guide ├── _project_specs/ # Project specifications │ ├── overview.md # Project vision and goals │ ├── features/ # Feature specifications │ │ ├── feature-a.md │ │ └── feature-b.md │ ├── todos/ # Atomic todos tracking │ │ ├── active.md # Current sprint/focus │ │ ├── backlog.md # Future work │ │ └── completed.md # Done items (for reference) │ ├── session/ # Session state (see session-management.md) │ │ ├── current-state.md # Live session state │ │ ├── decisions.md # Key decisions log │ │ ├── code-landmarks.md # Important code locations │ │ └── archive/ # Past session summaries │ └── prompts/ # LLM prompt specifications (if AI-first) └── CLAUDE.md # Claude instructions (references skills) ``` ### What Goes Where | Location | Content | |----------|---------| | `docs/` | Technical documentation, API refs, setup guides | | `_project_specs/` | Business logic, features, requirements, todos | | `_project_specs/session/` | Session state, decisions, context for resumability | | `CLAUDE.md` | Claude-specific instructions and skill references | --- ## Atomic Todos All work is tracked as atomic todos with validation and test criteria. ### Todo Format (Required) ```markdown ## [TODO-001] Short descriptive title **Status:** pending | in-progress | blocked | done **Priority:** high | medium | low **Estimate:** XS | S | M | L | XL ### Description One paragraph describing what needs to be done. ### Acceptance Criteria - [ ] Criterion 1 - specific, measurable - [ ] Criterion 2 - specific, measurable ### Validation How to verify this is complete: - Manual: [steps to manually test] - Automated: [test file/command that validates this] ### Test Cases | Input | Expected Output | Notes | |-------|-----------------|-------| | ... | ... | ... | ### Dependencies - Depends on: [TODO-xxx] (if any) - Blocks: [TODO-yyy] (if any) ### TDD Execution Log | Phase | Command | Result | Timestamp | |-------|---------|--------|-----------| | RED | `[test command]` | - | - | | GREEN | `[test command]` | - | - | | VALIDATE | `[lint && typecheck && test --coverage]` | - | - | | COMPLETE | Moved to completed.md | - | - | ``` ### Todo Rules 1. **Atomic** - Each todo is a single, completable unit of work 2. **Testable** - Every todo has validation criteria and test cases 3. **Sized** - If larger than "M", break it down further 4. **Independent** - Minimize dependencies between todos 5. **Tracked** - Move between active.md → completed.md when done ### Todo Execution Workflow (TDD - Mandatory) **Every todo MUST follow this exact workflow. No exceptions.** ``` ┌─────────────────────────────────────────────────────────────┐ │ 1. RED: Write Tests First │ │ └─ Create test file(s) based on Test Cases table │ │ └─ Tests should cover all acceptance criteria │ │ └─ Run tests → ALL MUST FAIL (proves tests are valid) │ ├─────────────────────────────────────────────────────────────┤ │ 2. GREEN: Implement the Feature │ │ └─ Write minimum code to make tests pass │ │ └─ Follow simplicity rules (20 lines/function, etc.) │ │ └─ Run tests → ALL MUST PASS │ ├─────────────────────────────────────────────────────────────┤ │ 3. VALIDATE: Quality Gates │ │ └─ Run linter (auto-fix if possible) │ │ └─ Run type checker (tsc/mypy/pyright) │ │ └─ Run full test suite with coverage │ │ └─ Verify coverage threshold (≥80%) │ ├─────────────────────────────────────────────────────────────┤ │ 4. COMPLETE: Mark Done │ │ └─ Only after ALL validations pass │ │ └─ Move todo to completed.md │ │ └─ Checkpoint session state │ └─────────────────────────────────────────────────────────────┘ ``` #### Execution Commands by Stack **Node.js/TypeScript:** ```bash # 1. RED - Run tests (expect failures) npm test -- --grep "todo-description" # 2. GREEN - Run tests (expect pass) npm test -- --grep "todo-description" # 3. VALIDATE - Full quality check npm run lint && npm run typecheck && npm test -- --coverage ``` **Python:** ```bash # 1. RED - Run tests (expect failures) pytest -k "todo_description" -v # 2. GREEN - Run tests (expect pass) pytest -k "todo_description" -v # 3. VALIDATE - Full quality check ruff check . && mypy . && pytest --cov --cov-fail-under=80 ``` **React/Next.js:** ```bash # 1. RED - Run tests (expect failures) npm test -- --testPathPattern="ComponentName" # 2. GREEN - Run tests (expect pass) npm test -- --testPathPattern="ComponentName" # 3. VALIDATE - Full quality check npm run lint && npm run typecheck && npm test -- --coverage --watchAll=false ``` #### Blocking Conditions **NEVER mark a todo as complete if:** - ❌ Tests were not written first (skipped RED phase) - ❌ Tests did not fail initially (invalid tests) - ❌ Any test is failing - ❌ Linter has errors (warnings may be acceptable) - ❌ Type checker has errors - ❌ Coverage dropped below threshold **If blocked by failures:** ```markdown ## [TODO-042] - BLOCKED **Blocking Reason:** [Lint error in X / Test failure in Y / Coverage at 75%] **Action Required:** [Specific fix needed] ``` ### Bug Fix Workflow (TDD - Mandatory) **When a user reports a bug, NEVER jump to fixing it directly.** ``` ┌─────────────────────────────────────────────────────────────┐ │ 1. DIAGNOSE: Identify the Test Gap │ │ └─ Run existing tests - do any fail? │ │ └─ If tests pass but bug exists → tests are incomplete │ │ └─ Document: "Test gap: [what was missed]" │ ├─────────────────────────────────────────────────────────────┤ │ 2. RED: Write a Failing Test for the Bug │ │ └─ Create test that reproduces the exact bug │ │ └─ Test should FAIL with current code │ │ └─ This proves the test catches the bug │ ├─────────────────────────────────────────────────────────────┤ │ 3. GREEN: Fix the Bug │ │ └─ Write minimum code to make the test pass │ │ └─ Run test → must PASS now │ ├─────────────────────────────────────────────────────────────┤ │ 4. VALIDATE: Full Quality Check │ │ └─ Run ALL tests (not just the new one) │ │ └─ Run linter and type checker │ │ └─ Verify no regression in coverage │ └─────────────────────────────────────────────────────────────┘ ``` #### Bug Report Todo Format ```markdown ## [BUG-001] Short description of the bug **Status:** pending **Priority:** high **Reported:** [how user reported it / reproduction steps] ### Bug Description What is happening vs. what should happen. ### Reproduction Steps 1. Step one 2. Step two 3. Observe: [incorrect behavior] 4. Expected: [correct behavior] ### Test Gap Analysis - Existing test coverage: [list relevant test files] - Gap identified: [what the tests missed] - New test needed: [describe the test to add] ### Test Cases for Bug | Input | Current (Bug) | Expected (Fixed) | |-------|---------------|------------------| | ... | ... | ... | ### TDD Execution Log | Phase | Command | Result | Timestamp | |-------|---------|--------|-----------| | DIAGNOSE | `npm test` | All pass (gap!) | - | | RED | `npm test -- --grep "bug description"` | 1 test failed ✓ | - | | GREEN | `npm test -- --grep "bug description"` | 1 test passed ✓ | - | | VALIDATE | `npm run lint && npm run typecheck && npm test -- --coverage` | Pass ✓ | - | ``` #### Bug Fix Anti-Patterns - ❌ **Fixing without a test** - Bug will likely return - ❌ **Writing test after fix** - Can't prove test catches the bug - ❌ **Skipping test gap analysis** - Misses why tests didn't catch it - ❌ **Only testing the fix** - Must run full test suite for regressions ### Example Atomic Todo ```markdown ## [TODO-042] Add email validation to signup form **Status:** pending **Priority:** high **Estimate:** S ### Description Validate email format on the signup form before submission. Show inline error if invalid. ### Acceptance Criteria - [ ] Email field shows error for invalid format - [ ] Error clears when user fixes the email - [ ] Form cannot submit with invalid email - [ ] Valid emails pass through without error ### Validation - Manual: Enter "notanemail" in signup form, verify error appears - Automated: `npm test -- --grep "email validation"` ### Test Cases | Input | Expected Output | Notes | |-------|-----------------|-------| | user@example.com | Valid, no error | Standard email | | user@sub.example.com | Valid, no error | Subdomain | | notanemail | Invalid, show error | No @ symbol | | user@ | Invalid, show error | No domain | | @example.com | Invalid, show error | No local part | ### Dependencies - Depends on: [TODO-041] Signup form component - Blocks: [TODO-045] Signup flow integration test ### TDD Execution Log | Phase | Command | Result | Timestamp | |-------|---------|--------|-----------| | RED | `npm test -- --grep "email validation"` | 5 tests failed ✓ | - | | GREEN | `npm test -- --grep "email validation"` | 5 tests passed ✓ | - | | VALIDATE | `npm run lint && npm run typecheck && npm test -- --coverage` | Pass, 84% coverage ✓ | - | | COMPLETE | Moved to completed.md | ✓ | - | ``` --- ## Credentials Management When a project needs API keys, always ask the user for their centralized access file first. ### Workflow ``` 1. Ask: "Do you have an access keys file? (e.g., ~/Documents/Access.txt)" 2. Read and parse the file for known key patterns 3. Validate keys are working 4. Create project .env with found keys 5. Report missing keys and where to get them ``` ### Key Patterns to Detect | Service | Pattern | Env Variable | |---------|---------|--------------| | OpenAI | `sk-proj-*` | `OPENAI_API_KEY` | | Claude | `sk-ant-*` | `ANTHROPIC_API_KEY` | | Render | `rnd_*` | `RENDER_API_KEY` | | Replicate | `r8_*` | `REPLICATE_API_TOKEN` | | Reddit | client_id + secret | `REDDIT_CLIENT_ID`, `REDDIT_CLIENT_SECRET` | See `credentials.md` for full parsing logic and validation commands. --- ## Security Every project must meet these security requirements. See `security.md` skill for detailed patterns. ### Essential Security Checks 1. **No secrets in code** - Use environment variables, never commit secrets 2. **`.env` in `.gitignore`** - Always, no exceptions 3. **No secrets in client-exposed env vars** - Never use `VITE_*`, `NEXT_PUBLIC_*` for secrets 4. **Validate all input** - Use Zod/Pydantic at API boundaries 5. **Parameterized queries only** - No string concatenation for SQL 6. **Hash passwords properly** - bcrypt with 12+ rounds 7. **Dependency scanning** - npm audit / safety check must pass ### Required Files - `.gitignore` with secrets patterns - `.env.example` with all required vars (no values) - `scripts/security-check.sh` for pre-commit validation ### Security in CI Every PR must pass: - Secret scanning (detect-secrets / trufflehog) - Dependency audit (npm audit / safety) - Static analysis (CodeQL) --- ## Quality Gates ### Coverage Threshold - **Minimum 80% code coverage** - CI must fail below this - Business logic (core/) should aim for 100% - Integration tests cover boundaries ### Pre-Commit Hooks All projects must have pre-commit hooks that run: 1. Linting (auto-fix where possible) 2. Type checking 3. Tests (at minimum, affected tests) This catches issues before they hit CI, saving time and keeping the main branch clean. --- ## Session Management Maintain context for resumability. See `session-management.md` for full details. ### Core Rule: Checkpoint at Natural Breakpoints After completing any task, ask: 1. **Decision made?** → Log to `_project_specs/session/decisions.md` 2. **>10 tool calls?** → Full checkpoint to `current-state.md` 3. **Major feature done?** → Archive to `session/archive/` 4. **Otherwise** → Quick update to `current-state.md` ### Session Start 1. Read `_project_specs/session/current-state.md` 2. Check `_project_specs/todos/active.md` 3. Continue from documented "Next Steps" ### Session End 1. Archive current session 2. Update `current-state.md` with handoff notes 3. Ensure next steps are specific and actionable --- ## Response Format When implementing features (following TDD): 1. **Clarify requirements** if ambiguous 2. **Propose structure** - outline before code 3. **Write tests FIRST** - based on test cases table (RED phase) 4. **Run tests to verify they fail** - proves tests are valid 5. **Implement minimum code** to make tests pass (GREEN phase) 6. **Run full validation** - lint, typecheck, coverage (VALIDATE phase) 7. **Flag complexity** - warn if approaching limits 8. **Checkpoint after completing** - update session state, log TDD execution **TDD is non-negotiable.** Tests must exist and fail before any implementation begins. When you notice code violating these rules, **stop and refactor** before continuing. --- ## Automatic TDD Loops (via Stop Hook) The Stop hook in `.claude/settings.json` runs tests after each response. If tests fail, the failure output is fed back to Claude automatically. No manual intervention needed. See the `iterative-development` skill for setup details. ### How It Works 1. You ask Claude to implement something 2. Claude writes tests + implementation 3. Stop hook runs tests automatically 4. If failures: output fed back to Claude, it fixes and tries again 5. If all pass: Claude stops, work is done ### When It Activates | Task Type | TDD Loop? | |-----------|-----------| | New feature | Yes - tests run after each response | | Bug fix | Yes - write failing test first | | Refactoring | Yes - existing tests catch regressions | | Simple question/explanation | No - no code changes | | One-line fix | No - trivial change | ================================================ FILE: skills/cloudflare-d1/SKILL.md ================================================ --- name: cloudflare-d1 description: Cloudflare D1 SQLite database with Workers, Drizzle ORM, migrations when-to-use: When working with Cloudflare D1 or Workers user-invocable: false paths: ["wrangler.toml", "src/worker*", "**/d1/**"] effort: medium --- # Cloudflare D1 Skill Cloudflare D1 is a serverless SQLite database designed for Cloudflare Workers with global distribution and zero cold starts. **Sources:** [D1 Docs](https://developers.cloudflare.com/d1/) | [Drizzle + D1](https://orm.drizzle.team/docs/connect-cloudflare-d1) | [Wrangler CLI](https://developers.cloudflare.com/workers/wrangler/) --- ## Core Principle **SQLite at the edge, migrations in version control, Drizzle for type safety.** D1 brings SQLite's simplicity to serverless. Design for horizontal scale (multiple small databases) rather than vertical (one large database). Use Drizzle ORM for type-safe queries and migrations. --- ## D1 Stack | Component | Purpose | |-----------|---------| | **D1** | Serverless SQLite database | | **Workers** | Edge runtime for your application | | **Wrangler** | CLI for development and deployment | | **Drizzle ORM** | Type-safe ORM with migrations | | **Drizzle Kit** | Migration tooling | | **Hono** | Lightweight web framework (optional) | --- ## Project Setup ### Create Worker Project ```bash # Create new project npm create cloudflare@latest my-app -- --template "worker-typescript" cd my-app # Install dependencies npm install drizzle-orm npm install -D drizzle-kit ``` ### Create D1 Database ```bash # Create database (creates both local and remote) npx wrangler d1 create my-database # Output: # [[d1_databases]] # binding = "DB" # database_name = "my-database" # database_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" ``` ### Configure wrangler.toml ```toml name = "my-app" main = "src/index.ts" compatibility_date = "2024-01-01" [[d1_databases]] binding = "DB" database_name = "my-database" database_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" migrations_dir = "drizzle" migrations_table = "drizzle_migrations" ``` ### Generate TypeScript Types ```bash # Generate env types from wrangler.toml npx wrangler types # Creates worker-configuration.d.ts: # interface Env { # DB: D1Database; # } ``` --- ## Drizzle ORM Setup ### Schema Definition ```typescript // src/db/schema.ts import { sqliteTable, text, integer, real, blob } from 'drizzle-orm/sqlite-core'; import { sql } from 'drizzle-orm'; export const users = sqliteTable('users', { id: integer('id').primaryKey({ autoIncrement: true }), email: text('email').notNull().unique(), name: text('name').notNull(), role: text('role', { enum: ['user', 'admin'] }).default('user'), createdAt: text('created_at').default(sql`CURRENT_TIMESTAMP`), updatedAt: text('updated_at').default(sql`CURRENT_TIMESTAMP`) }); export const posts = sqliteTable('posts', { id: integer('id').primaryKey({ autoIncrement: true }), title: text('title').notNull(), content: text('content'), authorId: integer('author_id').references(() => users.id), published: integer('published', { mode: 'boolean' }).default(false), viewCount: integer('view_count').default(0), createdAt: text('created_at').default(sql`CURRENT_TIMESTAMP`) }); export const tags = sqliteTable('tags', { id: integer('id').primaryKey({ autoIncrement: true }), name: text('name').notNull().unique() }); export const postTags = sqliteTable('post_tags', { postId: integer('post_id').references(() => posts.id), tagId: integer('tag_id').references(() => tags.id) }); // Type exports export type User = typeof users.$inferSelect; export type NewUser = typeof users.$inferInsert; export type Post = typeof posts.$inferSelect; export type NewPost = typeof posts.$inferInsert; ``` ### Drizzle Config ```typescript // drizzle.config.ts import { defineConfig } from 'drizzle-kit'; export default defineConfig({ schema: './src/db/schema.ts', out: './drizzle', dialect: 'sqlite', driver: 'd1-http', dbCredentials: { accountId: process.env.CLOUDFLARE_ACCOUNT_ID!, databaseId: process.env.CLOUDFLARE_DATABASE_ID!, token: process.env.CLOUDFLARE_D1_TOKEN! } }); ``` ### Database Client ```typescript // src/db/index.ts import { drizzle } from 'drizzle-orm/d1'; import * as schema from './schema'; export function createDb(d1: D1Database) { return drizzle(d1, { schema }); } export type Database = ReturnType<typeof createDb>; export * from './schema'; ``` --- ## Migration Workflow ### Generate Migration ```bash # Generate migration from schema changes npx drizzle-kit generate # Output: drizzle/0000_initial.sql ``` ### Apply Migrations Locally ```bash # Apply to local D1 npx wrangler d1 migrations apply my-database --local # Or via Drizzle npx drizzle-kit migrate ``` ### Apply Migrations to Production ```bash # Apply to remote D1 npx wrangler d1 migrations apply my-database --remote # Preview first (dry run) npx wrangler d1 migrations apply my-database --remote --dry-run ``` ### Migration File Example ```sql -- drizzle/0000_initial.sql CREATE TABLE `users` ( `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, `email` text NOT NULL, `name` text NOT NULL, `role` text DEFAULT 'user', `created_at` text DEFAULT CURRENT_TIMESTAMP, `updated_at` text DEFAULT CURRENT_TIMESTAMP ); CREATE UNIQUE INDEX `users_email_unique` ON `users` (`email`); CREATE TABLE `posts` ( `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, `title` text NOT NULL, `content` text, `author_id` integer REFERENCES `users`(`id`), `published` integer DEFAULT false, `view_count` integer DEFAULT 0, `created_at` text DEFAULT CURRENT_TIMESTAMP ); ``` --- ## Worker Implementation ### Basic Worker with Hono ```typescript // src/index.ts import { Hono } from 'hono'; import { createDb, users, posts } from './db'; import { eq, desc } from 'drizzle-orm'; type Bindings = { DB: D1Database; }; const app = new Hono<{ Bindings: Bindings }>(); // Middleware to inject db app.use('*', async (c, next) => { c.set('db', createDb(c.env.DB)); await next(); }); // List users app.get('/users', async (c) => { const db = c.get('db'); const allUsers = await db.select().from(users); return c.json(allUsers); }); // Get user by ID app.get('/users/:id', async (c) => { const db = c.get('db'); const id = parseInt(c.req.param('id')); const user = await db.select().from(users).where(eq(users.id, id)).get(); if (!user) { return c.json({ error: 'User not found' }, 404); } return c.json(user); }); // Create user app.post('/users', async (c) => { const db = c.get('db'); const body = await c.req.json<{ email: string; name: string }>(); const result = await db.insert(users).values({ email: body.email, name: body.name }).returning(); return c.json(result[0], 201); }); // Update user app.put('/users/:id', async (c) => { const db = c.get('db'); const id = parseInt(c.req.param('id')); const body = await c.req.json<Partial<{ email: string; name: string }>>(); const result = await db.update(users) .set({ ...body, updatedAt: new Date().toISOString() }) .where(eq(users.id, id)) .returning(); if (result.length === 0) { return c.json({ error: 'User not found' }, 404); } return c.json(result[0]); }); // Delete user app.delete('/users/:id', async (c) => { const db = c.get('db'); const id = parseInt(c.req.param('id')); const result = await db.delete(users).where(eq(users.id, id)).returning(); if (result.length === 0) { return c.json({ error: 'User not found' }, 404); } return c.json({ deleted: true }); }); export default app; ``` ### Raw D1 API (Without ORM) ```typescript // src/index.ts export default { async fetch(request: Request, env: Env): Promise<Response> { const url = new URL(request.url); if (url.pathname === '/users' && request.method === 'GET') { const { results } = await env.DB.prepare( 'SELECT * FROM users ORDER BY created_at DESC' ).all(); return Response.json(results); } if (url.pathname === '/users' && request.method === 'POST') { const body = await request.json() as { email: string; name: string }; const result = await env.DB.prepare( 'INSERT INTO users (email, name) VALUES (?, ?) RETURNING *' ).bind(body.email, body.name).first(); return Response.json(result, { status: 201 }); } return new Response('Not Found', { status: 404 }); } }; ``` --- ## Query Patterns ### Select Queries ```typescript import { eq, and, or, like, gt, desc, asc, count, sql } from 'drizzle-orm'; // Basic select const allPosts = await db.select().from(posts); // Select specific columns const titles = await db.select({ id: posts.id, title: posts.title }).from(posts); // Where clause const published = await db.select().from(posts).where(eq(posts.published, true)); // Multiple conditions const recentPublished = await db.select().from(posts).where( and( eq(posts.published, true), gt(posts.createdAt, '2024-01-01') ) ); // OR conditions const featured = await db.select().from(posts).where( or( eq(posts.viewCount, 1000), like(posts.title, '%featured%') ) ); // Order and limit const topPosts = await db.select() .from(posts) .orderBy(desc(posts.viewCount)) .limit(10); // Pagination const page2 = await db.select() .from(posts) .orderBy(desc(posts.createdAt)) .limit(10) .offset(10); // Count const postCount = await db.select({ count: count() }).from(posts); ``` ### Joins ```typescript // Inner join const postsWithAuthors = await db.select({ post: posts, author: users }) .from(posts) .innerJoin(users, eq(posts.authorId, users.id)); // Left join const allPostsWithAuthors = await db.select() .from(posts) .leftJoin(users, eq(posts.authorId, users.id)); // Many-to-many via junction table const postsWithTags = await db.select({ post: posts, tag: tags }) .from(posts) .leftJoin(postTags, eq(posts.id, postTags.postId)) .leftJoin(tags, eq(postTags.tagId, tags.id)); ``` ### Insert, Update, Delete ```typescript // Insert single const newUser = await db.insert(users).values({ email: 'user@example.com', name: 'John Doe' }).returning(); // Insert multiple await db.insert(users).values([ { email: 'a@test.com', name: 'Alice' }, { email: 'b@test.com', name: 'Bob' } ]); // Upsert (insert or update on conflict) await db.insert(users) .values({ email: 'user@test.com', name: 'New Name' }) .onConflictDoUpdate({ target: users.email, set: { name: 'New Name' } }); // Update await db.update(posts) .set({ published: true }) .where(eq(posts.id, 1)); // Update with increment await db.update(posts) .set({ viewCount: sql`${posts.viewCount} + 1` }) .where(eq(posts.id, 1)); // Delete await db.delete(posts).where(eq(posts.id, 1)); ``` ### Transactions ```typescript // D1 supports transactions via batch const results = await db.batch([ db.insert(users).values({ email: 'a@test.com', name: 'A' }), db.insert(users).values({ email: 'b@test.com', name: 'B' }), db.update(posts).set({ published: true }).where(eq(posts.id, 1)) ]); // Raw D1 batch const batchResults = await env.DB.batch([ env.DB.prepare('INSERT INTO users (email, name) VALUES (?, ?)').bind('a@test.com', 'A'), env.DB.prepare('INSERT INTO users (email, name) VALUES (?, ?)').bind('b@test.com', 'B') ]); ``` --- ## Local Development ### Start Dev Server ```bash # Local development with D1 npx wrangler dev # With specific port npx wrangler dev --port 8787 ``` ### Database Management ```bash # Execute SQL locally npx wrangler d1 execute my-database --local --command "SELECT * FROM users" # Execute SQL file npx wrangler d1 execute my-database --local --file ./seed.sql # Open SQLite shell npx wrangler d1 execute my-database --local --command ".tables" ``` ### Drizzle Studio ```bash # Run Drizzle Studio for visual DB management npx drizzle-kit studio ``` ### Seed Data ```sql -- seed.sql INSERT INTO users (email, name, role) VALUES ('admin@example.com', 'Admin User', 'admin'), ('user@example.com', 'Test User', 'user'); INSERT INTO posts (title, content, author_id, published) VALUES ('First Post', 'Hello World!', 1, true), ('Draft Post', 'Work in progress...', 1, false); ``` ```bash # Seed local database npx wrangler d1 execute my-database --local --file ./seed.sql ``` --- ## Multi-Environment Setup ### wrangler.toml ```toml name = "my-app" main = "src/index.ts" compatibility_date = "2024-01-01" # Development [env.dev] [[env.dev.d1_databases]] binding = "DB" database_name = "my-database-dev" database_id = "dev-database-id" # Staging [env.staging] [[env.staging.d1_databases]] binding = "DB" database_name = "my-database-staging" database_id = "staging-database-id" # Production [env.production] [[env.production.d1_databases]] binding = "DB" database_name = "my-database-prod" database_id = "prod-database-id" ``` ### Deploy to Environments ```bash # Deploy to staging npx wrangler deploy --env staging # Deploy to production npx wrangler deploy --env production # Apply migrations to staging npx wrangler d1 migrations apply my-database-staging --remote --env staging ``` --- ## Testing ### Integration Tests ```typescript // tests/api.test.ts import { unstable_dev } from 'wrangler'; import type { UnstableDevWorker } from 'wrangler'; import { describe, beforeAll, afterAll, it, expect } from 'vitest'; describe('API', () => { let worker: UnstableDevWorker; beforeAll(async () => { worker = await unstable_dev('src/index.ts', { experimental: { disableExperimentalWarning: true } }); }); afterAll(async () => { await worker.stop(); }); it('should list users', async () => { const res = await worker.fetch('/users'); expect(res.status).toBe(200); const data = await res.json(); expect(Array.isArray(data)).toBe(true); }); it('should create user', async () => { const res = await worker.fetch('/users', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ email: 'test@test.com', name: 'Test' }) }); expect(res.status).toBe(201); }); }); ``` --- ## CLI Quick Reference ```bash # Database wrangler d1 create <name> # Create database wrangler d1 list # List databases wrangler d1 info <name> # Database info wrangler d1 delete <name> # Delete database # Migrations wrangler d1 migrations list <name> # List migrations wrangler d1 migrations apply <name> --local # Apply locally wrangler d1 migrations apply <name> --remote # Apply to production # SQL execution wrangler d1 execute <name> --command "SQL" # Run SQL wrangler d1 execute <name> --file ./file.sql # Run SQL file wrangler d1 execute <name> --local # Run on local wrangler d1 execute <name> --remote # Run on production # Development wrangler dev # Start local server wrangler types # Generate TypeScript types wrangler deploy # Deploy to production # Drizzle drizzle-kit generate # Generate migrations drizzle-kit migrate # Apply migrations drizzle-kit studio # Open Drizzle Studio drizzle-kit push # Push schema (dev only) ``` --- ## D1 Limits & Considerations | Limit | Value | |-------|-------| | **Database size** | 10 GB max | | **Row size** | 1 MB max | | **SQL statement** | 100 KB max | | **Batch size** | 1000 statements | | **Reads per day (free)** | 5 million | | **Writes per day (free)** | 100,000 | --- ## Anti-Patterns - **Single large database** - Design for multiple smaller databases (per-tenant) - **No migrations** - Always version control schema changes - **Raw SQL everywhere** - Use Drizzle for type safety - **No connection to remote** - Always test against real D1 before deploy - **Large blobs in D1** - Use R2 for file storage - **Complex joins** - D1 is SQLite; keep queries simple - **No batching** - Use batch for multiple operations - **Ignoring limits** - Monitor usage on free tier ================================================ FILE: skills/code-deduplication/SKILL.md ================================================ --- name: code-deduplication description: Prevent semantic code duplication with capability index and check-before-write when-to-use: Before creating new utility functions or shared code user-invocable: false effort: medium --- # Code Deduplication Skill **Purpose:** Prevent semantic duplication and code bloat. Maintain a capability index so Claude always knows what exists before writing something new. --- ## Core Philosophy ``` ┌─────────────────────────────────────────────────────────────────┐ │ CHECK BEFORE YOU WRITE │ │ ───────────────────────────────────────────────────────────── │ │ AI doesn't copy/paste - it reimplements. │ │ The problem isn't duplicate code, it's duplicate PURPOSE. │ │ │ │ Before writing ANY new function: │ │ 1. Check CODE_INDEX.md for existing capabilities │ │ 2. Search codebase for similar functionality │ │ 3. Extend existing code if possible │ │ 4. Only create new if nothing suitable exists │ ├─────────────────────────────────────────────────────────────────┤ │ AFTER WRITING: Update the index immediately. │ │ PERIODICALLY: Run /audit-duplicates to catch overlap. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Code Index Structure Maintain `CODE_INDEX.md` in project root, organized by **capability** not file location: ```markdown # Code Index *Last updated: [timestamp]* *Run `/update-code-index` to regenerate* ## Quick Reference | Category | Count | Location | |----------|-------|----------| | Date/Time | 5 functions | src/utils/dates.ts | | Validation | 8 functions | src/utils/validate.ts | | API Clients | 12 functions | src/api/*.ts | | Auth | 6 functions | src/auth/*.ts | --- ## Date/Time Operations | Function | Location | Does What | Params | |----------|----------|-----------|--------| | `formatDate()` | utils/dates.ts:15 | Formats Date → "Jan 15, 2024" | `(date: Date, format?: string)` | | `formatRelative()` | utils/dates.ts:32 | Formats Date → "2 days ago" | `(date: Date)` | | `parseDate()` | utils/dates.ts:48 | Parses string → Date | `(str: string, format?: string)` | | `isExpired()` | auth/tokens.ts:22 | Checks if timestamp past now | `(timestamp: number)` | | `addDays()` | utils/dates.ts:61 | Adds days to date | `(date: Date, days: number)` | --- ## Validation | Function | Location | Does What | Params | |----------|----------|-----------|--------| | `isEmail()` | utils/validate.ts:10 | Validates email format | `(email: string)` | | `isPhone()` | utils/validate.ts:25 | Validates phone with country | `(phone: string, country?: string)` | | `isURL()` | utils/validate.ts:42 | Validates URL format | `(url: string)` | | `isUUID()` | utils/validate.ts:55 | Validates UUID v4 | `(id: string)` | | `sanitizeHTML()` | utils/sanitize.ts:12 | Strips XSS from input | `(html: string)` | | `sanitizeSQL()` | utils/sanitize.ts:28 | Escapes SQL special chars | `(input: string)` | --- ## String Operations | Function | Location | Does What | Params | |----------|----------|-----------|--------| | `slugify()` | utils/strings.ts:8 | Converts to URL slug | `(str: string)` | | `truncate()` | utils/strings.ts:20 | Truncates with ellipsis | `(str: string, len: number)` | | `capitalize()` | utils/strings.ts:32 | Capitalizes first letter | `(str: string)` | | `pluralize()` | utils/strings.ts:40 | Adds s/es correctly | `(word: string, count: number)` | --- ## API Clients | Function | Location | Does What | Returns | |----------|----------|-----------|---------| | `fetchUser()` | api/users.ts:15 | GET /users/:id | `Promise<User>` | | `fetchUsers()` | api/users.ts:28 | GET /users with pagination | `Promise<User[]>` | | `createUser()` | api/users.ts:45 | POST /users | `Promise<User>` | | `updateUser()` | api/users.ts:62 | PATCH /users/:id | `Promise<User>` | | `deleteUser()` | api/users.ts:78 | DELETE /users/:id | `Promise<void>` | --- ## Error Handling | Function/Class | Location | Does What | |----------------|----------|-----------| | `AppError` | utils/errors.ts:5 | Base error class with code | | `ValidationError` | utils/errors.ts:20 | Input validation failures | | `NotFoundError` | utils/errors.ts:32 | Resource not found | | `handleAsync()` | utils/errors.ts:45 | Wraps async route handlers | | `errorMiddleware()` | middleware/error.ts:10 | Express error handler | --- ## Hooks (React) | Hook | Location | Does What | |------|----------|-----------| | `useAuth()` | hooks/useAuth.ts | Auth state + login/logout | | `useUser()` | hooks/useUser.ts | Current user data | | `useDebounce()` | hooks/useDebounce.ts | Debounces value changes | | `useLocalStorage()` | hooks/useLocalStorage.ts | Persisted state | | `useFetch()` | hooks/useFetch.ts | Data fetching with loading/error | --- ## Components (React) | Component | Location | Does What | |-----------|----------|-----------| | `Button` | components/Button.tsx | Styled button with variants | | `Input` | components/Input.tsx | Form input with validation | | `Modal` | components/Modal.tsx | Dialog overlay | | `Toast` | components/Toast.tsx | Notification popup | | `Spinner` | components/Spinner.tsx | Loading indicator | ``` --- ## File Header Format Every file should have a summary header: ### TypeScript/JavaScript ```typescript /** * @file User authentication utilities * @description Handles login, logout, session management, and token refresh. * * Key exports: * - login(email, password) - Authenticates user, returns tokens * - logout() - Clears session and tokens * - refreshToken() - Gets new access token * - validateSession() - Checks if session is valid * * @see src/api/auth.ts for API endpoints * @see src/hooks/useAuth.ts for React hook */ import { ... } from '...'; ``` ### Python ```python """ User authentication utilities. Handles login, logout, session management, and token refresh. Key exports: - login(email, password) - Authenticates user, returns tokens - logout() - Clears session and tokens - refresh_token() - Gets new access token - validate_session() - Checks if session is valid See Also: - src/api/auth.py for API endpoints - src/services/user.py for user operations """ from typing import ... ``` --- ## Function Documentation Every function needs a one-line summary: ### TypeScript ```typescript /** * Formats a date into a human-readable relative string. * Examples: "2 minutes ago", "yesterday", "3 months ago" */ export function formatRelative(date: Date): string { // ... } /** * Validates email format and checks for disposable domains. * Returns true for valid non-disposable emails. */ export function isValidEmail(email: string): boolean { // ... } ``` ### Python ```python def format_relative(date: datetime) -> str: """Formats a date into a human-readable relative string. Examples: "2 minutes ago", "yesterday", "3 months ago" """ ... def is_valid_email(email: str) -> bool: """Validates email format and checks for disposable domains. Returns True for valid non-disposable emails. """ ... ``` --- ## Check Before Write Process ### Before Creating ANY New Function ``` ┌─────────────────────────────────────────────────────────────────┐ │ BEFORE WRITING NEW CODE │ │ ───────────────────────────────────────────────────────────── │ │ │ │ 1. DESCRIBE what you need in plain English │ │ "I need to format a date as relative time" │ │ │ │ 2. CHECK CODE_INDEX.md │ │ Search for: date, time, format, relative │ │ → Found: formatRelative() in utils/dates.ts │ │ │ │ 3. EVALUATE if existing code works │ │ - Does it do what I need? → Use it │ │ - Close but not quite? → Extend it │ │ - Nothing suitable? → Create new, update index │ │ │ │ 4. If extending, check for breaking changes │ │ - Add optional params, don't change existing behavior │ │ - Update tests for new functionality │ └─────────────────────────────────────────────────────────────────┘ ``` ### Decision Tree ``` Need new functionality │ ▼ Check CODE_INDEX.md for similar │ ├─► Found exact match ──────► USE IT │ ├─► Found similar ──────────► Can it be extended? │ │ │ ┌──────────────┴──────────────┐ │ ▼ ▼ │ Yes: Extend No: Create new │ (add params) (update index) │ └─► Nothing found ──────────► Create new (update index) ``` --- ## Common Duplication Patterns ### Pattern 1: Utility Function Reimplementation ❌ **Bad:** Creating `validateEmail()` when `isEmail()` exists ```typescript // DON'T: This already exists as isEmail() function validateEmail(email: string): boolean { return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email); } ``` ✅ **Good:** Check index first, use existing ```typescript import { isEmail } from '@/utils/validate'; if (isEmail(userInput)) { ... } ``` ### Pattern 2: Slightly Different Versions ❌ **Bad:** Multiple date formatters with slight variations ```typescript // In file A function formatDate(d: Date) { return d.toLocaleDateString(); } // In file B function displayDate(d: Date) { return d.toLocaleDateString('en-US'); } // In file C function showDate(d: Date) { return d.toLocaleDateString('en-US', { month: 'short' }); } ``` ✅ **Good:** One function with options ```typescript // utils/dates.ts function formatDate(d: Date, options?: { locale?: string; format?: 'short' | 'long' }) { const locale = options?.locale ?? 'en-US'; const formatOpts = options?.format === 'short' ? { month: 'short', day: 'numeric' } : { month: 'long', day: 'numeric', year: 'numeric' }; return d.toLocaleDateString(locale, formatOpts); } ``` ### Pattern 3: Inline Logic That Should Be Extracted ❌ **Bad:** Same validation logic scattered across files ```typescript // In signup.ts if (!email || !email.includes('@') || email.length < 5) { ... } // In profile.ts if (!email || !email.includes('@') || email.length < 5) { ... } // In invite.ts if (!email || !email.includes('@') || email.length < 5) { ... } ``` ✅ **Good:** Extract once, import everywhere ```typescript // utils/validate.ts export const isEmail = (email: string) => email && email.includes('@') && email.length >= 5; // Everywhere else import { isEmail } from '@/utils/validate'; if (!isEmail(email)) { ... } ``` --- ## Periodic Audit Run `/audit-duplicates` periodically to catch semantic overlap: ### Audit Checklist - [ ] **Utility functions**: Any functions doing similar things? - [ ] **API calls**: Multiple ways to fetch same data? - [ ] **Validation**: Scattered inline validation logic? - [ ] **Error handling**: Inconsistent error patterns? - [ ] **Components**: Similar UI components that could merge? - [ ] **Hooks**: Custom hooks with overlapping logic? ### Audit Output Format ```markdown ## Duplicate Audit - [DATE] ### 🔴 High Priority (Merge These) 1. **Date formatting** - 3 similar functions found - `formatDate()` in utils/dates.ts - `displayDate()` in components/Header.tsx - `showDate()` in pages/Profile.tsx - **Action:** Consolidate into utils/dates.ts 2. **Email validation** - Inline logic in 5 files - signup.ts:42 - profile.ts:28 - invite.ts:15 - settings.ts:67 - admin.ts:33 - **Action:** Extract to utils/validate.ts ### 🟡 Medium Priority (Consider Merging) 1. **User fetching** - 2 different patterns - `fetchUser()` in api/users.ts - `getUser()` in services/user.ts - **Action:** Decide on one pattern ### 🟢 Low Priority (Monitor) 1. **Button components** - 3 variants exist - May be intentional for different use cases - **Action:** Document the differences ``` --- ## Vector DB Integration (Optional) For large codebases (100+ files), add vector search: ### Setup with ChromaDB ```python # scripts/index_codebase.py import chromadb from chromadb.utils import embedding_functions # Initialize client = chromadb.PersistentClient(path="./.chroma") ef = embedding_functions.DefaultEmbeddingFunction() collection = client.get_or_create_collection("code_index", embedding_function=ef) # Index a function collection.add( documents=["Formats a date into human-readable relative string like '2 days ago'"], metadatas=[{"function": "formatRelative", "file": "utils/dates.ts", "line": 32}], ids=["formatRelative"] ) # Search before writing results = collection.query( query_texts=["format date as relative time"], n_results=5 ) # Returns: formatRelative in utils/dates.ts - 0.92 similarity ``` ### Setup with LanceDB (Lighter) ```python # scripts/index_codebase.py import lancedb db = lancedb.connect("./.lancedb") # Create table data = [ {"function": "formatRelative", "file": "utils/dates.ts", "description": "Formats date as relative time"}, {"function": "isEmail", "file": "utils/validate.ts", "description": "Validates email format"}, ] table = db.create_table("code_index", data) # Search results = table.search("validate email address").limit(5).to_list() ``` ### When to Use Vector DB | Codebase Size | Recommendation | |---------------|----------------| | < 50 files | Markdown index only | | 50-200 files | Markdown + periodic audit | | 200+ files | Add vector DB | | 500+ files | Vector DB essential | --- ## Claude Instructions ### At Session Start 1. Read `CODE_INDEX.md` if it exists 2. Note the categories and key functions available 3. Keep this context for the session ### Before Writing New Code 1. **Pause and check**: "Does something like this exist?" 2. Search CODE_INDEX.md for similar capabilities 3. If unsure, search the codebase: `grep -r "functionName\|similar_term" src/` 4. Only create new if confirmed nothing suitable exists ### After Writing New Code 1. **Immediately update CODE_INDEX.md** 2. Add file header if new file 3. Add function docstring 4. Commit index update with code ### When User Says "Add X functionality" ``` Before implementing, let me check if we already have something similar... [Checks CODE_INDEX.md] Found: `existingFunction()` in utils/file.ts does something similar. Options: 1. Use existing function as-is 2. Extend it with new capability 3. Create new (if truly different use case) Which approach would you prefer? ``` --- ## Quick Reference ### Update Index Command ```bash /update-code-index ``` ### Audit Command ```bash /audit-duplicates ``` ### File Header Template ```typescript /** * @file [Short description] * @description [What this file does] * * Key exports: * - function1() - [what it does] * - function2() - [what it does] */ ``` ### Function Template ```typescript /** * [One line description of what it does] */ export function name(params): ReturnType { ``` ### Index Entry Template ```markdown | `functionName()` | path/file.ts:line | Does what in plain English | `(params)` | ``` ================================================ FILE: skills/code-graph/SKILL.md ================================================ --- name: code-graph description: AST-based code graph for fast symbol lookup, dependency analysis, and blast radius via codebase-memory-mcp MCP server when-to-use: "Before reading files — query the graph first for symbol lookup, call tracing, and blast radius" user-invocable: false effort: medium --- # Code Graph Skill **Purpose:** Use the code graph (codebase-memory-mcp) for sub-millisecond symbol lookup, function search, dependency analysis, and blast radius detection. This replaces brute-force grep and file reading for code navigation. --- ## Core Principle **Graph first, file second.** Before reading files or grepping, query the code graph. Only read full files when you need to modify them or need context beyond what the graph provides. **Consider graph when planning.** When planning any change — feature, refactor, bug fix — start by querying the graph to understand scope, dependencies, and blast radius. This applies to thinking and planning phases, not just implementation. Grep is still the right tool for searching string literals, log messages, config values, and content that lives outside code structure. ``` ┌────────────────────────────────────────────────────────────────┐ │ GRAPH FIRST, FILE SECOND │ │ ─────────────────────────────────────────────────────────────│ │ The code graph indexes your entire codebase as a persistent │ │ knowledge graph. Claude queries it via MCP for instant │ │ symbol lookup, dependency chains, and blast radius — instead │ │ of reading hundreds of files. │ │ │ │ 14 MCP tools │ 64 languages │ sub-ms queries │ zero deps │ │ ~99% fewer tokens for navigation vs brute-force file reads │ ├────────────────────────────────────────────────────────────────┤ │ AUTO-UPDATED │ │ ─────────────────────────────────────────────────────────────│ │ File watcher keeps graph in sync. Post-commit hook ensures │ │ freshness. No manual rebuild needed. │ └────────────────────────────────────────────────────────────────┘ ``` --- ## When to Use Graph vs Direct Read | Task | Use Graph Tool | Use Direct Read? | |------|---------------|------------------| | Find function/class definition | `search_graph` | No | | Get function signature + docs | `get_code_snippet` | No | | Find all callers of a function | `trace_call_path` | No | | Trace dependency chain | `query_graph` | No | | Determine blast radius of change | `detect_changes` | No | | Understand project architecture | `get_architecture` | No | | Search for code patterns | `search_code` | No | | Read full implementation to modify | `search_graph` to locate, then Read file | Yes | | Understand business logic context | `get_code_snippet` for overview, then Read | Yes | **Rule:** If a graph tool can answer the question, use it. Only open files when you need the full source to make edits. --- ## Available MCP Tools ### Indexing & Status | Tool | Purpose | When to Use | |------|---------|-------------| | `index_repository` | Build/rebuild graph for a project | First setup, or after major restructure | | `index_status` | Check if graph is current | Before querying, if unsure of freshness | | `list_projects` | List all indexed projects | Multi-project navigation | ### Querying & Navigation | Tool | Purpose | When to Use | |------|---------|-------------| | `search_graph` | Find symbols by name (fuzzy) | "Find auth-related functions" | | `search_code` | Text search across indexed codebase | "Find TODO comments", pattern matching | | `get_code_snippet` | Get source code for a specific symbol | Need signature, docstring, implementation | | `get_graph_schema` | Understand graph structure and relationships | Exploring what data is available | | `query_graph` | Run structured graph queries | Complex dependency/relationship queries | ### Analysis | Tool | Purpose | When to Use | |------|---------|-------------| | `trace_call_path` | Trace caller/callee chains | "Who calls sendEmail?", "What does init() trigger?" | | `detect_changes` | Identify changed files and blast radius | Before/after code changes, PR review | | `get_architecture` | High-level module/package structure | Onboarding, understanding project layout | ### Management | Tool | Purpose | When to Use | |------|---------|-------------| | `delete_project` | Remove a project from the graph | Cleanup, project restructure | | `manage_adr` | Architecture decision records | Document architectural decisions | | `ingest_traces` | Import runtime traces | Performance analysis, dead code detection | --- ## Workflow: Before Any Code Change ``` 0. PLAN → get_architecture + search_graph to understand scope before planning 1. LOCATE → search_graph to find the symbol 2. UNDERSTAND → get_code_snippet for context 3. BLAST → detect_changes to assess impact 4. TRACE → trace_call_path to find all affected callers 5. CHANGE → Read file, make edit 6. VERIFY → detect_changes again to confirm scope ``` **Step 0 applies to planning, not just coding.** When the user asks you to plan a feature, refactor, or fix — query the graph first to understand what exists, what depends on what, and what the scope looks like. This prevents plans based on wrong assumptions about the codebase. **Never skip step 3.** Blast radius analysis prevents unexpected breakage from changes to shared code. --- ## Graph Data & Freshness The graph stays fresh automatically through 3 layers — no manual rebuild needed: | Layer | Trigger | What Happens | |-------|---------|-------------| | **File watcher** | Every file save | codebase-memory-mcp detects changes and re-indexes affected files in real-time | | **Auto-index** | Session start | `auto_index: true` ensures graph is current when Claude Code starts | | **Post-commit hook** | Every `git commit` | Touches `.code-graph/.needs-update` marker — file watcher picks it up (~10ms, non-blocking) | **You do NOT need to manually re-index** unless you do a major restructure (rename entire directories, switch branches with massive diffs). In that case: `index_repository` once, then the 3 layers keep it fresh. - **Storage**: `.code-graph/` directory (auto-created, gitignored) - **MCP config**: `.mcp.json` at project root (committed, shared with team) --- ## MCP Configuration The code graph MCP server is configured in `.mcp.json` at project root: ```json { "mcpServers": { "codebase-memory": { "command": "codebase-memory-mcp", "args": [] } } } ``` **Installation:** `~/.claude/install-graph-tools.sh` --- ## Decision Framework ``` Need to find a symbol/function? → search_graph (sub-ms, structured result) → NOT: grep -r "functionName" (slow, unstructured) Need to understand dependencies? → query_graph or trace_call_path (complete, traversable) → NOT: manually reading import statements Need to assess change impact? → detect_changes (comprehensive, instant) → NOT: searching for usages manually across files Need to understand architecture? → get_architecture (high-level overview) → NOT: reading every directory listing Need to read/modify code? → search_graph to locate, then Read the specific file → NOT: reading entire directories hoping to find it ``` --- ## Anti-Patterns | Anti-Pattern | Do This Instead | |-------------|-----------------| | Grepping for function names | `search_graph` with the function name | | Reading entire files to find a signature | `get_code_snippet` for the specific symbol | | Manually tracing import chains | `trace_call_path` or `query_graph` | | Making changes without checking impact | `detect_changes` before every edit to shared code | | Reading all files in a directory | `get_architecture` for structure, `search_graph` for specifics | | Ignoring graph staleness warnings | Check `index_status`, re-index if needed | | Re-indexing on every query | Trust the file watcher; only manual re-index after major restructure | ================================================ FILE: skills/code-review/SKILL.md ================================================ --- name: code-review description: Mandatory code reviews via /code-review before commits and deploys when-to-use: When user asks to review code, before commits, or when /code-review is invoked user-invocable: true allowed-tools: [Read, Glob, Grep, Bash] effort: high --- # Code Review Skill **Purpose:** Enforce automated code reviews as a mandatory guardrail before every commit and deployment. Choose between Claude, OpenAI Codex, Google Gemini, or multiple engines for comprehensive analysis. --- ## Review Engine Choice When running `/code-review`, users can choose their preferred review engine: ``` ┌─────────────────────────────────────────────────────────────────┐ │ CODE REVIEW - Choose Your Engine │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ ○ Claude (default) │ │ Built-in, no extra setup, full conversation context │ │ │ │ ○ OpenAI Codex CLI │ │ GPT-5.2-Codex specialized for code review, 88% detection │ │ Requires: npm install -g @openai/codex │ │ │ │ ○ Google Gemini CLI │ │ Gemini 2.5 Pro with 1M token context, free tier available │ │ Requires: npm install -g @google/gemini-cli │ │ │ │ ○ Dual Engine (any two) │ │ Run two engines, compare findings, catch more issues │ │ │ │ ○ All Three (maximum coverage) │ │ Run Claude + Codex + Gemini for critical/security code │ │ │ └─────────────────────────────────────────────────────────────────┘ ``` ### Engine Comparison | Aspect | Claude | Codex | Gemini | Multi-Engine | |--------|--------|-------|--------|--------------| | **Setup** | None | npm + OpenAI API | npm + Google Account | All setups | | **Speed** | Fast | Fast | Fast | 2-3x time | | **Context** | Conversation | Fresh per review | 1M tokens | N/A | | **Detection** | Good | 88% (best) | 63.8% SWE-Bench | Combined | | **Free Tier** | N/A | Limited | 1,000/day | Varies | | **Best for** | Quick reviews | High accuracy | Large codebases | Critical code | ### Set Default Engine ```toml # ~/.claude/settings.toml or project CLAUDE.md [code-review] default_engine = "claude" # Options: claude, codex, gemini, dual, all ``` ### Usage Examples ```bash # Use default engine /code-review # Explicitly choose engine /code-review --engine claude /code-review --engine codex /code-review --engine gemini # Dual engine (pick any two) /code-review --engine claude,codex /code-review --engine claude,gemini /code-review --engine codex,gemini # All three engines /code-review --engine all # Quick shortcuts /code-review # Uses default /code-review --codex # Use Codex /code-review --gemini # Use Gemini /code-review --all # All three engines ``` --- ## Multi-Engine Output When using multiple engines, findings are compared and deduplicated: ### Dual Engine Example ``` ┌─────────────────────────────────────────────────────────────────┐ │ CODE REVIEW RESULTS - DUAL ENGINE (Claude + Codex) │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ ✅ AGREED (Found by both): │ │ 🔴 SQL injection in auth.ts:45 │ │ 🟡 Missing error handling in api.ts:112 │ │ │ │ 🔷 CLAUDE ONLY: │ │ 🟠 Potential race condition in worker.ts:89 │ │ 🟢 Consider extracting helper function │ │ │ │ 🔶 CODEX ONLY: │ │ 🟠 Memory leak - unclosed stream in upload.ts:34 │ │ 🟡 N+1 query pattern in orders.ts:156 │ │ │ ├─────────────────────────────────────────────────────────────────┤ │ SUMMARY │ │ Agreed: 2 | Claude only: 2 | Codex only: 2 │ │ Critical: 1 | High: 2 | Medium: 2 | Low: 1 │ │ Status: ❌ BLOCKED - Fix critical/high issues │ └─────────────────────────────────────────────────────────────────┘ ``` ### Triple Engine Example (All Three) ``` ┌─────────────────────────────────────────────────────────────────┐ │ CODE REVIEW RESULTS - TRIPLE ENGINE │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ ✅ UNANIMOUS (All 3 found): │ │ 🔴 SQL injection in auth.ts:45 │ │ │ │ ✅ MAJORITY (2 of 3 found): │ │ 🟠 Memory leak - unclosed stream in upload.ts:34 (Codex+Gemini)│ │ 🟡 Missing error handling in api.ts:112 (Claude+Codex) │ │ │ │ 🔷 CLAUDE ONLY: │ │ 🟠 Potential race condition in worker.ts:89 │ │ │ │ 🔶 CODEX ONLY: │ │ 🟡 N+1 query pattern in orders.ts:156 │ │ │ │ 🟢 GEMINI ONLY: │ │ 🟡 Consider using batch API for better performance │ │ 🟢 Type could be more specific in types.ts:23 │ │ │ ├─────────────────────────────────────────────────────────────────┤ │ SUMMARY │ │ Unanimous: 1 | Majority: 2 | Single: 5 │ │ Critical: 1 | High: 2 | Medium: 3 | Low: 2 │ │ Status: ❌ BLOCKED - Fix critical/high issues │ └─────────────────────────────────────────────────────────────────┘ ``` ### When to Use Each Mode | Mode | Use When | |------|----------| | **Single (Claude)** | Quick in-flow reviews, exploration | | **Single (Codex)** | CI/CD automation, high accuracy needed | | **Single (Gemini)** | Large codebases (100+ files), free tier | | **Dual** | Important PRs, pre-merge reviews | | **Triple (All)** | Security-critical code, payment systems, auth | --- ## Core Philosophy ``` ┌─────────────────────────────────────────────────────────────────┐ │ CODE REVIEW IS NON-NEGOTIABLE │ │ ───────────────────────────────────────────────────────────── │ │ │ │ Every commit must pass code review. │ │ Every PR must be reviewed before merge. │ │ Every deployment must include review sign-off. │ │ │ │ AI catches what humans miss. Humans catch what AI misses. │ │ Together: fewer bugs, cleaner code, better security. │ ├─────────────────────────────────────────────────────────────────┤ │ INVOKE: /code-review │ │ PLUGIN: code-review@claude-plugins-official │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## When to Run Code Review ### Mandatory Review Points | Trigger | Action | Command | |---------|--------|---------| | **Before commit** | Review staged changes | `/code-review` | | **Before PR** | Review all changes vs base | `/code-review` | | **Before merge** | Final review of PR | `/code-review` | | **Before deploy** | Review deployment diff | `/code-review` | ### Automatic Integration **Run code review automatically before every commit:** ``` ┌─────────────────────────────────────────────────────────────────┐ │ COMMIT WORKFLOW │ │ ───────────────────────────────────────────────────────────── │ │ │ │ 1. Write code │ │ 2. Run tests (TDD - must pass) │ │ 3. Run /code-review ← MANDATORY │ │ 4. Address critical/high issues │ │ 5. Commit │ │ 6. Push │ │ │ │ Skip step 3? ❌ NO COMMIT ALLOWED │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Using the Code Review Plugin ### Basic Usage ```bash # Review current changes /code-review # Review specific files /code-review src/auth/*.ts # Review a PR /code-review --pr 123 # Review with specific focus /code-review --focus security /code-review --focus performance /code-review --focus architecture ``` ### Review Categories The code review plugin analyzes: | Category | What It Checks | |----------|----------------| | **Security** | Vulnerabilities, injection risks, auth issues, secrets | | **Performance** | N+1 queries, memory leaks, inefficient algorithms | | **Architecture** | Design patterns, SOLID principles, coupling | | **Code Quality** | Readability, complexity, duplication | | **Best Practices** | Language idioms, framework conventions | | **Testing** | Coverage gaps, test quality, edge cases | | **Documentation** | Missing docs, outdated comments | ### Severity Levels | Level | Action Required | Can Commit? | |-------|-----------------|-------------| | 🔴 **Critical** | Must fix immediately | ❌ NO | | 🟠 **High** | Should fix before commit | ❌ NO | | 🟡 **Medium** | Fix soon, can commit | ✅ YES | | 🟢 **Low** | Nice to have | ✅ YES | | ℹ️ **Info** | Suggestions only | ✅ YES | --- ## Pre-Commit Hook Integration ### Install Pre-Commit Hook ```bash #!/bin/bash # .git/hooks/pre-commit echo "🔍 Running code review..." # Run Claude code review on staged files STAGED_FILES=$(git diff --cached --name-only --diff-filter=ACM | grep -E '\.(ts|tsx|js|jsx|py|go|rs)$') if [ -n "$STAGED_FILES" ]; then # Invoke code review (requires claude CLI) claude --print "/code-review $STAGED_FILES" > /tmp/code-review-result.txt 2>&1 # Check for critical/high issues if grep -q "🔴\|Critical\|🟠\|High" /tmp/code-review-result.txt; then echo "❌ Code review found critical/high issues:" cat /tmp/code-review-result.txt echo "" echo "Fix these issues before committing." exit 1 fi echo "✅ Code review passed" fi exit 0 ``` ### Make Hook Executable ```bash chmod +x .git/hooks/pre-commit ``` --- ## Codex CLI Setup (For Codex/Both Modes) If you want to use Codex or Both modes, install the Codex CLI: ```bash # Prerequisites: Node.js 22+ node --version # Must be 22+ # Install Codex CLI npm install -g @openai/codex # Authenticate (choose one): # Option 1: ChatGPT subscription (Plus, Pro, Team, Enterprise) codex # Follow prompts to sign in # Option 2: API key export OPENAI_API_KEY=sk-proj-... ``` ### Verify Installation ```bash # Check Codex is installed codex --version # Test review codex > /review ``` See `codex-review.md` skill for full Codex documentation. --- ## Gemini CLI Setup (For Gemini/Multi-Engine Modes) If you want to use Gemini or multi-engine modes, install the Gemini CLI: ```bash # Prerequisites: Node.js 20+ node --version # Must be 20+ # Install Gemini CLI npm install -g @google/gemini-cli # Or via Homebrew (macOS) brew install gemini-cli # Install Code Review extension gemini extensions install https://github.com/gemini-cli-extensions/code-review ``` ### Authenticate ```bash # Option 1: Google Account (recommended, 1000 req/day free) gemini # Follow browser login prompts # Option 2: API key (100 req/day free) export GEMINI_API_KEY="your-key-from-aistudio.google.com" ``` ### Verify Installation ```bash # Check Gemini is installed gemini --version # List extensions gemini extensions list # Test review gemini > /code-review ``` See `gemini-review.md` skill for full Gemini documentation. --- ## CI/CD Integration ### GitHub Actions - Claude Only ```yaml # .github/workflows/code-review.yml name: Code Review on: pull_request: types: [opened, synchronize, reopened] jobs: code-review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Get changed files id: changed-files run: | echo "files=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | tr '\n' ' ')" >> $GITHUB_OUTPUT - name: Run Claude Code Review env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | npx @anthropic-ai/claude-code --print "/code-review ${{ steps.changed-files.outputs.files }}" > review.md - name: Post Review Comment uses: actions/github-script@v7 with: script: | const fs = require('fs'); const review = fs.readFileSync('review.md', 'utf8'); github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: `## 🔍 Claude Code Review\n\n${review}` }); - name: Check for Critical Issues run: | if grep -q "Critical\|🔴" review.md; then echo "❌ Critical issues found" exit 1 fi ``` ### GitHub Actions - Codex Only ```yaml # .github/workflows/codex-review.yml name: Codex Code Review on: pull_request: jobs: review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Codex Review uses: openai/codex-action@main with: openai_api_key: ${{ secrets.OPENAI_API_KEY }} model: gpt-5.2-codex safety_strategy: drop-sudo ``` ### GitHub Actions - Both Engines ```yaml # .github/workflows/dual-review.yml name: Dual Code Review on: pull_request: jobs: claude-review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Claude Review env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | npx @anthropic-ai/claude-code --print "/code-review" > claude-review.md - uses: actions/upload-artifact@v4 with: name: claude-review path: claude-review.md codex-review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: actions/setup-node@v4 with: node-version: '22' - name: Install Codex run: npm install -g @openai/codex - name: Codex Review env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | codex exec --full-auto --sandbox read-only \ --output-last-message codex-review.md \ "Review this code for bugs, security issues, and quality problems" - uses: actions/upload-artifact@v4 with: name: codex-review path: codex-review.md combine-reviews: needs: [claude-review, codex-review] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v4 - name: Combine Reviews run: | echo "## 🔍 Dual Code Review Results" > combined-review.md echo "" >> combined-review.md echo "### Claude Findings" >> combined-review.md cat claude-review/claude-review.md >> combined-review.md echo "" >> combined-review.md echo "### Codex Findings" >> combined-review.md cat codex-review/codex-review.md >> combined-review.md - name: Post Combined Review uses: actions/github-script@v7 with: script: | const fs = require('fs'); const review = fs.readFileSync('combined-review.md', 'utf8'); github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: review }); ``` ### GitHub Actions - Gemini Only ```yaml # .github/workflows/gemini-review.yml name: Gemini Code Review on: pull_request: types: [opened, synchronize] jobs: review: runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '20' - name: Install Gemini CLI run: npm install -g @google/gemini-cli - name: Run Review env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} run: | # Get diff git diff origin/${{ github.base_ref }}...HEAD > diff.txt # Run Gemini review gemini -p "Review this pull request diff for bugs, security issues, and code quality problems. Be specific about file names and line numbers. $(cat diff.txt)" > review.md - name: Post Review Comment uses: actions/github-script@v7 with: script: | const fs = require('fs'); const review = fs.readFileSync('review.md', 'utf8'); github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: `## 🤖 Gemini Code Review\n\n${review}` }); - name: Check for Critical Issues run: | if grep -qi "critical\|security vulnerability\|injection" review.md; then echo "❌ Critical issues found" exit 1 fi ``` ### GitHub Actions - All Three Engines ```yaml # .github/workflows/triple-review.yml name: Triple Engine Code Review on: pull_request: jobs: claude-review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Claude Review env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | npx @anthropic-ai/claude-code --print "/code-review" > claude-review.md - uses: actions/upload-artifact@v4 with: name: claude-review path: claude-review.md codex-review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: actions/setup-node@v4 with: node-version: '22' - name: Install Codex run: npm install -g @openai/codex - name: Codex Review env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | codex exec --full-auto --sandbox read-only \ --output-last-message codex-review.md \ "Review this code for bugs, security issues, and quality problems" - uses: actions/upload-artifact@v4 with: name: codex-review path: codex-review.md gemini-review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: actions/setup-node@v4 with: node-version: '20' - name: Install Gemini CLI run: npm install -g @google/gemini-cli - name: Gemini Review env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} run: | git diff origin/${{ github.base_ref }}...HEAD > diff.txt gemini -p "Review this code diff for bugs, security, and quality issues: $(cat diff.txt)" > gemini-review.md - uses: actions/upload-artifact@v4 with: name: gemini-review path: gemini-review.md combine-reviews: needs: [claude-review, codex-review, gemini-review] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v4 - name: Combine Reviews run: | echo "## 🔍 Triple Engine Code Review Results" > combined-review.md echo "" >> combined-review.md echo "### 🟣 Claude Findings" >> combined-review.md cat claude-review/claude-review.md >> combined-review.md echo "" >> combined-review.md echo "---" >> combined-review.md echo "### 🟢 Codex Findings" >> combined-review.md cat codex-review/codex-review.md >> combined-review.md echo "" >> combined-review.md echo "---" >> combined-review.md echo "### 🔵 Gemini Findings" >> combined-review.md cat gemini-review/gemini-review.md >> combined-review.md - name: Post Combined Review uses: actions/github-script@v7 with: script: | const fs = require('fs'); const review = fs.readFileSync('combined-review.md', 'utf8'); github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: review }); - name: Check Critical Issues run: | # Fail if any engine found critical issues if grep -qi "critical\|🔴" combined-review.md; then echo "❌ Critical issues found by at least one engine" exit 1 fi ``` --- ## Review Checklist ### Before Every Commit - [ ] Run `/code-review` on staged changes - [ ] No critical (🔴) issues - [ ] No high (🟠) issues - [ ] Security concerns addressed - [ ] Performance issues considered ### Before Every PR - [ ] Full code review of all changes - [ ] All critical/high issues resolved - [ ] Tests added for new functionality - [ ] Documentation updated if needed ### Before Every Deployment - [ ] Final review of deployment diff - [ ] Security scan passed - [ ] No new vulnerabilities introduced - [ ] Rollback plan documented --- ## Common Review Findings ### Security Issues (Always Fix) | Issue | Example | Fix | |-------|---------|-----| | SQL Injection | `query = f"SELECT * FROM users WHERE id = {id}"` | Use parameterized queries | | XSS | `innerHTML = userInput` | Sanitize or use textContent | | Secrets in code | `apiKey = "sk-xxx"` | Use environment variables | | Missing auth | Unprotected endpoints | Add authentication middleware | | Insecure crypto | MD5/SHA1 for passwords | Use bcrypt/argon2 | ### Performance Issues (Should Fix) | Issue | Example | Fix | |-------|---------|-----| | N+1 queries | Loop with individual queries | Use batch/eager loading | | Memory leak | Unclosed connections | Use connection pooling | | Missing index | Slow queries | Add database indexes | | Large payload | Fetching unused fields | Select only needed fields | | No pagination | Loading all records | Implement pagination | ### Code Quality (Nice to Fix) | Issue | Example | Fix | |-------|---------|-----| | Long function | 100+ lines | Extract into smaller functions | | Deep nesting | 5+ levels | Early returns, extract methods | | Magic numbers | `if (status === 3)` | Use named constants | | Duplicate code | Copy-pasted blocks | Extract shared function | | Missing types | `any` everywhere | Add proper TypeScript types | --- ## Integration with TDD Workflow ``` ┌─────────────────────────────────────────────────────────────────┐ │ TDD + CODE REVIEW WORKFLOW │ │ ───────────────────────────────────────────────────────────── │ │ │ │ 1. RED: Write failing tests │ │ 2. GREEN: Write code to pass tests │ │ 3. REFACTOR: Clean up code │ │ 4. REVIEW: Run /code-review ← NEW STEP │ │ 5. FIX: Address critical/high issues │ │ 6. VALIDATE: Lint + TypeCheck + Coverage │ │ 7. COMMIT: Only after review passes │ │ │ │ Review catches what tests miss: │ │ - Security vulnerabilities │ │ - Performance issues │ │ - Architecture problems │ │ - Code maintainability │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Review Response Template When code review finds issues, respond with: ```markdown ## Code Review Results ### 🔴 Critical Issues (Must Fix) 1. **SQL Injection in userController.ts:45** - Issue: User input directly interpolated into query - Fix: Use parameterized query - Code: `db.query('SELECT * FROM users WHERE id = $1', [userId])` ### 🟠 High Issues (Should Fix) 1. **Missing authentication on /api/admin endpoints** - Issue: Admin routes accessible without auth - Fix: Add auth middleware ### 🟡 Medium Issues (Fix Soon) 1. **N+1 query in getOrders function** - Consider eager loading or batch query ### 🟢 Low Issues (Nice to Have) 1. **Consider extracting validation logic to separate file** ### ✅ Strengths - Good test coverage - Clear function names - Proper error handling ### 📊 Summary - Critical: 1 | High: 1 | Medium: 1 | Low: 1 - **Status: ❌ BLOCKED** - Fix critical/high issues before commit ``` --- ## Claude Instructions ### When to Invoke Code Review Claude should automatically suggest or run code review: 1. **After completing a feature** → "Let me run a code review before we commit" 2. **Before creating a PR** → "Running code review on all changes" 3. **When user says "commit"** → "First, let me review the changes" 4. **After fixing bugs** → "Reviewing the fix for any issues" ### Review Focus Areas Prioritize review based on change type: | Change Type | Focus Areas | |-------------|-------------| | Auth/Security code | Security, input validation, crypto | | Database code | SQL injection, N+1, transactions | | API endpoints | Auth, rate limiting, validation | | Frontend code | XSS, state management, performance | | Infrastructure | Secrets, permissions, logging | --- ## Quick Reference ### Commands ```bash # Basic review /code-review # Review specific files /code-review src/auth.ts src/users.ts # Review with focus /code-review --focus security # Review PR /code-review --pr 123 ``` ### Severity Actions ``` 🔴 Critical → STOP. Fix now. No commit. 🟠 High → STOP. Fix now. No commit. 🟡 Medium → Note it. Fix soon. Can commit. 🟢 Low → Optional. Nice to have. ℹ️ Info → FYI only. ``` ### Workflow ``` Code → Test → Review → Fix → Commit → Push → PR → Review → Merge → Deploy ↑ ↑ ↑ /code-review /code-review /code-review ``` ================================================ FILE: skills/codex-review/SKILL.md ================================================ --- name: codex-review description: OpenAI Codex CLI code review with GPT-5.2-Codex, CI/CD integration when-to-use: When user requests Codex-powered code review or multi-engine review user-invocable: true effort: medium --- # OpenAI Codex Code Review Skill Use OpenAI's Codex CLI for specialized code review with GPT-5.2-Codex - trained specifically for detecting bugs, security flaws, and code quality issues. **Sources:** [Codex CLI](https://developers.openai.com/codex/cli/) | [GitHub](https://github.com/openai/codex) | [Code Review Cookbook](https://cookbook.openai.com/examples/codex/build_code_review_with_codex_sdk) --- ## Why Codex for Code Review? | Feature | Benefit | |---------|---------| | **GPT-5.2-Codex** | Specialized training for code review | | **88% detection rate** | Bugs, security flaws, style issues (LiveCodeBench) | | **Structured output** | JSON schema for consistent findings | | **GitHub native** | `@codex review` in PR comments | | **Headless mode** | CI/CD automation without TUI | --- ## Installation ### Prerequisites ```bash # Check Node.js version (requires 22+) node --version # Install Node.js 22 if needed # macOS brew install node@22 # Or via nvm nvm install 22 nvm use 22 ``` ### Install Codex CLI ```bash # Via npm (recommended) npm install -g @openai/codex # Via Homebrew (macOS) brew install --cask codex # Verify installation codex --version ``` ### Authentication **Option 1: ChatGPT Subscription** (Plus, Pro, Team, Edu, Enterprise) ```bash codex # Follow prompts to sign in with ChatGPT account ``` **Option 2: OpenAI API Key** ```bash # Set environment variable export OPENAI_API_KEY=sk-proj-... # Or add to shell profile echo 'export OPENAI_API_KEY=sk-proj-...' >> ~/.zshrc # Run Codex codex ``` ### Shell Completions (Optional) ```bash # Bash codex completion bash >> ~/.bashrc # Zsh codex completion zsh >> ~/.zshrc # Fish codex completion fish > ~/.config/fish/completions/codex.fish ``` --- ## Interactive Code Review ### Launch Review Mode ```bash # Start Codex codex # In the TUI, type: /review ``` ### Review Presets | Preset | Use Case | |--------|----------| | **Review against base branch** | Before opening PR - diffs against upstream | | **Review uncommitted changes** | Before committing - staged + unstaged + untracked | | **Review a commit** | Analyze specific SHA from history | | **Custom instructions** | e.g., "Focus on security vulnerabilities" | ### Example Session ``` $ codex > /review Select review type: ❯ Review against a base branch Review uncommitted changes Review a commit Custom review instructions Select base branch: main Reviewing changes... ┌─────────────────────────────────────────────────────────────┐ │ CODE REVIEW FINDINGS │ ├─────────────────────────────────────────────────────────────┤ │ 🔴 CRITICAL: SQL Injection vulnerability │ │ File: src/api/users.ts:45 │ │ Issue: User input directly interpolated in query │ │ Fix: Use parameterized queries │ ├─────────────────────────────────────────────────────────────┤ │ 🟠 HIGH: Missing authentication check │ │ File: src/api/admin.ts:23 │ │ Issue: Admin endpoint accessible without auth │ │ Fix: Add requireAuth middleware │ ├─────────────────────────────────────────────────────────────┤ │ 🟡 MEDIUM: Inefficient database query │ │ File: src/services/orders.ts:89 │ │ Issue: N+1 query pattern in loop │ │ Fix: Use batch query or JOIN │ └─────────────────────────────────────────────────────────────┘ ``` --- ## Headless Mode (Automation) ### Basic Usage ```bash # Simple review codex exec "review the code for bugs and security issues" # Review with JSON output codex exec --json "review uncommitted changes" > review.json # Save final message to file codex exec --output-last-message review.txt "review the diff against main" ``` ### Full Automation (CI/CD) ```bash # Full auto mode (use only in isolated runners!) codex exec \ --full-auto \ --json \ --output-last-message findings.txt \ --sandbox read-only \ -m gpt-5.2-codex \ "Review this code for bugs, security issues, and performance problems" ``` ### Structured Output with Schema ```bash # Define output schema cat > review-schema.json << 'EOF' { "type": "object", "properties": { "findings": { "type": "array", "items": { "type": "object", "properties": { "severity": { "enum": ["critical", "high", "medium", "low"] }, "title": { "type": "string" }, "file": { "type": "string" }, "line": { "type": "integer" }, "description": { "type": "string" }, "suggestion": { "type": "string" } }, "required": ["severity", "title", "file", "description"] } }, "summary": { "type": "string" }, "approved": { "type": "boolean" } }, "required": ["findings", "summary", "approved"] } EOF # Run with schema validation codex exec \ --output-schema review-schema.json \ --output-last-message review.json \ "Review the staged changes and output findings" ``` --- ## GitHub Integration ### Option 1: PR Comment Trigger In any pull request, add a comment: ``` @codex review ``` Codex will respond with a standard GitHub code review. ### Option 2: GitHub Action ```yaml # .github/workflows/codex-review.yml name: Codex Code Review on: pull_request: types: [opened, synchronize] jobs: review: runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Codex Review uses: openai/codex-action@main with: openai_api_key: ${{ secrets.OPENAI_API_KEY }} model: gpt-5.2-codex safety_strategy: drop-sudo ``` ### Option 3: Manual Headless in CI ```yaml # .github/workflows/codex-review.yml name: Codex Code Review on: pull_request: jobs: review: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: actions/setup-node@v4 with: node-version: '22' - name: Install Codex CLI run: npm install -g @openai/codex - name: Run Review env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | # Get diff git diff origin/${{ github.base_ref }}...HEAD > diff.txt # Run Codex review codex exec \ --full-auto \ --sandbox read-only \ --output-last-message review.md \ "Review this git diff for bugs, security issues, and code quality: $(cat diff.txt)" - name: Post Review Comment uses: actions/github-script@v7 with: script: | const fs = require('fs'); const review = fs.readFileSync('review.md', 'utf8'); github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: `## 🤖 Codex Code Review\n\n${review}` }); ``` --- ## GitLab CI/CD ```yaml # .gitlab-ci.yml codex-review: image: node:22 stage: review script: - npm install -g @openai/codex - | codex exec \ --full-auto \ --sandbox read-only \ --output-last-message review.md \ "Review the merge request changes for bugs and security issues" - cat review.md artifacts: paths: - review.md rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" ``` --- ## Jenkins Pipeline ```groovy pipeline { agent any environment { OPENAI_API_KEY = credentials('openai-api-key') } stages { stage('Install Codex') { steps { sh 'npm install -g @openai/codex' } } stage('Code Review') { steps { sh ''' codex exec \ --full-auto \ --sandbox read-only \ --output-last-message review.md \ "Review the code changes for bugs and security issues" ''' } } stage('Publish Results') { steps { archiveArtifacts artifacts: 'review.md' script { def review = readFile('review.md') echo "Code Review Results:\n${review}" } } } } } ``` --- ## Configuration ### Config File ```toml # ~/.codex/config.toml [model] default = "gpt-5.2-codex" # Best for code review [sandbox] default = "read-only" # Safe for reviews [review] # Custom review instructions applied to all reviews instructions = """ Focus on: 1. Security vulnerabilities (OWASP Top 10) 2. Performance issues (N+1 queries, memory leaks) 3. Error handling gaps 4. Type safety issues """ ``` ### Per-Project Config ```toml # .codex/config.toml (in project root) [review] instructions = """ This is a Python FastAPI project. Focus on: - Async/await correctness - Pydantic model validation - SQL injection via SQLAlchemy - Authentication/authorization gaps """ ``` --- ## CLI Quick Reference ```bash # Interactive codex # Start TUI /review # Open review presets # Headless codex exec "prompt" # Non-interactive execution codex exec --json "prompt" # JSON output codex exec --full-auto "prompt" # No approval prompts # Key Flags --output-last-message FILE # Save response to file --output-schema FILE # Validate against JSON schema --sandbox read-only # Restrict file access -m gpt-5.2-codex # Use best review model --json # Machine-readable output # Resume codex exec resume SESSION_ID # Continue previous session ``` --- ## Comparison: Claude vs Codex Review | Aspect | Claude (Built-in) | Codex CLI | |--------|-------------------|-----------| | **Setup** | None (already in Claude Code) | Install CLI + auth | | **Model** | Claude | GPT-5.2-Codex (specialized) | | **Context** | Full conversation context | Fresh context per review | | **Integration** | Native | GitHub, GitLab, Jenkins | | **Output** | Markdown | JSON schema support | | **Best for** | Quick reviews, in-flow | CI/CD, critical PRs | --- ## Security Considerations ### CI/CD Safety ```yaml # Always use these flags in CI/CD: --sandbox read-only # Prevent file modifications --safety-strategy drop-sudo # Revoke elevated permissions ``` ### API Key Protection ```yaml # GitHub Actions - use secrets env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} # Never hardcode keys # Never echo keys in logs ``` ### Public Repositories For public repos, use `drop-sudo` safety strategy to prevent Codex from reading its own API key during execution. --- ## Troubleshooting | Issue | Solution | |-------|----------| | `codex: command not found` | Run `npm install -g @openai/codex` | | `Node.js version error` | Upgrade to Node.js 22+ | | `Authentication failed` | Re-run `codex` and sign in again | | `API key invalid` | Check `OPENAI_API_KEY` env var | | `Timeout in CI` | Add `--timeout 300` flag | | `Rate limited` | Reduce frequency or upgrade plan | --- ## Anti-Patterns - **Using `--dangerously-bypass-approvals-and-sandbox` casually** - Only in isolated CI runners - **Exposing API keys in logs** - Use secrets management - **Skipping sandbox in CI** - Always use `--sandbox read-only` - **Ignoring findings** - Review and address or document exceptions - **Running on every commit** - Use on PRs only to save costs ================================================ FILE: skills/commit-hygiene/SKILL.md ================================================ --- name: commit-hygiene description: Atomic commits, PR size limits, commit thresholds, stacked PRs when-to-use: When committing code, creating PRs, or when change set is growing large user-invocable: false effort: low --- # Commit Hygiene Skill **Purpose:** Keep commits atomic, PRs reviewable, and git history clean. Advise when it's time to commit before changes become too large. --- ## Core Philosophy ``` ┌─────────────────────────────────────────────────────────────────┐ │ ATOMIC COMMITS │ │ ───────────────────────────────────────────────────────────── │ │ One logical change per commit. │ │ Each commit should be self-contained and deployable. │ │ If you need "and" to describe it, split it. │ ├─────────────────────────────────────────────────────────────────┤ │ SMALL PRS WIN │ │ ───────────────────────────────────────────────────────────── │ │ < 400 lines changed = reviewed in < 1 hour │ │ > 1000 lines = likely rubber-stamped or abandoned │ │ Smaller PRs = faster reviews, fewer bugs, easier reverts │ ├─────────────────────────────────────────────────────────────────┤ │ COMMIT EARLY, COMMIT OFTEN │ │ ───────────────────────────────────────────────────────────── │ │ Working code? Commit it. │ │ Test passing? Commit it. │ │ Don't wait for "done" - commit at every stable point. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Commit Size Thresholds ### Warning Thresholds (Time to Commit!) | Metric | Yellow Zone | Red Zone | Action | |--------|-------------|----------|--------| | **Files changed** | 5-10 files | > 10 files | Commit NOW | | **Lines added** | 150-300 lines | > 300 lines | Commit NOW | | **Lines deleted** | 100-200 lines | > 200 lines | Commit NOW | | **Total changes** | 250-400 lines | > 400 lines | Commit NOW | | **Time since last commit** | 30-60 min | > 60 min | Consider committing | ### Ideal Commit Size ``` ┌─────────────────────────────────────────────────────────────────┐ │ IDEAL COMMIT │ │ ───────────────────────────────────────────────────────────── │ │ Files: 1-5 │ │ Lines: 50-200 total changes │ │ Scope: Single logical unit of work │ │ Message: Describes ONE thing │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Check Current State (Run Frequently) ### Quick Status Check ```bash # See what's changed (staged + unstaged) git status --short # Count files and lines changed git diff --stat git diff --cached --stat # Staged only # Get totals git diff --shortstat # Example output: 8 files changed, 245 insertions(+), 32 deletions(-) ``` ### Detailed Change Analysis ```bash # Full diff summary with file names git diff --stat HEAD # Just the numbers git diff --numstat HEAD | awk '{add+=$1; del+=$2} END {print "+"add" -"del" total:"add+del}' # Files changed count git status --porcelain | wc -l ``` ### Pre-Commit Check Script ```bash #!/bin/bash # scripts/check-commit-size.sh # Thresholds MAX_FILES=10 MAX_LINES=400 WARN_FILES=5 WARN_LINES=200 # Get stats FILES=$(git status --porcelain | wc -l | tr -d ' ') STATS=$(git diff --shortstat HEAD 2>/dev/null) INSERTIONS=$(echo "$STATS" | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo 0) DELETIONS=$(echo "$STATS" | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo 0) TOTAL=$((INSERTIONS + DELETIONS)) echo "📊 Current changes: $FILES files, +$INSERTIONS -$DELETIONS ($TOTAL total lines)" # Check thresholds if [ "$FILES" -gt "$MAX_FILES" ] || [ "$TOTAL" -gt "$MAX_LINES" ]; then echo "🔴 RED ZONE: Commit immediately! Changes are too large." echo " Consider splitting into multiple commits." exit 1 elif [ "$FILES" -gt "$WARN_FILES" ] || [ "$TOTAL" -gt "$WARN_LINES" ]; then echo "🟡 WARNING: Changes getting large. Commit soon." exit 0 else echo "🟢 OK: Changes are within healthy limits." exit 0 fi ``` --- ## When to Commit ### Commit Triggers (Any One = Commit) | Trigger | Example | |---------|---------| | **Test passes** | Just got a test green → commit | | **Feature complete** | Finished a function → commit | | **Refactor done** | Renamed variable across files → commit | | **Bug fixed** | Fixed the issue → commit | | **Before switching context** | About to work on something else → commit | | **Clean compile** | Code compiles/lints clean → commit | | **Threshold hit** | > 5 files or > 200 lines → commit | ### Commit Immediately If - ✅ Tests are passing after being red - ✅ You're about to make a "big change" - ✅ You've been coding for 30+ minutes - ✅ You're about to try something risky - ✅ The current state is "working" ### Don't Wait For - ❌ "Perfect" code - ❌ All features done - ❌ Full test coverage - ❌ Code review from yourself - ❌ Documentation complete --- ## Atomic Commit Patterns ### Good Atomic Commits ``` ✅ "Add email validation to signup form" - 3 files: validator.ts, signup.tsx, signup.test.ts - 120 lines changed - Single purpose: email validation ✅ "Fix null pointer in user lookup" - 2 files: userService.ts, userService.test.ts - 25 lines changed - Single purpose: fix one bug ✅ "Refactor: Extract PaymentProcessor class" - 4 files: payment.ts → paymentProcessor.ts + types - 180 lines changed - Single purpose: refactoring ``` ### Bad Commits (Too Large) ``` ❌ "Add authentication, fix bugs, update styles" - 25 files changed - 800 lines changed - Multiple purposes mixed ❌ "WIP" - Unknown scope - No clear purpose - Hard to review/revert ❌ "Updates" - 15 files changed - Mix of features, fixes, refactors - Impossible to review properly ``` --- ## Splitting Large Changes ### Strategy 1: By Layer ``` Instead of one commit with: - API endpoint + database migration + frontend + tests Split into: 1. "Add users table migration" 2. "Add User model and repository" 3. "Add GET /users endpoint" 4. "Add UserList component" 5. "Add integration tests for user flow" ``` ### Strategy 2: By Feature Slice ``` Instead of one commit with: - All CRUD operations for users Split into: 1. "Add create user functionality" 2. "Add read user functionality" 3. "Add update user functionality" 4. "Add delete user functionality" ``` ### Strategy 3: Refactor First ``` Instead of: - Feature + refactoring mixed Split into: 1. "Refactor: Extract validation helpers" (no behavior change) 2. "Add email validation using new helpers" (new feature) ``` ### Strategy 4: By Risk Level ``` Instead of: - Safe changes + risky changes together Split into: 1. "Update dependencies" (safe, isolated) 2. "Migrate to new API version" (risky, separate) ``` --- ## PR Size Guidelines ### Optimal PR Size | Metric | Optimal | Acceptable | Too Large | |--------|---------|------------|-----------| | **Files** | 1-10 | 10-20 | > 20 | | **Lines changed** | 50-200 | 200-400 | > 400 | | **Commits** | 1-5 | 5-10 | > 10 | | **Review time** | < 30 min | 30-60 min | > 60 min | ### PR Size vs Defect Rate ``` ┌─────────────────────────────────────────────────────────────────┐ │ RESEARCH FINDINGS (Google, Microsoft studies) │ │ ───────────────────────────────────────────────────────────── │ │ PRs < 200 lines: 15% defect rate │ │ PRs 200-400 lines: 23% defect rate │ │ PRs > 400 lines: 40%+ defect rate │ │ │ │ Review quality drops sharply after 200-400 lines. │ │ Large PRs get "LGTM" rubber stamps, not real reviews. │ └─────────────────────────────────────────────────────────────────┘ ``` ### When PR is Too Large ```bash # Check PR size before creating git diff main --stat git diff main --shortstat # If too large, consider: # 1. Split into multiple PRs (stacked PRs) # 2. Create feature flag and merge incrementally # 3. Use draft PR for early feedback ``` --- ## Commit Message Format ### Structure ``` <type>: <description> (50 chars max) [optional body - wrap at 72 chars] [optional footer] ``` ### Types | Type | Use For | |------|---------| | `feat` | New feature | | `fix` | Bug fix | | `refactor` | Code change that neither fixes nor adds | | `test` | Adding/updating tests | | `docs` | Documentation only | | `style` | Formatting, no code change | | `chore` | Build, config, dependencies | ### Examples ``` feat: Add email validation to signup form fix: Prevent null pointer in user lookup refactor: Extract PaymentProcessor class test: Add integration tests for checkout flow chore: Update dependencies to latest versions ``` --- ## Git Workflow Integration ### Pre-Commit Hook for Size Check ```bash #!/bin/bash # .git/hooks/pre-commit MAX_LINES=400 MAX_FILES=15 FILES=$(git diff --cached --name-only | wc -l | tr -d ' ') STATS=$(git diff --cached --shortstat) INSERTIONS=$(echo "$STATS" | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo 0) DELETIONS=$(echo "$STATS" | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo 0) TOTAL=$((INSERTIONS + DELETIONS)) if [ "$TOTAL" -gt "$MAX_LINES" ]; then echo "❌ Commit too large: $TOTAL lines (max: $MAX_LINES)" echo " Consider splitting into smaller commits." echo " Use 'git add -p' for partial staging." exit 1 fi if [ "$FILES" -gt "$MAX_FILES" ]; then echo "❌ Too many files: $FILES (max: $MAX_FILES)" echo " Consider splitting into smaller commits." exit 1 fi echo "✅ Commit size OK: $FILES files, $TOTAL lines" ``` ### Partial Staging (Split Large Changes) ```bash # Stage specific hunks interactively git add -p # Stage specific files git add path/to/specific/file.ts # Stage with preview git add -N file.ts # Intent to add git diff # See what would be added git add file.ts # Actually add ``` ### Unstage If Too Large ```bash # Unstage everything git reset HEAD # Unstage specific files git reset HEAD path/to/file.ts # Stage just what you need for THIS commit git add -p ``` --- ## Claude Integration ### Periodic Check During Development **Claude should run this check after every significant change:** ```bash # Quick status git diff --shortstat HEAD ``` **Thresholds for Claude to advise committing:** | Condition | Claude Action | |-----------|---------------| | > 5 files changed | Suggest: "Consider committing current changes" | | > 200 lines changed | Suggest: "Changes are getting large, commit recommended" | | > 10 files OR > 400 lines | Warn: "⚠️ Commit now before changes become unmanageable" | | Test just passed | Suggest: "Good checkpoint - commit these passing tests" | | Refactoring complete | Suggest: "Refactoring done - commit before adding features" | ### Claude Commit Reminder Messages ``` 📊 Status: 7 files changed, +180 -45 (225 total) 💡 Approaching commit threshold. Consider committing current work. --- 📊 Status: 12 files changed, +320 -80 (400 total) ⚠️ Changes are large! Commit now to keep PRs reviewable. Suggested commit: "feat: Add user authentication flow" --- 📊 Status: 3 files changed, +85 -10 (95 total) ✅ Tests passing. Good time to commit! Suggested commit: "fix: Validate email format on signup" ``` --- ## Stacked PRs (For Large Features) When a feature is genuinely large, use stacked PRs: ``` ┌─────────────────────────────────────────────────────────────────┐ │ STACKED PR PATTERN │ │ ───────────────────────────────────────────────────────────── │ │ │ │ main ─────────────────────────────────────────────────────────│ │ └── PR #1: Database schema (200 lines) ← Review first │ │ └── PR #2: API endpoints (250 lines) ← Review second │ │ └── PR #3: Frontend (300 lines) ← Review third │ │ │ │ Each PR is reviewable independently. │ │ Merge in order: #1 → #2 → #3 │ └─────────────────────────────────────────────────────────────────┘ ``` ### Creating Stacked PRs ```bash # Create base branch git checkout -b feature/auth-schema # ... make changes ... git commit -m "feat: Add users table schema" git push -u origin feature/auth-schema gh pr create --base main --title "feat: Add users table schema" # Create next branch FROM the first git checkout -b feature/auth-api # ... make changes ... git commit -m "feat: Add authentication API endpoints" git push -u origin feature/auth-api gh pr create --base feature/auth-schema --title "feat: Add auth API endpoints" # And so on... ``` --- ## Checklist ### Before Every Commit - [ ] Changes are for ONE logical purpose - [ ] Tests pass (if applicable) - [ ] Lint/typecheck pass - [ ] < 10 files changed - [ ] < 400 lines total - [ ] Commit message describes ONE thing ### Before Creating PR - [ ] Total lines < 400 (ideal < 200) - [ ] All commits are atomic - [ ] No "WIP" or "fixup" commits - [ ] PR title describes the change - [ ] Description explains why, not just what ### Red Flags (Stop and Split) - ❌ Commit message needs "and" - ❌ > 10 files in one commit - ❌ > 400 lines in one commit - ❌ Mix of features, fixes, and refactors - ❌ "I'll clean this up later" --- ## Quick Reference ### Thresholds ``` Files: ≤ 5 = 🟢 | 6-10 = 🟡 | > 10 = 🔴 Lines: ≤ 200 = 🟢 | 201-400 = 🟡 | > 400 = 🔴 Time: ≤ 30min = 🟢 | 30-60min = 🟡 | > 60min = 🔴 ``` ### Commands ```bash # Quick status git diff --shortstat HEAD # Detailed file list git diff --stat HEAD # Partial staging git add -p # Check before PR git diff main --shortstat ``` ### Commit Now If - ✅ Tests just passed - ✅ > 200 lines changed - ✅ > 5 files changed - ✅ About to switch tasks - ✅ Current state is "working" ================================================ FILE: skills/cpg-analysis/SKILL.md ================================================ --- name: cpg-analysis description: Deep code property graph analysis with Joern CPG (AST+CFG+PDG) and CodeQL for control flow, data flow, taint analysis, and security auditing when-to-use: "When deep code analysis is needed — control flow, data flow, taint tracking, or security auditing" user-invocable: true effort: high --- # CPG Analysis Skill **Purpose:** Deep code analysis beyond AST. Use Joern for full Code Property Graph (control flow, data flow, program dependencies) and CodeQL for interprocedural taint analysis and vulnerability detection. **These are opt-in tools.** They require Docker/JVM (Joern) or CodeQL CLI. Use codebase-memory-mcp (Tier 1, always-on) for everyday navigation. Use these for deep analysis when Tier 1 is not enough. ``` ┌────────────────────────────────────────────────────────────────┐ │ CODE PROPERTY GRAPH = AST + CFG + CDG + DDG + PDG │ │ ─────────────────────────────────────────────────────────────│ │ AST = Abstract Syntax Tree (structure) │ │ CFG = Control Flow Graph (execution paths) │ │ CDG = Control Dependency Graph (conditional dependencies) │ │ DDG = Data Dependency Graph (data flow between statements) │ │ PDG = Program Dependency Graph (CDG + DDG combined) │ │ │ │ Tier 2 (Joern): Full CPG with 40+ query tools │ │ Tier 3 (CodeQL): Interprocedural taint + security queries │ └────────────────────────────────────────────────────────────────┘ ``` --- ## Tier Selection Guide ``` Simple symbol lookup, dependency trace, blast radius? → Tier 1: codebase-memory-mcp (always on, sub-ms) Control flow paths, data flow, dead code, complex refactoring? → Tier 2: Joern CPG (on-demand, seconds) Security audit, taint analysis, vulnerability detection? → Tier 3: CodeQL (on-demand, seconds to minutes) Full security review before release? → All three tiers in sequence ``` --- ## Tier 2: Joern CPG (CodeBadger MCP) ### When to Use Joern | Scenario | Why Joern | Tier 1 Can't Do This | |----------|-----------|---------------------| | Trace data flow through functions | Full DDG traversal | Tier 1 has no data flow | | Understanding control flow paths | CFG analysis with branch conditions | Tier 1 has no CFG | | Finding dead/unreachable code | PDG reachability analysis | Tier 1 only detects unused exports | | Complex refactoring impact | Cross-function dependency chains | Tier 1 limited to call graph | | Auditing third-party library usage | Deep call chain traversal | Tier 1 stops at import boundary | | Understanding exception flow | CFG includes throw/catch paths | Tier 1 ignores exceptions | ### Key MCP Tools (Joern/CodeBadger) | Tool | Purpose | Example Query | |------|---------|---------------| | `generate_cpg` | Build CPG for project | First-time setup or after major changes | | `get_cpg_status` | Check CPG build status | Verify CPG is ready before querying | | `run_cpgql_query` | Run arbitrary CPGQL queries | `cpg.method("login").callOut.code.l` | | `get_cpgql_syntax_help` | Query language reference | When unsure about query syntax | | `get_cfg` | Control flow graph for a method | Understand execution paths in a function | | `list_methods` | List all methods in project | Overview of available functions | | `get_method_source` | Get source code of a method | Read specific function source | | `list_calls` | List calls from/to a method | Caller/callee analysis | | `get_call_graph` | Full call graph visualization | Understand call chains | | `get_type_definition` | Type/class definitions | Understand type hierarchy | ### Supported Languages (Joern) Java, Scala, C/C++, Python, JavaScript, TypeScript, PHP, Ruby, Go, Kotlin, Swift, Lua **Not supported:** Rust (use CodeQL for Rust) ### MCP Configuration (Joern) ```json { "mcpServers": { "codebadger": { "url": "http://localhost:4242/mcp", "type": "http" } } } ``` ### Prerequisites - Docker (for Joern backend) - Python 3.10+ (for MCP server) - Install: `~/.claude/install-graph-tools.sh --joern` ### Common CPGQL Queries ```scala // Find all methods that handle user input cpg.method.where(_.parameter.name(".*input.*|.*request.*")).name.l // Trace data flow from parameter to return cpg.method("processPayment").parameter.reachableBy(cpg.method("processPayment").methodReturn).l // Find methods with high cyclomatic complexity cpg.method.where(_.controlStructure.size > 10).name.l // Dead code: methods with no callers cpg.method.where(_.callIn.size == 0).filter(_.name != "main").name.l // Exception flow: methods that can throw but callers don't catch cpg.method.where(_.ast.isThrow.size > 0).callIn.method.filter(_.ast.isTry.size == 0).name.l ``` --- ## Tier 3: CodeQL ### When to Use CodeQL | Scenario | Why CodeQL | Other Tiers Can't Do This | |----------|-----------|--------------------------| | Security audit before release | Interprocedural taint analysis | Joern has basic taint, CodeQL is deeper | | Reviewing auth/payment code | Data flow from source to sink | Cross-function, cross-file taint | | PR security review | Targeted vulnerability scan | Pre-built OWASP query packs | | Compliance checking | CWE/OWASP pattern matching | Curated security query suites | | Rust security analysis | Full Rust support | Joern doesn't support Rust | ### Key MCP Tools (CodeQL) | Tool | Purpose | |------|---------| | `run_query` | Execute a CodeQL query against the database | | `find_definitions` | Locate symbol definitions | | `find_references` | Find all references to a symbol | | `get_results` | Parse BQRS (Binary Query Result Sets) | ### Supported Languages (CodeQL) C/C++, C#, Go, Java, Kotlin, JavaScript, TypeScript, Python, Ruby, Swift, **Rust** ### MCP Configuration (CodeQL) ```json { "mcpServers": { "codeql": { "command": "codeql-mcp", "args": ["--database", ".code-graph/codeql-db"] } } } ``` ### Prerequisites - CodeQL CLI (`brew install codeql` on macOS) - Install: `~/.claude/install-graph-tools.sh --codeql` ### Common CodeQL Patterns ```ql // SQL injection: user input flows to SQL query import python from DataFlow::PathNode source, DataFlow::PathNode sink where TaintTracking::hasFlowPath(source, sink) and source instanceof RemoteFlowSource and sink instanceof SqlExecution select sink, source, sink, "SQL injection from $@.", source, "user input" // Unvalidated redirect from DataFlow::PathNode source, DataFlow::PathNode sink where source instanceof RemoteFlowSource and sink instanceof RedirectSink select sink, "Unvalidated redirect from user input" ``` --- ## Combined Workflow: Deep Analysis When performing security review or complex refactoring, use all tiers: ``` 1. SCOPE → Tier 1: detect_changes / get_architecture Identify files and modules in scope 2. STRUCTURE → Tier 1: search_graph / trace_call_path Map the call graph and dependencies 3. FLOW → Tier 2: get_cfg / run_cpgql_query Analyze control flow and data flow paths 4. SECURITY → Tier 3: run_query with taint analysis Check for vulnerabilities in data paths 5. REPORT → Combine findings from all tiers Prioritize: Critical > High > Medium > Low ``` --- ## Anti-Patterns | Anti-Pattern | Do This Instead | |-------------|-----------------| | Using Joern/CodeQL for simple symbol lookup | Use Tier 1 `search_graph` (sub-ms vs seconds) | | Running full CPG build on every commit | Build CPG on-demand; use Tier 1 for continuous monitoring | | Querying Joern without checking `get_cpg_status` | Always verify CPG is built and current before querying | | Running CodeQL without a specific security question | Have a hypothesis first; CodeQL queries are expensive | | Ignoring Tier 1 blast radius before deep analysis | Always scope with Tier 1 first, then go deep on flagged areas | | Using CodeQL for non-security structural queries | Use Joern CPGQL for structural/flow queries; CodeQL for security | ================================================ FILE: skills/credentials/SKILL.md ================================================ --- name: credentials description: Centralized API key management from Access.txt when-to-use: When setting up a new project that needs API keys or environment variables user-invocable: false effort: low --- # Credentials Management Skill For securely loading API keys from a centralized access file and configuring project environments. --- ## Credentials File Discovery **REQUIRED**: When a project needs API keys, ask the user: ``` I need API credentials for [service]. Do you have a centralized access keys file? Please provide the path (e.g., ~/Documents/Access.txt) or type 'manual' to enter keys directly. ``` ### Default Locations to Check ```bash ~/Documents/Access.txt ~/Access.txt ~/.secrets/keys.txt ~/.credentials.txt ``` --- ## Supported File Formats The credentials file can use any of these formats: ### Format 1: Colon-separated ``` Render API: rnd_xxxxx OpenAI API: sk-proj-xxxxx Claude API: sk-ant-xxxxx Reddit client id: xxxxx Reddit secret: xxxxx ``` ### Format 2: Key=Value ``` RENDER_API_KEY=rnd_xxxxx OPENAI_API_KEY=sk-proj-xxxxx ANTHROPIC_API_KEY=sk-ant-xxxxx ``` ### Format 3: Mixed/Informal ``` Reddit api access: client id Y1FgKALKmb6f6UxFtyMXfA and secret is -QLoYdxMqOJkYrgk5KeGPa6Ps6vIiQ ``` --- ## Key Identification Patterns Use these patterns to identify keys in the file: | Service | Pattern | Env Variable | |---------|---------|--------------| | OpenAI | `sk-proj-*` or `sk-*` | `OPENAI_API_KEY` | | Claude/Anthropic | `sk-ant-*` | `ANTHROPIC_API_KEY` | | Render | `rnd_*` | `RENDER_API_KEY` | | Eleven Labs | `sk_*` (not sk-ant/sk-proj) | `ELEVEN_LABS_API_KEY` | | Replicate | `r8_*` | `REPLICATE_API_TOKEN` | | Supabase | URL + `eyJ*` (JWT) | `SUPABASE_URL`, `SUPABASE_ANON_KEY`, `SUPABASE_SERVICE_ROLE_KEY` | | Reddit | client_id + secret pair | `REDDIT_CLIENT_ID`, `REDDIT_CLIENT_SECRET` | | GitHub | `ghp_*` or `github_pat_*` | `GITHUB_TOKEN` | | Vercel | `*_*` (from vercel.com) | `VERCEL_TOKEN` | | Stripe (Test) | `sk_test_*`, `pk_test_*` | `STRIPE_SECRET_KEY`, `STRIPE_PUBLISHABLE_KEY` | | Stripe (Live) | `sk_live_*`, `pk_live_*` | `STRIPE_SECRET_KEY`, `STRIPE_PUBLISHABLE_KEY` | | Stripe Webhook | `whsec_*` | `STRIPE_WEBHOOK_SECRET` | | Twilio | `SK*` + Account SID | `TWILIO_API_KEY`, `TWILIO_ACCOUNT_SID` | | SendGrid | `SG.*` | `SENDGRID_API_KEY` | | AWS | `AKIA*` + secret | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` | | PostHog | `phc_*` | `POSTHOG_API_KEY`, `NEXT_PUBLIC_POSTHOG_KEY` | --- ## Parsing Credentials File When reading the user's access file, extract keys using these rules: ```python # Python parsing logic import re from pathlib import Path def parse_credentials_file(file_path: str) -> dict[str, str]: """Parse various credential file formats.""" content = Path(file_path).expanduser().read_text() credentials = {} # Pattern matching for known key formats patterns = { 'OPENAI_API_KEY': r'sk-proj-[A-Za-z0-9_-]+', 'ANTHROPIC_API_KEY': r'sk-ant-[A-Za-z0-9_-]+', 'RENDER_API_KEY': r'rnd_[A-Za-z0-9]+', 'REPLICATE_API_TOKEN': r'r8_[A-Za-z0-9]+', 'ELEVEN_LABS_API_KEY': r'sk_[a-f0-9]{40,}', 'GITHUB_TOKEN': r'ghp_[A-Za-z0-9]+|github_pat_[A-Za-z0-9_]+', 'STRIPE_SECRET_KEY': r'sk_(live|test)_[A-Za-z0-9]+', 'STRIPE_PUBLISHABLE_KEY': r'pk_(live|test)_[A-Za-z0-9]+', 'STRIPE_WEBHOOK_SECRET': r'whsec_[A-Za-z0-9]+', 'POSTHOG_API_KEY': r'phc_[A-Za-z0-9]+', } # Supabase requires special handling (URL + JWT tokens) supabase_url = re.search(r'https://[a-z0-9]+\.supabase\.co', content) anon_key = re.search(r'anon[^:]*:\s*(eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+)', content, re.I) service_role = re.search(r'service.?role[^:]*:\s*(eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+)', content, re.I) if supabase_url: credentials['SUPABASE_URL'] = supabase_url.group(0) if anon_key: credentials['SUPABASE_ANON_KEY'] = anon_key.group(1) if service_role: credentials['SUPABASE_SERVICE_ROLE_KEY'] = service_role.group(1) for env_var, pattern in patterns.items(): match = re.search(pattern, content) if match: credentials[env_var] = match.group(0) # Reddit requires special handling (client_id + secret pair) reddit_id = re.search(r'client.?id[:\s]+([A-Za-z0-9_-]+)', content, re.I) reddit_secret = re.search(r'secret[:\s]+([A-Za-z0-9_-]+)', content, re.I) if reddit_id: credentials['REDDIT_CLIENT_ID'] = reddit_id.group(1) if reddit_secret: credentials['REDDIT_CLIENT_SECRET'] = reddit_secret.group(1) return credentials ``` ```typescript // TypeScript parsing logic function parseCredentialsFile(content: string): Record<string, string> { const credentials: Record<string, string> = {}; const patterns: Record<string, RegExp> = { OPENAI_API_KEY: /sk-proj-[A-Za-z0-9_-]+/, ANTHROPIC_API_KEY: /sk-ant-[A-Za-z0-9_-]+/, RENDER_API_KEY: /rnd_[A-Za-z0-9]+/, REPLICATE_API_TOKEN: /r8_[A-Za-z0-9]+/, ELEVEN_LABS_API_KEY: /sk_[a-f0-9]{40,}/, GITHUB_TOKEN: /ghp_[A-Za-z0-9]+|github_pat_[A-Za-z0-9_]+/, STRIPE_SECRET_KEY: /sk_(live|test)_[A-Za-z0-9]+/, STRIPE_PUBLISHABLE_KEY: /pk_(live|test)_[A-Za-z0-9]+/, STRIPE_WEBHOOK_SECRET: /whsec_[A-Za-z0-9]+/, POSTHOG_API_KEY: /phc_[A-Za-z0-9]+/, }; for (const [envVar, pattern] of Object.entries(patterns)) { const match = content.match(pattern); if (match) credentials[envVar] = match[0]; } // Reddit pair const redditId = content.match(/client.?id[:\s]+([A-Za-z0-9_-]+)/i); const redditSecret = content.match(/secret[:\s]+([A-Za-z0-9_-]+)/i); if (redditId) credentials.REDDIT_CLIENT_ID = redditId[1]; if (redditSecret) credentials.REDDIT_CLIENT_SECRET = redditSecret[1]; return credentials; } ``` --- ## Validation Commands After extracting keys, validate them: ### OpenAI ```bash curl -s -o /dev/null -w "%{http_code}" \ -H "Authorization: Bearer $OPENAI_API_KEY" \ https://api.openai.com/v1/models # 200 = valid ``` ### Anthropic/Claude ```bash curl -s -o /dev/null -w "%{http_code}" \ -H "x-api-key: $ANTHROPIC_API_KEY" \ -H "anthropic-version: 2023-06-01" \ https://api.anthropic.com/v1/models # 200 = valid ``` ### Render ```bash curl -s -o /dev/null -w "%{http_code}" \ -H "Authorization: Bearer $RENDER_API_KEY" \ https://api.render.com/v1/services # 200 = valid ``` ### Reddit ```bash # Get OAuth token first TOKEN=$(curl -s -X POST \ -u "$REDDIT_CLIENT_ID:$REDDIT_CLIENT_SECRET" \ -d "grant_type=client_credentials" \ -A "CredentialTest/1.0" \ https://www.reddit.com/api/v1/access_token | jq -r '.access_token') # Non-null token = valid ``` ### Replicate ```bash curl -s -o /dev/null -w "%{http_code}" \ -H "Authorization: Token $REPLICATE_API_TOKEN" \ https://api.replicate.com/v1/models # 200 = valid ``` --- ## Project Setup Workflow When initializing a project that needs API keys: ### Step 1: Ask for Credentials File ``` This project needs the following API keys: - ANTHROPIC_API_KEY (for Claude) - SUPABASE_URL and SUPABASE_ANON_KEY Do you have an access keys file? Please provide the path: ``` ### Step 2: Read and Parse ```python # Read the file credentials = parse_credentials_file("~/Documents/Access.txt") # Show what was found print("Found credentials:") for key, value in credentials.items(): masked = value[:8] + "..." + value[-4:] print(f" {key}: {masked}") ``` ### Step 3: Validate Keys ``` Validating credentials... ✓ ANTHROPIC_API_KEY: Valid ✓ REDDIT_CLIENT_ID: Valid ✗ SUPABASE_URL: Not found in file ``` ### Step 4: Create .env File ```bash # Write to project .env cat > .env << EOF # Auto-generated from ~/Documents/Access.txt ANTHROPIC_API_KEY=sk-ant-xxx... REDDIT_CLIENT_ID=xxx... REDDIT_CLIENT_SECRET=xxx... EOF # Add to .gitignore if not present echo ".env" >> .gitignore ``` ### Step 5: Report Missing Keys ``` Missing credentials that need manual setup: - SUPABASE_URL: Get from supabase.com/dashboard/project/[ref]/settings/api - SUPABASE_ANON_KEY: Same location as above Would you like me to open these URLs? ``` --- ## Service-Specific Setup Guides ### Reddit (from Access.txt) ``` Found in your access file: - REDDIT_CLIENT_ID: Y1FgKA... - REDDIT_CLIENT_SECRET: -QLoYd... Also needed (add to Access.txt or enter manually): - REDDIT_USER_AGENT: YourApp/1.0 by YourUsername ``` ### Supabase (typically not in file) ``` Supabase credentials are project-specific. Get them from: https://supabase.com/dashboard/project/[your-ref]/settings/api Required: - SUPABASE_URL - SUPABASE_ANON_KEY - SUPABASE_SERVICE_ROLE_KEY (for admin operations) ``` --- ## Security Rules - **NEVER** commit Access.txt or its path to git - **NEVER** log full API keys - always mask middle characters - **ALWAYS** add `.env` to `.gitignore` - **ALWAYS** use environment variables, never hardcode keys - **VALIDATE** keys before using them in production setup --- ## Quick Reference ```bash # Check if credentials file exists ls -la ~/Documents/Access.txt # Common env var names OPENAI_API_KEY ANTHROPIC_API_KEY RENDER_API_KEY REDDIT_CLIENT_ID REDDIT_CLIENT_SECRET REPLICATE_API_TOKEN ELEVEN_LABS_API_KEY SUPABASE_URL SUPABASE_ANON_KEY GITHUB_TOKEN ``` ### Prompt Template ``` I need API credentials for this project. Do you have a centralized access keys file (like ~/Documents/Access.txt)? If yes, provide the path and I'll: 1. Read and parse your keys 2. Validate they're working 3. Set up your project's .env file 4. Tell you which keys are missing ``` ================================================ FILE: skills/cross-agent-delegation/SKILL.md ================================================ --- name: cross-agent-delegation description: Cross-agent task routing — Codex auto-review, Kimi delegation by complexity score (iCPG + Claude reasoning), iCPG + Mnemos mandatory for all agents when-to-use: Always loaded when multiple AI CLI tools are available (Claude, Kimi, Codex) user-invocable: false effort: medium --- # Cross-Agent Delegation Claude Code orchestrates task routing to Kimi and Codex. The user interacts with Claude only — delegation happens behind the scenes. --- ## Tool Detection At session start, detect available tools: ```bash command -v kimi &>/dev/null && HAS_KIMI=true || HAS_KIMI=false command -v codex &>/dev/null && HAS_CODEX=true || HAS_CODEX=false ``` --- ## Codex Auto-Review (Stop Hook — Automatic) When Codex is installed, a Stop hook reviews code after tests pass: 1. TDD loop check runs tests 2. `codex-auto-review.sh` runs Codex on the diff 3. Critical/High findings feed back to Claude (exit 2) 4. Clean reviews pass through (exit 0) **Fully automatic.** No user or Claude action needed. --- ## Kimi Delegation (Claude Orchestrates) When Kimi is installed and the task complexity is bounded, Claude delegates directly — the user does not need to run anything. ### Step 1: Score complexity, not file count File count is a poor proxy for delegation risk. A 1-file change to an authz path is harder than a 12-file rename. Score the task on five dimensions, each 0-2, sourced from iCPG signals plus Claude's semantic reasoning: | Dimension | 0 (low) | 1 (medium) | 2 (high) | Source | |---|---|---|---|---| | **Cyclomatic / surface depth** | <10 LOC, no branches | 10-50 LOC, ≤3 branches | 50+ LOC or nested control flow | iCPG `query_graph` over function bodies | | **Fan-out (consumer blast radius)** | 0-2 callers | 3-10 callers | 11+ callers | iCPG `trace_path(<symbol>, mode=callers)` | | **Crosses a security boundary** (SEC-006, auth, PII, RLS, org-scope, billing, payments) | None | Tangential | Direct read or write | iCPG SEC-* / R-063 tags + grep for `org_id`, `user_id`, `auth`, `pii` | | **Concurrency / transactional** | Pure / sync | Async only | Locks, transactions, atomic claims, `FOR UPDATE`, `asyncio.Lock`, `session.begin` | iCPG concurrency flags + grep | | **Domain invariants required** | None / well-documented inline | Some implicit (need to read 1-2 files) | Heavy (cross-doc, ADR-bound, RFC-bound) | Claude reasoning + iCPG ADR linkage | ```bash # Auto-collect signals icpg query blast <scope> --format json # fan-out, async flags, sec tags grep -rE "org_id|user_id|auth|pii" <file> # cheap sec heuristic if iCPG flags absent grep -rE "asyncio.Lock|FOR UPDATE|session.begin" <file> # concurrency heuristic ``` ### Step 2: Sum → routing | Total score | Route | Rationale | |---|---|---| | **0-3** | Kimi solo | Bounded surface, no security/concurrency/cross-doc concerns | | **4-6** | Kimi → Codex auto-review (no user prompt) | Real risk, but not so high that we need full Claude context — Codex catches what Kimi might miss | | **7-10** | Claude handles directly | Cross-cutting / security-critical / concurrency-heavy — needs full context | ### Step 3: Floor — trivial-case shortcut To skip iCPG-query cost on truly trivial work: ```bash # If <2 files changed AND no SEC/auth/PII/concurrency keyword in diff, # → auto-Kimi without scoring. FILES=$(git diff --name-only | wc -l) HAS_RISK_KEYWORDS=$(git diff | grep -ciE "org_id|auth|pii|asyncio|FOR UPDATE|transaction|session\.begin" || true) if [ "$FILES" -lt 2 ] && [ "$HAS_RISK_KEYWORDS" -eq 0 ]; then AUTO_KIMI=true fi ``` This handles the trivial-rename / typo-fix case without paying the iCPG round-trip. ### When NOT to Delegate (overrides scoring) - User explicitly asked Claude to do it - Cross-service changes (API + frontend + database) — needs full context regardless of score - Production hotfix on a release branch — cross-tool review latency is too high - Score 7+ in any single dimension (one critical axis is enough to keep Claude in the loop) ### Step 4: Delegate via Bash Claude writes a mnemos checkpoint, then runs Kimi headless: ```bash # 1. Save current context to disk mnemos checkpoint --force # 2. Get context summary for Kimi CONTEXT=$(mnemos resume 2>/dev/null) # 3. Get constraints for target files CONSTRAINTS=$(icpg query constraints <target-file> 2>/dev/null) # 4. Run Kimi headless with full context kimi --print -y -w . -p " ## Context (from mnemos checkpoint) $CONTEXT ## Constraints (from iCPG) $CONSTRAINTS ## Task <specific task description> ## Rules - Run tests after changes - Record changes: icpg record --base main - Write checkpoint when done: mnemos checkpoint --force " ``` ### Step 4: Read Results After Kimi finishes, Claude: ```bash # Read what Kimi did mnemos resume # Kimi's checkpoint icpg status # Kimi's recorded symbols git diff # Kimi's file changes ``` ### When NOT to Delegate - Security-sensitive code (auth, crypto, payments) - Cross-service changes (API + frontend + database) - Refactors that touch shared interfaces - User explicitly asked Claude to do it --- ## iCPG — Mandatory for All Agents Before ANY code change, Claude runs these (and includes results when delegating): ### Pre-Task Queries ```bash # 1. Duplicate check — already done? icpg query prior "<goal>" # 2. Constraints — what invariants apply? icpg query constraints <file-path> # 3. Risk — is this symbol fragile? icpg query risk <symbol-name> ``` ### After Code Changes ```bash icpg record --reason <id> --base main icpg drift check ``` --- ## Mnemos — Mandatory for All Agents ### At Task Start ```bash mnemos add goal "<task description>" ``` ### At Sub-Goal Boundaries ```bash mnemos checkpoint ``` ### At Task End (auto-handled by Stop hook) ```bash mnemos checkpoint --force ``` ### Context Transfer Between Tools The checkpoint is the bridge. Claude writes it, Kimi reads it: ```bash # Claude saves state mnemos checkpoint --force # Kimi (or Codex) reads state mnemos resume ``` The checkpoint contains: goal, constraints, recent files, git state, fatigue level. --- ## Full Orchestration Flow ``` TASK ARRIVES (user tells Claude) | v [1] Claude: icpg query prior "<goal>" ← Already done? [2] Claude: trivial-case shortcut ← <2 files & no risk keywords? | +-- YES + Kimi installed -----> AUTO-KIMI (no scoring) | +-- NO ↓ v [3] Claude: score complexity (5 dims × 0-2, iCPG + reasoning) | +-- score 0-3 ----> KIMI SOLO PATH | [a] mnemos checkpoint --force | [b] kimi --print -y -p "..." | [c] mnemos resume + git diff | [d] Continue in Claude | +-- score 4-6 ----> KIMI + CODEX REVIEW PATH | [a] mnemos checkpoint --force | [b] kimi --print -y -p "..." | [c] codex review --uncommitted ← Auto-review the diff | [d] If P0/P1 findings: re-prompt Kimi with findings | [e] Once clean: continue in Claude | +-- score 7-10 ----> CLAUDE DIRECT PATH (full context) | v [4] icpg query constraints <files> ← Invariants [5] icpg query risk <symbols> ← Fragility [6] mnemos add goal "<task>" ← Track in memory | v [7] IMPLEMENT (TDD: RED -> GREEN) | v [8] Stop: tdd-loop-check.sh ← Tests pass? [9] Stop: codex-auto-review.sh ← Codex reviews diff [10] Stop: icpg-stop-record.sh ← Record symbols [11] Stop: mnemos-checkpoint.sh ← Save memory ``` ================================================ FILE: skills/database-schema/SKILL.md ================================================ --- name: database-schema description: Schema awareness - read before coding, type generation, prevent column errors when-to-use: Before writing any database queries or modifying data models user-invocable: false paths: ["**/schema.*", "**/migrations/**", "**/models/**", "**/*.prisma", "**/drizzle/**"] effort: medium --- # Database Schema Awareness Skill **Problem:** Claude forgets schema details mid-session - wrong column names, missing fields, incorrect types. TDD catches this at runtime, but we can prevent it earlier. --- ## Core Rule: Read Schema Before Writing Database Code **MANDATORY: Before writing ANY code that touches the database:** ``` ┌─────────────────────────────────────────────────────────────┐ │ 1. READ the schema file (see locations below) │ │ 2. VERIFY columns/types you're about to use exist │ │ 3. REFERENCE schema in your response when writing queries │ │ 4. TYPE-CHECK using generated types (Drizzle/Prisma/etc) │ └─────────────────────────────────────────────────────────────┘ ``` **If schema file doesn't exist → CREATE IT before proceeding.** --- ## Schema File Locations (By Stack) | Stack | Schema Location | Type Generation | |-------|-----------------|-----------------| | **Drizzle** | `src/db/schema.ts` or `drizzle/schema.ts` | Built-in TypeScript | | **Prisma** | `prisma/schema.prisma` | `npx prisma generate` | | **Supabase** | `supabase/migrations/*.sql` + types | `supabase gen types typescript` | | **SQLAlchemy** | `app/models/*.py` or `src/models.py` | Pydantic models | | **TypeORM** | `src/entities/*.ts` | Decorators = types | | **Raw SQL** | `schema.sql` or `migrations/` | Manual types required | ### Schema Reference File (Recommended) Create `_project_specs/schema-reference.md` for quick lookup: ```markdown # Database Schema Reference *Auto-generated or manually maintained. Claude: READ THIS before database work.* ## Tables ### users | Column | Type | Nullable | Default | Notes | |--------|------|----------|---------|-------| | id | uuid | NO | gen_random_uuid() | PK | | email | text | NO | - | Unique | | name | text | YES | - | Display name | | created_at | timestamptz | NO | now() | - | | updated_at | timestamptz | NO | now() | - | ### orders | Column | Type | Nullable | Default | Notes | |--------|------|----------|---------|-------| | id | uuid | NO | gen_random_uuid() | PK | | user_id | uuid | NO | - | FK → users.id | | status | text | NO | 'pending' | enum: pending/paid/shipped/delivered | | total_cents | integer | NO | - | Amount in cents | | created_at | timestamptz | NO | now() | - | ## Relationships - users 1:N orders (user_id) ## Enums - order_status: pending, paid, shipped, delivered ``` --- ## Pre-Code Checklist (Database Work) Before writing any database code, Claude MUST: ```markdown ### Schema Verification Checklist - [ ] Read schema file: `[path to schema]` - [ ] Columns I'm using exist: [list columns] - [ ] Types match my code: [list type mappings] - [ ] Relationships are correct: [list FKs] - [ ] Nullable fields handled: [list nullable columns] ``` **Example in practice:** ```markdown ### Schema Verification for TODO-042 (Add order history endpoint) - [x] Read schema: `src/db/schema.ts` - [x] Columns exist: orders.id, orders.user_id, orders.status, orders.total_cents, orders.created_at - [x] Types: id=uuid→string, total_cents=integer→number, status=text→OrderStatus enum - [x] Relationships: orders.user_id → users.id (many-to-one) - [x] Nullable: none of these columns are nullable ``` --- ## Type Generation Commands ### Drizzle (TypeScript) ```typescript // Schema defines types automatically // src/db/schema.ts import { pgTable, uuid, text, integer, timestamp } from 'drizzle-orm/pg-core'; export const users = pgTable('users', { id: uuid('id').primaryKey().defaultRandom(), email: text('email').notNull().unique(), name: text('name'), createdAt: timestamp('created_at').notNull().defaultNow(), }); export const orders = pgTable('orders', { id: uuid('id').primaryKey().defaultRandom(), userId: uuid('user_id').notNull().references(() => users.id), status: text('status').notNull().default('pending'), totalCents: integer('total_cents').notNull(), createdAt: timestamp('created_at').notNull().defaultNow(), }); // Inferred types - USE THESE export type User = typeof users.$inferSelect; export type NewUser = typeof users.$inferInsert; export type Order = typeof orders.$inferSelect; export type NewOrder = typeof orders.$inferInsert; ``` ### Prisma ```prisma // prisma/schema.prisma model User { id String @id @default(uuid()) email String @unique name String? orders Order[] createdAt DateTime @default(now()) @map("created_at") @@map("users") } model Order { id String @id @default(uuid()) userId String @map("user_id") user User @relation(fields: [userId], references: [id]) status String @default("pending") totalCents Int @map("total_cents") createdAt DateTime @default(now()) @map("created_at") @@map("orders") } ``` ```bash # Generate types after schema changes npx prisma generate ``` ### Supabase ```bash # Generate TypeScript types from live database supabase gen types typescript --local > src/types/database.ts # Or from remote supabase gen types typescript --project-id your-project-id > src/types/database.ts ``` ```typescript // Use generated types import { Database } from '@/types/database'; type User = Database['public']['Tables']['users']['Row']; type NewUser = Database['public']['Tables']['users']['Insert']; type Order = Database['public']['Tables']['orders']['Row']; ``` ### SQLAlchemy (Python) ```python # app/models/user.py from sqlalchemy import Column, String, DateTime from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.sql import func from app.db import Base import uuid class User(Base): __tablename__ = "users" id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) email = Column(String, nullable=False, unique=True) name = Column(String, nullable=True) created_at = Column(DateTime(timezone=True), server_default=func.now()) # Relationships orders = relationship("Order", back_populates="user") ``` ```python # app/schemas/user.py - Pydantic for API validation from pydantic import BaseModel, EmailStr from uuid import UUID from datetime import datetime class UserBase(BaseModel): email: EmailStr name: str | None = None class UserCreate(UserBase): pass class User(UserBase): id: UUID created_at: datetime class Config: from_attributes = True ``` --- ## Schema-Aware TDD Workflow Extend the standard TDD workflow for database work: ``` ┌─────────────────────────────────────────────────────────────┐ │ 0. SCHEMA: Read and verify schema before anything else │ │ └─ Read schema file │ │ └─ Complete Schema Verification Checklist │ │ └─ Note any missing columns/tables needed │ ├─────────────────────────────────────────────────────────────┤ │ 1. RED: Write tests that use correct column names │ │ └─ Import generated types │ │ └─ Use type-safe queries in tests │ │ └─ Tests should fail on logic, NOT schema errors │ ├─────────────────────────────────────────────────────────────┤ │ 2. GREEN: Implement with type-safe queries │ │ └─ Use ORM types, not raw strings │ │ └─ TypeScript/mypy catches column mismatches │ ├─────────────────────────────────────────────────────────────┤ │ 3. VALIDATE: Type check catches schema drift │ │ └─ tsc --noEmit / mypy catches wrong columns │ │ └─ Tests validate runtime behavior │ └─────────────────────────────────────────────────────────────┘ ``` --- ## Common Schema Mistakes (And How to Prevent) | Mistake | Example | Prevention | |---------|---------|------------| | Wrong column name | `user.userName` vs `user.name` | Read schema, use generated types | | Wrong type | `totalCents` as string | Type generation catches this | | Missing nullable check | `user.name!` when nullable | Schema shows nullable fields | | Wrong FK relationship | `order.userId` vs `order.user_id` | Check schema column names | | Missing column | Using `user.avatar` that doesn't exist | Read schema before coding | | Wrong enum value | `status: 'complete'` vs `'completed'` | Document enums in schema reference | ### Type-Safe Query Examples **Drizzle (catches errors at compile time):** ```typescript // ✅ Correct - uses schema-defined columns const user = await db.select().from(users).where(eq(users.email, email)); // ❌ Wrong - TypeScript error: 'userName' doesn't exist const user = await db.select().from(users).where(eq(users.userName, email)); ``` **Prisma (catches errors at compile time):** ```typescript // ✅ Correct const user = await prisma.user.findUnique({ where: { email } }); // ❌ Wrong - TypeScript error const user = await prisma.user.findUnique({ where: { userName: email } }); ``` **Raw SQL (NO protection - avoid):** ```typescript // ❌ Dangerous - no type checking, easy to get wrong const result = await db.query('SELECT * FROM users WHERE user_name = $1', [email]); // Should be 'email' not 'user_name' - won't catch until runtime ``` --- ## Migration Workflow When schema changes are needed: ``` ┌─────────────────────────────────────────────────────────────┐ │ 1. Update schema file (Drizzle/Prisma/SQLAlchemy) │ ├─────────────────────────────────────────────────────────────┤ │ 2. Generate migration │ │ └─ Drizzle: npx drizzle-kit generate │ │ └─ Prisma: npx prisma migrate dev --name add_column │ │ └─ Supabase: supabase migration new add_column │ ├─────────────────────────────────────────────────────────────┤ │ 3. Regenerate types │ │ └─ Prisma: npx prisma generate │ │ └─ Supabase: supabase gen types typescript │ ├─────────────────────────────────────────────────────────────┤ │ 4. Update schema-reference.md │ ├─────────────────────────────────────────────────────────────┤ │ 5. Run type check - find all broken code │ │ └─ npm run typecheck │ ├─────────────────────────────────────────────────────────────┤ │ 6. Fix type errors, update tests, run full validation │ └─────────────────────────────────────────────────────────────┘ ``` --- ## Session Start Protocol **When starting a session that involves database work:** 1. Read schema file immediately 2. Read `_project_specs/schema-reference.md` if exists 3. Note in session state what tables/columns are relevant 4. Reference schema explicitly when writing code **Session state example:** ```markdown ## Current Session - Database Context **Schema read:** ✓ src/db/schema.ts **Tables in scope:** users, orders, order_items **Key columns:** - users: id, email, name, created_at - orders: id, user_id, status, total_cents - order_items: id, order_id, product_id, quantity, price_cents ``` --- ## Anti-Patterns - ❌ **Guessing column names** - Always read schema first - ❌ **Using raw SQL strings** - Use ORM with type generation - ❌ **Hardcoding without verification** - Check schema before using any column - ❌ **Ignoring type errors** - Schema drift shows up as type errors - ❌ **Not regenerating types** - After migration, always regenerate - ❌ **Assuming nullable** - Check schema for nullable columns --- ## Checklist ### Setup - [ ] Schema file exists in standard location - [ ] Type generation configured - [ ] `_project_specs/schema-reference.md` created - [ ] Types regenerate on schema change ### Per-Task - [ ] Schema read before writing database code - [ ] Schema Verification Checklist completed - [ ] Using generated types (not raw strings) - [ ] Type check passes (catches column errors) - [ ] Tests use correct schema ================================================ FILE: skills/existing-repo/SKILL.md ================================================ --- name: existing-repo description: Analyze existing repositories, maintain structure, setup guardrails and best practices when-to-use: When working with an existing codebase for the first time or adding guardrails user-invocable: true allowed-tools: [Read, Glob, Grep, Bash] effort: high --- # Existing Repository Skill For working with existing codebases - analyze structure, respect conventions, and set up proper guardrails without breaking anything. **Sources:** [Husky](https://typicode.github.io/husky/) | [lint-staged](https://github.com/lint-staged/lint-staged) | [pre-commit](https://pre-commit.com/) | [commitlint](https://commitlint.js.org/) --- ## Core Principle **Understand before modifying.** Existing repos have conventions, patterns, and history. Your job is to work within them, not reorganize them. --- ## Phase 1: Repository Analysis **ALWAYS run this analysis first when joining an existing repo.** ### 1.1 Basic Detection ```bash # Check git status git remote -v 2>/dev/null git branch -a 2>/dev/null git log --oneline -5 2>/dev/null # Check for existing configs ls -la .* 2>/dev/null | head -20 ls *.json *.toml *.yaml *.yml 2>/dev/null ``` ### 1.2 Tech Stack Detection ```bash # JavaScript/TypeScript ls package.json tsconfig.json 2>/dev/null # Python ls pyproject.toml setup.py requirements*.txt 2>/dev/null # Mobile ls pubspec.yaml 2>/dev/null # Flutter ls android/build.gradle 2>/dev/null # Android ls ios/*.xcodeproj 2>/dev/null # iOS # Other ls Cargo.toml 2>/dev/null # Rust ls go.mod 2>/dev/null # Go ls Gemfile 2>/dev/null # Ruby ``` ### 1.3 Repo Structure Type | Pattern | Detection | Meaning | |---------|-----------|---------| | **Monorepo** | `packages/`, `apps/`, `workspaces` in package.json | Multiple projects, shared tooling | | **Full-Stack Monolith** | `frontend/` + `backend/` in same repo | Single team, tightly coupled | | **Separate Concerns** | Only frontend OR backend code | Split repos, separate deploys | | **Microservices** | Multiple `service-*` or domain dirs | Distributed architecture | ```bash # Detect repo structure type if [ -d "packages" ] || [ -d "apps" ]; then echo "MONOREPO detected" elif [ -d "frontend" ] && [ -d "backend" ]; then echo "FULL-STACK MONOLITH detected" elif [ -d "src" ] || [ -d "app" ]; then # Check if it's frontend or backend grep -q "react\|vue\|angular" package.json 2>/dev/null && echo "FRONTEND detected" grep -q "fastapi\|express\|django" package.json pyproject.toml 2>/dev/null && echo "BACKEND detected" fi ``` ### 1.4 Directory Mapping ```bash # Get directory structure (max 3 levels) find . -type d -maxdepth 3 \ -not -path "*/node_modules/*" \ -not -path "*/.git/*" \ -not -path "*/venv/*" \ -not -path "*/__pycache__/*" \ -not -path "*/dist/*" \ -not -path "*/build/*" \ 2>/dev/null | head -50 # Identify key directories for dir in src app lib core services api routes components pages hooks utils models; do [ -d "$dir" ] && echo "Found: $dir/" done ``` ### 1.5 Entry Points ```bash # Find main entry points ls index.ts index.js main.ts main.py app.py server.ts server.js 2>/dev/null cat package.json 2>/dev/null | grep -A1 '"main"' cat pyproject.toml 2>/dev/null | grep -A1 'scripts' ``` --- ## Phase 2: Convention Detection **Identify and document existing patterns before making changes.** ### 2.1 Code Style ```bash # Check for formatters ls .prettierrc* .editorconfig .eslintrc* biome.json 2>/dev/null # JS/TS ls pyproject.toml | xargs grep -l "ruff\|black\|isort" 2>/dev/null # Python # Check indent style from existing files head -20 src/**/*.ts 2>/dev/null | grep "^\s" | head -1 # tabs vs spaces ``` ### 2.2 Testing Setup ```bash # JS/TS testing grep -l "jest\|vitest\|mocha\|playwright" package.json 2>/dev/null ls jest.config.* vitest.config.* playwright.config.* 2>/dev/null # Python testing grep -l "pytest\|unittest" pyproject.toml 2>/dev/null ls pytest.ini conftest.py 2>/dev/null # Test directories ls -d tests/ test/ __tests__/ spec/ 2>/dev/null ``` ### 2.3 CI/CD Setup ```bash # Check existing workflows ls -la .github/workflows/ 2>/dev/null ls .gitlab-ci.yml Jenkinsfile .circleci/ 2>/dev/null # Check deploy configs ls vercel.json render.yaml fly.toml railway.json Dockerfile 2>/dev/null ``` ### 2.4 Documentation Style ```bash # Find README pattern head -30 README.md 2>/dev/null # Find existing docs ls -la docs/ documentation/ wiki/ 2>/dev/null ls CONTRIBUTING.md CHANGELOG.md 2>/dev/null ``` --- ## Phase 3: Guardrails Audit **Check what guardrails exist and what's missing.** ### 3.1 Pre-commit Hooks Status ```bash # Check for hook managers ls .husky/ 2>/dev/null && echo "Husky installed" ls .pre-commit-config.yaml 2>/dev/null && echo "pre-commit framework installed" ls .git/hooks/pre-commit 2>/dev/null && echo "Manual pre-commit hook exists" # Check what hooks run cat .husky/pre-commit 2>/dev/null cat .pre-commit-config.yaml 2>/dev/null ``` ### 3.2 Linting Status ```bash # JS/TS linting grep -q "eslint" package.json && echo "ESLint configured" grep -q "biome" package.json && echo "Biome configured" ls .eslintrc* biome.json 2>/dev/null # Python linting grep -q "ruff" pyproject.toml && echo "Ruff configured" grep -q "flake8" pyproject.toml setup.cfg && echo "Flake8 configured" ``` ### 3.3 Type Checking Status ```bash # TypeScript ls tsconfig.json 2>/dev/null && echo "TypeScript configured" grep "strict" tsconfig.json 2>/dev/null # Python type checking grep -q "mypy" pyproject.toml && echo "mypy configured" grep -q "pyright" pyproject.toml && echo "pyright configured" ls py.typed 2>/dev/null ``` ### 3.4 Commit Message Enforcement ```bash # commitlint ls commitlint.config.* 2>/dev/null && echo "commitlint configured" cat .husky/commit-msg 2>/dev/null grep "conventional" package.json 2>/dev/null ``` ### 3.5 Security Scanning ```bash # Check for security tools grep -q "detect-secrets\|trufflehog" .pre-commit-config.yaml package.json 2>/dev/null ls .github/workflows/*.yml | xargs grep -l "security\|audit" 2>/dev/null ``` --- ## Phase 4: Guardrails Setup **Only add missing guardrails. Never overwrite existing configurations.** ### 4.1 JavaScript/TypeScript Projects #### Husky + lint-staged (if not present) ```bash # Check if already installed if [ ! -d ".husky" ]; then # Install Husky npm install -D husky lint-staged npx husky init # Create pre-commit hook echo 'npx lint-staged' > .husky/pre-commit chmod +x .husky/pre-commit fi ``` **lint-staged config** (add to package.json if missing): ```json { "lint-staged": { "*.{ts,tsx,js,jsx}": [ "eslint --fix", "prettier --write" ], "*.{json,md,yml,yaml}": [ "prettier --write" ] } } ``` #### ESLint (if not present) ```bash # Check if eslint exists if ! grep -q "eslint" package.json; then npm install -D eslint @typescript-eslint/parser @typescript-eslint/eslint-plugin fi ``` **eslint.config.js** (ESLint 9+ flat config): ```javascript import eslint from '@eslint/js' import tseslint from 'typescript-eslint' export default tseslint.config( eslint.configs.recommended, ...tseslint.configs.recommended, { rules: { '@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }], '@typescript-eslint/explicit-function-return-type': 'off', 'no-console': ['warn', { allow: ['warn', 'error'] }] } }, { ignores: ['dist/', 'node_modules/', 'coverage/'] } ) ``` #### Prettier (if not present) ```bash if ! grep -q "prettier" package.json; then npm install -D prettier fi ``` **.prettierrc** (respect existing style or use sensible defaults): ```json { "semi": false, "singleQuote": true, "trailingComma": "es5", "tabWidth": 2, "printWidth": 100 } ``` #### commitlint (if not present) ```bash if [ ! -f "commitlint.config.js" ]; then npm install -D @commitlint/cli @commitlint/config-conventional echo "npx commitlint --edit \$1" > .husky/commit-msg chmod +x .husky/commit-msg fi ``` **commitlint.config.js**: ```javascript export default { extends: ['@commitlint/config-conventional'], rules: { 'type-enum': [ 2, 'always', ['feat', 'fix', 'docs', 'style', 'refactor', 'test', 'chore', 'ci', 'perf', 'revert'] ], 'subject-case': [2, 'always', 'lower-case'], 'subject-max-length': [2, 'always', 72] } } ``` ### 4.2 Python Projects #### pre-commit framework (if not present) ```bash # Install pre-commit if [ ! -f ".pre-commit-config.yaml" ]; then pip install pre-commit pre-commit install fi ``` **.pre-commit-config.yaml**: ```yaml repos: # Ruff - linting and formatting (replaces black, isort, flake8) - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.13 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - id: ruff-format # Type checking - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.16.0 hooks: - id: mypy additional_dependencies: [types-requests] args: [--ignore-missing-imports] # Security - repo: https://github.com/Yelp/detect-secrets rev: v1.5.0 hooks: - id: detect-secrets args: ['--baseline', '.secrets.baseline'] # General - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files - id: check-merge-conflict # Commit messages - repo: https://github.com/compilerla/conventional-pre-commit rev: v4.0.0 hooks: - id: conventional-pre-commit stages: [commit-msg] ``` #### pyproject.toml additions (if not present) ```toml [tool.ruff] target-version = "py312" line-length = 100 [tool.ruff.lint] select = [ "E", # pycodestyle errors "F", # pyflakes "I", # isort "B", # flake8-bugbear "UP", # pyupgrade "S", # flake8-bandit (security) ] ignore = ["E501"] # line length handled by formatter [tool.mypy] python_version = "3.12" strict = true ignore_missing_imports = true [tool.pytest.ini_options] testpaths = ["tests"] addopts = "-v --cov=src --cov-report=term-missing --cov-fail-under=80" ``` ### 4.3 Branch Protection (Document for User) Recommend these GitHub branch protection rules: ```markdown ## Recommended Branch Protection (main branch) 1. **Require pull request before merging** - Require 1 approval - Dismiss stale reviews on new commits 2. **Require status checks** - Lint - Type check - Tests - Security scan 3. **Require signed commits** (optional but recommended) 4. **Do not allow bypassing above settings** ``` --- ## Phase 5: Structure Preservation Rules ### NEVER Do These - **Don't reorganize directory structure** - Work within existing patterns - **Don't rename files for "consistency"** - Match existing naming conventions - **Don't add new patterns** - Use patterns already in the codebase - **Don't change import styles** - Match existing (relative vs absolute, etc.) - **Don't change formatting** - Match existing style or use existing formatter config - **Don't add new dependencies lightly** - Check if equivalent exists ### ALWAYS Do These - **Read existing code first** - Understand patterns before writing new code - **Match existing conventions** - Naming, structure, error handling - **Use existing utilities** - Don't reinvent what exists - **Follow existing test patterns** - Match test file naming and structure - **Preserve existing configs** - Only add, don't modify unless fixing bugs ### Convention Detection Checklist Before writing any code, identify: | Convention | Example | Where to Check | |------------|---------|----------------| | Naming | camelCase vs snake_case | Existing file names | | File structure | feature/ vs type/ | Directory layout | | Export style | default vs named | Existing modules | | Error handling | throw vs return Error | Existing functions | | Logging | console vs logger | Existing code | | Testing | describe/it vs test() | Existing tests | | Comments | JSDoc vs inline | Existing code | --- ## Phase 6: Analysis Report Template After running analysis, generate this report: ```markdown # Repository Analysis Report ## Overview - **Repo Type**: [Monorepo | Full-Stack | Frontend | Backend | Microservices] - **Primary Language**: [TypeScript | Python | ...] - **Framework**: [React | FastAPI | ...] - **Age**: [X commits, Y contributors] ## Directory Structure ``` [tree output] ``` ## Tech Stack | Category | Technology | Config File | |----------|------------|-------------| | Language | TypeScript | tsconfig.json | | Framework | React | - | | Testing | Vitest | vitest.config.ts | | Linting | ESLint | eslint.config.js | | Formatting | Prettier | .prettierrc | ## Guardrails Status ### Present - [x] ESLint configured - [x] Prettier configured - [x] TypeScript strict mode ### Missing (Recommended) - [ ] Pre-commit hooks (Husky + lint-staged) - [ ] Commit message validation (commitlint) - [ ] Security scanning in CI ## Conventions Detected | Pattern | Observed | Example | |---------|----------|---------| | Naming | camelCase | `getUserById.ts` | | Imports | Absolute | `@/components/Button` | | Testing | Colocated | `Button.test.tsx` | | Exports | Named | `export { Button }` | ## Recommendations 1. Add Husky + lint-staged for pre-commit hooks 2. Add commitlint for conventional commits 3. Add security workflow to GitHub Actions ## Files to Review First - `src/index.ts` - Main entry point - `src/utils/` - Shared utilities - `tests/setup.ts` - Test configuration ``` --- ## Gradual Implementation Strategy Don't add all guardrails at once. Follow this timeline: | Week | Focus | Why | |------|-------|-----| | 1 | Formatting (Prettier/Ruff) | Non-breaking, easy wins | | 2 | Linting (ESLint/Ruff) | Catches obvious issues | | 3 | Pre-commit hooks | Automates week 1-2 | | 4 | Commit message validation | Team consistency | | 5 | Type checking strictness | Catches runtime errors | | 6 | Security scanning | Catches vulnerabilities | --- ## Working with Separate Repos When frontend and backend are in separate repos: ### Frontend Repo Setup ```bash # Clone and analyze git clone [frontend-repo] cd frontend # Run analysis # Expect: React/Vue/Angular, no backend code # Add frontend-specific guardrails # - Husky + lint-staged # - ESLint + Prettier # - Component testing (Vitest/Jest) ``` ### Backend Repo Setup ```bash # Clone and analyze git clone [backend-repo] cd backend # Run analysis # Expect: FastAPI/Express/Django, no frontend code # Add backend-specific guardrails # - pre-commit framework # - Ruff + mypy # - API testing (pytest/Jest) ``` ### Cross-Repo Coordination | Concern | Solution | |---------|----------| | Shared types | Generate from OpenAPI spec | | API contracts | Contract testing (Pact) | | Deployments | Coordinate via CI/CD triggers | | Versioning | Semantic versioning on both | --- ## Anti-Patterns - **Adding unused guardrails** - Only add what the team will use - **Strict rules on day 1** - Introduce gradually - **Blocking on warnings** - Start permissive, tighten over time - **Ignoring existing patterns** - Work with what exists - **Over-engineering** - Simple rules > complex systems - **Skipping the analysis phase** - Always understand before changing --- ## Quick Reference: Detection Commands ```bash # One-liner repo analysis echo "=== Repo Type ===" && \ ls -d packages apps frontend backend 2>/dev/null || echo "Standard repo" && \ echo "=== Tech Stack ===" && \ ls *.json *.toml *.yaml 2>/dev/null && \ echo "=== Existing Guardrails ===" && \ ls .husky .pre-commit-config.yaml .eslintrc* 2>/dev/null || echo "None detected" && \ echo "=== Entry Points ===" && \ ls index.* main.* app.* server.* 2>/dev/null ``` ================================================ FILE: skills/firebase/SKILL.md ================================================ --- name: firebase description: Firebase Firestore, Auth, Storage, real-time listeners, security rules when-to-use: When working with Firebase services user-invocable: false paths: ["**/firebase*", "firestore.rules", "storage.rules", "firebase.json"] effort: medium --- # Firebase Skill Firebase/Firestore patterns for web and mobile applications with real-time data, offline support, and security rules. **Sources:** [Firebase Docs](https://firebase.google.com/docs) | [Firestore Best Practices](https://firebase.google.com/docs/firestore/best-practices) | [Security Rules](https://firebase.google.com/docs/rules) --- ## Core Principle **Denormalize with purpose, secure with rules, scale horizontally.** Firestore is a document database - embrace denormalization for read efficiency. Security rules are your server-side validation. Design for your access patterns. --- ## Firebase Stack | Service | Purpose | |---------|---------| | **Firestore** | NoSQL document database with real-time sync | | **Authentication** | User auth, OAuth, anonymous sessions | | **Storage** | File uploads with security rules | | **Functions** | Serverless backend (Node.js) | | **Hosting** | Static site + CDN | | **Extensions** | Pre-built solutions (Stripe, Algolia, etc.) | --- ## Project Setup ### Install Firebase CLI ```bash # Install globally npm install -g firebase-tools # Login firebase login # Initialize in project firebase init ``` ### Initialize with Emulators ```bash firebase init emulators # Start local development firebase emulators:start ``` ### Project Structure ``` project/ ├── firebase.json # Firebase config ├── firestore.rules # Security rules ├── firestore.indexes.json # Composite indexes ├── storage.rules # Storage security rules └── functions/ # Cloud Functions ├── src/ ├── package.json └── tsconfig.json ``` --- ## Firestore Data Modeling ### Document Structure ```typescript // Good: Flat documents with all needed data interface Post { id: string; title: string; content: string; authorId: string; authorName: string; // Denormalized for display authorAvatar: string; // Denormalized tags: string[]; likeCount: number; // Aggregated counter createdAt: Timestamp; updatedAt: Timestamp; } // Collection: posts/{postId} ``` ### When to Use Subcollections ```typescript // Use subcollections for: // 1. Unbounded lists (comments, messages) // 2. Data with different access patterns // 3. Data that grows independently // posts/{postId}/comments/{commentId} interface Comment { id: string; text: string; authorId: string; authorName: string; createdAt: Timestamp; } ``` ### Data Model Patterns ```typescript // Pattern 1: Embedded data (bounded, always needed) interface User { id: string; email: string; profile: { displayName: string; bio: string; avatar: string; }; settings: { notifications: boolean; theme: 'light' | 'dark'; }; } // Pattern 2: Reference with denormalization interface Order { id: string; userId: string; userEmail: string; // Denormalized for display items: OrderItem[]; // Embedded (bounded) total: number; status: 'pending' | 'paid' | 'shipped'; } // Pattern 3: Aggregation documents // Keep counters in parent document interface Channel { id: string; name: string; memberCount: number; // Updated via Cloud Function messageCount: number; } ``` --- ## TypeScript SDK (Modular v9+) ### Initialize Firebase ```typescript // lib/firebase.ts import { initializeApp, getApps } from 'firebase/app'; import { getFirestore, connectFirestoreEmulator } from 'firebase/firestore'; import { getAuth, connectAuthEmulator } from 'firebase/auth'; import { getStorage, connectStorageEmulator } from 'firebase/storage'; const firebaseConfig = { apiKey: process.env.NEXT_PUBLIC_FIREBASE_API_KEY, authDomain: process.env.NEXT_PUBLIC_FIREBASE_AUTH_DOMAIN, projectId: process.env.NEXT_PUBLIC_FIREBASE_PROJECT_ID, storageBucket: process.env.NEXT_PUBLIC_FIREBASE_STORAGE_BUCKET, messagingSenderId: process.env.NEXT_PUBLIC_FIREBASE_MESSAGING_SENDER_ID, appId: process.env.NEXT_PUBLIC_FIREBASE_APP_ID }; // Initialize only once const app = getApps().length === 0 ? initializeApp(firebaseConfig) : getApps()[0]; export const db = getFirestore(app); export const auth = getAuth(app); export const storage = getStorage(app); // Connect to emulators in development if (process.env.NODE_ENV === 'development') { connectFirestoreEmulator(db, 'localhost', 8080); connectAuthEmulator(auth, 'http://localhost:9099'); connectStorageEmulator(storage, 'localhost', 9199); } ``` ### CRUD Operations ```typescript import { collection, doc, getDoc, getDocs, addDoc, setDoc, updateDoc, deleteDoc, query, where, orderBy, limit, startAfter, serverTimestamp, Timestamp } from 'firebase/firestore'; import { db } from './firebase'; // Create async function createPost(data: Omit<Post, 'id' | 'createdAt' | 'updatedAt'>) { const docRef = await addDoc(collection(db, 'posts'), { ...data, createdAt: serverTimestamp(), updatedAt: serverTimestamp() }); return docRef.id; } // Read single document async function getPost(postId: string): Promise<Post | null> { const docSnap = await getDoc(doc(db, 'posts', postId)); if (!docSnap.exists()) return null; return { id: docSnap.id, ...docSnap.data() } as Post; } // Query with filters async function getPostsByAuthor(authorId: string, pageSize = 10) { const q = query( collection(db, 'posts'), where('authorId', '==', authorId), orderBy('createdAt', 'desc'), limit(pageSize) ); const snapshot = await getDocs(q); return snapshot.docs.map(doc => ({ id: doc.id, ...doc.data() } as Post)); } // Pagination async function getNextPage(lastDoc: Post, pageSize = 10) { const q = query( collection(db, 'posts'), orderBy('createdAt', 'desc'), startAfter(lastDoc.createdAt), limit(pageSize) ); const snapshot = await getDocs(q); return snapshot.docs.map(doc => ({ id: doc.id, ...doc.data() } as Post)); } // Update async function updatePost(postId: string, data: Partial<Post>) { await updateDoc(doc(db, 'posts', postId), { ...data, updatedAt: serverTimestamp() }); } // Delete async function deletePost(postId: string) { await deleteDoc(doc(db, 'posts', postId)); } ``` ### Real-time Listeners ```typescript import { onSnapshot, QuerySnapshot, DocumentSnapshot } from 'firebase/firestore'; // Listen to single document function subscribeToPost( postId: string, onData: (post: Post | null) => void, onError: (error: Error) => void ) { return onSnapshot( doc(db, 'posts', postId), (snapshot: DocumentSnapshot) => { if (!snapshot.exists()) { onData(null); return; } onData({ id: snapshot.id, ...snapshot.data() } as Post); }, onError ); } // Listen to collection with query function subscribeToPosts( authorId: string, onData: (posts: Post[]) => void, onError: (error: Error) => void ) { const q = query( collection(db, 'posts'), where('authorId', '==', authorId), orderBy('createdAt', 'desc') ); return onSnapshot( q, (snapshot: QuerySnapshot) => { const posts = snapshot.docs.map(doc => ({ id: doc.id, ...doc.data() } as Post)); onData(posts); }, onError ); } // React hook example function usePost(postId: string) { const [post, setPost] = useState<Post | null>(null); const [loading, setLoading] = useState(true); const [error, setError] = useState<Error | null>(null); useEffect(() => { const unsubscribe = subscribeToPost( postId, (data) => { setPost(data); setLoading(false); }, (err) => { setError(err); setLoading(false); } ); return unsubscribe; }, [postId]); return { post, loading, error }; } ``` ### Offline Persistence (Web) ```typescript import { enableIndexedDbPersistence, enableMultiTabIndexedDbPersistence } from 'firebase/firestore'; // Enable offline persistence (call once at startup) async function enableOffline() { try { // Single tab await enableIndexedDbPersistence(db); // OR multi-tab (recommended) await enableMultiTabIndexedDbPersistence(db); } catch (err: any) { if (err.code === 'failed-precondition') { // Multiple tabs open, only works in one console.warn('Persistence only available in one tab'); } else if (err.code === 'unimplemented') { // Browser doesn't support console.warn('Persistence not supported'); } } } // Check if data is from cache onSnapshot(docRef, (snapshot) => { const source = snapshot.metadata.fromCache ? 'cache' : 'server'; console.log(`Data from ${source}`); if (snapshot.metadata.hasPendingWrites) { console.log('Local changes pending sync'); } }); ``` --- ## Security Rules ### Basic Rules Structure ```javascript // firestore.rules rules_version = '2'; service cloud.firestore { match /databases/{database}/documents { // Helper functions function isAuthenticated() { return request.auth != null; } function isOwner(userId) { return request.auth.uid == userId; } function isAdmin() { return request.auth.token.admin == true; } // Posts collection match /posts/{postId} { // Anyone can read published posts allow read: if resource.data.status == 'published'; // Only authenticated users can create allow create: if isAuthenticated() && request.resource.data.authorId == request.auth.uid && request.resource.data.keys().hasAll(['title', 'content', 'authorId']); // Only author can update allow update: if isOwner(resource.data.authorId) && request.resource.data.authorId == resource.data.authorId; // Can't change author // Only author or admin can delete allow delete: if isOwner(resource.data.authorId) || isAdmin(); // Comments subcollection match /comments/{commentId} { allow read: if true; allow create: if isAuthenticated(); allow update, delete: if isOwner(resource.data.authorId); } } // User profiles match /users/{userId} { allow read: if true; allow create: if isAuthenticated() && isOwner(userId); allow update: if isOwner(userId); allow delete: if false; // Never allow delete } // Private user data match /users/{userId}/private/{document=**} { allow read, write: if isOwner(userId); } } } ``` ### Data Validation in Rules ```javascript match /posts/{postId} { function isValidPost() { let data = request.resource.data; return data.title is string && data.title.size() >= 3 && data.title.size() <= 100 && data.content is string && data.content.size() <= 50000 && data.tags is list && data.tags.size() <= 5; } allow create: if isAuthenticated() && isValidPost(); allow update: if isOwner(resource.data.authorId) && isValidPost(); } ``` ### Test Rules Locally ```bash # Install emulators firebase emulators:start # Run rules tests npm test ``` ```typescript // tests/firestore.rules.test.ts import { assertFails, assertSucceeds, initializeTestEnvironment } from '@firebase/rules-unit-testing'; describe('Firestore Rules', () => { let testEnv: RulesTestEnvironment; beforeAll(async () => { testEnv = await initializeTestEnvironment({ projectId: 'test-project', firestore: { rules: fs.readFileSync('firestore.rules', 'utf8') } }); }); test('unauthenticated users cannot write', async () => { const unauthedDb = testEnv.unauthenticatedContext().firestore(); await assertFails( setDoc(doc(unauthedDb, 'posts/test'), { title: 'Test' }) ); }); test('users can only update own posts', async () => { const aliceDb = testEnv.authenticatedContext('alice').firestore(); const bobDb = testEnv.authenticatedContext('bob').firestore(); // Create as Alice await assertSucceeds( setDoc(doc(aliceDb, 'posts/test'), { title: 'Test', authorId: 'alice' }) ); // Bob cannot update await assertFails( updateDoc(doc(bobDb, 'posts/test'), { title: 'Hacked' }) ); }); }); ``` --- ## Authentication ### Email/Password Auth ```typescript import { createUserWithEmailAndPassword, signInWithEmailAndPassword, signOut, onAuthStateChanged, User } from 'firebase/auth'; import { auth } from './firebase'; // Sign up async function signUp(email: string, password: string) { const credential = await createUserWithEmailAndPassword(auth, email, password); return credential.user; } // Sign in async function signIn(email: string, password: string) { const credential = await signInWithEmailAndPassword(auth, email, password); return credential.user; } // Sign out async function logout() { await signOut(auth); } // Auth state listener function onAuthChange(callback: (user: User | null) => void) { return onAuthStateChanged(auth, callback); } ``` ### OAuth Providers ```typescript import { GoogleAuthProvider, signInWithPopup, signInWithRedirect } from 'firebase/auth'; const googleProvider = new GoogleAuthProvider(); async function signInWithGoogle() { try { const result = await signInWithPopup(auth, googleProvider); return result.user; } catch (error) { // Handle errors throw error; } } ``` --- ## Cloud Functions ### Basic HTTP Function ```typescript // functions/src/index.ts import { onRequest } from 'firebase-functions/v2/https'; import { onDocumentCreated } from 'firebase-functions/v2/firestore'; import { initializeApp } from 'firebase-admin/app'; import { getFirestore } from 'firebase-admin/firestore'; initializeApp(); const db = getFirestore(); // HTTP endpoint export const helloWorld = onRequest((request, response) => { response.json({ message: 'Hello from Firebase!' }); }); // Firestore trigger export const onPostCreated = onDocumentCreated('posts/{postId}', async (event) => { const snapshot = event.data; if (!snapshot) return; const post = snapshot.data(); // Update author's post count await db.doc(`users/${post.authorId}`).update({ postCount: FieldValue.increment(1) }); }); ``` ### Callable Functions ```typescript // Backend import { onCall, HttpsError } from 'firebase-functions/v2/https'; export const createPost = onCall(async (request) => { // Auth check if (!request.auth) { throw new HttpsError('unauthenticated', 'Must be logged in'); } const { title, content } = request.data; // Validation if (!title || title.length < 3) { throw new HttpsError('invalid-argument', 'Title must be at least 3 characters'); } // Create post const postRef = await db.collection('posts').add({ title, content, authorId: request.auth.uid, createdAt: FieldValue.serverTimestamp() }); return { postId: postRef.id }; }); // Frontend import { getFunctions, httpsCallable } from 'firebase/functions'; const functions = getFunctions(); const createPostFn = httpsCallable(functions, 'createPost'); async function createPost(title: string, content: string) { const result = await createPostFn({ title, content }); return result.data as { postId: string }; } ``` --- ## Batch Operations & Transactions ### Batch Writes ```typescript import { writeBatch, doc } from 'firebase/firestore'; async function batchUpdate(updates: { id: string; data: any }[]) { const batch = writeBatch(db); updates.forEach(({ id, data }) => { batch.update(doc(db, 'posts', id), data); }); await batch.commit(); // Atomic } ``` ### Transactions ```typescript import { runTransaction, doc, increment } from 'firebase/firestore'; async function likePost(postId: string, userId: string) { await runTransaction(db, async (transaction) => { const postRef = doc(db, 'posts', postId); const likeRef = doc(db, 'posts', postId, 'likes', userId); const postSnap = await transaction.get(postRef); if (!postSnap.exists()) throw new Error('Post not found'); const likeSnap = await transaction.get(likeRef); if (likeSnap.exists()) throw new Error('Already liked'); transaction.set(likeRef, { createdAt: serverTimestamp() }); transaction.update(postRef, { likeCount: increment(1) }); }); } ``` --- ## Indexes ### Composite Indexes ```json // firestore.indexes.json { "indexes": [ { "collectionGroup": "posts", "queryScope": "COLLECTION", "fields": [ { "fieldPath": "authorId", "order": "ASCENDING" }, { "fieldPath": "createdAt", "order": "DESCENDING" } ] }, { "collectionGroup": "posts", "queryScope": "COLLECTION", "fields": [ { "fieldPath": "tags", "arrayConfig": "CONTAINS" }, { "fieldPath": "createdAt", "order": "DESCENDING" } ] } ] } ``` ```bash # Deploy indexes firebase deploy --only firestore:indexes ``` --- ## CLI Quick Reference ```bash # Project setup firebase login # Authenticate firebase init # Initialize project firebase projects:list # List projects # Emulators firebase emulators:start # Start all emulators firebase emulators:start --only firestore,auth # Specific emulators # Deploy firebase deploy # Deploy everything firebase deploy --only firestore # Deploy rules + indexes firebase deploy --only functions # Deploy functions firebase deploy --only hosting # Deploy hosting # Functions cd functions && npm run build # Build TypeScript firebase functions:log # View logs ``` --- ## Anti-Patterns - **No security rules** - Always write rules, never use test mode in production - **Deep nesting** - Keep documents flat, max 2-3 levels - **Large documents** - Max 1MB, split if larger - **Unbounded arrays** - Use subcollections for lists that grow - **No offline handling** - Enable persistence for mobile/PWA - **Reading all fields** - Use field masks or Firestore Lite - **Ignoring indexes** - Check console for missing index errors - **No emulator testing** - Always test rules before deploy ================================================ FILE: skills/flutter/SKILL.md ================================================ --- name: flutter description: Flutter development with Riverpod state management, Freezed, go_router, and mocktail testing when-to-use: When working on Flutter/Dart code user-invocable: false paths: ["**/*.dart", "pubspec.yaml", "lib/**", "test/**"] effort: medium --- # Flutter Skill --- ## Project Structure ``` project/ ├── lib/ │ ├── core/ # Core utilities │ │ ├── constants/ # App constants │ │ ├── extensions/ # Dart extensions │ │ ├── router/ # go_router configuration │ │ │ └── app_router.dart │ │ └── theme/ # App theme │ │ └── app_theme.dart │ ├── data/ # Data layer │ │ ├── models/ # Freezed data models │ │ ├── repositories/ # Repository implementations │ │ └── services/ # API services │ ├── domain/ # Domain layer │ │ ├── entities/ # Business entities │ │ └── repositories/ # Repository interfaces │ ├── presentation/ # UI layer │ │ ├── common/ # Shared widgets │ │ ├── features/ # Feature modules │ │ │ └── feature_name/ │ │ │ ├── providers/ # Riverpod providers │ │ │ ├── widgets/ # Feature-specific widgets │ │ │ └── feature_screen.dart │ │ └── providers/ # Global providers │ ├── main.dart │ └── app.dart ├── test/ │ ├── unit/ # Unit tests │ ├── widget/ # Widget tests │ └── integration/ # Integration tests ├── pubspec.yaml ├── analysis_options.yaml └── CLAUDE.md ``` --- ## Riverpod State Management ### Provider Types ```dart // Simple value provider final appNameProvider = Provider<String>((ref) => 'My App'); // StateProvider for simple mutable state final counterProvider = StateProvider<int>((ref) => 0); // NotifierProvider for complex state logic final userProvider = NotifierProvider<UserNotifier, User?>(() => UserNotifier()); // AsyncNotifierProvider for async operations final usersProvider = AsyncNotifierProvider<UsersNotifier, List<User>>( () => UsersNotifier(), ); // FutureProvider for simple async data final configProvider = FutureProvider<Config>((ref) async { return await ref.watch(configServiceProvider).loadConfig(); }); // StreamProvider for real-time data final messagesProvider = StreamProvider<List<Message>>((ref) { return ref.watch(messageServiceProvider).watchMessages(); }); // Family providers for parameterized data final userByIdProvider = FutureProvider.family<User, String>((ref, userId) async { return await ref.watch(userRepositoryProvider).getUser(userId); }); ``` ### Notifier Pattern ```dart @riverpod class Users extends _$Users { @override Future<List<User>> build() async { return await _fetchUsers(); } Future<List<User>> _fetchUsers() async { final repository = ref.read(userRepositoryProvider); return await repository.getUsers(); } Future<void> refresh() async { state = const AsyncLoading(); state = await AsyncValue.guard(() => _fetchUsers()); } Future<void> addUser(User user) async { final repository = ref.read(userRepositoryProvider); await repository.addUser(user); ref.invalidateSelf(); } } ``` ### AsyncValue Handling ```dart class UsersScreen extends ConsumerWidget { const UsersScreen({super.key}); @override Widget build(BuildContext context, WidgetRef ref) { final usersAsync = ref.watch(usersProvider); return usersAsync.when( data: (users) => UsersList(users: users), loading: () => const Center(child: CircularProgressIndicator()), error: (error, stack) => ErrorDisplay( error: error, onRetry: () => ref.invalidate(usersProvider), ), ); } } // Pattern matching alternative Widget build(BuildContext context, WidgetRef ref) { final usersAsync = ref.watch(usersProvider); return switch (usersAsync) { AsyncData(:final value) => UsersList(users: value), AsyncLoading() => const LoadingIndicator(), AsyncError(:final error) => ErrorDisplay(error: error), }; } ``` ### ref Methods ```dart // watch - rebuilds when provider changes final users = ref.watch(usersProvider); // read - one-time read, no rebuild void onButtonPressed() { ref.read(counterProvider.notifier).state++; } // listen - react to changes without rebuild ref.listen(authProvider, (previous, next) { if (next == null) { context.go('/login'); } }); // invalidate - force refresh ref.invalidate(usersProvider); // keepAlive - prevent auto-dispose final link = ref.keepAlive(); // Later: link.close() to allow disposal ``` --- ## Freezed Data Models ### Model Definition ```dart import 'package:freezed_annotation/freezed_annotation.dart'; part 'user.freezed.dart'; part 'user.g.dart'; @freezed class User with _$User { const factory User({ required String id, required String name, required String email, @Default(false) bool isActive, DateTime? createdAt, }) = _User; factory User.fromJson(Map<String, dynamic> json) => _$UserFromJson(json); } // Union types for states @freezed sealed class AuthState with _$AuthState { const factory AuthState.initial() = _Initial; const factory AuthState.loading() = _Loading; const factory AuthState.authenticated(User user) = _Authenticated; const factory AuthState.unauthenticated() = _Unauthenticated; const factory AuthState.error(String message) = _Error; } ``` ### Using Freezed Unions ```dart Widget build(BuildContext context, WidgetRef ref) { final authState = ref.watch(authProvider); return authState.when( initial: () => const SplashScreen(), loading: () => const LoadingScreen(), authenticated: (user) => HomeScreen(user: user), unauthenticated: () => const LoginScreen(), error: (message) => ErrorScreen(message: message), ); } ``` --- ## go_router Navigation ### Router Configuration ```dart final routerProvider = Provider<GoRouter>((ref) { final authState = ref.watch(authProvider); return GoRouter( initialLocation: '/', refreshListenable: authState, redirect: (context, state) { final isLoggedIn = authState.valueOrNull != null; final isLoggingIn = state.matchedLocation == '/login'; if (!isLoggedIn && !isLoggingIn) return '/login'; if (isLoggedIn && isLoggingIn) return '/'; return null; }, routes: [ GoRoute( path: '/', builder: (context, state) => const HomeScreen(), routes: [ GoRoute( path: 'user/:id', builder: (context, state) => UserScreen( userId: state.pathParameters['id']!, ), ), ], ), GoRoute( path: '/login', builder: (context, state) => const LoginScreen(), ), ], errorBuilder: (context, state) => ErrorScreen(error: state.error), ); }); ``` ### Navigation ```dart // Navigate to route context.go('/user/123'); // Push onto stack context.push('/user/123'); // Pop current route context.pop(); // Replace current route context.pushReplacement('/home'); // Named routes context.goNamed('user', pathParameters: {'id': '123'}); ``` --- ## Widget Patterns ### ConsumerWidget vs ConsumerStatefulWidget ```dart // Stateless with Riverpod class UserCard extends ConsumerWidget { const UserCard({super.key, required this.userId}); final String userId; @override Widget build(BuildContext context, WidgetRef ref) { final user = ref.watch(userByIdProvider(userId)); return user.when( data: (user) => Card(child: Text(user.name)), loading: () => const CardSkeleton(), error: (e, _) => ErrorCard(error: e), ); } } // Stateful with Riverpod class SearchScreen extends ConsumerStatefulWidget { const SearchScreen({super.key}); @override ConsumerState<SearchScreen> createState() => _SearchScreenState(); } class _SearchScreenState extends ConsumerState<SearchScreen> { final _controller = TextEditingController(); @override void dispose() { _controller.dispose(); super.dispose(); } @override Widget build(BuildContext context) { final results = ref.watch(searchProvider(_controller.text)); return Column( children: [ TextField( controller: _controller, onChanged: (_) => setState(() {}), ), Expanded(child: SearchResults(results: results)), ], ); } } ``` ### HookConsumerWidget (with flutter_hooks) ```dart class AnimatedCounter extends HookConsumerWidget { const AnimatedCounter({super.key}); @override Widget build(BuildContext context, WidgetRef ref) { final controller = useAnimationController(duration: const Duration(milliseconds: 300)); final count = ref.watch(counterProvider); useEffect(() { controller.forward(from: 0); return null; }, [count]); return ScaleTransition( scale: controller, child: Text('$count'), ); } } ``` --- ## Testing with Mocktail ### Unit Tests ```dart import 'package:flutter_test/flutter_test.dart'; import 'package:mocktail/mocktail.dart'; import 'package:riverpod/riverpod.dart'; class MockUserRepository extends Mock implements UserRepository {} void main() { late MockUserRepository mockRepository; late ProviderContainer container; setUp(() { mockRepository = MockUserRepository(); container = ProviderContainer( overrides: [ userRepositoryProvider.overrideWithValue(mockRepository), ], ); }); tearDown(() { container.dispose(); }); test('usersProvider returns list of users', () async { final users = [User(id: '1', name: 'John', email: 'john@example.com')]; when(() => mockRepository.getUsers()).thenAnswer((_) async => users); final result = await container.read(usersProvider.future); expect(result, equals(users)); verify(() => mockRepository.getUsers()).called(1); }); } ``` ### Widget Tests ```dart void main() { testWidgets('UserCard displays user name', (tester) async { final user = User(id: '1', name: 'John', email: 'john@example.com'); await tester.pumpWidget( ProviderScope( overrides: [ userByIdProvider('1').overrideWith((_) => AsyncData(user)), ], child: const MaterialApp(home: UserCard(userId: '1')), ), ); expect(find.text('John'), findsOneWidget); }); testWidgets('UserCard shows loading indicator', (tester) async { await tester.pumpWidget( ProviderScope( overrides: [ userByIdProvider('1').overrideWith((_) => const AsyncLoading()), ], child: const MaterialApp(home: UserCard(userId: '1')), ), ); expect(find.byType(CircularProgressIndicator), findsOneWidget); }); } ``` --- ## pubspec.yaml ```yaml name: my_app description: A Flutter application publish_to: 'none' version: 1.0.0+1 environment: sdk: '>=3.2.0 <4.0.0' dependencies: flutter: sdk: flutter # State management flutter_riverpod: ^2.4.9 riverpod_annotation: ^2.3.3 # Data models freezed_annotation: ^2.4.1 json_annotation: ^4.8.1 # Navigation go_router: ^13.0.0 # Networking dio: ^5.4.0 # Storage shared_preferences: ^2.2.2 # Utils intl: ^0.19.0 dev_dependencies: flutter_test: sdk: flutter # Code generation build_runner: ^2.4.8 freezed: ^2.4.6 json_serializable: ^6.7.1 riverpod_generator: ^2.3.9 # Testing mocktail: ^1.0.2 # Linting flutter_lints: ^3.0.1 ``` --- ## GitHub Actions ```yaml name: Flutter CI on: push: branches: [main] pull_request: branches: [main] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: subosito/flutter-action@v2 with: flutter-version: '3.16.0' channel: 'stable' cache: true - name: Install dependencies run: flutter pub get - name: Generate code run: dart run build_runner build --delete-conflicting-outputs - name: Analyze run: flutter analyze --fatal-infos - name: Run tests run: flutter test --coverage - name: Build APK run: flutter build apk --release ``` --- ## analysis_options.yaml ```yaml include: package:flutter_lints/flutter.yaml analyzer: exclude: - "**/*.g.dart" - "**/*.freezed.dart" errors: invalid_annotation_target: ignore language: strict-casts: true strict-inference: true strict-raw-types: true linter: rules: - always_declare_return_types - avoid_dynamic_calls - avoid_print - avoid_type_to_string - cancel_subscriptions - close_sinks - prefer_const_constructors - prefer_const_declarations - prefer_final_locals - require_trailing_commas - unawaited_futures - use_super_parameters ``` --- ## Flutter Anti-Patterns - ❌ **Provider without autoDispose** - Use `.autoDispose` to prevent memory leaks - ❌ **watch in callbacks** - Use `ref.read()` in onPressed/callbacks, not `ref.watch()` - ❌ **Business logic in widgets** - Move to Notifiers/providers - ❌ **Mutable state in providers** - Use Freezed for immutable models - ❌ **Not using AsyncValue** - Handle loading/error states with `when()` - ❌ **setState with Riverpod** - Use providers for shared state - ❌ **Passing ref to functions** - Keep ref usage within widgets/providers - ❌ **Deeply nested Consumer** - Use ConsumerWidget instead - ❌ **Not using family for params** - Use `.family` for parameterized providers - ❌ **Global GoRouter instance** - Use Provider for router with redirect logic - ❌ **BuildContext across async** - Store values before await, not context - ❌ **Ignoring dispose** - Clean up controllers in ConsumerStatefulWidget ================================================ FILE: skills/gemini-review/SKILL.md ================================================ --- name: gemini-review description: Google Gemini CLI code review with Gemini 2.5 Pro, 1M token context, CI/CD integration when-to-use: When user requests Gemini-powered code review or needs large-context review user-invocable: true effort: medium --- # Google Gemini Code Review Skill Use Google's Gemini CLI for code review with Gemini 2.5 Pro - featuring a massive 1M token context window that can analyze entire repositories at once. **Sources:** [Gemini CLI](https://github.com/google-gemini/gemini-cli) | [Code Review Extension](https://github.com/gemini-cli-extensions/code-review) | [Gemini Code Assist](https://codeassist.google/) | [GitHub Action](https://github.com/google-github-actions/run-gemini-cli) --- ## Why Gemini for Code Review? | Feature | Benefit | |---------|---------| | **Gemini 2.5 Pro** | State-of-the-art reasoning for code | | **1M token context** | Entire repositories fit - no chunking needed | | **Free tier** | 1,000 requests/day with Google account | | **Consistent output** | Clean formatting, predictable structure | | **GitHub native** | Gemini Code Assist app for auto PR reviews | ### Benchmark Performance | Benchmark | Score | Notes | |-----------|-------|-------| | SWE-Bench Verified | 63.8% | Agentic coding benchmark | | Qodo PR Benchmark | 56.3% | PR review quality | | LiveCodeBench v5 | 70.4% | Code generation | | WebDev Arena | #1 | Web development | --- ## Installation ### Prerequisites ```bash # Check Node.js version (requires 20+) node --version # Install Node.js 20 if needed # macOS brew install node@20 # Or via nvm nvm install 20 nvm use 20 ``` ### Install Gemini CLI ```bash # Via npm (recommended) npm install -g @google/gemini-cli # Via Homebrew (macOS) brew install gemini-cli # Or run without installing npx @google/gemini-cli # Verify installation gemini --version ``` ### Install Code Review Extension ```bash # Requires Gemini CLI v0.4.0+ gemini extensions install https://github.com/gemini-cli-extensions/code-review # Verify extension gemini extensions list ``` --- ## Authentication ### Option 1: Google Account (Recommended) **Free tier: 1,000 requests/day, 60 requests/min** ```bash # Run gemini and follow browser login gemini # Select: "Login with Google Account" # Opens browser for OAuth ``` This gives you access to Gemini 2.5 Pro with the full 1M token context window. ### Option 2: Gemini API Key **Free tier: 100 requests/day** ```bash # Get API key from https://aistudio.google.com/apikey # Set environment variable export GEMINI_API_KEY="your-api-key" # Or add to shell profile echo 'export GEMINI_API_KEY="your-api-key"' >> ~/.zshrc # Run Gemini gemini ``` ### Option 3: Vertex AI (Enterprise) ```bash # For Google Cloud projects export GOOGLE_API_KEY="your-api-key" export GOOGLE_GENAI_USE_VERTEXAI=true export GOOGLE_CLOUD_PROJECT="your-project-id" gemini ``` --- ## Interactive Code Review ### Using the Code Review Extension ```bash # Start Gemini CLI gemini # Run code review on current branch /code-review ``` The extension analyzes: - Code changes on your current branch - Identifies quality issues - Suggests fixes ### Manual Review Prompts ```bash # In interactive mode gemini # Then ask: > Review the changes in this branch for bugs and security issues > Analyze src/api/users.ts for potential vulnerabilities > What are the code quality issues in the last 3 commits? ``` --- ## Headless Mode (Automation) ### Basic Usage ```bash # Simple prompt execution gemini -p "Review the code changes for bugs and security issues" # With JSON output (for parsing) gemini -p "Review the changes" --output-format json # Stream JSON events (real-time) gemini -p "Review and fix issues" --output-format stream-json # Specify model gemini -m gemini-2.5-pro -p "Deep code review of this PR" ``` ### Full CI/CD Example ```bash # Get diff and review git diff origin/main...HEAD > diff.txt gemini -p "Review this code diff for: 1. Security vulnerabilities 2. Performance issues 3. Code quality problems 4. Missing error handling Diff: $(cat diff.txt) " --output-format json > review.json ``` ### Session Tracking ```bash # Track token usage and costs gemini -p "Review changes" --session-summary metrics.json # View metrics cat metrics.json ``` --- ## GitHub Integration ### Option 1: Gemini Code Assist App (Easiest) Install from [GitHub Marketplace](https://github.com/marketplace/gemini-code-assist): 1. Go to GitHub Marketplace → Gemini Code Assist 2. Click "Install" and select repositories 3. PRs automatically get reviewed when opened **Commands in PR comments:** ``` /gemini review # Request code review /gemini summary # Get PR summary /gemini help # Show available commands ``` **Quota:** - Free: 33 PRs/day - Enterprise: 100+ PRs/day ### Option 2: GitHub Action ```yaml # .github/workflows/gemini-review.yml name: Gemini Code Review on: pull_request: types: [opened, synchronize] jobs: review: runs-on: ubuntu-latest permissions: contents: read pull-requests: write steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '20' - name: Install Gemini CLI run: npm install -g @google/gemini-cli - name: Run Review env: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} run: | # Get diff git diff origin/${{ github.base_ref }}...HEAD > diff.txt # Run Gemini review gemini -p "Review this pull request diff for bugs, security issues, and code quality problems. Be specific about file names and line numbers. $(cat diff.txt)" > review.md - name: Post Review Comment uses: actions/github-script@v7 with: script: | const fs = require('fs'); const review = fs.readFileSync('review.md', 'utf8'); github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: `## 🤖 Gemini Code Review\n\n${review}` }); ``` ### Option 3: Official GitHub Action ```yaml # .github/workflows/gemini-review.yml name: Gemini Code Review on: pull_request: types: [opened, synchronize] issue_comment: types: [created] jobs: review: runs-on: ubuntu-latest permissions: contents: read pull-requests: write issues: write steps: - uses: actions/checkout@v4 - name: Run Gemini CLI uses: google-github-actions/run-gemini-cli@v1 with: gemini_api_key: ${{ secrets.GEMINI_API_KEY }} prompt: "Review this pull request for code quality, security issues, and potential bugs." ``` **On-demand commands in comments:** ``` @gemini-cli /review @gemini-cli explain this code change @gemini-cli write unit tests for this component ``` --- ## GitLab CI/CD ```yaml # .gitlab-ci.yml gemini-review: image: node:20 stage: review script: - npm install -g @google/gemini-cli - | gemini -p "Review the merge request changes for bugs, security issues, and code quality" > review.md - cat review.md artifacts: paths: - review.md rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" variables: GEMINI_API_KEY: $GEMINI_API_KEY ``` --- ## Configuration ### Global Config ```bash # ~/.gemini/settings.json { "model": "gemini-2.5-pro", "theme": "dark", "sandbox": true } ``` ### Project Config (GEMINI.md) Create a `GEMINI.md` file in your project root for project-specific context: ```markdown # Project Context for Gemini ## Tech Stack - TypeScript with strict mode - React 18 with hooks - FastAPI backend - PostgreSQL database ## Code Review Focus Areas 1. Type safety - ensure proper TypeScript types 2. React hooks rules - check for dependency array issues 3. SQL injection - verify parameterized queries 4. Authentication - check all endpoints have proper auth ## Conventions - Use camelCase for variables - Use PascalCase for components - All API errors should use AppError class ``` --- ## CLI Quick Reference ```bash # Interactive gemini # Start interactive mode /code-review # Run code review extension # Headless gemini -p "prompt" # Single prompt, exit gemini -p "prompt" --output-format json # JSON output gemini -m gemini-2.5-flash -p "prompt" # Use faster model # Extensions gemini extensions list # List installed gemini extensions install URL # Install extension gemini extensions update # Update all # Key Flags --output-format json # Structured output --output-format stream-json # Real-time events --session-summary FILE # Track metrics -m MODEL # Select model ``` --- ## Comparison: Claude vs Codex vs Gemini | Aspect | Claude | Codex CLI | Gemini CLI | |--------|--------|-----------|------------| | **Setup** | None (built-in) | npm + OpenAI API | npm + Google Account | | **Model** | Claude | GPT-5.2-Codex | Gemini 2.5 Pro | | **Context** | Conversation | Fresh per review | 1M tokens (huge!) | | **Free Tier** | N/A | Limited | 1,000/day | | **Best For** | Quick reviews | High accuracy | Large codebases | | **GitHub Native** | No | @codex | Gemini Code Assist | ### When to Use Each | Scenario | Recommended Engine | |----------|-------------------| | Quick in-flow review | Claude | | Critical security review | Codex (88% detection) | | Large codebase (100+ files) | Gemini (1M context) | | Free automated reviews | Gemini | | Multiple perspectives | All three (dual/triple engine) | --- ## Troubleshooting | Issue | Solution | |-------|----------| | `gemini: command not found` | `npm install -g @google/gemini-cli` | | `Node.js version error` | Upgrade to Node.js 20+ | | `Authentication failed` | Re-run `gemini` and login again | | `Extension not found` | `gemini extensions install https://github.com/gemini-cli-extensions/code-review` | | `Rate limited` | Wait or upgrade to Vertex AI | | `Hangs in CI` | Ensure `DEBUG` env var is not set | --- ## Anti-Patterns - **Skipping authentication setup** - Always configure before CI/CD - **Using API key in logs** - Use secrets management - **Ignoring context limits** - Even 1M tokens has limits for huge monorepos - **Running on every commit** - Use on PRs only to save quota - **Not setting project context** - Add GEMINI.md for better reviews ================================================ FILE: skills/icpg/SKILL.md ================================================ --- name: icpg description: Intent-Augmented Code Property Graph — tracks WHY code exists via ReasonNodes with formal contracts, 6-dimension drift detection, and 3 canonical pre-task queries for autonomous development when-to-use: "Before any code change — query the reason graph for intent, constraints, and risk" user-invocable: false effort: high --- # iCPG Skill (Intent-Augmented Code Property Graph) **Purpose:** Add a Reason Graph layer on top of code structure so every function, class, and module is traceable to the goal that created it, the agent or human that owns it, and whether it's still doing what it was supposed to do. ``` ┌────────────────────────────────────────────────────────────────┐ │ iCPG = AST + CFG + PDG + RG (Reason Graph) │ │ ─────────────────────────────────────────────────────────────│ │ AST = Abstract Syntax Tree (structure) ← existing │ │ CFG = Control Flow Graph (execution paths) ← existing │ │ PDG = Program Dependency Graph ← existing │ │ RG = Reason Graph (WHY layer) ← THIS SKILL │ │ │ │ The RG stores ReasonNodes (goals/tasks), links them to code │ │ symbols via typed edges, enforces contracts (DbC), and │ │ detects when code drifts from its original purpose. │ │ │ │ Storage: .icpg/reason.db (SQLite, per-project, gitignored) │ │ CLI: icpg init | create | record | query | drift | bootstrap │ └────────────────────────────────────────────────────────────────┘ ``` --- ## Core Principle **Intent first, code second.** Before writing or modifying code, query the reason graph to understand WHY existing code was written, WHAT constraints it must preserve, and WHETHER your change duplicates prior work. --- ## The 3 Canonical Pre-Task Queries **Every agent MUST run these before writing code:** | # | Query | Command | What It Answers | |---|-------|---------|-----------------| | 1 | **search_prior_work** | `icpg query prior "<goal>"` | Has this been attempted before? Prevents duplication. | | 2 | **get_constraints** | `icpg query constraints <file>` | What invariants apply to files I'll touch? Prevents breakage. | | 3 | **get_risk_profile** | `icpg query risk <symbol>` | Is this symbol fragile? Drift history, ownership changes. | --- ## ReasonNode — The Core Primitive Each ReasonNode captures a stated purpose with a formal contract: ``` id UUID goal Natural language: what is this trying to achieve decision_type business_goal | arch_decision | task | workaround | constraint | patch scope Files/modules expected to be touched owner Human or agent accountable status proposed | executing | fulfilled | drifted | abandoned source manual | commit | inferred | agent-session FORMAL CONTRACT (Design by Contract): preconditions What must be true before this intent executes postconditions What must be true when fulfilled invariants What must remain true throughout and after ``` **Drift = predicate failure.** A symbol has drifted when its current behavior no longer satisfies the postconditions of the ReasonNode that created it, or when an invariant is violated. --- ## Six Edge Types ``` CREATES Reason → Symbol (this intent created this function) MODIFIES Reason → Symbol (this intent changed this function) REQUIRES Reason → Reason (B depends on A being done first) DUPLICATES Reason → Reason (these two goals overlap) VALIDATED_BY Reason → Test (this test proves the intent was satisfied) DRIFTS_FROM Symbol → Reason (this symbol no longer does what it was made for) ``` --- ## 6-Dimension Drift Model | Dimension | What It Means | Detection | |-----------|--------------|-----------| | **Spec drift** | Symbol checksum changed without a MODIFIES edge | Compare stored vs current checksum | | **Decision drift** | Postconditions no longer hold | Evaluate predicates against codebase | | **Ownership drift** | >3 different owners without coherent oversight | Count unique owners on edges | | **Test drift** | VALIDATED_BY tests missing or failing | Check test file existence + run | | **Usage drift** | Symbol used outside original scope | Grep for imports beyond scope | | **Dependency drift** | Downstream REQUIRES reasons have drifted | Traverse REQUIRES edges | Run `icpg drift check` to scan all dimensions. Each produces a 0-1 severity score. --- ## CLI Reference ### Setup ```bash icpg init # Create .icpg/ and database icpg bootstrap --days 90 # Infer ReasonNodes from git history icpg bootstrap --days 90 --no-llm # Without LLM (commit-message only) ``` ### Create & Record ```bash icpg create "Add JWT auth" --scope src/auth/ --owner feature-auth --type task icpg record --reason <id> --base main # Record symbols from git diff icpg record --reason <id> --edge-type MODIFIES # Record as modifications ``` ### Query (the 3 canonical queries) ```bash icpg query prior "user authentication" # 1. Duplicate detection icpg query constraints src/auth/service.ts # 2. Invariants for file icpg query risk validateToken # 3. Symbol risk profile icpg query context src/auth/service.ts # All intents for a file icpg query blast <reason-id> # Full blast radius ``` ### Drift ```bash icpg drift check # Full scan across all dimensions icpg drift resolve <id> # Mark drift event resolved ``` ### Status ```bash icpg status # Stats: reasons, symbols, edges, drift ``` --- ## Storage Per-project, gitignored, zero infrastructure: ``` .icpg/ reason.db SQLite database (4 tables: reasons, symbols, edges, drift_events) .gitignore Contains: * chroma/ ChromaDB vectors (if chromadb installed) tfidf_cache.json TF-IDF fallback cache .current-intent Marker file for active intent (used by Stop hook) ``` Install options: ```bash pip install ./scripts/icpg # Core (zero deps) pip install "./scripts/icpg[vectors]" # + ChromaDB for duplicate detection pip install "./scripts/icpg[all]" # + ChromaDB + scikit-learn + openai ``` --- ## Workflow: Before Any Code Change ``` 0. INTENT → icpg create (or identify existing intent) 1. DEDUP → icpg query prior (check for duplicate work) 2. CONSTRAINTS → icpg query constraints (understand invariants) 3. RISK → icpg query risk (check fragile symbols) 4. LOCATE → search_graph to find symbols (code-graph skill) 5. CHANGE → Make the edit (PreToolUse hook shows context) 6. RECORD → icpg record (link symbols to intent) 7. DRIFT CHECK → icpg drift check (verify no unintended drift) 8. VERIFY → Run tests, lint, typecheck ``` **Step 0 is non-negotiable for autonomous agents.** Every change must be linked to a stated purpose. Without an intent, there's nothing to measure drift against. --- ## Hook Integration ### PreToolUse Hook (automatic context injection) Add to `.claude/settings.json`: ```json { "hooks": { "PreToolUse": [{ "matcher": "Edit|Write", "hooks": [{ "type": "command", "command": "scripts/icpg-pre-edit.sh", "timeout": 3, "statusMessage": "Checking intent context..." }] }] } } ``` Before every file edit, agents see: ``` ═══ iCPG CONTEXT ═══ INTENTS for src/auth/service.ts: [>] a1b2c3d4 — User authentication with JWT tokens Owner: feature-auth | Status: executing Invariants: 2 CONSTRAINTS for src/auth/service.ts: From intent: User authentication with JWT tokens INV: file_exists("src/auth/middleware.ts") POST: test_exists("src/auth/__tests__/service.test.ts") PRESERVE function signatures unless your task requires changing them. ═══════════════════ ``` ### Stop Hook (automatic symbol recording) After implementation passes tests, auto-records symbols: ```json { "hooks": { "Stop": [{ "hooks": [ {"type": "command", "command": "scripts/tdd-loop-check.sh", "timeout": 60}, {"type": "command", "command": "scripts/icpg-stop-record.sh", "timeout": 5} ] }] } } ``` --- ## Agent Teams Integration ### Updated Pipeline (agent-teams + iCPG) ``` 0. INTENT Team lead creates ReasonNode from feature spec 0b. DEDUP icpg query prior — check for duplicate intents 1. SPEC Feature agent writes spec 2. SPEC-REVIEW Quality agent reviews spec + intent alignment 3. TESTS (RED) Feature agent writes tests 4. RED-VERIFY Quality agent verifies tests fail 5. IMPLEMENT Feature agent codes (PreEdit hook shows context) 5b. RECORD Auto-record symbols → intent (Stop hook) 5c. DRIFT-CHECK Quality agent verifies no scope drift 6. GREEN-VERIFY Quality agent verifies tests pass + coverage 7. VALIDATE Lint + typecheck + full suite 8. CODE-REVIEW Review agent (sees intent context per file) 9. SECURITY Security agent 10. BRANCH-PR Merger agent (PR includes intent traceability) ``` ### Agent Responsibilities | Agent | iCPG Action | |-------|-------------| | **Team Lead** | `icpg create` when creating task chains. `icpg query prior` to check duplicates. | | **Feature Agent** | `icpg query constraints` before implementing. Writes `.icpg/.current-intent` for auto-recording. | | **Quality Agent** | `icpg drift check` during GREEN verify. Verifies scope alignment. | | **Review Agent** | Sees intent context via PreToolUse hook when reviewing files. | | **Merger Agent** | Includes intent traceability in PR description. | --- ## Bootstrapping from Git History For existing codebases, infer ReasonNodes from commit history: ```bash icpg bootstrap --days 90 --verbose ``` This will: 1. Get commits from last 90 days 2. Cluster by temporal proximity (2-hour window) 3. Infer intent via LLM (Claude or OpenAI) or commit message parsing 4. Create ReasonNodes with `source: "inferred"`, `confidence: 0.6-0.8` 5. Extract symbols from changed files, create CREATES edges 6. Run duplicate detection against existing ReasonNodes **Quality note:** Inferred intents are marked low-confidence. Review and promote high-value ones manually. --- ## Contract Predicates Predicates are structured assertions over codebase state: ``` file_exists("src/auth/middleware.ts") test_exists("src/auth/__tests__/service.test.ts") symbol_count("src/auth/") <= 15 function_signature("validateToken") == "(token: string) => Promise<User>" ``` Contracts can be: - **Hand-authored** for high-risk ReasonNodes - **LLM-inferred** via `icpg create --infer-contracts` - **Heuristic** (scope → file_exists, test → test_exists) --- ## Anti-Patterns | Anti-Pattern | Do This Instead | |-------------|-----------------| | Coding without stating intent | `icpg create` before every non-trivial change | | Assuming your change is isolated | `icpg query constraints` + `icpg query risk` first | | Rebuilding what already exists | `icpg query prior` to check for prior work | | Leaving intent in 'executing' forever | Update status to 'fulfilled' when done | | Ignoring drift events | `icpg drift check` weekly, resolve or create new intents | | Storing full source in symbols | Store signature + checksum only — read source from files | | Skipping bootstrap on existing repos | `icpg bootstrap --days 90` to build initial graph | ================================================ FILE: skills/iterative-development/SKILL.md ================================================ --- name: iterative-development description: TDD iteration loops using Claude Code Stop hooks - runs tests after each response, feeds failures back automatically when-to-use: When setting up or configuring TDD loops via Stop hooks user-invocable: false effort: medium --- # Iterative Development Skill (Stop Hook TDD Loops) **Concept:** Claude Code's Stop hook fires right before Claude finishes a response. Exit code 2 feeds stderr back to the model and continues the conversation. This creates a real TDD loop without any plugins. --- ## How It Actually Works Claude Code has a **Stop hook** that runs when Claude is about to conclude its response. If the hook script exits with code 2, its stderr is shown to the model and the conversation continues automatically. ``` ┌─────────────────────────────────────────────────────────────┐ │ 1. User asks Claude to implement a feature │ ├─────────────────────────────────────────────────────────────┤ │ 2. Claude writes tests + implementation │ ├─────────────────────────────────────────────────────────────┤ │ 3. Claude finishes its response │ ├─────────────────────────────────────────────────────────────┤ │ 4. Stop hook runs: executes tests, lint, typecheck │ ├─────────────────────────────────────────────────────────────┤ │ 5a. All pass (exit 0) → Claude stops, work is done │ │ 5b. Failures (exit 2) → stderr fed back to Claude │ ├─────────────────────────────────────────────────────────────┤ │ 6. Claude sees failures, fixes code, response ends │ ├─────────────────────────────────────────────────────────────┤ │ 7. Stop hook runs again → repeat until green or max tries │ └─────────────────────────────────────────────────────────────┘ ``` **Key insight:** No fake plugins, no `/ralph-loop` command. The hook is real Claude Code infrastructure that runs automatically. --- ## Setup: Stop Hook Configuration Add this to your project's `.claude/settings.json`: ```json { "hooks": { "Stop": [ { "hooks": [ { "type": "command", "command": "scripts/tdd-loop-check.sh", "timeout": 60, "statusMessage": "Running tests..." } ] } ] } } ``` ### The TDD Loop Check Script Create `scripts/tdd-loop-check.sh` in your project: ```bash #!/bin/bash # TDD Loop Check - runs after each Claude response # Exit 0 = all good, Claude stops # Exit 2 = failures, stderr fed back to Claude to fix MAX_ITERATIONS=25 ITERATION_FILE=".claude/.tdd-iteration-count" # Track iteration count if [ -f "$ITERATION_FILE" ]; then count=$(cat "$ITERATION_FILE") count=$((count + 1)) else count=1 fi echo "$count" > "$ITERATION_FILE" # Safety: stop after max iterations if [ "$count" -ge "$MAX_ITERATIONS" ]; then rm -f "$ITERATION_FILE" echo "Max iterations ($MAX_ITERATIONS) reached. Stopping loop." >&2 exit 0 fi # Skip if no test files exist yet if ! find . -name "*.test.*" -o -name "*.spec.*" -o -name "test_*" 2>/dev/null | grep -q .; then rm -f "$ITERATION_FILE" exit 0 fi # Run tests TEST_OUTPUT=$(npm test 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Tests failing:" >&2 echo "$TEST_OUTPUT" | tail -30 >&2 echo "" >&2 echo "Fix the failing tests and try again." >&2 exit 2 } # Run lint (if configured) if [ -f "package.json" ] && grep -q '"lint"' package.json; then LINT_OUTPUT=$(npm run lint 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Lint errors:" >&2 echo "$LINT_OUTPUT" | tail -20 >&2 echo "" >&2 echo "Fix lint errors and try again." >&2 exit 2 } fi # Run typecheck (if configured) if [ -f "tsconfig.json" ]; then TYPE_OUTPUT=$(npx tsc --noEmit 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Type errors:" >&2 echo "$TYPE_OUTPUT" | tail -20 >&2 echo "" >&2 echo "Fix type errors and try again." >&2 exit 2 } fi # All green - reset counter and let Claude stop rm -f "$ITERATION_FILE" exit 0 ``` ### Python Variant ```bash #!/bin/bash # Python TDD Loop Check MAX_ITERATIONS=25 ITERATION_FILE=".claude/.tdd-iteration-count" if [ -f "$ITERATION_FILE" ]; then count=$(cat "$ITERATION_FILE") count=$((count + 1)) else count=1 fi echo "$count" > "$ITERATION_FILE" if [ "$count" -ge "$MAX_ITERATIONS" ]; then rm -f "$ITERATION_FILE" echo "Max iterations ($MAX_ITERATIONS) reached." >&2 exit 0 fi if ! find . -name "test_*" -o -name "*_test.py" 2>/dev/null | grep -q .; then rm -f "$ITERATION_FILE" exit 0 fi TEST_OUTPUT=$(pytest -v 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Tests failing:" >&2 echo "$TEST_OUTPUT" | tail -30 >&2 exit 2 } if command -v ruff &>/dev/null; then LINT_OUTPUT=$(ruff check . 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Lint errors:" >&2 echo "$LINT_OUTPUT" | tail -20 >&2 exit 2 } fi if command -v mypy &>/dev/null; then TYPE_OUTPUT=$(mypy . 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Type errors:" >&2 echo "$TYPE_OUTPUT" | tail -20 >&2 exit 2 } fi rm -f "$ITERATION_FILE" exit 0 ``` --- ## Additional Hooks for Quality Enforcement ### PreToolUse Hook: Lint Before File Writes Runs a linter before any Write/Edit lands: ```json { "hooks": { "PreToolUse": [ { "matcher": "Write|Edit", "hooks": [ { "type": "command", "command": "scripts/pre-write-lint.sh", "timeout": 10, "statusMessage": "Checking code quality..." } ] } ] } } ``` ### SessionStart Hook: Auto-Inject Context Runs at session start to inject project info: ```json { "hooks": { "SessionStart": [ { "hooks": [ { "type": "command", "command": "echo 'TDD loop active. Tests run automatically after each response. Fix failures to continue.'", "statusMessage": "Loading project context..." } ] } ] } } ``` --- ## Core Philosophy ``` ┌─────────────────────────────────────────────────────────────┐ │ ITERATION > PERFECTION │ │ ───────────────────────────────────────────────────────── │ │ Don't aim for perfect on first try. │ │ Let the loop refine the work. Each iteration builds on │ │ previous attempts visible in files and git history. │ ├─────────────────────────────────────────────────────────────┤ │ FAILURES ARE DATA │ │ ───────────────────────────────────────────────────────── │ │ Failed tests, lint errors, type mismatches are signals. │ │ The Stop hook feeds them directly to Claude as context. │ ├─────────────────────────────────────────────────────────────┤ │ CLEAR COMPLETION CRITERIA │ │ ───────────────────────────────────────────────────────── │ │ The hook defines "done": tests pass, lint clean, types ok. │ │ No ambiguity about when to stop. │ └─────────────────────────────────────────────────────────────┘ ``` --- ## Error Classification Not all failures should loop. The hook script should distinguish: | Type | Examples | Action | |------|----------|--------| | **Code Error** | Logic bug, wrong assertion, type mismatch | Exit 2 → loop continues | | **Access Error** | Missing API key, DB connection refused | Exit 0 → stop, report to user | | **Environment Error** | Missing package, wrong runtime version | Exit 0 → stop, report to user | The sample scripts above handle this — they only exit 2 for test/lint/type failures, not for environment issues. --- ## When to Use TDD Loops ### Good For | Use Case | Why | |----------|-----| | Feature development | Tests provide clear pass/fail signal | | Bug fixes | Write failing test, fix, loop until green | | Refactoring | Existing tests catch regressions | | API development | Each endpoint independently testable | ### Not Good For | Use Case | Why | |----------|-----| | UI/UX work | Requires human judgment | | One-shot operations | No iteration needed | | Unclear requirements | No clear "done" criteria | | Subjective design | No objective success metric | --- ## Disabling the Loop To temporarily disable the TDD loop for a session: 1. Remove or rename the Stop hook in `.claude/settings.json` 2. Or set `MAX_ITERATIONS=1` in the script 3. Or delete `scripts/tdd-loop-check.sh` The hook only fires if the script exists and is configured. --- ## Gitignore Additions ```gitignore # TDD loop state .claude/.tdd-iteration-count ``` ================================================ FILE: skills/klaviyo/SKILL.md ================================================ --- name: klaviyo description: Klaviyo email/SMS marketing - profiles, events, flows, segmentation when-to-use: When integrating Klaviyo for email/SMS marketing user-invocable: false effort: medium --- # Klaviyo E-Commerce Marketing Skill For integrating Klaviyo email/SMS marketing - customer profiles, event tracking, campaigns, flows, and segmentation. **Sources:** [Klaviyo API Docs](https://developers.klaviyo.com/en/docs) | [API Reference](https://developers.klaviyo.com/en/reference/api-overview) --- ## Why Klaviyo | Feature | Benefit | |---------|---------| | **E-commerce Native** | Built for online stores, deep integrations | | **Event-Based** | Trigger flows from any customer action | | **Segmentation** | Advanced filtering on behavior + properties | | **Email + SMS** | Unified platform for both channels | | **Analytics** | Revenue attribution per campaign | --- ## API Basics ### Base URLs | Type | URL | |------|-----| | Server-side (Private) | `https://a.klaviyo.com/api` | | Client-side (Public) | `https://a.klaviyo.com/client` | ### Authentication ```typescript // Server-side: Private API Key const headers = { "Authorization": "Klaviyo-API-Key pk_xxxxxxxxxxxxxxxxxxxxxxxx", "Content-Type": "application/json", "revision": "2024-10-15", // API version }; // Client-side: Public API Key (6 characters) const publicKey = "XXXXXX"; // Company ID // Use as query param: ?company_id=XXXXXX ``` ### API Key Scopes | Scope | Access | |-------|--------| | Read-only | View data only | | Full | Read + write (default) | | Custom | Specific permissions | --- ## Installation ### Node.js ```bash npm install klaviyo-api ``` ```typescript // lib/klaviyo.ts import { ApiClient, EventsApi, ProfilesApi, ListsApi } from "klaviyo-api"; const client = new ApiClient(); client.setApiKey(process.env.KLAVIYO_PRIVATE_KEY!); export const eventsApi = new EventsApi(client); export const profilesApi = new ProfilesApi(client); export const listsApi = new ListsApi(client); ``` ### Python ```bash pip install klaviyo-api ``` ```python # lib/klaviyo.py from klaviyo_api import KlaviyoAPI klaviyo = KlaviyoAPI( api_key=os.environ["KLAVIYO_PRIVATE_KEY"], max_delay=60, max_retries=3 ) ``` ### Direct HTTP (Any Language) ```typescript // lib/klaviyo.ts const KLAVIYO_BASE_URL = "https://a.klaviyo.com/api"; async function klaviyoRequest( endpoint: string, method: "GET" | "POST" | "PATCH" | "DELETE" = "GET", body?: object ) { const response = await fetch(`${KLAVIYO_BASE_URL}${endpoint}`, { method, headers: { Authorization: `Klaviyo-API-Key ${process.env.KLAVIYO_PRIVATE_KEY}`, "Content-Type": "application/json", revision: "2024-10-15", }, body: body ? JSON.stringify(body) : undefined, }); if (!response.ok) { const error = await response.json(); throw new Error(`Klaviyo API error: ${JSON.stringify(error)}`); } return response.json(); } ``` --- ## Profiles (Customers) ### Create/Update Profile ```typescript // Upsert profile (create or update) async function upsertProfile(data: ProfileInput) { return klaviyoRequest("/profiles", "POST", { data: { type: "profile", attributes: { email: data.email, phone_number: data.phone, // E.164 format: +1234567890 first_name: data.firstName, last_name: data.lastName, properties: { // Custom properties lifetime_value: data.ltv, plan: data.plan, signup_source: data.source, }, location: { city: data.city, region: data.state, country: data.country, zip: data.zip, }, }, }, }); } ``` ```python # Python def upsert_profile(data): return klaviyo.Profiles.create_or_update_profile({ "data": { "type": "profile", "attributes": { "email": data["email"], "first_name": data["first_name"], "last_name": data["last_name"], "properties": { "plan": data.get("plan"), } } } }) ``` ### Get Profile ```typescript async function getProfileByEmail(email: string) { const response = await klaviyoRequest( `/profiles?filter=equals(email,"${email}")` ); return response.data[0]; } async function getProfileById(profileId: string) { return klaviyoRequest(`/profiles/${profileId}`); } ``` ### Update Profile Properties ```typescript async function updateProfileProperties( profileId: string, properties: Record<string, any> ) { return klaviyoRequest(`/profiles/${profileId}`, "PATCH", { data: { type: "profile", id: profileId, attributes: { properties, }, }, }); } // Usage await updateProfileProperties("profile_id", { last_purchase_date: new Date().toISOString(), total_orders: 5, vip_status: true, }); ``` --- ## Events (Tracking) ### Track Event (Server-Side) ```typescript async function trackEvent(data: EventInput) { return klaviyoRequest("/events", "POST", { data: { type: "event", attributes: { profile: { data: { type: "profile", attributes: { email: data.email, // or phone_number, or external_id }, }, }, metric: { data: { type: "metric", attributes: { name: data.eventName, }, }, }, properties: data.properties, value: data.value, // For revenue tracking unique_id: data.uniqueId, // Deduplication time: data.timestamp || new Date().toISOString(), }, }, }); } ``` ### Common E-Commerce Events ```typescript // Viewed Product await trackEvent({ email: customer.email, eventName: "Viewed Product", properties: { ProductID: product.id, ProductName: product.name, ProductURL: product.url, ImageURL: product.image, Price: product.price, Categories: product.categories, }, }); // Added to Cart await trackEvent({ email: customer.email, eventName: "Added to Cart", properties: { ProductID: product.id, ProductName: product.name, Quantity: quantity, Price: product.price, CartTotal: cart.total, ItemNames: cart.items.map(i => i.name), }, value: product.price * quantity, }); // Started Checkout await trackEvent({ email: customer.email, eventName: "Started Checkout", properties: { CheckoutURL: checkout.url, ItemCount: cart.itemCount, Categories: cart.categories, ItemNames: cart.items.map(i => i.name), }, value: cart.total, }); // Placed Order await trackEvent({ email: customer.email, eventName: "Placed Order", properties: { OrderId: order.id, ItemCount: order.itemCount, Categories: order.categories, ItemNames: order.items.map(i => i.name), Items: order.items.map(i => ({ ProductID: i.productId, ProductName: i.name, Quantity: i.quantity, Price: i.price, ImageURL: i.image, ProductURL: i.url, })), BillingAddress: order.billingAddress, ShippingAddress: order.shippingAddress, }, value: order.total, uniqueId: order.id, // Prevent duplicate orders }); // Fulfilled Order await trackEvent({ email: customer.email, eventName: "Fulfilled Order", properties: { OrderId: order.id, TrackingNumber: fulfillment.trackingNumber, TrackingURL: fulfillment.trackingUrl, Carrier: fulfillment.carrier, }, }); // Cancelled Order await trackEvent({ email: customer.email, eventName: "Cancelled Order", properties: { OrderId: order.id, Reason: cancellation.reason, }, value: -order.total, // Negative value for refunds }); ``` ### Client-Side Tracking (JavaScript) ```html <!-- Add to your site --> <script async src="https://static.klaviyo.com/onsite/js/klaviyo.js?company_id=XXXXXX"></script> <script> // Identify user klaviyo.identify({ email: "customer@example.com", first_name: "John", last_name: "Doe", }); // Track event klaviyo.track("Viewed Product", { ProductID: "prod_123", ProductName: "Blue T-Shirt", Price: 29.99, }); // Track with value klaviyo.track("Added to Cart", { ProductID: "prod_123", ProductName: "Blue T-Shirt", Price: 29.99, $value: 29.99, // Revenue tracking }); </script> ``` --- ## Lists & Segments ### Add Profile to List ```typescript async function addToList(listId: string, emails: string[]) { return klaviyoRequest(`/lists/${listId}/relationships/profiles`, "POST", { data: emails.map(email => ({ type: "profile", attributes: { email }, })), }); } // By profile ID async function addProfileToList(listId: string, profileId: string) { return klaviyoRequest(`/lists/${listId}/relationships/profiles`, "POST", { data: [{ type: "profile", id: profileId }], }); } ``` ### Remove from List ```typescript async function removeFromList(listId: string, profileId: string) { return klaviyoRequest( `/lists/${listId}/relationships/profiles`, "DELETE", { data: [{ type: "profile", id: profileId }], } ); } ``` ### Get List Members ```typescript async function getListMembers(listId: string, cursor?: string) { const params = new URLSearchParams({ "page[size]": "100", }); if (cursor) { params.set("page[cursor]", cursor); } return klaviyoRequest(`/lists/${listId}/profiles?${params}`); } ``` ### Create List ```typescript async function createList(name: string) { return klaviyoRequest("/lists", "POST", { data: { type: "list", attributes: { name }, }, }); } ``` --- ## Campaigns ### Get Campaigns ```typescript async function getCampaigns(status?: "draft" | "scheduled" | "sent") { const params = new URLSearchParams(); if (status) { params.set("filter", `equals(status,"${status}")`); } return klaviyoRequest(`/campaigns?${params}`); } ``` ### Get Campaign Performance ```typescript async function getCampaignMetrics(campaignId: string) { return klaviyoRequest( `/campaign-recipient-estimations/${campaignId}`, "GET" ); } ``` --- ## Flows (Automations) ### Get Flows ```typescript async function getFlows() { return klaviyoRequest("/flows"); } async function getFlowById(flowId: string) { return klaviyoRequest(`/flows/${flowId}`); } ``` ### Common Flow Triggers | Flow Type | Trigger Event | |-----------|---------------| | Welcome Series | Added to List | | Abandoned Cart | Added to Cart + No Purchase | | Browse Abandon | Viewed Product + No Cart | | Post-Purchase | Placed Order | | Winback | No Order in X Days | | Review Request | Fulfilled Order | --- ## Webhooks ### Create Webhook ```typescript async function createWebhook(data: WebhookInput) { return klaviyoRequest("/webhooks", "POST", { data: { type: "webhook", attributes: { name: data.name, endpoint_url: data.url, secret_key: data.secret, topics: data.topics, // e.g., ["profile.created", "event.created"] }, }, }); } ``` ### Webhook Topics | Topic | Trigger | |-------|---------| | `profile.created` | New profile created | | `profile.updated` | Profile properties changed | | `profile.merged` | Profiles merged | | `event.created` | New event tracked | | `list.member.added` | Profile added to list | | `list.member.removed` | Profile removed from list | ### Verify Webhook Signature ```typescript import crypto from "crypto"; function verifyKlaviyoWebhook( payload: string, signature: string, secret: string ): boolean { const expectedSignature = crypto .createHmac("sha256", secret) .update(payload) .digest("base64"); return crypto.timingSafeEqual( Buffer.from(signature), Buffer.from(expectedSignature) ); } // Express handler app.post("/webhooks/klaviyo", (req, res) => { const signature = req.headers["klaviyo-webhook-signature"] as string; if (!verifyKlaviyoWebhook(JSON.stringify(req.body), signature, WEBHOOK_SECRET)) { return res.status(401).json({ error: "Invalid signature" }); } const { type, data } = req.body; switch (type) { case "profile.created": handleNewProfile(data); break; case "event.created": handleNewEvent(data); break; } res.status(200).json({ received: true }); }); ``` --- ## Rate Limits | Window | Limit | |--------|-------| | Burst | 75 requests/second | | Steady | 700 requests/minute | ### Handle Rate Limiting ```typescript async function klaviyoRequestWithRetry( endpoint: string, method: "GET" | "POST" | "PATCH" | "DELETE" = "GET", body?: object, retries = 3 ): Promise<any> { for (let attempt = 0; attempt < retries; attempt++) { const response = await fetch(`${KLAVIYO_BASE_URL}${endpoint}`, { method, headers: { Authorization: `Klaviyo-API-Key ${process.env.KLAVIYO_PRIVATE_KEY}`, "Content-Type": "application/json", revision: "2024-10-15", }, body: body ? JSON.stringify(body) : undefined, }); if (response.status === 429) { const retryAfter = parseInt(response.headers.get("Retry-After") || "5"); await new Promise(r => setTimeout(r, retryAfter * 1000)); continue; } if (!response.ok) { throw new Error(`Klaviyo error: ${response.status}`); } return response.json(); } throw new Error("Max retries exceeded"); } ``` --- ## Pagination ```typescript async function getAllProfiles() { const profiles = []; let cursor: string | undefined; do { const params = new URLSearchParams({ "page[size]": "100" }); if (cursor) { params.set("page[cursor]", cursor); } const response = await klaviyoRequest(`/profiles?${params}`); profiles.push(...response.data); cursor = response.links?.next ? new URL(response.links.next).searchParams.get("page[cursor]") : undefined; } while (cursor); return profiles; } ``` --- ## Filtering & Sorting ```typescript // Filter by date const recentEvents = await klaviyoRequest( `/events?filter=greater-than(datetime,2024-01-01T00:00:00Z)` ); // Filter by property const vipProfiles = await klaviyoRequest( `/profiles?filter=equals(properties.vip_status,true)` ); // Multiple filters (AND) const filtered = await klaviyoRequest( `/profiles?filter=and(equals(properties.plan,"pro"),greater-than(properties.ltv,1000))` ); // Sorting const sorted = await klaviyoRequest( `/profiles?sort=-created` // Descending by created date ); // Sparse fieldsets (only return specific fields) const sparse = await klaviyoRequest( `/profiles?fields[profile]=email,first_name,properties` ); ``` --- ## Integration Patterns ### E-Commerce Order Sync ```typescript // After order is placed async function syncOrderToKlaviyo(order: Order) { // 1. Upsert customer profile await upsertProfile({ email: order.customerEmail, firstName: order.customerFirstName, lastName: order.customerLastName, phone: order.customerPhone, }); // 2. Update lifetime metrics await updateProfileProperties( await getProfileIdByEmail(order.customerEmail), { last_order_date: new Date().toISOString(), total_orders: order.customerOrderCount, lifetime_value: order.customerLifetimeValue, } ); // 3. Track order event await trackEvent({ email: order.customerEmail, eventName: "Placed Order", properties: { OrderId: order.id, Items: order.items, // ... other properties }, value: order.total, uniqueId: order.id, }); } ``` ### Subscription Status Sync ```typescript // When subscription changes async function syncSubscriptionStatus(user: User, status: string) { await updateProfileProperties(user.klaviyoProfileId, { subscription_status: status, subscription_plan: user.plan, subscription_updated_at: new Date().toISOString(), }); await trackEvent({ email: user.email, eventName: `Subscription ${status}`, properties: { plan: user.plan, mrr: user.mrr, }, value: status === "cancelled" ? -user.mrr : user.mrr, }); } ``` --- ## Environment Variables ```bash # .env KLAVIYO_PRIVATE_KEY=pk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx KLAVIYO_PUBLIC_KEY=XXXXXX KLAVIYO_WEBHOOK_SECRET=your_webhook_secret ``` Add to `credentials.md`: ```python 'KLAVIYO_PRIVATE_KEY': r'pk_[a-f0-9]{32}', 'KLAVIYO_PUBLIC_KEY': r'[A-Z0-9]{6}', ``` --- ## Checklist ### Setup - [ ] Klaviyo account created - [ ] Private API key generated - [ ] Public API key noted (company ID) - [ ] API revision set in headers ### Integration - [ ] Profile sync on signup/update - [ ] Key events tracked (view, cart, order) - [ ] Order events include Items array - [ ] Revenue tracked with $value - [ ] Unique IDs for deduplication ### Testing - [ ] Test profile creation - [ ] Test event tracking - [ ] Verify events in Klaviyo dashboard - [ ] Test webhook delivery - [ ] Test rate limit handling --- ## Anti-Patterns - **Missing email/phone** - Every profile needs at least one identifier - **Duplicate events** - Use unique_id for orders/transactions - **Missing Items array** - Required for product recommendations - **Client-side only** - Server-side tracking is more reliable - **Ignoring rate limits** - Implement exponential backoff - **Hardcoded API keys** - Use environment variables - **Missing revenue tracking** - Include $value for ROI attribution ================================================ FILE: skills/llm-patterns/SKILL.md ================================================ --- name: llm-patterns description: AI-first application patterns, LLM testing, prompt management when-to-use: When building apps where LLMs handle core logic - classification, extraction, generation user-invocable: false effort: medium --- # LLM Patterns Skill For AI-first applications where LLMs handle logical operations. --- ## Core Principle **LLM for logic, code for plumbing.** Use LLMs for: - Classification, extraction, summarization - Decision-making with natural language reasoning - Content generation and transformation - Complex conditional logic that would be brittle in code Use traditional code for: - Data validation (Zod/Pydantic) - API routing and HTTP handling - Database operations - Authentication/authorization - Orchestration and error handling --- ## Project Structure ``` project/ ├── src/ │ ├── core/ │ │ ├── prompts/ # Prompt templates │ │ │ ├── classify.ts │ │ │ └── extract.ts │ │ ├── llm/ # LLM client and utilities │ │ │ ├── client.ts # LLM client wrapper │ │ │ ├── schemas.ts # Response schemas (Zod) │ │ │ └── index.ts │ │ └── services/ # Business logic using LLM │ ├── infra/ │ └── ... ├── tests/ │ ├── unit/ │ ├── integration/ │ └── llm/ # LLM-specific tests │ ├── fixtures/ # Saved responses for deterministic tests │ ├── evals/ # Evaluation test suites │ └── mocks/ # Mock LLM responses └── _project_specs/ └── prompts/ # Prompt specifications ``` --- ## LLM Client Pattern ### Typed LLM Wrapper ```typescript // core/llm/client.ts import Anthropic from '@anthropic-ai/sdk'; import { z } from 'zod'; const client = new Anthropic(); interface LLMCallOptions<T> { prompt: string; schema: z.ZodSchema<T>; model?: string; maxTokens?: number; } export async function llmCall<T>({ prompt, schema, model = 'claude-sonnet-4-20250514', maxTokens = 1024, }: LLMCallOptions<T>): Promise<T> { const response = await client.messages.create({ model, max_tokens: maxTokens, messages: [{ role: 'user', content: prompt }], }); const text = response.content[0].type === 'text' ? response.content[0].text : ''; // Parse and validate response const parsed = JSON.parse(text); return schema.parse(parsed); } ``` ### Structured Outputs ```typescript // core/llm/schemas.ts import { z } from 'zod'; export const ClassificationSchema = z.object({ category: z.enum(['support', 'sales', 'feedback', 'other']), confidence: z.number().min(0).max(1), reasoning: z.string(), }); export type Classification = z.infer<typeof ClassificationSchema>; ``` --- ## Prompt Patterns ### Template Functions ```typescript // core/prompts/classify.ts export function classifyTicketPrompt(ticket: string): string { return `Classify this support ticket into one of these categories: - support: Technical issues or help requests - sales: Pricing, plans, or purchase inquiries - feedback: Suggestions or complaints - other: Anything else Respond with JSON: { "category": "...", "confidence": 0.0-1.0, "reasoning": "brief explanation" } Ticket: ${ticket}`; } ``` ### Prompt Versioning ```typescript // core/prompts/index.ts export const PROMPTS = { classify: { v1: classifyTicketPromptV1, v2: classifyTicketPromptV2, // improved accuracy current: classifyTicketPromptV2, }, } as const; ``` --- ## Testing LLM Calls ### 1. Unit Tests with Mocks (Fast, Deterministic) ```typescript // tests/llm/mocks/classify.mock.ts export const mockClassifyResponse = { category: 'support', confidence: 0.95, reasoning: 'User is asking for help with login', }; // tests/unit/services/ticket.test.ts import { classifyTicket } from '../../../src/core/services/ticket'; import { mockClassifyResponse } from '../../llm/mocks/classify.mock'; // Mock the LLM client vi.mock('../../../src/core/llm/client', () => ({ llmCall: vi.fn().mockResolvedValue(mockClassifyResponse), })); describe('classifyTicket', () => { it('returns classification for ticket', async () => { const result = await classifyTicket('I cannot log in'); expect(result.category).toBe('support'); expect(result.confidence).toBeGreaterThan(0.9); }); }); ``` ### 2. Fixture Tests (Deterministic, Tests Parsing) ```typescript // tests/llm/fixtures/classify.fixtures.json { "support_ticket": { "input": "I can't reset my password", "expected_category": "support", "raw_response": "{\"category\":\"support\",\"confidence\":0.98,\"reasoning\":\"Password reset is a support issue\"}" } } // tests/llm/classify.fixture.test.ts import fixtures from './fixtures/classify.fixtures.json'; import { ClassificationSchema } from '../../src/core/llm/schemas'; describe('Classification Response Parsing', () => { Object.entries(fixtures).forEach(([name, fixture]) => { it(`parses ${name} correctly`, () => { const parsed = JSON.parse(fixture.raw_response); const result = ClassificationSchema.parse(parsed); expect(result.category).toBe(fixture.expected_category); }); }); }); ``` ### 3. Evaluation Tests (Slow, Run in CI nightly) ```typescript // tests/llm/evals/classify.eval.test.ts import { classifyTicket } from '../../../src/core/services/ticket'; const TEST_CASES = [ { input: 'How much does the pro plan cost?', expected: 'sales' }, { input: 'The app crashes when I click save', expected: 'support' }, { input: 'You should add dark mode', expected: 'feedback' }, { input: 'What time is it in Tokyo?', expected: 'other' }, ]; describe('Classification Accuracy (Eval)', () => { // Skip in regular CI, run nightly const runEvals = process.env.RUN_LLM_EVALS === 'true'; it.skipIf(!runEvals)('achieves >90% accuracy on test set', async () => { let correct = 0; for (const testCase of TEST_CASES) { const result = await classifyTicket(testCase.input); if (result.category === testCase.expected) correct++; } const accuracy = correct / TEST_CASES.length; expect(accuracy).toBeGreaterThan(0.9); }, 60000); // 60s timeout for LLM calls }); ``` --- ## GitHub Actions for LLM Tests ```yaml # .github/workflows/quality.yml (add to existing) jobs: quality: # ... existing steps ... - name: Run Tests (with LLM mocks) run: npm run test:coverage llm-evals: runs-on: ubuntu-latest # Run nightly or on-demand if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' steps: - uses: actions/checkout@v4 - name: Setup Node uses: actions/setup-node@v4 with: node-version: '20' - name: Install dependencies run: npm ci - name: Run LLM Evals run: npm run test:evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} RUN_LLM_EVALS: 'true' ``` --- ## Cost & Performance Tracking ```typescript // core/llm/client.ts - add tracking interface LLMMetrics { model: string; inputTokens: number; outputTokens: number; latencyMs: number; cost: number; } export async function llmCallWithMetrics<T>( options: LLMCallOptions<T> ): Promise<{ result: T; metrics: LLMMetrics }> { const start = Date.now(); const response = await client.messages.create({...}); const metrics: LLMMetrics = { model: options.model, inputTokens: response.usage.input_tokens, outputTokens: response.usage.output_tokens, latencyMs: Date.now() - start, cost: calculateCost(response.usage, options.model), }; // Log or send to monitoring console.log('[LLM]', metrics); return { result: parsed, metrics }; } ``` --- ## LLM Anti-Patterns - ❌ Hardcoded prompts in business logic - use prompt templates - ❌ No schema validation on LLM responses - always use Zod - ❌ Testing with live LLM calls in CI - use mocks for unit tests - ❌ No cost tracking - monitor token usage - ❌ Ignoring latency - LLM calls are slow, design for async - ❌ No fallback for LLM failures - handle timeouts and errors - ❌ Prompts without version control - track prompt changes - ❌ No evaluation suite - measure accuracy over time - ❌ Using LLM for deterministic logic - use code for validation, auth, math - ❌ Giant monolithic prompts - compose smaller focused prompts ================================================ FILE: skills/maggy/SKILL.md ================================================ --- name: maggy description: Maggy is a local AI engineering command center. AI-prioritized inbox across issue trackers (GitHub Issues/Asana), one-click TDD execute with iCPG context enrichment, daily competitor intelligence briefing. when-to-use: "When you want a persistent dashboard to triage tickets and spawn Claude Code runs against any repo" user-invocable: true effort: medium --- # Maggy Skill **Maggy** is a generic, local AI engineering command center. Install once, point it at your team's issue tracker and codebases, and get: - **AI-prioritized inbox** — ranks open issues by urgency, OKR alignment, and recency - **One-click Execute** — spawns Claude Code locally with iCPG context injected - **Competitor intelligence** — daily AI briefing on your competitive landscape - **No hardcoding** — works for any team, any stack, any issue tracker ### ⚠️ Execute permission model (important) Execute currently runs `claude -p --dangerously-skip-permissions` so the TDD pipeline isn't blocked waiting on approval prompts (subprocess has no terminal). That flag **grants Claude full permission to write/edit files and run shell commands** inside the target codebase, and the prompt it receives includes content from the issue tracker (which any team member can author). **Hardening already in place:** - `working_dir` is validated against the list of codebase roots in `~/.maggy/config.yaml` — Claude can't be pointed at arbitrary filesystem paths. - Only tickets from your configured trackers reach Execute; no public-internet input flows into the prompt. **Roadmap:** move the unconditional flag behind per-codebase config (`auto_approve: true|false`) so privileged execution becomes opt-in. Until then, treat Execute like `git pull && make` on any ticket you push the button for — only run it on repos you own, against tickets from authors you trust. ``` ┌──────────────────────────────────────────────────────────────┐ │ maggy ──────────────┐ │ │ ├── skills/ ← installed globally → ~/.claude/ │ │ ├── commands/ ← installed globally → ~/.claude/ │ │ ├── scripts/icpg/ ← used by Maggy for context enrichment │ │ └── maggy/ ← dashboard: run `./install.sh` to use │ │ ├── src/ │ │ │ ├── providers/ ← GitHub / Asana / Linear │ │ │ ├── services/ ← inbox, competitor, executor │ │ │ └── api/ ← FastAPI routes │ │ └── install.sh │ └──────────────────────────────────────────────────────────────┘ ``` --- ## When Maggy Helps | Scenario | How Maggy helps | |------------------------------------------|-----------------------------------------------| | Morning triage of 50 open issues | AI ranks them; top items stay top | | Implementing a ticket | `Execute` → iCPG-enriched TDD pipeline | | "What are competitors shipping?" | Daily briefing + filterable news feed | | Multiple repos per team | Auto-picks right repo based on ticket content | | New team onboarding | Configure via `/maggy-init`, no code writing | --- ## Install and Configure ```bash # One-time install cd $(cat ~/.claude/.bootstrap-dir)/maggy ./install.sh # Configure # Edit ~/.maggy/config.yaml — see maggy/config.example.yaml for the schema # Credentials export GITHUB_TOKEN=ghp_... export ANTHROPIC_API_KEY=sk-ant-... # Run python3 -m src.main # Or from Claude Code: # /maggy-init # interactive wizard # /maggy # launch dashboard ``` --- ## Provider Abstraction Maggy services never see GitHub/Asana directly — they talk to an `IssueTrackerProvider` Protocol. Drop-in swap between: - `GitHubIssuesProvider` — scans multiple repos, aggregates open issues, maps "done" → closed - `AsanaProvider` — queries projects, respects workspace scope - `LinearProvider` — stub for future The same inbox, Execute pipeline, and Competitor features work with any provider. --- ## Execute Pipeline When you click Execute on a ticket: 1. Maggy queries the configured iCPG for relevant symbols, blast radius, and prior intents 2. Picks the right working directory based on ticket keywords + configured codebases 3. Spawns `claude -p --dangerously-skip-permissions` in that directory 4. Runs analyze → write failing tests → implement 5. Captures output in a session you can follow in the Sessions tab Because the spawned Claude Code runs in the target repo, it picks up: - That repo's `CLAUDE.md` - Your global `~/.claude/CLAUDE.md` - All bootstrap skills - `.claude/hooks/`, `.mcp.json` So Execute gets the full bootstrap experience — not a stripped-down version. --- ## Competitor Intelligence Generic — works for any domain: 1. Configure `competitors.categories: ["fintech", "embedded-finance"]` in `~/.maggy/config.yaml` 2. Click Discover — Claude identifies 12-18 competitors (market leaders, AI-first challengers, vertical specialists) 3. Maggy monitors their RSS blogs + Google News daily 4. Daily briefing is generated once per day (cached), regeneratable on demand --- ## Not Included Maggy MVP is focused. Not shipped: - Meeting bot (voice) - Slack integration - P2P network + session handoff - Self-improvement (`/improve-maggy`) - Linear provider (stub only) These are v2 work. --- ## Files - `maggy/PLAN.md` — architecture rationale - `maggy/README.md` — user docs - `maggy/src/providers/base.py` — IssueTrackerProvider Protocol - `maggy/src/services/executor.py` — TDD pipeline - `maggy/src/services/competitor.py` — discovery + briefing - `maggy/src/services/inbox.py` — AI prioritization - `commands/maggy.md` — `/maggy` launcher - `commands/maggy-init.md` — `/maggy-init` setup wizard ================================================ FILE: skills/medusa/SKILL.md ================================================ --- name: medusa description: Medusa headless commerce - modules, workflows, API routes, admin UI when-to-use: When building with Medusa commerce platform user-invocable: false effort: medium --- # Medusa E-Commerce Skill For building headless e-commerce with Medusa - open-source, Node.js native, fully customizable. **Sources:** [Medusa Docs](https://docs.medusajs.com) | [API Reference](https://docs.medusajs.com/api/store) | [GitHub](https://github.com/medusajs/medusa) --- ## Why Medusa | Feature | Benefit | |---------|---------| | **Open Source** | Self-host, no vendor lock-in, MIT license | | **Node.js Native** | TypeScript, familiar stack, easy to customize | | **Headless** | Any frontend (Next.js, Remix, mobile) | | **Modular** | Use only what you need, extend anything | | **Built-in Admin** | Dashboard included, customizable | --- ## Quick Start ### Prerequisites ```bash # Required node --version # v20+ LTS git --version # PostgreSQL running locally or remote ``` ### Create New Project ```bash # Scaffold new Medusa application npx create-medusa-app@latest my-store # This creates: # - Medusa backend # - PostgreSQL database (auto-configured) # - Admin dashboard # - Optional: Next.js storefront cd my-store npm run dev ``` ### Access Points | URL | Purpose | |-----|---------| | `http://localhost:9000` | Backend API | | `http://localhost:9000/app` | Admin dashboard | | `http://localhost:8000` | Storefront (if installed) | ### Create Admin User ```bash npx medusa user -e admin@example.com -p supersecret ``` --- ## Project Structure ``` medusa-store/ ├── src/ │ ├── admin/ # Admin UI customizations │ │ ├── widgets/ # Dashboard widgets │ │ └── routes/ # Custom admin pages │ ├── api/ # Custom API routes │ │ ├── store/ # Public storefront APIs │ │ │ └── custom/ │ │ │ └── route.ts │ │ └── admin/ # Admin APIs │ │ └── custom/ │ │ └── route.ts │ ├── jobs/ # Scheduled tasks │ ├── modules/ # Custom business logic │ ├── workflows/ # Multi-step processes │ ├── subscribers/ # Event listeners │ └── links/ # Module relationships ├── .medusa/ # Auto-generated (don't edit) ├── medusa-config.ts # Configuration ├── package.json └── tsconfig.json ``` --- ## Configuration ### medusa-config.ts ```typescript import { defineConfig, loadEnv } from "@medusajs/framework/utils"; loadEnv(process.env.NODE_ENV || "development", process.cwd()); export default defineConfig({ projectConfig: { databaseUrl: process.env.DATABASE_URL, http: { storeCors: process.env.STORE_CORS || "http://localhost:8000", adminCors: process.env.ADMIN_CORS || "http://localhost:9000", authCors: process.env.AUTH_CORS || "http://localhost:9000", }, redisUrl: process.env.REDIS_URL, }, admin: { disable: false, backendUrl: process.env.MEDUSA_BACKEND_URL || "http://localhost:9000", }, modules: [ // Add custom modules here ], }); ``` ### Environment Variables ```bash # .env DATABASE_URL=postgresql://user:pass@localhost:5432/medusa REDIS_URL=redis://localhost:6379 # CORS (comma-separated for multiple origins) STORE_CORS=http://localhost:8000 ADMIN_CORS=http://localhost:9000 # Backend URL MEDUSA_BACKEND_URL=http://localhost:9000 # JWT Secrets JWT_SECRET=your-super-secret-jwt-key COOKIE_SECRET=your-super-secret-cookie-key ``` --- ## Custom API Routes ### Store API (Public) ```typescript // src/api/store/hello/route.ts import type { MedusaRequest, MedusaResponse } from "@medusajs/framework/http"; export async function GET( req: MedusaRequest, res: MedusaResponse ) { res.json({ message: "Hello from custom store API!", }); } // Accessible at: GET /store/hello ``` ### Admin API (Protected) ```typescript // src/api/admin/analytics/route.ts import type { MedusaRequest, MedusaResponse } from "@medusajs/framework/http"; import { Modules } from "@medusajs/framework/utils"; export async function GET( req: MedusaRequest, res: MedusaResponse ) { const orderService = req.scope.resolve(Modules.ORDER); const orders = await orderService.listOrders({ created_at: { $gte: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000), // Last 30 days }, }); const totalRevenue = orders.reduce( (sum, order) => sum + (order.total || 0), 0 ); res.json({ orderCount: orders.length, totalRevenue, }); } // Accessible at: GET /admin/analytics (requires auth) ``` ### Route with Parameters ```typescript // src/api/store/products/[id]/reviews/route.ts import type { MedusaRequest, MedusaResponse } from "@medusajs/framework/http"; export async function GET( req: MedusaRequest, res: MedusaResponse ) { const { id } = req.params; // Fetch reviews for product const reviews = await getReviewsForProduct(id); res.json({ reviews }); } export async function POST( req: MedusaRequest, res: MedusaResponse ) { const { id } = req.params; const { rating, comment, customerId } = req.body; const review = await createReview({ productId: id, rating, comment, customerId, }); res.status(201).json({ review }); } // Accessible at: // GET /store/products/:id/reviews // POST /store/products/:id/reviews ``` ### Middleware ```typescript // src/api/middlewares.ts import { defineMiddlewares } from "@medusajs/framework/http"; import { authenticate } from "@medusajs/framework/http"; export default defineMiddlewares({ routes: [ { matcher: "/store/protected/*", middlewares: [authenticate("customer", ["session", "bearer"])], }, { matcher: "/admin/*", middlewares: [authenticate("user", ["session", "bearer"])], }, ], }); ``` --- ## Modules (Custom Business Logic) ### Create Custom Module ```typescript // src/modules/reviews/index.ts import { Module } from "@medusajs/framework/utils"; import ReviewModuleService from "./service"; export const REVIEW_MODULE = "reviewModuleService"; export default Module(REVIEW_MODULE, { service: ReviewModuleService, }); ``` ```typescript // src/modules/reviews/service.ts import { MedusaService } from "@medusajs/framework/utils"; class ReviewModuleService extends MedusaService({}) { async createReview(data: CreateReviewInput) { // Implementation } async getProductReviews(productId: string) { // Implementation } async getAverageRating(productId: string) { // Implementation } } export default ReviewModuleService; ``` ### Register Module ```typescript // medusa-config.ts import { REVIEW_MODULE } from "./src/modules/reviews"; export default defineConfig({ // ... modules: [ { resolve: "./src/modules/reviews", options: {}, }, ], }); ``` ### Use Module in API ```typescript // src/api/store/products/[id]/reviews/route.ts import { REVIEW_MODULE } from "../../../modules/reviews"; export async function GET(req: MedusaRequest, res: MedusaResponse) { const { id } = req.params; const reviewService = req.scope.resolve(REVIEW_MODULE); const reviews = await reviewService.getProductReviews(id); const averageRating = await reviewService.getAverageRating(id); res.json({ reviews, averageRating }); } ``` --- ## Workflows ### Define Workflow ```typescript // src/workflows/create-order-with-notification/index.ts import { createWorkflow, createStep, StepResponse, } from "@medusajs/framework/workflows-sdk"; import { Modules } from "@medusajs/framework/utils"; const createOrderStep = createStep( "create-order", async (input: CreateOrderInput, { container }) => { const orderService = container.resolve(Modules.ORDER); const order = await orderService.createOrders(input); return new StepResponse(order, order.id); }, // Compensation (rollback) function async (orderId, { container }) => { const orderService = container.resolve(Modules.ORDER); await orderService.deleteOrders([orderId]); } ); const sendNotificationStep = createStep( "send-notification", async (order: Order, { container }) => { const notificationService = container.resolve("notificationService"); await notificationService.send({ to: order.email, template: "order-confirmation", data: { order }, }); return new StepResponse({ sent: true }); } ); export const createOrderWithNotificationWorkflow = createWorkflow( "create-order-with-notification", (input: CreateOrderInput) => { const order = createOrderStep(input); const notification = sendNotificationStep(order); return { order, notification }; } ); ``` ### Execute Workflow ```typescript // In an API route import { createOrderWithNotificationWorkflow } from "../../../workflows/create-order-with-notification"; export async function POST(req: MedusaRequest, res: MedusaResponse) { const { result } = await createOrderWithNotificationWorkflow(req.scope).run({ input: req.body, }); res.json(result); } ``` --- ## Subscribers (Event Listeners) ### Create Subscriber ```typescript // src/subscribers/order-placed.ts import type { SubscriberArgs, SubscriberConfig } from "@medusajs/framework"; export default async function orderPlacedHandler({ event, container, }: SubscriberArgs<{ id: string }>) { const orderId = event.data.id; console.log(`Order placed: ${orderId}`); // Send notification, update analytics, etc. const notificationService = container.resolve("notificationService"); await notificationService.sendOrderConfirmation(orderId); } export const config: SubscriberConfig = { event: "order.placed", }; ``` ### Common Events | Event | Trigger | |-------|---------| | `order.placed` | New order created | | `order.updated` | Order modified | | `order.canceled` | Order cancelled | | `order.completed` | Order fulfilled | | `customer.created` | New customer registered | | `product.created` | New product added | | `product.updated` | Product modified | | `inventory.updated` | Stock changed | --- ## Scheduled Jobs ```typescript // src/jobs/sync-inventory.ts import type { MedusaContainer } from "@medusajs/framework"; export default async function syncInventoryJob(container: MedusaContainer) { const inventoryService = container.resolve("inventoryService"); console.log("Running inventory sync..."); await inventoryService.syncFromExternalSource(); console.log("Inventory sync complete"); } export const config = { name: "sync-inventory", schedule: "0 */6 * * *", // Every 6 hours }; ``` --- ## Admin UI Customization ### Custom Widget ```tsx // src/admin/widgets/sales-overview.tsx import { defineWidgetConfig } from "@medusajs/admin-sdk"; import { Container, Heading, Text } from "@medusajs/ui"; const SalesOverviewWidget = () => { return ( <Container> <Heading level="h2">Sales Overview</Heading> <Text>Your custom sales data here...</Text> </Container> ); }; export const config = defineWidgetConfig({ zone: "order.list.before", // Where to show the widget }); export default SalesOverviewWidget; ``` ### Widget Zones | Zone | Location | |------|----------| | `order.list.before` | Before order list | | `order.details.after` | After order details | | `product.list.before` | Before product list | | `product.details.after` | After product details | | `customer.list.before` | Before customer list | ### Custom Admin Route ```tsx // src/admin/routes/analytics/page.tsx import { defineRouteConfig } from "@medusajs/admin-sdk"; import { Container, Heading } from "@medusajs/ui"; import { ChartBar } from "@medusajs/icons"; const AnalyticsPage = () => { return ( <Container> <Heading level="h1">Analytics Dashboard</Heading> {/* Your analytics charts */} </Container> ); }; export const config = defineRouteConfig({ label: "Analytics", icon: ChartBar, }); export default AnalyticsPage; ``` --- ## Store API (Built-in) ### Products ```typescript // Frontend: Fetch products const response = await fetch("http://localhost:9000/store/products"); const { products } = await response.json(); // With filters const response = await fetch( "http://localhost:9000/store/products?" + new URLSearchParams({ category_id: "cat_123", limit: "20", offset: "0", }) ); ``` ### Cart ```typescript // Create cart const { cart } = await fetch("http://localhost:9000/store/carts", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ region_id: "reg_123", }), }).then(r => r.json()); // Add item await fetch(`http://localhost:9000/store/carts/${cart.id}/line-items`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ variant_id: "variant_123", quantity: 1, }), }); // Complete cart (create order) const { order } = await fetch( `http://localhost:9000/store/carts/${cart.id}/complete`, { method: "POST" } ).then(r => r.json()); ``` ### Customer Authentication ```typescript // Register await fetch("http://localhost:9000/store/customers", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ email: "customer@example.com", password: "password123", first_name: "John", last_name: "Doe", }), }); // Login const { token } = await fetch("http://localhost:9000/store/auth/token", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ email: "customer@example.com", password: "password123", }), }).then(r => r.json()); // Authenticated request await fetch("http://localhost:9000/store/customers/me", { headers: { Authorization: `Bearer ${token}`, }, }); ``` --- ## Payment Integration ### Stripe Setup ```bash npm install @medusajs/payment-stripe ``` ```typescript // medusa-config.ts export default defineConfig({ modules: [ { resolve: "@medusajs/payment-stripe", options: { apiKey: process.env.STRIPE_API_KEY, }, }, ], }); ``` ### In Admin 1. Go to Settings → Regions 2. Add Stripe as payment provider 3. Configure for each region --- ## Deployment ### Railway ```bash # Install Railway CLI npm install -g @railway/cli # Login and deploy railway login railway init railway up ``` ### Render ```yaml # render.yaml services: - type: web name: medusa-backend runtime: node plan: starter buildCommand: npm install && npm run build startCommand: npm run start envVars: - key: NODE_ENV value: production - key: DATABASE_URL fromDatabase: name: medusa-db property: connectionString - key: JWT_SECRET generateValue: true - key: COOKIE_SECRET generateValue: true databases: - name: medusa-db plan: starter ``` ### Docker ```dockerfile FROM node:20-alpine WORKDIR /app COPY package*.json ./ RUN npm ci --only=production COPY . . RUN npm run build EXPOSE 9000 CMD ["npm", "run", "start"] ``` --- ## CLI Commands ```bash # Development npm run dev # Start dev server # Database npx medusa db:migrate # Run migrations npx medusa db:sync # Sync schema # Users npx medusa user -e email -p pass # Create admin user # Build npm run build # Build for production npm run start # Start production server ``` --- ## Checklist ### Setup - [ ] PostgreSQL database configured - [ ] Redis configured (optional but recommended) - [ ] Admin user created - [ ] CORS origins configured - [ ] JWT/Cookie secrets set ### Customization - [ ] Custom modules for business logic - [ ] Custom API routes for frontend - [ ] Subscribers for event handling - [ ] Workflows for complex operations ### Deployment - [ ] Environment variables configured - [ ] Database migrations run - [ ] HTTPS enabled - [ ] Admin URL secured --- ## Anti-Patterns - **Editing .medusa folder** - Auto-generated, will be overwritten - **Direct database access** - Use services and modules - **Skipping workflows for complex ops** - Workflows provide rollback - **Hardcoding URLs** - Use environment variables - **Ignoring TypeScript errors** - Framework relies on types ================================================ FILE: skills/mnemos/SKILL.md ================================================ --- name: mnemos description: Task-scoped memory lifecycle — typed MnemoGraph prevents lossy context compaction by treating facts/decisions/code-refs/handoffs as distinct node types with per-type eviction policies when-to-use: "When you need durable working memory across compactions — checkpoint decisions, preserve task handoffs, or audit what was remembered" user-invocable: false effort: high --- # Mnemos — Task-Scoped Memory Lifecycle ## What It Does Mnemos prevents lossy context compaction from destroying the structured knowledge you need most. It treats your working memory as a **typed graph** (MnemoGraph) where different types of knowledge have different eviction policies: - **GoalNodes** and **ConstraintNodes** are NEVER evicted — they survive all compaction - **ResultNodes** are compressed (summary kept) before eviction - **ContextNodes** are evictable when their activation weight drops - **CheckpointNodes** persist to disk for session resume ## Fatigue Model Mnemos monitors 4 dimensions of "agent fatigue" — all passively observed from hook data, no manual input needed: | Dimension | Weight | Signal Source | What It Measures | |-----------|--------|--------------|-----------------| | Token utilization | 0.40 | Statusline JSON | How full the context window is | | Scope scatter | 0.25 | PreToolUse file paths | How many directories the agent is bouncing between | | Re-read ratio | 0.20 | PreToolUse Read calls | How often the agent re-reads files it already read (context loss) | | Error density | 0.15 | PostToolUse outcomes | What fraction of tool calls are failing (agent struggling) | Fatigue states and actions: | State | Score | Action | |-------|-------|--------| | FLOW | 0.0–0.4 | Normal operation | | COMPRESS | 0.4–0.6 | Micro-consolidation runs (compress 3 ResultNodes, evict 1 cold ContextNode) | | PRE-SLEEP | 0.6–0.75 | Checkpoint written, consolidation runs | | REM | 0.75–0.9 | Emergency checkpoint, consider wrapping up | | EMERGENCY | 0.9+ | Checkpoint written, hand off immediately | ## How To Use ### Automatic (hooks handle everything): 1. **Statusline** writes `fatigue.json` on every API call 2. **PreToolUse** hook reads fatigue before every edit, auto-checkpoints at 0.60+ 3. **PreCompact** hook writes emergency checkpoint, compaction marker, and tells summarizer what to preserve 4. **Post-Compaction Injection** (PreToolUse, no matcher) detects the compaction marker on the first tool call after compaction and re-injects the full checkpoint into context 5. **SessionStart** hook loads last checkpoint on new session resume ### Post-Compaction Recovery (Two-Layer Defense): When Claude Code compacts the context (~83% full), Mnemos uses two layers: - **Layer 1**: PreCompact outputs strong preservation instructions with inline checkpoint content for the summarizer - **Layer 2**: After compaction, the first tool call triggers `mnemos-post-compact-inject.sh` which detects the `.mnemos/just-compacted` marker and re-injects the full checkpoint. This is the guaranteed path — it doesn't depend on the summarizer. The result: after compaction, you'll see a "CONTEXT RESTORED AFTER COMPACTION" block with your goal, constraints, what you were working on, and progress. Resume from there. ### Manual CLI: ```bash mnemos init # Initialize .mnemos/ mnemos status # Show node counts + fatigue mnemos fatigue # Detailed fatigue breakdown mnemos checkpoint --force # Write checkpoint now mnemos resume # Output checkpoint for context mnemos consolidate # Run micro-consolidation mnemos nodes --type goal # List active GoalNodes mnemos add goal "Build auth" # Add a GoalNode mnemos bridge-icpg # Import iCPG ReasonNodes ``` ## Agent Instructions When working on a task: 1. **Create a GoalNode** at the start: `mnemos add goal "what you're trying to achieve" --task-id session-1` 2. **Add ConstraintNodes** for invariants: `mnemos add constraint "API backward compatibility" --scope src/api/` 3. **Check fatigue** before long operations: `mnemos fatigue` 4. **Checkpoint at sub-goal boundaries**: `mnemos checkpoint` 5. **On session resume**: the SessionStart hook automatically loads your checkpoint ## iCPG Integration Mnemos bridges with iCPG (Intent-Augmented Code Property Graph): - `mnemos bridge-icpg` imports active ReasonNodes as GoalNodes - Postconditions/invariants become ConstraintNodes - Checkpoint includes iCPG state (active intent, unresolved drift) ## Storage Everything lives in `.mnemos/` (gitignored): - `mnemo.db` — SQLite MnemoGraph - `fatigue.json` — Live token metrics (updated per API call by statusline) - `signals.jsonl` — Behavioral signal log (appended by PreToolUse + PostToolUse hooks) - `checkpoint-latest.json` — Most recent checkpoint - `checkpoints/` — Archived checkpoints ================================================ FILE: skills/ms-teams-apps/SKILL.md ================================================ --- name: ms-teams-apps description: Microsoft Teams bots and AI agents - Claude/OpenAI, Adaptive Cards, Graph API when-to-use: When building Microsoft Teams bots, tabs, or message extensions user-invocable: false effort: medium --- # Microsoft Teams Apps Skill **Purpose:** Build AI-powered agents and apps for Microsoft Teams. Create conversational bots, message extensions, and intelligent assistants that integrate with LLMs like OpenAI and Claude. --- ## Architecture Overview ``` ┌─────────────────────────────────────────────────────────────────┐ │ TEAMS APP TYPES │ │ ───────────────────────────────────────────────────────────── │ │ │ │ 1. AI AGENTS (Bots) │ │ Conversational apps powered by LLMs │ │ Handle messages, commands, and actions │ │ │ │ 2. MESSAGE EXTENSIONS │ │ Search external systems, insert cards into messages │ │ Action commands with modal dialogs │ │ │ │ 3. TABS │ │ Embedded web applications inside Teams │ │ Personal, channel, or meeting tabs │ │ │ │ 4. WEBHOOKS & CONNECTORS │ │ Incoming: Post messages to channels │ │ Outgoing: Respond to @mentions │ ├─────────────────────────────────────────────────────────────────┤ │ SDK LANDSCAPE (2025) │ │ ───────────────────────────────────────────────────────────── │ │ Teams SDK v2: Primary SDK for Teams-only apps │ │ M365 Agents SDK: Multi-channel (Teams, Outlook, Copilot) │ │ Teams Toolkit: VS Code extension for development │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Quick Start ### Install Teams CLI ```bash npm install -g @microsoft/teams.cli ``` ### Create New Project ```bash # TypeScript (Recommended) npx @microsoft/teams.cli new typescript my-agent --template echo # Python npx @microsoft/teams.cli new python my-agent --template echo # C# npx @microsoft/teams.cli new csharp my-agent --template echo ``` ### Project Structure ``` my-agent/ ├── src/ │ ├── index.ts # Entry point │ ├── app.ts # App configuration │ └── handlers/ │ ├── message.ts # Message handlers │ └── commands.ts # Command handlers ├── appPackage/ │ ├── manifest.json # App manifest │ ├── color.png # App icon (192x192) │ └── outline.png # Outline icon (32x32) ├── .env # Environment variables ├── teamsapp.yml # Teams Toolkit config └── package.json ``` --- ## App Manifest ### Basic Manifest Structure ```json { "$schema": "https://developer.microsoft.com/json-schemas/teams/v1.17/MicrosoftTeams.schema.json", "manifestVersion": "1.17", "version": "1.0.0", "id": "{{APP_ID}}", "developer": { "name": "Your Company", "websiteUrl": "https://yourcompany.com", "privacyUrl": "https://yourcompany.com/privacy", "termsOfUseUrl": "https://yourcompany.com/terms" }, "name": { "short": "AI Assistant", "full": "AI Assistant for Teams" }, "description": { "short": "Your AI-powered assistant", "full": "An intelligent assistant that helps you with tasks using AI." }, "icons": { "color": "color.png", "outline": "outline.png" }, "accentColor": "#5558AF", "bots": [ { "botId": "{{BOT_ID}}", "scopes": ["personal", "team", "groupChat"], "supportsFiles": false, "isNotificationOnly": false, "commandLists": [ { "scopes": ["personal", "team", "groupChat"], "commands": [ { "title": "help", "description": "Show available commands" }, { "title": "ask", "description": "Ask the AI a question" } ] } ] } ], "permissions": ["identity", "messageTeamMembers"], "validDomains": ["*.azurewebsites.net"] } ``` ### Manifest with Message Extensions ```json { "composeExtensions": [ { "botId": "{{BOT_ID}}", "commands": [ { "id": "searchQuery", "type": "query", "title": "Search", "description": "Search for information", "initialRun": true, "parameters": [ { "name": "query", "title": "Search query", "description": "Enter your search terms", "inputType": "text" } ] }, { "id": "createTask", "type": "action", "title": "Create Task", "description": "Create a new task", "fetchTask": true, "context": ["compose", "commandBox", "message"] } ] } ] } ``` --- ## AI Agent Development ### Basic Bot with Teams SDK v2 ```typescript // src/app.ts import { App, HttpPlugin, DevtoolsPlugin } from '@microsoft/teams.ai'; import { OpenAIModel, ActionPlanner, PromptManager } from '@microsoft/teams.ai'; // Configure the AI model const model = new OpenAIModel({ azureApiKey: process.env.AZURE_OPENAI_API_KEY!, azureDefaultDeployment: process.env.AZURE_OPENAI_DEPLOYMENT!, azureEndpoint: process.env.AZURE_OPENAI_ENDPOINT!, // Or use OpenAI directly: // apiKey: process.env.OPENAI_API_KEY!, // defaultModel: 'gpt-4' }); // Configure prompts const prompts = new PromptManager({ promptsFolder: './src/prompts' }); // Create action planner const planner = new ActionPlanner({ model, prompts, defaultPrompt: 'chat' }); // Create the app const app = new App({ plugins: [ new HttpPlugin(), new DevtoolsPlugin() ], ai: { planner } }); // Handle messages app.on('message', async (context, state) => { // AI automatically handles the conversation // The planner uses the 'chat' prompt to generate responses }); // Handle specific commands app.message('/help', async (context, state) => { await context.sendActivity({ type: 'message', text: 'Available commands:\n- /help - Show this message\n- /ask [question] - Ask me anything' }); }); // Start the app app.start(); ``` ### Prompt Configuration ```yaml # src/prompts/chat/config.json { "schema": 1.1, "description": "AI Assistant for Teams", "type": "completion", "completion": { "model": "gpt-4", "max_tokens": 1000, "temperature": 0.7, "top_p": 1 } } ``` ```text # src/prompts/chat/skprompt.txt You are an AI assistant for Microsoft Teams. You help users with their questions and tasks. Current conversation: {{$history}} User: {{$input}} Assistant: ``` --- ## Integrating Claude/Anthropic ### Claude-Powered Teams Bot ```typescript // src/claude-bot.ts import { App, HttpPlugin } from '@microsoft/teams.ai'; import Anthropic from '@anthropic-ai/sdk'; const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY! }); const app = new App({ plugins: [new HttpPlugin()] }); // Conversation history store const conversations = new Map<string, Anthropic.MessageParam[]>(); app.on('message', async (context, state) => { const userId = context.activity.from.id; const userMessage = context.activity.text; // Get or initialize conversation history if (!conversations.has(userId)) { conversations.set(userId, []); } const history = conversations.get(userId)!; // Add user message to history history.push({ role: 'user', content: userMessage }); // Show typing indicator await context.sendActivity({ type: 'typing' }); try { // Call Claude API const response = await anthropic.messages.create({ model: 'claude-sonnet-4-20250514', max_tokens: 1024, system: `You are an AI assistant integrated into Microsoft Teams. Help users with their questions and tasks. Be concise and helpful. Use markdown formatting when appropriate. Current user: ${context.activity.from.name}`, messages: history }); const assistantMessage = response.content[0].type === 'text' ? response.content[0].text : ''; // Add assistant response to history history.push({ role: 'assistant', content: assistantMessage }); // Keep history manageable (last 20 messages) if (history.length > 20) { history.splice(0, history.length - 20); } // Send response await context.sendActivity({ type: 'message', text: assistantMessage }); } catch (error) { console.error('Claude API error:', error); await context.sendActivity({ type: 'message', text: 'Sorry, I encountered an error processing your request.' }); } }); // Clear conversation command app.message('/clear', async (context, state) => { const userId = context.activity.from.id; conversations.delete(userId); await context.sendActivity('Conversation cleared. Starting fresh!'); }); app.start(); ``` ### Claude with Tools/Function Calling ```typescript // src/claude-agent.ts import Anthropic from '@anthropic-ai/sdk'; const anthropic = new Anthropic(); // Define tools the agent can use const tools: Anthropic.Tool[] = [ { name: 'search_knowledge_base', description: 'Search the company knowledge base for information', input_schema: { type: 'object' as const, properties: { query: { type: 'string', description: 'The search query' } }, required: ['query'] } }, { name: 'create_task', description: 'Create a new task in the task management system', input_schema: { type: 'object' as const, properties: { title: { type: 'string', description: 'Task title' }, description: { type: 'string', description: 'Task description' }, assignee: { type: 'string', description: 'Person to assign the task to' }, due_date: { type: 'string', description: 'Due date in YYYY-MM-DD format' } }, required: ['title'] } }, { name: 'get_calendar', description: 'Get calendar events for a user', input_schema: { type: 'object' as const, properties: { user: { type: 'string', description: 'User email or name' }, days: { type: 'number', description: 'Number of days to look ahead' } }, required: ['user'] } } ]; // Tool implementations async function executeTools(toolName: string, toolInput: any): Promise<string> { switch (toolName) { case 'search_knowledge_base': // Implement your search logic return `Found 3 results for "${toolInput.query}":\n1. Document A\n2. Document B\n3. Document C`; case 'create_task': // Implement task creation (e.g., call Microsoft Graph API) return `Task created: "${toolInput.title}"`; case 'get_calendar': // Implement calendar lookup return `Calendar for ${toolInput.user}: 2 meetings today`; default: return 'Unknown tool'; } } // Agent loop with tool use async function runAgent(userMessage: string): Promise<string> { let messages: Anthropic.MessageParam[] = [ { role: 'user', content: userMessage } ]; while (true) { const response = await anthropic.messages.create({ model: 'claude-sonnet-4-20250514', max_tokens: 1024, system: 'You are a helpful Teams assistant. Use tools when needed to help users.', tools, messages }); // Check if we need to use tools if (response.stop_reason === 'tool_use') { const toolResults: Anthropic.MessageParam[] = []; for (const content of response.content) { if (content.type === 'tool_use') { const result = await executeTools(content.name, content.input); toolResults.push({ role: 'user', content: [{ type: 'tool_result', tool_use_id: content.id, content: result }] }); } } messages.push({ role: 'assistant', content: response.content }); messages.push(...toolResults); continue; } // Return final text response const textContent = response.content.find(c => c.type === 'text'); return textContent?.text || 'No response'; } } ``` --- ## Adaptive Cards ### Basic Adaptive Card ```typescript // src/cards/welcome-card.ts import { CardFactory } from 'botbuilder'; export function createWelcomeCard(userName: string) { return CardFactory.adaptiveCard({ type: 'AdaptiveCard', $schema: 'http://adaptivecards.io/schemas/adaptive-card.json', version: '1.5', body: [ { type: 'TextBlock', text: `Welcome, ${userName}!`, size: 'Large', weight: 'Bolder' }, { type: 'TextBlock', text: 'I\'m your AI assistant. How can I help you today?', wrap: true }, { type: 'ActionSet', actions: [ { type: 'Action.Submit', title: 'Get Started', data: { action: 'getStarted' } }, { type: 'Action.Submit', title: 'View Help', data: { action: 'help' } } ] } ] }); } ``` ### AI Response Card with Actions ```typescript // src/cards/ai-response-card.ts export function createAIResponseCard( question: string, answer: string, sources?: string[] ) { return { type: 'AdaptiveCard', $schema: 'http://adaptivecards.io/schemas/adaptive-card.json', version: '1.5', body: [ { type: 'Container', style: 'emphasis', items: [ { type: 'TextBlock', text: 'Your Question', size: 'Small', weight: 'Bolder' }, { type: 'TextBlock', text: question, wrap: true } ] }, { type: 'Container', items: [ { type: 'TextBlock', text: 'AI Response', size: 'Small', weight: 'Bolder' }, { type: 'TextBlock', text: answer, wrap: true } ] }, ...(sources && sources.length > 0 ? [{ type: 'Container', items: [ { type: 'TextBlock', text: 'Sources', size: 'Small', weight: 'Bolder' }, ...sources.map(source => ({ type: 'TextBlock', text: `• ${source}`, size: 'Small' })) ] }] : []) ], actions: [ { type: 'Action.Submit', title: '👍 Helpful', data: { action: 'feedback', value: 'positive' } }, { type: 'Action.Submit', title: '👎 Not Helpful', data: { action: 'feedback', value: 'negative' } }, { type: 'Action.Submit', title: 'Ask Follow-up', data: { action: 'followUp' } } ] }; } ``` ### Form Card for User Input ```typescript // src/cards/task-form-card.ts export function createTaskFormCard() { return { type: 'AdaptiveCard', $schema: 'http://adaptivecards.io/schemas/adaptive-card.json', version: '1.5', body: [ { type: 'TextBlock', text: 'Create New Task', size: 'Large', weight: 'Bolder' }, { type: 'Input.Text', id: 'taskTitle', label: 'Task Title', isRequired: true, placeholder: 'Enter task title' }, { type: 'Input.Text', id: 'taskDescription', label: 'Description', isMultiline: true, placeholder: 'Enter task description' }, { type: 'Input.ChoiceSet', id: 'priority', label: 'Priority', choices: [ { title: 'High', value: 'high' }, { title: 'Medium', value: 'medium' }, { title: 'Low', value: 'low' } ], value: 'medium' }, { type: 'Input.Date', id: 'dueDate', label: 'Due Date' } ], actions: [ { type: 'Action.Submit', title: 'Create Task', data: { action: 'createTask' } }, { type: 'Action.Submit', title: 'Cancel', data: { action: 'cancel' } } ] }; } ``` --- ## Microsoft Graph Integration ### Setup Graph Client ```typescript // src/graph/client.ts import { Client } from '@microsoft/microsoft-graph-client'; import { TokenCredentialAuthenticationProvider } from '@microsoft/microsoft-graph-client/authProviders/azureTokenCredentials'; import { ClientSecretCredential } from '@azure/identity'; export function createGraphClient() { const credential = new ClientSecretCredential( process.env.AZURE_TENANT_ID!, process.env.AZURE_CLIENT_ID!, process.env.AZURE_CLIENT_SECRET! ); const authProvider = new TokenCredentialAuthenticationProvider(credential, { scopes: ['https://graph.microsoft.com/.default'] }); return Client.initWithMiddleware({ authProvider }); } ``` ### Common Graph Operations ```typescript // src/graph/operations.ts import { Client } from '@microsoft/microsoft-graph-client'; export class GraphOperations { constructor(private client: Client) {} // Get user profile async getUserProfile(userId: string) { return this.client.api(`/users/${userId}`).get(); } // Get user's calendar events async getCalendarEvents(userId: string, days: number = 7) { const startDate = new Date().toISOString(); const endDate = new Date(Date.now() + days * 24 * 60 * 60 * 1000).toISOString(); return this.client .api(`/users/${userId}/calendarView`) .query({ startDateTime: startDate, endDateTime: endDate }) .select('subject,start,end,location') .orderby('start/dateTime') .get(); } // Send email async sendEmail( fromUserId: string, to: string, subject: string, body: string ) { return this.client.api(`/users/${fromUserId}/sendMail`).post({ message: { subject, body: { contentType: 'HTML', content: body }, toRecipients: [{ emailAddress: { address: to } }] } }); } // Create Teams meeting async createMeeting( userId: string, subject: string, startTime: string, endTime: string, attendees: string[] ) { return this.client.api(`/users/${userId}/onlineMeetings`).post({ subject, startDateTime: startTime, endDateTime: endTime, participants: { attendees: attendees.map(email => ({ upn: email, role: 'attendee' })) } }); } // Post message to channel async postToChannel(teamId: string, channelId: string, message: string) { return this.client .api(`/teams/${teamId}/channels/${channelId}/messages`) .post({ body: { content: message } }); } } ``` --- ## Authentication ### SSO with Teams SDK ```typescript // src/auth.ts import { App } from '@microsoft/teams.ai'; const app = new App({ // ... other config }); app.on('message', async ({ userGraph, isSignedIn, send, signin }) => { // Check if user is signed in if (!isSignedIn) { // Initiate sign-in flow await signin(); return; } // User is signed in, access Graph API const me = await userGraph.call({ method: 'GET', path: '/me' }); await send(`Hello, ${me.displayName}!`); }); ``` ### Manual OAuth Flow ```typescript // src/auth/oauth.ts import { OAuthPrompt, OAuthPromptSettings } from 'botbuilder-dialogs'; const oauthSettings: OAuthPromptSettings = { connectionName: process.env.OAUTH_CONNECTION_NAME!, text: 'Please sign in to continue', title: 'Sign In', timeout: 300000 // 5 minutes }; // In your dialog async function handleAuth(context, state) { const tokenResponse = await context.adapter.getUserToken( context, oauthSettings.connectionName ); if (!tokenResponse?.token) { // No token, show sign-in card await context.sendActivity({ attachments: [ CardFactory.oauthCard( oauthSettings.connectionName, oauthSettings.title, oauthSettings.text ) ] }); return null; } return tokenResponse.token; } ``` --- ## RAG (Retrieval-Augmented Generation) ### Vector Search with Azure AI Search ```typescript // src/rag/azure-search.ts import { SearchClient, AzureKeyCredential } from '@azure/search-documents'; const searchClient = new SearchClient( process.env.AZURE_SEARCH_ENDPOINT!, process.env.AZURE_SEARCH_INDEX!, new AzureKeyCredential(process.env.AZURE_SEARCH_KEY!) ); export async function searchKnowledgeBase( query: string, topK: number = 5 ): Promise<string[]> { const results = await searchClient.search(query, { top: topK, select: ['content', 'title', 'source'], queryType: 'semantic', semanticConfiguration: 'default' }); const documents: string[] = []; for await (const result of results.results) { documents.push(`${result.document.title}: ${result.document.content}`); } return documents; } ``` ### RAG-Enhanced Claude Response ```typescript // src/rag/claude-rag.ts import Anthropic from '@anthropic-ai/sdk'; import { searchKnowledgeBase } from './azure-search'; const anthropic = new Anthropic(); export async function getRAGResponse(userQuery: string): Promise<string> { // 1. Search knowledge base const relevantDocs = await searchKnowledgeBase(userQuery); // 2. Build context const context = relevantDocs.join('\n\n---\n\n'); // 3. Generate response with context const response = await anthropic.messages.create({ model: 'claude-sonnet-4-20250514', max_tokens: 1024, system: `You are a helpful assistant for Teams. Answer questions based on the provided context. If the context doesn't contain relevant information, say so and provide a general response. Always cite your sources when using information from the context.`, messages: [ { role: 'user', content: `Context:\n${context}\n\nQuestion: ${userQuery}` } ] }); return response.content[0].type === 'text' ? response.content[0].text : ''; } ``` --- ## Deployment ### Azure Bot Service Setup ```bash # Create resource group az group create --name rg-teams-bot --location eastus # Create App Service plan az appservice plan create \ --name asp-teams-bot \ --resource-group rg-teams-bot \ --sku B1 \ --is-linux # Create Web App az webapp create \ --name my-teams-bot \ --resource-group rg-teams-bot \ --plan asp-teams-bot \ --runtime "NODE:18-lts" # Create Bot Channels Registration az bot create \ --resource-group rg-teams-bot \ --name my-teams-bot \ --kind registration \ --endpoint https://my-teams-bot.azurewebsites.net/api/messages \ --sku F0 # Enable Teams channel az bot msteams create \ --name my-teams-bot \ --resource-group rg-teams-bot ``` ### Environment Variables ```bash # .env # Azure Bot BOT_ID=your-bot-id BOT_PASSWORD=your-bot-password BOT_TENANT_ID=your-tenant-id # Azure OpenAI AZURE_OPENAI_API_KEY=your-key AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com AZURE_OPENAI_DEPLOYMENT=gpt-4 # Or OpenAI OPENAI_API_KEY=sk-xxx # Or Anthropic ANTHROPIC_API_KEY=sk-ant-xxx # Microsoft Graph AZURE_CLIENT_ID=your-client-id AZURE_CLIENT_SECRET=your-client-secret AZURE_TENANT_ID=your-tenant-id # Azure AI Search (for RAG) AZURE_SEARCH_ENDPOINT=https://your-search.search.windows.net AZURE_SEARCH_KEY=your-key AZURE_SEARCH_INDEX=knowledge-base ``` ### Docker Deployment ```dockerfile # Dockerfile FROM node:18-alpine WORKDIR /app COPY package*.json ./ RUN npm ci --only=production COPY . . RUN npm run build EXPOSE 3978 CMD ["node", "dist/index.js"] ``` ```yaml # docker-compose.yml version: '3.8' services: teams-bot: build: . ports: - "3978:3978" environment: - BOT_ID=${BOT_ID} - BOT_PASSWORD=${BOT_PASSWORD} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} restart: unless-stopped ``` ### Teams Toolkit Deployment ```bash # Login to Azure npx teamsfx account login azure # Provision resources npx teamsfx provision --env dev # Deploy npx teamsfx deploy --env dev # Publish to Teams npx teamsfx publish --env dev ``` --- ## Testing ### Local Testing with ngrok ```bash # Start ngrok tunnel ngrok http 3978 # Update manifest with ngrok URL # Bot endpoint: https://xxxx.ngrok.io/api/messages ``` ### Teams Toolkit Local Debug ```bash # Start local debugging (opens Teams with your app) npx teamsfx preview --local ``` ### Unit Testing ```typescript // tests/bot.test.ts import { TestAdapter, TurnContext } from 'botbuilder'; import { createWelcomeCard } from '../src/cards/welcome-card'; describe('Bot Tests', () => { let adapter: TestAdapter; beforeEach(() => { adapter = new TestAdapter(); }); test('should respond to hello', async () => { await adapter .send('hello') .assertReply((activity) => { expect(activity.text).toContain('Hello'); }); }); test('should create welcome card', () => { const card = createWelcomeCard('John'); expect(card.content.body[0].text).toContain('John'); }); }); ``` --- ## Best Practices ### Conversation Design ``` ┌─────────────────────────────────────────────────────────────────┐ │ CONVERSATION UX GUIDELINES │ │ ───────────────────────────────────────────────────────────── │ │ │ │ 1. GREET INTELLIGENTLY │ │ - Welcome new users with onboarding card │ │ - Return users get quick access to recent actions │ │ │ │ 2. HANDLE ERRORS GRACEFULLY │ │ - Never show stack traces to users │ │ - Provide clear recovery options │ │ - Log errors for debugging │ │ │ │ 3. USE CARDS FOR RICH CONTENT │ │ - Adaptive Cards for forms and structured data │ │ - Hero Cards for simple actions │ │ - Keep cards concise and actionable │ │ │ │ 4. TYPING INDICATORS │ │ - Show typing for long operations │ │ - Provide progress updates for very long tasks │ │ │ │ 5. CONTEXT AWARENESS │ │ - Remember conversation history │ │ - Personalize based on user preferences │ │ - Respect team/channel context │ └─────────────────────────────────────────────────────────────────┘ ``` ### Security Checklist - [ ] Validate all incoming messages - [ ] Use App-Only auth for Graph API when possible - [ ] Never log sensitive user data - [ ] Implement rate limiting - [ ] Use managed identity in Azure - [ ] Rotate secrets regularly - [ ] Enable audit logging ### Performance Tips | Tip | Description | |-----|-------------| | Cache Graph tokens | Token refresh is expensive | | Stream long responses | Use typing indicator + chunked responses | | Index knowledge base | Pre-embed documents for RAG | | Use connection pooling | Reuse HTTP connections | | Compress payloads | Gzip large card responses | --- ## Project Templates ### AI Assistant Template ```typescript // Complete AI assistant with Claude import { App, HttpPlugin } from '@microsoft/teams.ai'; import Anthropic from '@anthropic-ai/sdk'; import { createWelcomeCard } from './cards/welcome-card'; import { createAIResponseCard } from './cards/ai-response-card'; const anthropic = new Anthropic(); const app = new App({ plugins: [new HttpPlugin()] }); const conversations = new Map<string, Anthropic.MessageParam[]>(); // Welcome new users app.conversationUpdate('membersAdded', async (context) => { for (const member of context.activity.membersAdded || []) { if (member.id !== context.activity.recipient.id) { await context.sendActivity({ attachments: [createWelcomeCard(member.name || 'User')] }); } } }); // Handle messages app.on('message', async (context) => { const userId = context.activity.from.id; const userMessage = context.activity.text; // Initialize or get conversation if (!conversations.has(userId)) { conversations.set(userId, []); } const history = conversations.get(userId)!; history.push({ role: 'user', content: userMessage }); // Show typing await context.sendActivity({ type: 'typing' }); // Get AI response const response = await anthropic.messages.create({ model: 'claude-sonnet-4-20250514', max_tokens: 1024, system: 'You are a helpful Teams assistant.', messages: history }); const answer = response.content[0].type === 'text' ? response.content[0].text : ''; history.push({ role: 'assistant', content: answer }); // Send rich card response await context.sendActivity({ attachments: [{ contentType: 'application/vnd.microsoft.card.adaptive', content: createAIResponseCard(userMessage, answer) }] }); }); // Handle card actions app.on('adaptiveCard/action', async (context) => { const action = context.activity.value?.action; switch (action) { case 'feedback': // Log feedback console.log('Feedback:', context.activity.value); await context.sendActivity('Thanks for your feedback!'); break; case 'followUp': await context.sendActivity('What would you like to know more about?'); break; } }); app.start(); ``` --- ## Troubleshooting | Issue | Cause | Fix | |-------|-------|-----| | Bot not responding | Endpoint unreachable | Check ngrok/Azure URL in manifest | | Auth failures | Token expired/invalid | Refresh OAuth connection | | Cards not rendering | Invalid schema | Validate at adaptivecards.io/designer | | Graph 403 errors | Missing permissions | Check app registration permissions | | Slow responses | API latency | Add typing indicator, consider streaming | --- ## Resources - [Teams SDK Documentation](https://microsoft.github.io/teams-sdk/) - [Teams Platform Docs](https://learn.microsoft.com/en-us/microsoftteams/platform/) - [Adaptive Cards Designer](https://adaptivecards.io/designer/) - [Microsoft Graph Explorer](https://developer.microsoft.com/en-us/graph/graph-explorer) - [Teams Toolkit](https://learn.microsoft.com/en-us/microsoftteams/platform/toolkit/teams-toolkit-fundamentals) - [Bot Framework Emulator](https://github.com/Microsoft/BotFramework-Emulator) ================================================ FILE: skills/nodejs-backend/SKILL.md ================================================ --- name: nodejs-backend description: Node.js backend patterns with Express/Fastify, repositories when-to-use: When working on Node.js backend code - API routes, middleware, server setup user-invocable: false paths: ["src/api/**", "src/routes/**", "src/server/**", "src/middleware/**", "server/**", "api/**"] effort: medium --- # Node.js Backend Skill --- ## Project Structure ``` project/ ├── src/ │ ├── core/ # Pure business logic │ │ ├── types.ts # Domain types │ │ ├── errors.ts # Domain errors │ │ └── services/ # Pure functions │ │ ├── user.ts │ │ └── order.ts │ ├── infra/ # Side effects │ │ ├── http/ # HTTP layer │ │ │ ├── server.ts # Server setup │ │ │ ├── routes/ # Route handlers │ │ │ └── middleware/ # Express middleware │ │ ├── db/ # Database │ │ │ ├── client.ts # DB connection │ │ │ ├── repositories/ # Data access │ │ │ └── migrations/ # Schema migrations │ │ └── external/ # Third-party APIs │ ├── config/ # Configuration │ │ └── index.ts # Env vars, validated │ └── index.ts # Entry point ├── tests/ │ ├── unit/ │ └── integration/ ├── package.json └── CLAUDE.md ``` --- ## API Design ### Route Handler Pattern ```typescript // routes/users.ts import { Router } from 'express'; import { z } from 'zod'; import { createUser } from '../../core/services/user'; import { UserRepository } from '../db/repositories/user'; const CreateUserSchema = z.object({ email: z.string().email(), name: z.string().min(1).max(100), }); export function createUserRoutes(userRepo: UserRepository): Router { const router = Router(); router.post('/', async (req, res, next) => { try { const input = CreateUserSchema.parse(req.body); const user = await createUser(input, userRepo); res.status(201).json(user); } catch (error) { next(error); } }); return router; } ``` ### Dependency Injection at Composition Root ```typescript // index.ts import { createApp } from './infra/http/server'; import { createDbClient } from './infra/db/client'; import { UserRepository } from './infra/db/repositories/user'; import { createUserRoutes } from './infra/http/routes/users'; async function main(): Promise<void> { const db = await createDbClient(); const userRepo = new UserRepository(db); const app = createApp({ userRoutes: createUserRoutes(userRepo), }); app.listen(3000); } ``` --- ## Error Handling ### Domain Errors ```typescript // core/errors.ts export class DomainError extends Error { constructor( message: string, public readonly code: string, public readonly statusCode: number = 400 ) { super(message); this.name = 'DomainError'; } } export class NotFoundError extends DomainError { constructor(resource: string, id: string) { super(`${resource} with id ${id} not found`, 'NOT_FOUND', 404); } } export class ValidationError extends DomainError { constructor(message: string) { super(message, 'VALIDATION_ERROR', 400); } } ``` ### Global Error Handler ```typescript // middleware/errorHandler.ts import { ErrorRequestHandler } from 'express'; import { DomainError } from '../../core/errors'; import { ZodError } from 'zod'; export const errorHandler: ErrorRequestHandler = (err, req, res, next) => { if (err instanceof DomainError) { return res.status(err.statusCode).json({ error: { code: err.code, message: err.message }, }); } if (err instanceof ZodError) { return res.status(400).json({ error: { code: 'VALIDATION_ERROR', details: err.errors }, }); } console.error('Unexpected error:', err); return res.status(500).json({ error: { code: 'INTERNAL_ERROR', message: 'Something went wrong' }, }); }; ``` --- ## Database Patterns ### Repository Pattern ```typescript // db/repositories/user.ts import { Kysely } from 'kysely'; import { Database, User } from '../types'; export class UserRepository { constructor(private db: Kysely<Database>) {} async findById(id: string): Promise<User | null> { return this.db .selectFrom('users') .where('id', '=', id) .selectAll() .executeTakeFirst() ?? null; } async create(data: Omit<User, 'id' | 'createdAt'>): Promise<User> { return this.db .insertInto('users') .values(data) .returningAll() .executeTakeFirstOrThrow(); } } ``` ### Transactions ```typescript async function transferFunds( fromId: string, toId: string, amount: number, db: Kysely<Database> ): Promise<void> { await db.transaction().execute(async (trx) => { await trx .updateTable('accounts') .set((eb) => ({ balance: eb('balance', '-', amount) })) .where('id', '=', fromId) .execute(); await trx .updateTable('accounts') .set((eb) => ({ balance: eb('balance', '+', amount) })) .where('id', '=', toId) .execute(); }); } ``` --- ## Configuration ### Validated Config ```typescript // config/index.ts import { z } from 'zod'; const ConfigSchema = z.object({ NODE_ENV: z.enum(['development', 'production', 'test']), PORT: z.coerce.number().default(3000), DATABASE_URL: z.string().url(), API_KEY: z.string().min(1), }); export type Config = z.infer<typeof ConfigSchema>; export function loadConfig(): Config { return ConfigSchema.parse(process.env); } ``` --- ## Testing ### Unit Tests (Core) ```typescript // tests/unit/services/user.test.ts import { createUser } from '../../../src/core/services/user'; describe('createUser', () => { it('creates user with valid data', async () => { const mockRepo = { create: jest.fn().mockResolvedValue({ id: '1', email: 'test@example.com' }), findByEmail: jest.fn().mockResolvedValue(null), }; const result = await createUser({ email: 'test@example.com', name: 'Test' }, mockRepo); expect(result.email).toBe('test@example.com'); expect(mockRepo.create).toHaveBeenCalledTimes(1); }); }); ``` ### Integration Tests (API) ```typescript // tests/integration/users.test.ts import request from 'supertest'; import { createTestApp, createTestDb } from '../helpers'; describe('POST /users', () => { let app: Express; let db: TestDb; beforeAll(async () => { db = await createTestDb(); app = createTestApp(db); }); afterAll(async () => { await db.destroy(); }); it('creates user and returns 201', async () => { const response = await request(app) .post('/users') .send({ email: 'new@example.com', name: 'New User' }); expect(response.status).toBe(201); expect(response.body.email).toBe('new@example.com'); }); }); ``` --- ## Node.js Anti-Patterns - ❌ Callback hell - use async/await - ❌ Unhandled promise rejections - always catch or let error handler catch - ❌ Blocking the event loop - offload heavy computation - ❌ Secrets in code - use environment variables - ❌ SQL string concatenation - use parameterized queries - ❌ No input validation - validate at API boundary - ❌ Console.log in production - use proper logger - ❌ No graceful shutdown - handle SIGTERM - ❌ Monolithic route files - split by resource ================================================ FILE: skills/playwright-testing/SKILL.md ================================================ --- name: playwright-testing description: E2E testing with Playwright - Page Objects, cross-browser, CI/CD when-to-use: When writing or debugging E2E tests with Playwright user-invocable: true paths: ["**/e2e/**", "**/*.spec.ts", "**/playwright/**", "playwright.config.*"] effort: medium --- # Playwright E2E Testing Skill For end-to-end testing of web applications with Playwright - cross-browser, fast, reliable. **Sources:** [Playwright Best Practices](https://playwright.dev/docs/best-practices) | [Playwright Docs](https://playwright.dev/docs/intro) | [Better Stack Guide](https://betterstack.com/community/guides/testing/playwright-best-practices/) --- ## Setup ### Installation ```bash # New project npm init playwright@latest # Existing project npm install -D @playwright/test npx playwright install ``` ### Configuration ```typescript // playwright.config.ts import { defineConfig, devices } from '@playwright/test'; export default defineConfig({ testDir: './e2e', fullyParallel: true, forbidOnly: !!process.env.CI, retries: process.env.CI ? 2 : 0, workers: process.env.CI ? 1 : undefined, reporter: [ ['html'], ['list'], process.env.CI ? ['github'] : ['line'], ], use: { baseURL: process.env.BASE_URL || 'http://localhost:3000', trace: 'on-first-retry', screenshot: 'only-on-failure', video: 'retain-on-failure', }, projects: [ // Auth setup - runs once before all tests { name: 'setup', testMatch: /.*\.setup\.ts/ }, { name: 'chromium', use: { ...devices['Desktop Chrome'] }, dependencies: ['setup'], }, { name: 'firefox', use: { ...devices['Desktop Firefox'] }, dependencies: ['setup'], }, { name: 'webkit', use: { ...devices['Desktop Safari'] }, dependencies: ['setup'], }, // Mobile viewports { name: 'mobile-chrome', use: { ...devices['Pixel 5'] }, dependencies: ['setup'], }, { name: 'mobile-safari', use: { ...devices['iPhone 12'] }, dependencies: ['setup'], }, ], // Start dev server before tests webServer: { command: 'npm run dev', url: 'http://localhost:3000', reuseExistingServer: !process.env.CI, timeout: 120 * 1000, }, }); ``` --- ## Project Structure ``` project/ ├── e2e/ │ ├── fixtures/ │ │ ├── auth.fixture.ts # Auth fixtures │ │ └── test.fixture.ts # Extended test with fixtures │ ├── pages/ │ │ ├── base.page.ts # Base page object │ │ ├── login.page.ts # Login page object │ │ ├── dashboard.page.ts # Dashboard page object │ │ └── index.ts # Export all pages │ ├── tests/ │ │ ├── auth.spec.ts # Auth tests │ │ ├── dashboard.spec.ts # Dashboard tests │ │ └── checkout.spec.ts # Checkout flow tests │ ├── utils/ │ │ ├── helpers.ts # Test helpers │ │ └── test-data.ts # Test data factories │ └── auth.setup.ts # Global auth setup ├── playwright.config.ts └── .auth/ # Stored auth state (gitignored) ``` --- ## Locator Strategy (Priority Order) Use locators that mirror how users interact with the page: ```typescript // ✅ BEST: Role-based (accessible, resilient) page.getByRole('button', { name: 'Submit' }) page.getByRole('textbox', { name: 'Email' }) page.getByRole('link', { name: 'Sign up' }) page.getByRole('heading', { name: 'Welcome' }) // ✅ GOOD: User-facing text page.getByLabel('Email address') page.getByPlaceholder('Enter your email') page.getByText('Welcome back') page.getByTitle('Profile settings') // ✅ GOOD: Test IDs (stable, explicit) page.getByTestId('submit-button') page.getByTestId('user-avatar') // ⚠️ AVOID: CSS selectors (brittle) page.locator('.btn-primary') page.locator('#submit') // ❌ NEVER: XPath (extremely brittle) page.locator('//div[@class="container"]/button[1]') ``` ### Chaining Locators ```typescript // Narrow down to specific section const form = page.getByRole('form', { name: 'Login' }); await form.getByRole('textbox', { name: 'Email' }).fill('user@example.com'); await form.getByRole('button', { name: 'Submit' }).click(); // Filter within a list const productCard = page.getByTestId('product-card') .filter({ hasText: 'Pro Plan' }); await productCard.getByRole('button', { name: 'Buy' }).click(); ``` --- ## Page Object Model ### Base Page ```typescript // e2e/pages/base.page.ts import { Page, Locator } from '@playwright/test'; export abstract class BasePage { constructor(protected page: Page) {} async navigate(path: string = '/') { await this.page.goto(path); } async waitForPageLoad() { await this.page.waitForLoadState('networkidle'); } // Common elements get header() { return this.page.getByRole('banner'); } get footer() { return this.page.getByRole('contentinfo'); } // Common actions async clickNavLink(name: string) { await this.header.getByRole('link', { name }).click(); } } ``` ### Page Implementation ```typescript // e2e/pages/login.page.ts import { Page, expect } from '@playwright/test'; import { BasePage } from './base.page'; export class LoginPage extends BasePage { readonly emailInput: Locator; readonly passwordInput: Locator; readonly submitButton: Locator; readonly errorMessage: Locator; constructor(page: Page) { super(page); this.emailInput = page.getByLabel('Email'); this.passwordInput = page.getByLabel('Password'); this.submitButton = page.getByRole('button', { name: 'Sign in' }); this.errorMessage = page.getByRole('alert'); } async goto() { await this.navigate('/login'); } async login(email: string, password: string) { await this.emailInput.fill(email); await this.passwordInput.fill(password); await this.submitButton.click(); } async expectError(message: string) { await expect(this.errorMessage).toContainText(message); } async expectLoggedIn() { await expect(this.page).toHaveURL(/.*dashboard/); } } ``` ```typescript // e2e/pages/dashboard.page.ts import { Page, Locator, expect } from '@playwright/test'; import { BasePage } from './base.page'; export class DashboardPage extends BasePage { readonly welcomeHeading: Locator; readonly userMenu: Locator; readonly logoutButton: Locator; constructor(page: Page) { super(page); this.welcomeHeading = page.getByRole('heading', { name: /welcome/i }); this.userMenu = page.getByTestId('user-menu'); this.logoutButton = page.getByRole('button', { name: 'Logout' }); } async goto() { await this.navigate('/dashboard'); } async logout() { await this.userMenu.click(); await this.logoutButton.click(); } async expectWelcome(name: string) { await expect(this.welcomeHeading).toContainText(name); } } ``` ### Export All Pages ```typescript // e2e/pages/index.ts export { BasePage } from './base.page'; export { LoginPage } from './login.page'; export { DashboardPage } from './dashboard.page'; ``` --- ## Authentication ### Global Auth Setup ```typescript // e2e/auth.setup.ts import { test as setup, expect } from '@playwright/test'; import path from 'path'; const authFile = path.join(__dirname, '../.auth/user.json'); setup('authenticate', async ({ page }) => { // Go to login page await page.goto('/login'); // Login with test credentials await page.getByLabel('Email').fill(process.env.TEST_USER_EMAIL!); await page.getByLabel('Password').fill(process.env.TEST_USER_PASSWORD!); await page.getByRole('button', { name: 'Sign in' }).click(); // Wait for auth to complete await expect(page).toHaveURL(/.*dashboard/); // Save auth state for reuse await page.context().storageState({ path: authFile }); }); ``` ### Using Auth in Tests ```typescript // playwright.config.ts export default defineConfig({ projects: [ { name: 'setup', testMatch: /.*\.setup\.ts/ }, { name: 'chromium', use: { ...devices['Desktop Chrome'], storageState: '.auth/user.json', }, dependencies: ['setup'], }, ], }); ``` ### Tests Without Auth ```typescript // e2e/tests/public.spec.ts import { test } from '@playwright/test'; // Override to skip auth test.use({ storageState: { cookies: [], origins: [] } }); test('homepage loads for anonymous users', async ({ page }) => { await page.goto('/'); await expect(page.getByRole('heading', { name: 'Welcome' })).toBeVisible(); }); ``` --- ## Writing Tests ### Basic Test Structure ```typescript // e2e/tests/auth.spec.ts import { test, expect } from '@playwright/test'; import { LoginPage } from '../pages'; test.describe('Authentication', () => { test.beforeEach(async ({ page }) => { // Skip stored auth for login tests await page.context().clearCookies(); }); test('successful login redirects to dashboard', async ({ page }) => { const loginPage = new LoginPage(page); await loginPage.goto(); await loginPage.login('user@example.com', 'password123'); await loginPage.expectLoggedIn(); }); test('invalid credentials show error', async ({ page }) => { const loginPage = new LoginPage(page); await loginPage.goto(); await loginPage.login('wrong@example.com', 'wrongpass'); await loginPage.expectError('Invalid email or password'); }); test('empty form shows validation errors', async ({ page }) => { const loginPage = new LoginPage(page); await loginPage.goto(); await loginPage.submitButton.click(); await expect(page.getByText('Email is required')).toBeVisible(); await expect(page.getByText('Password is required')).toBeVisible(); }); }); ``` ### User Flow Tests ```typescript // e2e/tests/checkout.spec.ts import { test, expect } from '@playwright/test'; test.describe('Checkout Flow', () => { test('complete purchase flow', async ({ page }) => { // 1. Browse products await page.goto('/products'); await page.getByTestId('product-card') .filter({ hasText: 'Pro Plan' }) .getByRole('button', { name: 'Add to cart' }) .click(); // 2. View cart await page.getByRole('link', { name: 'Cart' }).click(); await expect(page.getByText('Pro Plan')).toBeVisible(); await expect(page.getByTestId('cart-total')).toContainText('$29.99'); // 3. Checkout await page.getByRole('button', { name: 'Checkout' }).click(); // 4. Fill payment (use Stripe test card) const stripeFrame = page.frameLocator('iframe[name*="stripe"]'); await stripeFrame.getByPlaceholder('Card number').fill('4242424242424242'); await stripeFrame.getByPlaceholder('MM / YY').fill('12/30'); await stripeFrame.getByPlaceholder('CVC').fill('123'); // 5. Complete purchase await page.getByRole('button', { name: 'Pay now' }).click(); // 6. Verify success await expect(page).toHaveURL(/.*success/); await expect(page.getByRole('heading', { name: 'Thank you' })).toBeVisible(); }); }); ``` --- ## Assertions ### Web-First Assertions (Auto-Wait) ```typescript // ✅ These wait and retry automatically await expect(page.getByRole('button')).toBeVisible(); await expect(page.getByRole('button')).toBeEnabled(); await expect(page.getByRole('button')).toHaveText('Submit'); await expect(page).toHaveURL('/dashboard'); await expect(page).toHaveTitle(/Dashboard/); // ❌ Avoid manual waits await page.waitForTimeout(3000); // NEVER do this ``` ### Soft Assertions ```typescript // Continue test even if assertion fails await expect.soft(page.getByTestId('price')).toHaveText('$29.99'); await expect.soft(page.getByTestId('stock')).toHaveText('In Stock'); // Fail at end if any soft assertions failed ``` ### Common Assertions ```typescript // Visibility await expect(locator).toBeVisible(); await expect(locator).toBeHidden(); await expect(locator).toBeAttached(); // Text content await expect(locator).toHaveText('exact text'); await expect(locator).toContainText('partial'); await expect(locator).toHaveValue('input value'); // State await expect(locator).toBeEnabled(); await expect(locator).toBeDisabled(); await expect(locator).toBeChecked(); await expect(locator).toBeFocused(); // Count await expect(locator).toHaveCount(5); // Page await expect(page).toHaveURL('/dashboard'); await expect(page).toHaveTitle('Dashboard | App'); await expect(page).toHaveScreenshot('dashboard.png'); ``` --- ## Mocking & Network ### Mock API Responses ```typescript test('shows error when API fails', async ({ page }) => { // Mock API to return error await page.route('**/api/users', (route) => { route.fulfill({ status: 500, body: JSON.stringify({ error: 'Server error' }), }); }); await page.goto('/users'); await expect(page.getByText('Failed to load users')).toBeVisible(); }); test('displays user data from API', async ({ page }) => { // Mock successful response await page.route('**/api/users', (route) => { route.fulfill({ status: 200, contentType: 'application/json', body: JSON.stringify([ { id: 1, name: 'John Doe', email: 'john@example.com' }, { id: 2, name: 'Jane Doe', email: 'jane@example.com' }, ]), }); }); await page.goto('/users'); await expect(page.getByText('John Doe')).toBeVisible(); await expect(page.getByText('Jane Doe')).toBeVisible(); }); ``` ### Wait for API Calls ```typescript test('submits form and shows success', async ({ page }) => { await page.goto('/contact'); // Fill form await page.getByLabel('Name').fill('John'); await page.getByLabel('Email').fill('john@example.com'); await page.getByLabel('Message').fill('Hello!'); // Wait for API call on submit const responsePromise = page.waitForResponse('**/api/contact'); await page.getByRole('button', { name: 'Send' }).click(); const response = await responsePromise; expect(response.status()).toBe(200); await expect(page.getByText('Message sent!')).toBeVisible(); }); ``` --- ## Visual Testing ```typescript // Full page screenshot await expect(page).toHaveScreenshot('homepage.png'); // Element screenshot await expect(page.getByTestId('chart')).toHaveScreenshot('chart.png'); // With options await expect(page).toHaveScreenshot('dashboard.png', { maxDiffPixels: 100, mask: [page.getByTestId('timestamp')], // Ignore dynamic content }); ``` --- ## CI/CD Integration ### GitHub Actions ```yaml # .github/workflows/e2e.yml name: E2E Tests on: push: branches: [main] pull_request: branches: [main] jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: node-version: 20 cache: 'npm' - name: Install dependencies run: npm ci - name: Install Playwright browsers run: npx playwright install --with-deps chromium - name: Run E2E tests run: npx playwright test --project=chromium env: BASE_URL: ${{ secrets.STAGING_URL }} TEST_USER_EMAIL: ${{ secrets.TEST_USER_EMAIL }} TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - uses: actions/upload-artifact@v4 if: failure() with: name: playwright-report path: playwright-report/ retention-days: 7 ``` ### Run Specific Tests ```bash # Run all tests npx playwright test # Run specific file npx playwright test e2e/tests/auth.spec.ts # Run tests with tag npx playwright test --grep @critical # Run in headed mode (debug) npx playwright test --headed # Run specific browser npx playwright test --project=chromium # Debug mode npx playwright test --debug # Show HTML report npx playwright show-report ``` --- ## Test Data ### Factories ```typescript // e2e/utils/test-data.ts import { faker } from '@faker-js/faker'; export const createUser = (overrides = {}) => ({ email: faker.internet.email(), password: faker.internet.password({ length: 12 }), name: faker.person.fullName(), ...overrides, }); export const createProduct = (overrides = {}) => ({ name: faker.commerce.productName(), price: faker.commerce.price({ min: 10, max: 100 }), description: faker.commerce.productDescription(), ...overrides, }); ``` ### Environment Variables ```bash # .env.test BASE_URL=http://localhost:3000 TEST_USER_EMAIL=test@example.com TEST_USER_PASSWORD=testpassword123 ``` --- ## Debugging ### Trace Viewer ```typescript // Enable in config for failures use: { trace: 'on-first-retry', } // View traces npx playwright show-trace trace.zip ``` ### Debug Mode ```bash # Step through test npx playwright test --debug # Pause at specific point await page.pause(); // In test code ``` ### VS Code Extension Install "Playwright Test for VS Code" for: - Run tests from editor - Debug with breakpoints - Pick locators visually - Watch mode --- ## Dead Link Detection (REQUIRED) **Every project MUST include dead link detection tests.** Run these on every deployment. ### Link Validator Test ```typescript // e2e/tests/links.spec.ts import { test, expect } from '@playwright/test'; const PAGES_TO_CHECK = ['/', '/about', '/pricing', '/blog', '/contact']; test.describe('Dead Link Detection', () => { for (const pagePath of PAGES_TO_CHECK) { test(`no dead links on ${pagePath}`, async ({ page, request }) => { await page.goto(pagePath); // Get all links on the page const links = await page.locator('a[href]').all(); const hrefs = await Promise.all( links.map(link => link.getAttribute('href')) ); // Filter to internal and absolute external links const uniqueLinks = [...new Set(hrefs.filter(Boolean))] as string[]; for (const href of uniqueLinks) { // Skip mailto, tel, and anchor links if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('#')) { continue; } // Build full URL const url = href.startsWith('http') ? href : new URL(href, page.url()).href; // Check link status const response = await request.get(url, { timeout: 10000, ignoreHTTPSErrors: true, }); expect( response.ok(), `Dead link found on ${pagePath}: ${href} returned ${response.status()}` ).toBeTruthy(); } }); } }); ``` ### Comprehensive Link Crawler ```typescript // e2e/tests/site-links.spec.ts import { test, expect, Page, APIRequestContext } from '@playwright/test'; interface LinkResult { url: string; status: number; foundOn: string; } async function checkAllLinks( page: Page, request: APIRequestContext, startUrl: string ): Promise<LinkResult[]> { const visited = new Set<string>(); const results: LinkResult[] = []; const toVisit = [startUrl]; const baseUrl = new URL(startUrl).origin; while (toVisit.length > 0) { const currentUrl = toVisit.pop()!; if (visited.has(currentUrl)) continue; visited.add(currentUrl); try { await page.goto(currentUrl); const links = await page.locator('a[href]').all(); for (const link of links) { const href = await link.getAttribute('href'); if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('tel:')) { continue; } const fullUrl = href.startsWith('http') ? href : new URL(href, currentUrl).href; // Check link const response = await request.get(fullUrl, { timeout: 10000, ignoreHTTPSErrors: true, }); results.push({ url: fullUrl, status: response.status(), foundOn: currentUrl, }); // Add internal links to queue if (fullUrl.startsWith(baseUrl) && !visited.has(fullUrl)) { toVisit.push(fullUrl); } } } catch (error) { results.push({ url: currentUrl, status: 0, foundOn: 'navigation', }); } } return results; } test('no dead links on entire site', async ({ page, request, baseURL }) => { const results = await checkAllLinks(page, request, baseURL!); const deadLinks = results.filter(r => r.status >= 400 || r.status === 0); if (deadLinks.length > 0) { console.error('Dead links found:'); deadLinks.forEach(link => { console.error(` ${link.url} (${link.status}) - found on ${link.foundOn}`); }); } expect(deadLinks, `Found ${deadLinks.length} dead links`).toHaveLength(0); }); ``` ### Image Link Validation ```typescript // e2e/tests/images.spec.ts import { test, expect } from '@playwright/test'; test('no broken images on homepage', async ({ page, request }) => { await page.goto('/'); const images = await page.locator('img[src]').all(); for (const img of images) { const src = await img.getAttribute('src'); if (!src) continue; const url = src.startsWith('http') ? src : new URL(src, page.url()).href; // Skip data URLs if (url.startsWith('data:')) continue; const response = await request.get(url); expect( response.ok(), `Broken image: ${src}` ).toBeTruthy(); // Verify it's actually an image const contentType = response.headers()['content-type']; expect( contentType?.startsWith('image/'), `${src} is not an image (${contentType})` ).toBeTruthy(); } }); ``` ### CI Integration for Link Checking ```yaml # .github/workflows/link-check.yml name: Link Check on: schedule: - cron: '0 6 * * 1' # Weekly on Monday push: branches: [main] jobs: link-check: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: node-version: 20 - run: npm ci - run: npx playwright install chromium - run: npx playwright test e2e/tests/links.spec.ts --project=chromium env: BASE_URL: ${{ secrets.PRODUCTION_URL }} ``` --- ## Anti-Patterns - **Hardcoded waits** - Use auto-waiting assertions instead - **CSS/XPath selectors** - Use role/text/testid locators - **Testing third-party sites** - Mock external dependencies - **Shared state between tests** - Each test must be isolated - **Missing awaits** - Use ESLint rule `no-floating-promises` - **Flaky time-based tests** - Mock dates/times - **Testing implementation details** - Test user-visible behavior - **Huge test files** - Split by feature/page --- ## Quick Reference ```bash # Install npm init playwright@latest # Run tests npx playwright test npx playwright test --headed npx playwright test --project=chromium npx playwright test --grep @smoke # Debug npx playwright test --debug npx playwright show-report npx playwright show-trace trace.zip # Generate tests npx playwright codegen localhost:3000 ``` ### Package.json Scripts ```json { "scripts": { "test:e2e": "playwright test", "test:e2e:headed": "playwright test --headed", "test:e2e:debug": "playwright test --debug", "test:e2e:report": "playwright show-report", "test:e2e:codegen": "playwright codegen" } } ``` ================================================ FILE: skills/polyphony/SKILL.md ================================================ --- name: polyphony description: Multi-agent orchestration with container-isolated workspaces — each agent session runs in its own Docker container with independent git branches when-to-use: Always loaded when container isolation is available (Docker/OrbStack installed). Default for /spawn-team. user-invocable: false effort: high --- # Polyphony — Multi-Agent Orchestration Container-isolated workspaces for parallel agent execution. Each agent gets its own Docker container with a full git clone on its own branch. No conflicts, independent tests, clean PRs. --- ## Architecture (6 Layers) 1. **Work Source** — Tasks from GitHub Issues (`gh api`) or local SQLite queue 2. **Orchestrator** — Supervisor loop: discover -> claim -> route -> provision -> run -> verify -> land 3. **Router** — Pure function: Task x Policy -> RunSpec (5-dimension complexity scoring) 4. **Identity Broker** — Resolves named credentials to volume mounts + env overlays 5. **Workspace Manager** — Per-task `git clone --reference`, branch checkout, cleanup 6. **Worker Runtime** — Docker container create/start/stop/logs lifecycle --- ## Task Lifecycle ``` DISCOVERED -> CLAIMED -> ROUTED -> PROVISIONED -> RUNNING -> VERIFYING -> LANDED | | v v FAILED --> BLOCKED | v CLAIMED (retry) ``` --- ## Prerequisites - Docker or OrbStack installed and running - At least one agent CLI available (Claude, Codex, or Kimi) - CLI subscriptions configured (not API keys) Check: ```bash command -v docker &>/dev/null || command -v orbctl &>/dev/null ``` --- ## Configuration All config lives in `~/.polyphony/`: | File | Purpose | |------|---------| | `config.yaml` | Workspace root, poll interval, max concurrency | | `identities.yaml` | Named credential bundles with volume paths | | `agents.yaml` | Agent profiles (CLI commands, strengths) | | `routing.yaml` | Routing rules and fallback chains | Initialize with: `polyphony init` --- ## Routing Rules Rules are evaluated top-down; first match wins. Each rule has a `match` predicate and an `agent` target. ```yaml rules: - match: { task_type: docs, risk: low } agent: kimi - match: { task_type: bugfix } agent: codex - match: { risk: high } agent: claude default: agent: claude fallback: [codex, kimi] ``` --- ## Complexity Scoring (5 Dimensions) Each dimension scores 0-2. Total 0-10. | Dimension | Source | |-----------|--------| | Cyclomatic depth | LOC + scope size | | Fan-out | Number of callers | | Security boundary | Auth/PII keywords | | Concurrency | Lock/transaction keywords | | Domain invariants | Risk level + task type | Routing thresholds: - **0-3**: Delegate to Kimi solo - **4-6**: Kimi + Codex review - **7-10**: Claude direct --- ## Container Isolation Each task gets: - Its own Docker container from `polyphony-worker:latest` - A full git clone at `/workspace` (not a worktree) - Auth volumes mounted read-only (e.g., `~/.claude:/home/worker/.claude:ro`) - Independent test execution - Its own branch for PRs --- ## CLI Commands ```bash polyphony init # Create ~/.polyphony/ with config templates polyphony spawn "Fix auth bug" # Create and route a task polyphony status # Show task states polyphony cleanup # Remove completed workspaces ``` --- ## Integration with Existing Skills - **cross-agent-delegation**: Uses Polyphony's complexity scoring for routing decisions - **agent-teams**: Uses Polyphony's workspace isolation instead of shared directories - **spawn-team**: Uses Polyphony's container provisioning for feature agents ================================================ FILE: skills/posthog-analytics/SKILL.md ================================================ --- name: posthog-analytics description: PostHog analytics, event tracking, feature flags, dashboards when-to-use: When adding analytics, feature flags, or event tracking with PostHog user-invocable: false effort: medium --- # PostHog Analytics Skill For implementing product analytics with PostHog - event tracking, user identification, feature flags, and project-specific dashboards. **Sources:** [PostHog Docs](https://posthog.com/docs) | [Product Analytics](https://posthog.com/docs/product-analytics) | [Feature Flags](https://posthog.com/docs/feature-flags) --- ## Philosophy **Measure what matters, not everything.** Analytics should answer specific questions: - Are users getting value? (activation, retention) - Where do users struggle? (funnels, drop-offs) - What features drive engagement? (feature usage) - Is the product growing? (acquisition, referrals) Don't track everything. Track what informs decisions. --- ## Installation ### Next.js (App Router) ```bash npm install posthog-js ``` ```typescript // lib/posthog.ts import posthog from 'posthog-js'; export function initPostHog() { if (typeof window !== 'undefined' && !posthog.__loaded) { posthog.init(process.env.NEXT_PUBLIC_POSTHOG_KEY!, { api_host: process.env.NEXT_PUBLIC_POSTHOG_HOST || 'https://us.i.posthog.com', person_profiles: 'identified_only', // Only create profiles for identified users capture_pageview: false, // We'll handle this manually for SPA capture_pageleave: true, loaded: (posthog) => { if (process.env.NODE_ENV === 'development') { posthog.debug(); } }, }); } return posthog; } export { posthog }; ``` ```typescript // app/providers.tsx 'use client'; import { useEffect } from 'react'; import { usePathname, useSearchParams } from 'next/navigation'; import { initPostHog, posthog } from '@/lib/posthog'; export function PostHogProvider({ children }: { children: React.ReactNode }) { const pathname = usePathname(); const searchParams = useSearchParams(); useEffect(() => { initPostHog(); }, []); // Track pageviews useEffect(() => { if (pathname) { let url = window.origin + pathname; if (searchParams.toString()) { url += `?${searchParams.toString()}`; } posthog.capture('$pageview', { $current_url: url }); } }, [pathname, searchParams]); return <>{children}</>; } ``` ```typescript // app/layout.tsx import { PostHogProvider } from './providers'; export default function RootLayout({ children }: { children: React.ReactNode }) { return ( <html lang="en"> <body> <PostHogProvider> {children} </PostHogProvider> </body> </html> ); } ``` ### React (Vite/CRA) ```typescript // src/posthog.ts import posthog from 'posthog-js'; posthog.init(import.meta.env.VITE_POSTHOG_KEY, { api_host: import.meta.env.VITE_POSTHOG_HOST || 'https://us.i.posthog.com', person_profiles: 'identified_only', }); export { posthog }; ``` ```typescript // src/main.tsx import { PostHogProvider } from 'posthog-js/react'; import { posthog } from './posthog'; ReactDOM.createRoot(document.getElementById('root')!).render( <PostHogProvider client={posthog}> <App /> </PostHogProvider> ); ``` ### Python (FastAPI/Flask) ```bash pip install posthog ``` ```python # analytics/posthog_client.py import posthog from functools import lru_cache @lru_cache() def get_posthog(): posthog.project_api_key = os.environ["POSTHOG_API_KEY"] posthog.host = os.environ.get("POSTHOG_HOST", "https://us.i.posthog.com") posthog.debug = os.environ.get("ENV") == "development" return posthog # Usage def track_event(user_id: str, event: str, properties: dict = None): ph = get_posthog() ph.capture( distinct_id=user_id, event=event, properties=properties or {} ) def identify_user(user_id: str, properties: dict): ph = get_posthog() ph.identify(user_id, properties) ``` ### Node.js (Express/Hono) ```bash npm install posthog-node ``` ```typescript // lib/posthog.ts import { PostHog } from 'posthog-node'; const posthog = new PostHog(process.env.POSTHOG_API_KEY!, { host: process.env.POSTHOG_HOST || 'https://us.i.posthog.com', }); // Flush on shutdown process.on('SIGTERM', () => posthog.shutdown()); export { posthog }; // Usage export function trackEvent(userId: string, event: string, properties?: Record<string, any>) { posthog.capture({ distinctId: userId, event, properties, }); } export function identifyUser(userId: string, properties: Record<string, any>) { posthog.identify({ distinctId: userId, properties, }); } ``` --- ## Environment Variables ```bash # .env.local (Next.js) - SAFE: These are meant to be public NEXT_PUBLIC_POSTHOG_KEY=phc_xxxxxxxxxxxxxxxxxxxx NEXT_PUBLIC_POSTHOG_HOST=https://us.i.posthog.com # .env (Backend) - Keep private POSTHOG_API_KEY=phc_xxxxxxxxxxxxxxxxxxxx POSTHOG_HOST=https://us.i.posthog.com ``` Add to `credentials.md` patterns: ```python 'POSTHOG_API_KEY': r'phc_[A-Za-z0-9]+', ``` --- ## User Identification ### When to Identify ```typescript // Identify on signup async function handleSignup(email: string, name: string) { const user = await createUser(email, name); posthog.identify(user.id, { email: user.email, name: user.name, created_at: user.createdAt, plan: 'free', }); posthog.capture('user_signed_up', { signup_method: 'email', }); } // Identify on login async function handleLogin(email: string) { const user = await authenticateUser(email); posthog.identify(user.id, { email: user.email, name: user.name, plan: user.plan, last_login: new Date().toISOString(), }); posthog.capture('user_logged_in'); } // Reset on logout function handleLogout() { posthog.capture('user_logged_out'); posthog.reset(); // Clears identity } ``` ### User Properties ```typescript // Standard properties to track interface UserProperties { // Identity email: string; name: string; // Lifecycle created_at: string; plan: 'free' | 'pro' | 'enterprise'; // Engagement onboarding_completed: boolean; feature_count: number; // Business company_name?: string; company_size?: string; industry?: string; } // Update properties when they change posthog.capture('$set', { $set: { plan: 'pro' }, }); ``` --- ## Event Tracking Patterns ### Event Naming Convention ```typescript // Format: [object]_[action] // Use snake_case, past tense for actions // ✅ Good event names 'user_signed_up' 'feature_created' 'subscription_upgraded' 'onboarding_completed' 'invite_sent' 'file_uploaded' 'search_performed' 'checkout_started' 'payment_completed' // ❌ Bad event names 'click' // Too vague 'ButtonClick' // Not snake_case 'user signup' // Spaces 'creatingFeature' // Not past tense ``` ### Core Events by Category ```typescript // === AUTHENTICATION === posthog.capture('user_signed_up', { signup_method: 'google' | 'email' | 'github', referral_source: 'organic' | 'paid' | 'referral', }); posthog.capture('user_logged_in', { login_method: 'google' | 'email' | 'magic_link', }); posthog.capture('user_logged_out'); posthog.capture('password_reset_requested'); // === ONBOARDING === posthog.capture('onboarding_started'); posthog.capture('onboarding_step_completed', { step_name: 'profile' | 'preferences' | 'first_action', step_number: 1, total_steps: 3, }); posthog.capture('onboarding_completed', { duration_seconds: 120, steps_skipped: 0, }); posthog.capture('onboarding_skipped', { skipped_at_step: 2, }); // === FEATURE USAGE === posthog.capture('feature_used', { feature_name: 'export' | 'share' | 'duplicate', context: 'dashboard' | 'editor', }); posthog.capture('[resource]_created', { resource_type: 'project' | 'document' | 'team', // Resource-specific properties }); posthog.capture('[resource]_updated', { resource_type: 'project', fields_changed: ['name', 'description'], }); posthog.capture('[resource]_deleted', { resource_type: 'project', }); // === BILLING === posthog.capture('pricing_page_viewed', { current_plan: 'free', }); posthog.capture('checkout_started', { plan: 'pro', billing_period: 'monthly' | 'annual', price: 29, }); posthog.capture('subscription_upgraded', { from_plan: 'free', to_plan: 'pro', mrr_change: 29, }); posthog.capture('subscription_downgraded', { from_plan: 'pro', to_plan: 'free', reason: 'too_expensive' | 'missing_features' | 'not_using', }); posthog.capture('subscription_cancelled', { plan: 'pro', reason: 'string', feedback: 'string', }); // === ERRORS === posthog.capture('error_occurred', { error_type: 'api_error' | 'validation_error' | 'network_error', error_message: 'string', error_code: 'string', page: '/dashboard', }); ``` ### React Hook for Tracking ```typescript // hooks/useTrack.ts import { useCallback } from 'react'; import { posthog } from '@/lib/posthog'; export function useTrack() { const track = useCallback((event: string, properties?: Record<string, any>) => { posthog.capture(event, { ...properties, timestamp: new Date().toISOString(), }); }, []); return { track }; } // Usage function CreateProjectButton() { const { track } = useTrack(); const handleCreate = async () => { track('project_creation_started'); try { const project = await createProject(); track('project_created', { project_id: project.id, template_used: project.template, }); } catch (error) { track('project_creation_failed', { error_message: error.message, }); } }; return <button onClick={handleCreate}>Create Project</button>; } ``` --- ## Feature Flags ### Setup ```typescript // Check feature flag (client-side) import { useFeatureFlagEnabled } from 'posthog-js/react'; function NewFeature() { const showNewUI = useFeatureFlagEnabled('new-dashboard-ui'); if (showNewUI) { return <NewDashboard />; } return <OldDashboard />; } // With payload import { useFeatureFlagPayload } from 'posthog-js/react'; function PricingPage() { const pricingConfig = useFeatureFlagPayload('pricing-experiment'); // pricingConfig = { price: 29, showAnnual: true } return <Pricing config={pricingConfig} />; } ``` ### Server-Side (Next.js) ```typescript // app/dashboard/page.tsx import { PostHog } from 'posthog-node'; import { cookies } from 'next/headers'; async function getFeatureFlags(userId: string) { const posthog = new PostHog(process.env.POSTHOG_API_KEY!); const flags = await posthog.getAllFlags(userId); await posthog.shutdown(); return flags; } export default async function Dashboard() { const cookieStore = cookies(); const userId = cookieStore.get('user_id')?.value; const flags = await getFeatureFlags(userId); return ( <div> {flags['new-dashboard'] && <NewFeature />} </div> ); } ``` ### A/B Testing ```typescript // Track experiment exposure function ExperimentComponent() { const variant = useFeatureFlagEnabled('checkout-experiment'); useEffect(() => { posthog.capture('experiment_viewed', { experiment: 'checkout-experiment', variant: variant ? 'test' : 'control', }); }, [variant]); return variant ? <NewCheckout /> : <OldCheckout />; } ``` --- ## Project-Specific Dashboards ### SaaS Product ```markdown ## Essential SaaS Dashboards ### 1. Acquisition Dashboard **Questions answered:** Where do users come from? What converts? Insights to create: - [ ] Signups by source (daily/weekly trend) - [ ] Signup conversion rate by landing page - [ ] Time from first visit to signup - [ ] Signup funnel: Visit → Signup Page → Form Start → Complete ### 2. Activation Dashboard **Questions answered:** Are new users getting value? Insights to create: - [ ] Onboarding completion rate - [ ] Time to first key action - [ ] Activation rate (% reaching "aha moment" in first 7 days) - [ ] Drop-off by onboarding step - [ ] Feature adoption in first session ### 3. Engagement Dashboard **Questions answered:** How are users using the product? Insights to create: - [ ] DAU/WAU/MAU trends - [ ] Feature usage heatmap - [ ] Session duration distribution - [ ] Actions per session - [ ] Power users vs casual users ### 4. Retention Dashboard **Questions answered:** Are users coming back? Insights to create: - [ ] Retention cohorts (D1, D7, D30) - [ ] Churn rate by plan - [ ] Reactivation rate - [ ] Last action before churn - [ ] Features correlated with retention ### 5. Revenue Dashboard **Questions answered:** Is the business growing? Insights to create: - [ ] MRR trend - [ ] Upgrades vs downgrades - [ ] Trial to paid conversion - [ ] Revenue by plan - [ ] LTV by acquisition source ``` ### E-Commerce ```markdown ## Essential E-Commerce Dashboards ### 1. Conversion Funnel Insights to create: - [ ] Full funnel: Browse → PDP → Add to Cart → Checkout → Purchase - [ ] Cart abandonment rate - [ ] Checkout drop-off by step - [ ] Payment failure rate ### 2. Product Performance Insights to create: - [ ] Product views → purchases (by product) - [ ] Add to cart rate by category - [ ] Search → purchase correlation - [ ] Cross-sell effectiveness ### 3. Customer Dashboard Insights to create: - [ ] Repeat purchase rate - [ ] Average order value trend - [ ] Customer lifetime value - [ ] Purchase frequency distribution ``` ### Content/Media ```markdown ## Essential Content Dashboards ### 1. Consumption Dashboard Insights to create: - [ ] Content views by type - [ ] Read/watch completion rate - [ ] Time on content - [ ] Scroll depth distribution ### 2. Engagement Dashboard Insights to create: - [ ] Shares by content - [ ] Comments per article - [ ] Save/bookmark rate - [ ] Return visits to same content ### 3. Growth Dashboard Insights to create: - [ ] New vs returning visitors - [ ] Email signup rate - [ ] Referral traffic sources ``` ### AI/LLM Application ```markdown ## Essential AI App Dashboards ### 1. Usage Dashboard Insights to create: - [ ] Queries per user per day - [ ] Token usage distribution - [ ] Response time p50/p95 - [ ] Error rate by query type ### 2. Quality Dashboard Insights to create: - [ ] User feedback (thumbs up/down) - [ ] Regeneration rate (user asked for new response) - [ ] Edit rate (user modified AI output) - [ ] Follow-up query rate ### 3. Cost Dashboard Insights to create: - [ ] Token cost per user - [ ] Cost by model - [ ] Cost by feature - [ ] Efficiency trends (value/cost) ``` --- ## Creating Dashboards ### Using PostHog MCP ```markdown When setting up analytics for a project: 1. First, check existing dashboards: - Use `dashboards-get-all` to list current dashboards 2. Create project-appropriate dashboards: - Use `dashboard-create` with descriptive name 3. Create insights for each dashboard: - Use `query-run` to test queries - Use `insight-create-from-query` to save - Use `add-insight-to-dashboard` to organize 4. Set up key funnels: - Signup funnel - Onboarding funnel - Purchase/conversion funnel ``` ### Dashboard Creation Workflow ```typescript // Example: Creating SaaS dashboards via MCP // 1. Create dashboard const dashboard = await mcp_posthog_dashboard_create({ name: "Activation Metrics", description: "Track new user activation and onboarding", tags: ["saas", "activation"], }); // 2. Create insights const signupFunnel = await mcp_posthog_query_run({ query: { kind: "InsightVizNode", source: { kind: "FunnelsQuery", series: [ { kind: "EventsNode", event: "user_signed_up", name: "Signed Up" }, { kind: "EventsNode", event: "onboarding_started", name: "Started Onboarding" }, { kind: "EventsNode", event: "onboarding_completed", name: "Completed Onboarding" }, { kind: "EventsNode", event: "first_value_action", name: "First Value" }, ], dateRange: { date_from: "-30d" }, }, }, }); // 3. Save and add to dashboard const insight = await mcp_posthog_insight_create_from_query({ name: "Signup to Activation Funnel", query: signupFunnel.query, favorited: true, }); await mcp_posthog_add_insight_to_dashboard({ insightId: insight.id, dashboardId: dashboard.id, }); ``` --- ## Privacy & Compliance ### GDPR Compliance ```typescript // Opt-out handling export function handleCookieConsent(consent: boolean) { if (consent) { posthog.opt_in_capturing(); } else { posthog.opt_out_capturing(); } } // Check consent status const hasConsent = posthog.has_opted_in_capturing(); // Initialize with consent check posthog.init(key, { opt_out_capturing_by_default: true, // Require explicit opt-in respect_dnt: true, // Respect Do Not Track }); ``` ### Data to Never Track ```typescript // ❌ NEVER track these posthog.capture('event', { password: '...', // Credentials credit_card: '...', // Payment info ssn: '...', // Government IDs medical_info: '...', // Health data full_address: '...', // Detailed location }); // ✅ OK to track posthog.capture('event', { country: 'US', // General location plan: 'pro', // Product info feature_used: 'export', // Usage }); ``` ### Property Sanitization ```typescript // lib/analytics.ts const SENSITIVE_KEYS = ['password', 'token', 'secret', 'credit', 'ssn']; function sanitizeProperties(props: Record<string, any>): Record<string, any> { return Object.fromEntries( Object.entries(props).filter(([key]) => !SENSITIVE_KEYS.some(sensitive => key.toLowerCase().includes(sensitive)) ) ); } export function safeCapture(event: string, properties?: Record<string, any>) { posthog.capture(event, sanitizeProperties(properties || {})); } ``` --- ## Testing Analytics ### Development Mode ```typescript // Disable in development if (process.env.NODE_ENV === 'development') { posthog.opt_out_capturing(); // Or use debug mode posthog.debug(); } ``` ### E2E Testing ```typescript // playwright/fixtures.ts import { test as base } from '@playwright/test'; export const test = base.extend({ page: async ({ page }, use) => { // Mock PostHog to capture events await page.addInitScript(() => { window.capturedEvents = []; window.posthog = { capture: (event, props) => { window.capturedEvents.push({ event, props }); }, identify: () => {}, reset: () => {}, }; }); await use(page); }, }); // In tests test('tracks signup event', async ({ page }) => { await page.goto('/signup'); await page.fill('[name=email]', 'test@example.com'); await page.click('button[type=submit]'); const events = await page.evaluate(() => window.capturedEvents); expect(events).toContainEqual({ event: 'user_signed_up', props: expect.objectContaining({ signup_method: 'email' }), }); }); ``` --- ## Debugging ### PostHog Toolbar ```typescript // Enable toolbar for debugging posthog.init(key, { // ... loaded: (posthog) => { if (process.env.NODE_ENV === 'development') { posthog.debug(); // Toolbar available via PostHog dashboard } }, }); ``` ### Event Debugging ```typescript // Log all events in development posthog.init(key, { _onCapture: (eventName, eventData) => { if (process.env.NODE_ENV === 'development') { console.log('PostHog Event:', eventName, eventData); } }, }); ``` --- ## Quick Reference ### Event Checklist by User Lifecycle ```markdown ## Must-Track Events ### Acquisition - [ ] `page_viewed` (automatic with capture_pageview) - [ ] `user_signed_up` - [ ] `user_logged_in` ### Activation - [ ] `onboarding_started` - [ ] `onboarding_step_completed` - [ ] `onboarding_completed` - [ ] `first_[key_action]` (your "aha moment") ### Engagement - [ ] `[feature]_used` - [ ] `[resource]_created` - [ ] `search_performed` - [ ] `invite_sent` ### Revenue - [ ] `pricing_page_viewed` - [ ] `checkout_started` - [ ] `subscription_upgraded` - [ ] `subscription_cancelled` ### Retention - [ ] `session_started` - [ ] `feature_[x]_used` (power features) ``` ### Dashboard Templates | Project Type | Key Dashboards | |--------------|----------------| | **SaaS** | Acquisition, Activation, Engagement, Retention, Revenue | | **E-Commerce** | Conversion Funnel, Product Performance, Customer LTV | | **Content** | Consumption, Engagement, Growth | | **AI/LLM** | Usage, Quality, Cost | | **Mobile App** | Installs, Onboarding, DAU/MAU, Crashes | ### Properties to Always Include ```typescript // Auto-enriched by PostHog $current_url $browser $device_type $os // Add these yourself user_plan // 'free' | 'pro' | 'enterprise' user_role // 'admin' | 'member' company_id // For B2B feature_context // Where in the app ``` ================================================ FILE: skills/project-tooling/SKILL.md ================================================ --- name: project-tooling description: gh, vercel, supabase, render CLI and deployment platform setup when-to-use: When setting up deployment, CI/CD, or when CLI tools are needed user-invocable: false effort: low --- # Project Tooling Skill Standard CLI tools for project infrastructure management. --- ## Required CLI Tools Before starting any project, verify these tools are installed and authenticated: ### 1. GitHub CLI (gh) ```bash # Verify installation gh --version # Verify authentication gh auth status # If not authenticated: gh auth login ``` ### 2. Vercel CLI ```bash # Verify installation vercel --version # Verify authentication vercel whoami # If not authenticated: vercel login ``` ### 3. Supabase CLI ```bash # Verify installation supabase --version # Verify authentication (check if linked to a project or logged in) supabase projects list # If not authenticated: supabase login ``` ### 4. Render CLI (optional - for Render deployments) ```bash # Verify installation render --version # If using Render API instead: # Ensure RENDER_API_KEY is set in environment ``` --- ## Validation Script Run this at project initialization to verify all tools: ```bash #!/bin/bash # scripts/verify-tooling.sh set -e echo "Verifying project tooling..." # GitHub CLI if command -v gh &> /dev/null; then if gh auth status &> /dev/null; then echo "✓ GitHub CLI authenticated" else echo "✗ GitHub CLI not authenticated. Run: gh auth login" exit 1 fi else echo "✗ GitHub CLI not installed. Run: brew install gh" exit 1 fi # Vercel CLI if command -v vercel &> /dev/null; then if vercel whoami &> /dev/null; then echo "✓ Vercel CLI authenticated" else echo "✗ Vercel CLI not authenticated. Run: vercel login" exit 1 fi else echo "✗ Vercel CLI not installed. Run: npm i -g vercel" exit 1 fi # Supabase CLI if command -v supabase &> /dev/null; then if supabase projects list &> /dev/null; then echo "✓ Supabase CLI authenticated" else echo "✗ Supabase CLI not authenticated. Run: supabase login" exit 1 fi else echo "✗ Supabase CLI not installed. Run: brew install supabase/tap/supabase" exit 1 fi echo "" echo "All tools verified!" ``` --- ## GitHub Repository Setup ### Create New Repository ```bash # Create and push in one command gh repo create <repo-name> --private --source=. --remote=origin --push # Or public: gh repo create <repo-name> --public --source=. --remote=origin --push ``` ### Connect Existing Repository ```bash # If repo exists on GitHub but not linked locally gh repo clone <owner>/<repo> # Or add remote to existing local project git remote add origin https://github.com/<owner>/<repo>.git git push -u origin main ``` ### Repository Settings ```bash # Enable branch protection on main gh api repos/{owner}/{repo}/branches/main/protection -X PUT \ -F required_status_checks='{"strict":true,"contexts":["quality"]}' \ -F enforce_admins=false \ -F required_pull_request_reviews='{"required_approving_review_count":1}' # Set default branch gh repo edit --default-branch main ``` --- ## Vercel Deployment ### Link Project ```bash # Link current directory to Vercel project vercel link # Or create new project vercel ``` ### Environment Variables ```bash # Add environment variable vercel env add ANTHROPIC_API_KEY production # Pull env vars to local .env vercel env pull .env.local ``` ### Deploy ```bash # Deploy to preview vercel # Deploy to production vercel --prod ``` --- ## Supabase Setup ### Create New Project ```bash # Create project (interactive) supabase projects create <project-name> --org-id <org-id> # Link local to remote supabase link --project-ref <project-ref> ``` ### Local Development ```bash # Start local Supabase supabase start # Stop local Supabase supabase stop # Reset database (apply all migrations fresh) supabase db reset ``` ### Migrations ```bash # Create new migration supabase migration new <migration-name> # Apply migrations to remote supabase db push # Pull remote schema to local supabase db pull ``` ### Generate Types ```bash # Generate TypeScript types from schema supabase gen types typescript --local > src/types/database.ts # Or from remote supabase gen types typescript --project-id <ref> > src/types/database.ts ``` --- ## Render Setup (API-based) ### Environment ```bash # Set API key export RENDER_API_KEY=<your-api-key> ``` ### Common Operations via API ```bash # List services curl -H "Authorization: Bearer $RENDER_API_KEY" \ https://api.render.com/v1/services # Trigger deploy curl -X POST -H "Authorization: Bearer $RENDER_API_KEY" \ https://api.render.com/v1/services/<service-id>/deploys # Get deploy status curl -H "Authorization: Bearer $RENDER_API_KEY" \ https://api.render.com/v1/services/<service-id>/deploys/<deploy-id> ``` --- ## Package.json Scripts Add these scripts for common operations: ```json { "scripts": { "verify-tools": "./scripts/verify-tooling.sh", "deploy:preview": "vercel", "deploy:prod": "vercel --prod", "db:start": "supabase start", "db:stop": "supabase stop", "db:reset": "supabase db reset", "db:migrate": "supabase db push", "db:types": "supabase gen types typescript --local > src/types/database.ts" } } ``` --- ## CI/CD Integration ### GitHub Actions with Vercel ```yaml # .github/workflows/deploy.yml name: Deploy on: push: branches: [main] pull_request: branches: [main] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Deploy to Vercel uses: amondnet/vercel-action@v25 with: vercel-token: ${{ secrets.VERCEL_TOKEN }} vercel-org-id: ${{ secrets.VERCEL_ORG_ID }} vercel-project-id: ${{ secrets.VERCEL_PROJECT_ID }} vercel-args: ${{ github.ref == 'refs/heads/main' && '--prod' || '' }} ``` ### GitHub Actions with Supabase ```yaml # .github/workflows/migrate.yml name: Migrate Database on: push: branches: [main] paths: - 'supabase/migrations/**' jobs: migrate: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup Supabase CLI uses: supabase/setup-cli@v1 with: version: latest - name: Push migrations run: supabase db push env: SUPABASE_ACCESS_TOKEN: ${{ secrets.SUPABASE_ACCESS_TOKEN }} SUPABASE_DB_PASSWORD: ${{ secrets.SUPABASE_DB_PASSWORD }} ``` --- ## Deployment Platform Setup **REQUIRED**: When initializing a project, always create todos for deployment platform connection based on the stack. ### Platform Selection by Stack | Stack | Default Platform | Action Required | |-------|-----------------|-----------------| | Next.js / Node.js | **Vercel** | Connect Git repo to Vercel | | Python (FastAPI, Flask) | **Render** | Connect Git repo to Render, get API key | | Static sites | **Vercel** or **Cloudflare Pages** | Connect Git repo | ### Vercel: Connect Git Repository When Vercel is the deployment platform, create this todo: ``` TODO: Connect Git repository to Vercel for automatic deployments ``` Steps: ```bash # Option 1: Via CLI vercel link vercel git connect # Option 2: Via Dashboard (recommended for first setup) # 1. Go to vercel.com/new # 2. Import Git repository # 3. Configure project settings # 4. Deploy ``` After connecting: - Push to `main` → Production deploy - Push to other branches → Preview deploy - PRs get deploy previews automatically ### Render: Connect Git Repository (Python) When Render is the deployment platform for Python projects: **Step 1: Ask user for Render API key** ``` Before proceeding, please provide your Render API key. Get it from: https://dashboard.render.com/u/settings/api-keys Store it securely - we'll add it to your environment. ``` **Step 2: Create todos** ``` TODO: Get Render API key from user TODO: Connect Git repository to Render TODO: Configure Render service (web service or background worker) TODO: Set environment variables on Render ``` **Step 3: Connect via Dashboard (recommended)** ```bash # 1. Go to dashboard.render.com/create # 2. Select "Web Service" for APIs, "Background Worker" for async # 3. Connect your GitHub/GitLab repository # 4. Configure: # - Name: <project-name> # - Runtime: Python 3 # - Build Command: pip install -r requirements.txt # - Start Command: uvicorn main:app --host 0.0.0.0 --port $PORT ``` **Step 4: Store API key for CI/CD** ```bash # Add to GitHub secrets for CI/CD gh secret set RENDER_API_KEY # Or add to local env echo "RENDER_API_KEY=<your-key>" >> .env ``` **Step 5: Configure render.yaml (optional - Infrastructure as Code)** ```yaml # render.yaml services: - type: web name: <project-name>-api runtime: python buildCommand: pip install -r requirements.txt startCommand: uvicorn main:app --host 0.0.0.0 --port $PORT envVars: - key: PYTHON_VERSION value: "3.11" - key: DATABASE_URL fromDatabase: name: <project-name>-db property: connectionString databases: - name: <project-name>-db plan: free ``` ### Deployment Checklist Template Add to project todos when setting up deployment: ```markdown ## Deployment Setup - [ ] Create Git repository (gh repo create) - [ ] Choose deployment platform (Vercel/Render/other) - [ ] Connect Git to deployment platform - [ ] Configure environment variables - [ ] Set up CI/CD workflow - [ ] Verify preview deployments work - [ ] Configure production domain ``` --- ## Tooling Anti-Patterns - ❌ Hardcoded secrets - use CLI env management or GitHub secrets - ❌ Manual deployments - automate via CI/CD - ❌ Skipping local Supabase - always develop locally first - ❌ Direct production database changes - use migrations - ❌ No branch protection - require PR reviews and CI checks - ❌ Missing environment separation - keep dev/staging/prod separate ================================================ FILE: skills/pwa-development/SKILL.md ================================================ --- name: pwa-development description: Progressive Web Apps - service workers, caching strategies, offline, Workbox when-to-use: When building PWA features - service workers, caching, offline support user-invocable: false paths: ["**/sw.*", "**/service-worker.*", "**/workbox-config.*", "**/manifest.json"] effort: medium --- # PWA Development Skill **Purpose:** Build Progressive Web Apps that work offline, install like native apps, and deliver fast, reliable experiences across all devices. --- ## Core PWA Requirements ``` ┌─────────────────────────────────────────────────────────────────┐ │ THE THREE PILLARS OF PWA │ │ ───────────────────────────────────────────────────────────── │ │ │ │ 1. HTTPS │ │ Required for service workers and security. │ │ localhost allowed for development. │ │ │ │ 2. SERVICE WORKER │ │ JavaScript that runs in background. │ │ Enables offline, caching, push notifications. │ │ │ │ 3. WEB APP MANIFEST │ │ JSON file describing app metadata. │ │ Enables installation and app-like experience. │ ├─────────────────────────────────────────────────────────────────┤ │ INSTALLABILITY CRITERIA (Chrome) │ │ ───────────────────────────────────────────────────────────── │ │ • HTTPS (or localhost) │ │ • Service worker with fetch handler │ │ • Web app manifest with: name, icons (192px + 512px), │ │ start_url, display: standalone/fullscreen/minimal-ui │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Web App Manifest ### Required Fields ```json { "name": "My Progressive Web App", "short_name": "MyPWA", "description": "A description of what the app does", "start_url": "/", "display": "standalone", "background_color": "#ffffff", "theme_color": "#000000", "icons": [ { "src": "/icons/icon-192.png", "sizes": "192x192", "type": "image/png" }, { "src": "/icons/icon-512.png", "sizes": "512x512", "type": "image/png" }, { "src": "/icons/icon-512-maskable.png", "sizes": "512x512", "type": "image/png", "purpose": "maskable" } ] } ``` ### Enhanced Manifest (Full Features) ```json { "name": "My Progressive Web App", "short_name": "MyPWA", "description": "A full-featured PWA", "start_url": "/?source=pwa", "scope": "/", "display": "standalone", "orientation": "portrait-primary", "background_color": "#ffffff", "theme_color": "#3367D6", "dir": "ltr", "lang": "en", "categories": ["productivity", "utilities"], "icons": [ { "src": "/icons/icon-72.png", "sizes": "72x72", "type": "image/png" }, { "src": "/icons/icon-96.png", "sizes": "96x96", "type": "image/png" }, { "src": "/icons/icon-128.png", "sizes": "128x128", "type": "image/png" }, { "src": "/icons/icon-144.png", "sizes": "144x144", "type": "image/png" }, { "src": "/icons/icon-152.png", "sizes": "152x152", "type": "image/png" }, { "src": "/icons/icon-192.png", "sizes": "192x192", "type": "image/png" }, { "src": "/icons/icon-384.png", "sizes": "384x384", "type": "image/png" }, { "src": "/icons/icon-512.png", "sizes": "512x512", "type": "image/png" }, { "src": "/icons/icon-maskable.png", "sizes": "512x512", "type": "image/png", "purpose": "maskable" } ], "screenshots": [ { "src": "/screenshots/desktop.png", "sizes": "1280x720", "type": "image/png", "form_factor": "wide" }, { "src": "/screenshots/mobile.png", "sizes": "750x1334", "type": "image/png", "form_factor": "narrow" } ], "shortcuts": [ { "name": "New Item", "short_name": "New", "description": "Create a new item", "url": "/new?source=shortcut", "icons": [{ "src": "/icons/shortcut-new.png", "sizes": "192x192" }] } ], "share_target": { "action": "/share", "method": "POST", "enctype": "multipart/form-data", "params": { "title": "title", "text": "text", "url": "url", "files": [{ "name": "files", "accept": ["image/*"] }] } }, "protocol_handlers": [ { "protocol": "web+myapp", "url": "/handle?url=%s" } ], "file_handlers": [ { "action": "/open-file", "accept": { "text/plain": [".txt"] } } ] } ``` ### Manifest Checklist - [ ] `name` and `short_name` defined - [ ] `start_url` set (use query param for analytics) - [ ] `display` set to `standalone` or `fullscreen` - [ ] Icons: 192x192 and 512x512 minimum - [ ] Maskable icon included for Android adaptive icons - [ ] `theme_color` matches app design - [ ] `background_color` for splash screen - [ ] Screenshots for richer install UI (optional) - [ ] Shortcuts for quick actions (optional) --- ## Service Worker Patterns ### Basic Service Worker ```javascript // sw.js const CACHE_NAME = 'app-cache-v1'; const STATIC_ASSETS = [ '/', '/index.html', '/styles/main.css', '/scripts/app.js', '/offline.html' ]; // Install: Cache static assets self.addEventListener('install', (event) => { event.waitUntil( caches.open(CACHE_NAME) .then((cache) => cache.addAll(STATIC_ASSETS)) .then(() => self.skipWaiting()) ); }); // Activate: Clean old caches self.addEventListener('activate', (event) => { event.waitUntil( caches.keys() .then((keys) => Promise.all( keys .filter((key) => key !== CACHE_NAME) .map((key) => caches.delete(key)) )) .then(() => self.clients.claim()) ); }); // Fetch: Serve from cache, fall back to network self.addEventListener('fetch', (event) => { event.respondWith( caches.match(event.request) .then((cached) => cached || fetch(event.request)) .catch(() => caches.match('/offline.html')) ); }); ``` ### Registration ```javascript // main.js if ('serviceWorker' in navigator) { window.addEventListener('load', async () => { try { const registration = await navigator.serviceWorker.register('/sw.js', { scope: '/' }); console.log('SW registered:', registration.scope); } catch (error) { console.error('SW registration failed:', error); } }); } ``` --- ## Caching Strategies ### Strategy Selection Guide | Strategy | Use Case | Description | |----------|----------|-------------| | **Cache First** | Static assets (CSS, JS, images) | Check cache, fall back to network | | **Network First** | API responses, dynamic content | Try network, fall back to cache | | **Stale While Revalidate** | Semi-static content (avatars, articles) | Serve cache immediately, update in background | | **Network Only** | Non-cacheable requests (analytics) | Always use network | | **Cache Only** | Offline-only assets | Only serve from cache | ### Cache First (Offline First) ```javascript // Best for: Static assets that rarely change self.addEventListener('fetch', (event) => { if (event.request.destination === 'image' || event.request.destination === 'style' || event.request.destination === 'script') { event.respondWith( caches.match(event.request) .then((cached) => { if (cached) return cached; return fetch(event.request).then((response) => { const clone = response.clone(); caches.open(CACHE_NAME).then((cache) => { cache.put(event.request, clone); }); return response; }); }) ); } }); ``` ### Network First (Fresh First) ```javascript // Best for: API data, frequently updated content self.addEventListener('fetch', (event) => { if (event.request.url.includes('/api/')) { event.respondWith( fetch(event.request) .then((response) => { const clone = response.clone(); caches.open(CACHE_NAME).then((cache) => { cache.put(event.request, clone); }); return response; }) .catch(() => caches.match(event.request)) ); } }); ``` ### Stale While Revalidate ```javascript // Best for: Content that's okay to be slightly outdated self.addEventListener('fetch', (event) => { if (event.request.url.includes('/articles/')) { event.respondWith( caches.open(CACHE_NAME).then((cache) => { return cache.match(event.request).then((cached) => { const fetchPromise = fetch(event.request).then((response) => { cache.put(event.request, response.clone()); return response; }); return cached || fetchPromise; }); }) ); } }); ``` --- ## Workbox (Recommended) ### Why Workbox? - Battle-tested caching strategies - Precaching with revision management - Background sync for offline forms - Automatic cache cleanup - TypeScript support ### Installation ```bash npm install workbox-webpack-plugin # Webpack npm install @vite-pwa/vite-plugin # Vite ``` ### Workbox with Vite ```javascript // vite.config.js import { VitePWA } from 'vite-plugin-pwa'; export default { plugins: [ VitePWA({ registerType: 'autoUpdate', includeAssets: ['favicon.ico', 'robots.txt', 'apple-touch-icon.png'], manifest: { name: 'My App', short_name: 'App', theme_color: '#ffffff', icons: [ { src: 'pwa-192x192.png', sizes: '192x192', type: 'image/png' }, { src: 'pwa-512x512.png', sizes: '512x512', type: 'image/png' } ] }, workbox: { globPatterns: ['**/*.{js,css,html,ico,png,svg}'], runtimeCaching: [ { urlPattern: /^https:\/\/api\.example\.com\/.*/i, handler: 'NetworkFirst', options: { cacheName: 'api-cache', expiration: { maxEntries: 100, maxAgeSeconds: 60 * 60 * 24 // 24 hours } } }, { urlPattern: /\.(?:png|jpg|jpeg|svg|gif)$/, handler: 'CacheFirst', options: { cacheName: 'image-cache', expiration: { maxEntries: 50, maxAgeSeconds: 60 * 60 * 24 * 30 // 30 days } } } ] } }) ] }; ``` ### Workbox Manual Service Worker ```javascript // sw.js import { precacheAndRoute } from 'workbox-precaching'; import { registerRoute } from 'workbox-routing'; import { CacheFirst, NetworkFirst, StaleWhileRevalidate } from 'workbox-strategies'; import { ExpirationPlugin } from 'workbox-expiration'; import { CacheableResponsePlugin } from 'workbox-cacheable-response'; // Precache static assets (generated by build tool) precacheAndRoute(self.__WB_MANIFEST); // Cache images registerRoute( ({ request }) => request.destination === 'image', new CacheFirst({ cacheName: 'images', plugins: [ new CacheableResponsePlugin({ statuses: [0, 200] }), new ExpirationPlugin({ maxEntries: 60, maxAgeSeconds: 30 * 24 * 60 * 60 // 30 days }) ] }) ); // Cache API responses registerRoute( ({ url }) => url.pathname.startsWith('/api/'), new NetworkFirst({ cacheName: 'api-responses', plugins: [ new CacheableResponsePlugin({ statuses: [0, 200] }), new ExpirationPlugin({ maxEntries: 100, maxAgeSeconds: 24 * 60 * 60 // 24 hours }) ] }) ); // Cache page navigations registerRoute( ({ request }) => request.mode === 'navigate', new NetworkFirst({ cacheName: 'pages', plugins: [ new CacheableResponsePlugin({ statuses: [0, 200] }) ] }) ); ``` --- ## Offline Experience ### Offline Page ```html <!-- offline.html --> <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Offline - App Name
📡

You're offline

Check your connection and try again.

``` ### Offline Detection ```javascript // Online/offline status handling function updateOnlineStatus() { const status = navigator.onLine ? 'online' : 'offline'; document.body.dataset.connectionStatus = status; if (!navigator.onLine) { showNotification('You are offline. Some features may be unavailable.'); } } window.addEventListener('online', updateOnlineStatus); window.addEventListener('offline', updateOnlineStatus); updateOnlineStatus(); ``` ### Background Sync (Queue Offline Actions) ```javascript // sw.js with Workbox import { BackgroundSyncPlugin } from 'workbox-background-sync'; import { registerRoute } from 'workbox-routing'; import { NetworkOnly } from 'workbox-strategies'; const bgSyncPlugin = new BackgroundSyncPlugin('formQueue', { maxRetentionTime: 24 * 60 // Retry for 24 hours }); registerRoute( ({ url }) => url.pathname === '/api/submit', new NetworkOnly({ plugins: [bgSyncPlugin] }), 'POST' ); ``` ```javascript // main.js - Queue form submission async function submitForm(data) { try { const response = await fetch('/api/submit', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(data) }); return response.json(); } catch (error) { // Will be retried by background sync when online showNotification('Saved offline. Will sync when connected.'); } } ``` --- ## App-Like Features ### Install Prompt ```javascript let deferredPrompt; window.addEventListener('beforeinstallprompt', (e) => { e.preventDefault(); deferredPrompt = e; showInstallButton(); }); async function installApp() { if (!deferredPrompt) return; deferredPrompt.prompt(); const { outcome } = await deferredPrompt.userChoice; console.log(`User ${outcome === 'accepted' ? 'accepted' : 'dismissed'} install`); deferredPrompt = null; hideInstallButton(); } window.addEventListener('appinstalled', () => { console.log('App installed'); deferredPrompt = null; }); ``` ### Detecting Standalone Mode ```javascript // Check if running as installed PWA function isInstalledPWA() { return window.matchMedia('(display-mode: standalone)').matches || window.navigator.standalone === true; // iOS } // Listen for display mode changes window.matchMedia('(display-mode: standalone)') .addEventListener('change', (e) => { console.log('Display mode:', e.matches ? 'standalone' : 'browser'); }); ``` ### Push Notifications ```javascript // Request permission async function requestNotificationPermission() { const permission = await Notification.requestPermission(); if (permission === 'granted') { await subscribeToPush(); } return permission; } // Subscribe to push async function subscribeToPush() { const registration = await navigator.serviceWorker.ready; const subscription = await registration.pushManager.subscribe({ userVisibleOnly: true, applicationServerKey: urlBase64ToUint8Array(VAPID_PUBLIC_KEY) }); // Send subscription to server await fetch('/api/push/subscribe', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(subscription) }); } // sw.js - Handle push events self.addEventListener('push', (event) => { const data = event.data.json(); event.waitUntil( self.registration.showNotification(data.title, { body: data.body, icon: '/icons/icon-192.png', badge: '/icons/badge-72.png', data: { url: data.url } }) ); }); // Handle notification click self.addEventListener('notificationclick', (event) => { event.notification.close(); event.waitUntil( clients.openWindow(event.notification.data.url) ); }); ``` ### Share Target ```javascript // sw.js - Handle share target self.addEventListener('fetch', (event) => { if (event.request.url.endsWith('/share') && event.request.method === 'POST') { event.respondWith((async () => { const formData = await event.request.formData(); const title = formData.get('title'); const text = formData.get('text'); const url = formData.get('url'); // Store or process shared content // Redirect to app with shared data return Response.redirect(`/?shared=true&title=${encodeURIComponent(title)}`); })()); } }); ``` --- ## Performance Optimization ### Critical Rendering Path ```html ``` ### Image Optimization ```html Hero image Hero image ``` ### Code Splitting ```javascript // Dynamic imports for route-based splitting const routes = { '/': () => import('./pages/Home.js'), '/about': () => import('./pages/About.js'), '/settings': () => import('./pages/Settings.js') }; async function loadPage(path) { const loader = routes[path]; if (loader) { const module = await loader(); return module.default; } } ``` --- ## Testing PWA ### Lighthouse Audit ```bash # Run Lighthouse from CLI npx lighthouse https://your-app.com --view # Key metrics to check: # - PWA badge (installable, offline-ready) # - Performance score # - Best practices # - Accessibility ``` ### Manual Testing Checklist - [ ] **Installability** - [ ] Install prompt appears on desktop Chrome - [ ] Can be added to home screen on mobile - [ ] App opens in standalone mode after install - [ ] **Offline Support** - [ ] App loads when offline (airplane mode) - [ ] Cached pages display correctly - [ ] Offline fallback page shows for uncached routes - [ ] Background sync works when coming back online - [ ] **Performance** - [ ] First Contentful Paint < 1.8s - [ ] Largest Contentful Paint < 2.5s - [ ] Time to Interactive < 3.8s - [ ] Cumulative Layout Shift < 0.1 - [ ] **Service Worker** - [ ] SW registers successfully - [ ] Static assets cached on install - [ ] SW updates correctly (new version) - [ ] No stale cache issues - [ ] **Manifest** - [ ] All required fields present - [ ] Icons display correctly - [ ] Theme color applied - [ ] Splash screen shows on launch ### Testing Service Worker Updates ```javascript // Force update check if ('serviceWorker' in navigator) { navigator.serviceWorker.ready.then((registration) => { registration.update(); }); } // Listen for updates navigator.serviceWorker.addEventListener('controllerchange', () => { // New service worker activated window.location.reload(); }); ``` --- ## Project Structure ``` project/ ├── public/ │ ├── manifest.json # Web app manifest │ ├── sw.js # Service worker (if not bundled) │ ├── offline.html # Offline fallback page │ ├── robots.txt │ └── icons/ │ ├── icon-72.png │ ├── icon-96.png │ ├── icon-128.png │ ├── icon-144.png │ ├── icon-152.png │ ├── icon-192.png │ ├── icon-384.png │ ├── icon-512.png │ ├── icon-maskable.png # For adaptive icons │ ├── apple-touch-icon.png │ └── favicon.ico ├── src/ │ ├── sw.js # Service worker source (if bundled) │ ├── pwa/ │ │ ├── install.js # Install prompt handling │ │ ├── offline.js # Offline detection │ │ └── push.js # Push notification handling │ └── ... └── tests/ └── pwa/ ├── manifest.test.js ├── sw.test.js └── offline.test.js ``` --- ## Common Mistakes | Mistake | Fix | |---------|-----| | Missing maskable icon | Add icon with `"purpose": "maskable"` | | No offline fallback | Create `offline.html` and cache it | | Cache never expires | Use `ExpirationPlugin` with Workbox | | SW caches too aggressively | Use appropriate strategies per resource type | | No update mechanism | Implement `skipWaiting()` + reload prompt | | Broken install prompt | Ensure manifest meets all criteria | | No HTTPS in production | Configure SSL certificate | | Large cache size | Set `maxEntries` and `maxAgeSeconds` | | Stale API responses | Use `NetworkFirst` for dynamic data | | Missing start_url tracking | Add query param: `/?source=pwa` | --- ## PWA Development Checklist ### Before Launch - [ ] HTTPS configured (production) - [ ] Manifest complete with all required fields - [ ] Icons in all required sizes (192, 512, maskable) - [ ] Service worker registered and working - [ ] Offline page created and cached - [ ] Cache strategies defined for all resource types - [ ] Install prompt handling implemented - [ ] Lighthouse PWA audit passes ### After Launch - [ ] Monitor cache sizes - [ ] Test SW updates don't break app - [ ] Track PWA installs via analytics - [ ] Test on multiple devices/browsers - [ ] Monitor Core Web Vitals - [ ] Set up push notification flow (if needed) --- ## Framework-Specific Guides ### Next.js ```bash npm install next-pwa ``` ```javascript // next.config.js const withPWA = require('next-pwa')({ dest: 'public', disable: process.env.NODE_ENV === 'development' }); module.exports = withPWA({ // Your Next.js config }); ``` ### Create React App ```bash # CRA 4+ has PWA support built-in npx create-react-app my-pwa --template cra-template-pwa ``` ### Vite (Any Framework) ```bash npm install vite-plugin-pwa -D ``` See Workbox with Vite section above for configuration. --- ## Quick Reference ### Caching Strategy Cheat Sheet ``` Static Assets (CSS, JS, images) → Cache First API Responses → Network First User-generated content → Stale While Revalidate Analytics, non-cacheable → Network Only Offline-only assets → Cache Only ``` ### Manifest Minimum Requirements ```json { "name": "App Name", "short_name": "App", "start_url": "/", "display": "standalone", "icons": [ { "src": "/icon-192.png", "sizes": "192x192", "type": "image/png" }, { "src": "/icon-512.png", "sizes": "512x512", "type": "image/png" } ] } ``` ### Service Worker Lifecycle ``` 1. Register → 2. Install → 3. Activate → 4. Fetch ↓ ↓ ↓ ↓ Load app Cache assets Clean old Serve requests caches from cache/network ``` ================================================ FILE: skills/python/SKILL.md ================================================ --- name: python description: Python development with ruff, mypy, pytest - TDD and type safety when-to-use: When working on Python files user-invocable: false paths: ["**/*.py", "pyproject.toml", "setup.py", "requirements*.txt"] effort: medium --- # Python Skill --- ## Type Hints - Use type hints on all function signatures - Use `typing` module for complex types - Run `mypy --strict` in CI ```python def process_user(user_id: int, options: dict[str, Any] | None = None) -> User: ... ``` --- ## Project Structure ``` project/ ├── src/ │ └── package_name/ │ ├── __init__.py │ ├── core/ # Pure business logic │ │ ├── __init__.py │ │ ├── models.py # Pydantic models / dataclasses │ │ └── services.py # Pure functions │ ├── infra/ # Side effects │ │ ├── __init__.py │ │ ├── api.py # FastAPI routes │ │ └── db.py # Database operations │ └── utils/ # Shared utilities ├── tests/ │ ├── unit/ │ └── integration/ ├── pyproject.toml └── CLAUDE.md ``` --- ## Tooling (Required) ```toml # pyproject.toml [tool.ruff] line-length = 100 select = ["E", "F", "I", "N", "W", "UP"] [tool.mypy] strict = true [tool.pytest.ini_options] testpaths = ["tests"] addopts = "--cov=src --cov-report=term-missing --cov-fail-under=80" ``` --- ## Testing with Pytest ```python # tests/unit/test_services.py import pytest from package_name.core.services import calculate_total class TestCalculateTotal: def test_returns_sum_of_items(self): # Arrange items = [{"price": 10}, {"price": 20}] # Act result = calculate_total(items) # Assert assert result == 30 def test_returns_zero_for_empty_list(self): assert calculate_total([]) == 0 def test_raises_on_invalid_item(self): with pytest.raises(ValueError): calculate_total([{"invalid": "item"}]) ``` --- ## GitHub Actions ```yaml name: Python Quality Gate on: [push, pull_request] jobs: quality: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v5 with: python-version: '3.12' - name: Install dependencies run: | pip install -e ".[dev]" - name: Lint (Ruff) run: ruff check . - name: Format Check (Ruff) run: ruff format --check . - name: Type Check (mypy) run: mypy src/ - name: Test with Coverage run: pytest ``` --- ## Pre-Commit Hooks ```yaml # .pre-commit-config.yaml repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.8.0 hooks: - id: ruff args: [--fix] - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.13.0 hooks: - id: mypy additional_dependencies: [pydantic] args: [--strict] - repo: local hooks: - id: pytest name: pytest entry: pytest tests/unit -x --tb=short language: system pass_filenames: false always_run: true ``` Install and setup: ```bash pip install pre-commit pre-commit install ``` --- ## Patterns ### Pydantic for Data Validation ```python from pydantic import BaseModel, Field class CreateUserRequest(BaseModel): email: str = Field(..., min_length=5) name: str = Field(..., max_length=100) ``` ### Dependency Injection ```python # Don't import dependencies directly in business logic # Pass them in # Bad from .db import database def get_user(user_id: int) -> User: return database.fetch(user_id) # Good def get_user(user_id: int, db: Database) -> User: return db.fetch(user_id) ``` ### Result Pattern (No Exceptions in Core) ```python from dataclasses import dataclass @dataclass class Result[T]: value: T | None error: str | None @property def is_ok(self) -> bool: return self.error is None ``` --- ## Python Anti-Patterns - ❌ `from module import *` - ❌ Mutable default arguments - ❌ Bare `except:` clauses - ❌ Using `type: ignore` without explanation - ❌ Global variables for state - ❌ Classes when functions suffice ================================================ FILE: skills/react-native/SKILL.md ================================================ --- name: react-native description: React Native mobile patterns, platform-specific code when-to-use: When working on React Native mobile app code user-invocable: false paths: ["**/*.tsx", "**/*.jsx", "ios/**", "android/**", "app.json"] effort: medium --- # React Native Skill --- ## Project Structure ``` project/ ├── src/ │ ├── core/ # Pure business logic (no React) │ │ ├── types.ts │ │ └── services/ │ ├── components/ # Reusable UI components │ │ ├── Button/ │ │ │ ├── Button.tsx │ │ │ ├── Button.test.tsx │ │ │ └── index.ts │ │ └── index.ts # Barrel export │ ├── screens/ # Screen components │ │ ├── Home/ │ │ │ ├── HomeScreen.tsx │ │ │ ├── useHome.ts # Screen-specific hook │ │ │ └── index.ts │ │ └── index.ts │ ├── navigation/ # Navigation configuration │ ├── hooks/ # Shared custom hooks │ ├── store/ # State management │ └── utils/ # Utilities ├── __tests__/ ├── android/ ├── ios/ └── CLAUDE.md ``` --- ## Component Patterns ### Functional Components Only ```typescript // Good - simple, testable interface ButtonProps { label: string; onPress: () => void; disabled?: boolean; } export function Button({ label, onPress, disabled = false }: ButtonProps): JSX.Element { return ( {label} ); } ``` ### Extract Logic to Hooks ```typescript // useHome.ts - all logic here export function useHome() { const [items, setItems] = useState([]); const [loading, setLoading] = useState(false); const refresh = useCallback(async () => { setLoading(true); const data = await fetchItems(); setItems(data); setLoading(false); }, []); return { items, loading, refresh }; } // HomeScreen.tsx - pure presentation export function HomeScreen(): JSX.Element { const { items, loading, refresh } = useHome(); return ( ); } ``` ### Props Interface Always Explicit ```typescript // Always define props interface, even if simple interface ItemCardProps { item: Item; onPress: (id: string) => void; } export function ItemCard({ item, onPress }: ItemCardProps): JSX.Element { ... } ``` --- ## State Management ### Local State First ```typescript // Start with useState, escalate only when needed const [value, setValue] = useState(''); ``` ### Zustand for Global State (if needed) ```typescript // store/useAppStore.ts import { create } from 'zustand'; interface AppState { user: User | null; setUser: (user: User | null) => void; } export const useAppStore = create((set) => ({ user: null, setUser: (user) => set({ user }), })); ``` ### React Query for Server State ```typescript // hooks/useItems.ts import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; export function useItems() { return useQuery({ queryKey: ['items'], queryFn: fetchItems, }); } export function useCreateItem() { const queryClient = useQueryClient(); return useMutation({ mutationFn: createItem, onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['items'] }); }, }); } ``` --- ## Testing ### Component Testing with React Native Testing Library ```typescript import { render, fireEvent } from '@testing-library/react-native'; import { Button } from './Button'; describe('Button', () => { it('calls onPress when pressed', () => { const onPress = jest.fn(); const { getByText } = render( ); } ``` ### Extract Logic to Hooks ```typescript // useHome.ts - all logic here export function useHome() { const [items, setItems] = useState([]); const [loading, setLoading] = useState(false); const refresh = useCallback(async () => { setLoading(true); const data = await fetchItems(); setItems(data); setLoading(false); }, []); useEffect(() => { refresh(); }, [refresh]); return { items, loading, refresh }; } // HomePage.tsx - pure presentation export function HomePage(): JSX.Element { const { items, loading, refresh } = useHome(); if (loading) return ; return ; } ``` ### Props Interface Always Explicit ```typescript // Always define props interface, even if simple interface ItemCardProps { item: Item; onClick: (id: string) => void; } export function ItemCard({ item, onClick }: ItemCardProps): JSX.Element { return (
onClick(item.id)}>

{item.title}

); } ``` --- ## State Management ### Local State First ```typescript // Start with useState, escalate only when needed const [value, setValue] = useState(''); ``` ### Zustand for Global State (if needed) ```typescript // store/useAppStore.ts import { create } from 'zustand'; interface AppState { user: User | null; theme: 'light' | 'dark'; setUser: (user: User | null) => void; toggleTheme: () => void; } export const useAppStore = create((set) => ({ user: null, theme: 'light', setUser: (user) => set({ user }), toggleTheme: () => set((state) => ({ theme: state.theme === 'light' ? 'dark' : 'light' })), })); ``` ### React Query for Server State ```typescript // api/queries/useItems.ts import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; import { itemsApi } from '../client'; export function useItems() { return useQuery({ queryKey: ['items'], queryFn: itemsApi.getAll, staleTime: 5 * 60 * 1000, // 5 minutes }); } export function useCreateItem() { const queryClient = useQueryClient(); return useMutation({ mutationFn: itemsApi.create, onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['items'] }); }, }); } ``` --- ## Routing ### React Router (Vite/CRA) ```typescript // App.tsx import { BrowserRouter, Routes, Route } from 'react-router-dom'; export function App(): JSX.Element { return ( } /> } /> } /> ); } ``` ### Protected Routes ```typescript interface ProtectedRouteProps { children: JSX.Element; } function ProtectedRoute({ children }: ProtectedRouteProps): JSX.Element { const { user } = useAppStore(); const location = useLocation(); if (!user) { return ; } return children; } ``` --- ## Styling ### CSS Modules (Preferred) ```typescript // Button.module.css .primary { background: var(--color-primary); color: white; } .secondary { background: transparent; border: 1px solid var(--color-primary); } // Button.tsx import styles from './Button.module.css'; ``` ### Tailwind (Alternative) ```typescript // Use consistent patterns, extract repeated combinations const buttonVariants = { primary: 'bg-blue-500 text-white hover:bg-blue-600', secondary: 'bg-transparent border border-blue-500 text-blue-500', } as const; ``` --- ## Forms ### React Hook Form + Zod ```typescript import { useForm } from 'react-hook-form'; import { zodResolver } from '@hookform/resolvers/zod'; import { z } from 'zod'; const schema = z.object({ email: z.string().email('Invalid email'), password: z.string().min(8, 'Password must be at least 8 characters'), }); type FormData = z.infer; export function LoginForm(): JSX.Element { const { register, handleSubmit, formState: { errors } } = useForm({ resolver: zodResolver(schema), }); const onSubmit = (data: FormData) => { // handle submit }; return (
{errors.email && {errors.email.message}} {errors.password && {errors.password.message}}
); } ``` --- ## Testing ### Component Testing with React Testing Library ```typescript import { render, screen, fireEvent } from '@testing-library/react'; import { Button } from './Button'; describe('Button', () => { it('calls onClick when clicked', () => { const onClick = vi.fn(); render( ); } ``` ### src/app/(auth)/callback/route.ts ```typescript import { createClient } from '@/lib/supabase/server'; import { NextResponse } from 'next/server'; export async function GET(request: Request) { const { searchParams, origin } = new URL(request.url); const code = searchParams.get('code'); const next = searchParams.get('next') ?? '/dashboard'; if (code) { const supabase = await createClient(); const { error } = await supabase.auth.exchangeCodeForSession(code); if (!error) { return NextResponse.redirect(`${origin}${next}`); } } return NextResponse.redirect(`${origin}/login?error=auth_error`); } ``` --- ## Server Actions ### src/app/actions/posts.ts ```typescript 'use server'; import { revalidatePath } from 'next/cache'; import { redirect } from 'next/navigation'; import { db } from '@/db'; import { posts, NewPost } from '@/db/schema'; import { requireAuth } from '@/lib/auth'; import { eq } from 'drizzle-orm'; export async function createPost(formData: FormData) { const user = await requireAuth(); const title = formData.get('title') as string; const content = formData.get('content') as string; const [post] = await db.insert(posts).values({ authorId: user.id, title, content, }).returning(); revalidatePath('/dashboard'); redirect(`/posts/${post.id}`); } export async function updatePost(id: string, formData: FormData) { const user = await requireAuth(); const title = formData.get('title') as string; const content = formData.get('content') as string; await db.update(posts) .set({ title, content }) .where(eq(posts.id, id)); revalidatePath(`/posts/${id}`); } export async function deletePost(id: string) { const user = await requireAuth(); await db.delete(posts).where(eq(posts.id, id)); revalidatePath('/dashboard'); redirect('/dashboard'); } ``` --- ## Data Fetching ### src/db/queries/posts.ts ```typescript import { db } from '@/db'; import { posts, profiles } from '@/db/schema'; import { eq, desc, and } from 'drizzle-orm'; export async function getPublishedPosts(limit = 10) { return db .select({ id: posts.id, title: posts.title, content: posts.content, author: profiles.name, createdAt: posts.createdAt, }) .from(posts) .innerJoin(profiles, eq(posts.authorId, profiles.id)) .where(eq(posts.published, true)) .orderBy(desc(posts.createdAt)) .limit(limit); } export async function getUserPosts(userId: string) { return db .select() .from(posts) .where(eq(posts.authorId, userId)) .orderBy(desc(posts.createdAt)); } export async function getPostById(id: string) { const [post] = await db .select() .from(posts) .where(eq(posts.id, id)) .limit(1); return post ?? null; } ``` ### In Server Components ```typescript // src/app/dashboard/page.tsx import { requireAuth } from '@/lib/auth'; import { getUserPosts } from '@/db/queries/posts'; export default async function DashboardPage() { const user = await requireAuth(); const posts = await getUserPosts(user.id); return (

Your Posts

{posts.map((post) => (

{post.title}

{post.content}

))}
); } ``` --- ## Storage ### Upload Component ```typescript 'use client'; import { useState } from 'react'; import { createClient } from '@/lib/supabase/client'; export function AvatarUpload({ userId }: { userId: string }) { const [uploading, setUploading] = useState(false); const handleUpload = async (e: React.ChangeEvent) => { const file = e.target.files?.[0]; if (!file) return; setUploading(true); const supabase = createClient(); const fileExt = file.name.split('.').pop(); const filePath = `${userId}/avatar.${fileExt}`; const { error } = await supabase.storage .from('avatars') .upload(filePath, file, { upsert: true }); if (error) { console.error('Upload error:', error); } setUploading(false); }; return ( ); } ``` ### Get Public URL ```typescript import { createClient } from '@/lib/supabase/server'; export async function getAvatarUrl(userId: string) { const supabase = await createClient(); const { data } = supabase.storage .from('avatars') .getPublicUrl(`${userId}/avatar.png`); return data.publicUrl; } ``` --- ## Realtime ### Client Component with Subscription ```typescript 'use client'; import { useEffect, useState } from 'react'; import { createClient } from '@/lib/supabase/client'; import { Post } from '@/db/schema'; export function RealtimePosts({ initialPosts }: { initialPosts: Post[] }) { const [posts, setPosts] = useState(initialPosts); useEffect(() => { const supabase = createClient(); const channel = supabase .channel('posts') .on( 'postgres_changes', { event: '*', schema: 'public', table: 'posts' }, (payload) => { if (payload.eventType === 'INSERT') { setPosts((prev) => [payload.new as Post, ...prev]); } else if (payload.eventType === 'DELETE') { setPosts((prev) => prev.filter((p) => p.id !== payload.old.id)); } else if (payload.eventType === 'UPDATE') { setPosts((prev) => prev.map((p) => (p.id === payload.new.id ? payload.new as Post : p)) ); } } ) .subscribe(); return () => { supabase.removeChannel(channel); }; }, []); return (
    {posts.map((post) => (
  • {post.title}
  • ))}
); } ``` --- ## OAuth Providers ### src/components/auth/oauth-buttons.tsx ```typescript 'use client'; import { createClient } from '@/lib/supabase/client'; export function OAuthButtons() { const handleOAuth = async (provider: 'google' | 'github') => { const supabase = createClient(); await supabase.auth.signInWithOAuth({ provider, options: { redirectTo: `${window.location.origin}/auth/callback`, }, }); }; return (
); } ``` --- ## Sign Out ### Server Action ```typescript // src/app/actions/auth.ts 'use server'; import { redirect } from 'next/navigation'; import { createClient } from '@/lib/supabase/server'; export async function signOut() { const supabase = await createClient(); await supabase.auth.signOut(); redirect('/login'); } ``` ### Sign Out Button ```typescript 'use client'; import { signOut } from '@/app/actions/auth'; export function SignOutButton() { return (
); } ``` --- ## Anti-Patterns - **Using Supabase client for DB queries** - Use Drizzle for type-safety - **Fetching in client components** - Prefer server components - **Not using middleware for auth** - Session refresh is critical - **Calling `cookies()` synchronously** - Must await in Next.js 15+ - **Service key in client** - Never expose, server-only - **Missing revalidatePath** - Always revalidate after mutations - **Not handling auth errors** - Show user-friendly messages ================================================ FILE: skills/supabase-node/SKILL.md ================================================ --- name: supabase-node description: Express/Hono with Supabase and Drizzle ORM when-to-use: When building a Node.js backend with Supabase user-invocable: false paths: ["src/api/**", "src/routes/**", "supabase/**"] effort: medium --- # Supabase + Node.js Skill Express/Hono patterns with Supabase Auth and Drizzle ORM. **Sources:** [Supabase JS Client](https://supabase.com/docs/reference/javascript/introduction) | [Drizzle ORM](https://orm.drizzle.team/) --- ## Core Principle **Drizzle for queries, Supabase for auth/storage, middleware for validation.** Use Drizzle ORM for type-safe database access. Use Supabase client for auth verification, storage, and realtime. Express or Hono for the API layer. --- ## Project Structure ``` project/ ├── src/ │ ├── routes/ │ │ ├── index.ts # Route aggregator │ │ ├── auth.ts │ │ ├── posts.ts │ │ └── users.ts │ ├── middleware/ │ │ ├── auth.ts # JWT validation │ │ ├── error.ts # Error handler │ │ └── validate.ts # Request validation │ ├── db/ │ │ ├── index.ts # Drizzle client │ │ ├── schema.ts # Schema definitions │ │ └── queries/ # Query functions │ ├── lib/ │ │ ├── supabase.ts # Supabase client │ │ └── config.ts # Environment config │ ├── types/ │ │ └── express.d.ts # Express type extensions │ └── index.ts # App entry point ├── supabase/ │ ├── migrations/ │ └── config.toml ├── drizzle.config.ts ├── package.json ├── tsconfig.json └── .env ``` --- ## Setup ### Install Dependencies ```bash npm install express cors helmet dotenv @supabase/supabase-js drizzle-orm postgres zod npm install -D typescript @types/express @types/cors @types/node tsx drizzle-kit ``` ### package.json Scripts ```json { "scripts": { "dev": "tsx watch src/index.ts", "build": "tsc", "start": "node dist/index.js", "db:generate": "drizzle-kit generate", "db:push": "drizzle-kit push", "db:studio": "drizzle-kit studio" } } ``` ### Environment Variables ```bash # .env PORT=3000 NODE_ENV=development # Supabase SUPABASE_URL=http://localhost:54321 SUPABASE_ANON_KEY= SUPABASE_SERVICE_ROLE_KEY= # Database DATABASE_URL=postgresql://postgres:postgres@localhost:54322/postgres ``` --- ## Configuration ### src/lib/config.ts ```typescript import { z } from 'zod'; import dotenv from 'dotenv'; dotenv.config(); const envSchema = z.object({ PORT: z.string().default('3000'), NODE_ENV: z.enum(['development', 'production', 'test']).default('development'), SUPABASE_URL: z.string().url(), SUPABASE_ANON_KEY: z.string(), SUPABASE_SERVICE_ROLE_KEY: z.string(), DATABASE_URL: z.string(), }); export const config = envSchema.parse(process.env); ``` --- ## Database Setup ### drizzle.config.ts ```typescript import { defineConfig } from 'drizzle-kit'; import { config } from './src/lib/config'; export default defineConfig({ schema: './src/db/schema.ts', out: './supabase/migrations', dialect: 'postgresql', dbCredentials: { url: config.DATABASE_URL, }, schemaFilter: ['public'], }); ``` ### src/db/index.ts ```typescript import { drizzle } from 'drizzle-orm/postgres-js'; import postgres from 'postgres'; import * as schema from './schema'; import { config } from '../lib/config'; const client = postgres(config.DATABASE_URL, { prepare: false, // Required for Supabase pooling }); export const db = drizzle(client, { schema }); ``` ### src/db/schema.ts ```typescript import { pgTable, uuid, text, timestamp, boolean, } from 'drizzle-orm/pg-core'; export const profiles = pgTable('profiles', { id: uuid('id').primaryKey(), email: text('email').notNull(), name: text('name'), avatarUrl: text('avatar_url'), createdAt: timestamp('created_at').defaultNow().notNull(), updatedAt: timestamp('updated_at').defaultNow().notNull(), }); export const posts = pgTable('posts', { id: uuid('id').primaryKey().defaultRandom(), authorId: uuid('author_id').references(() => profiles.id).notNull(), title: text('title').notNull(), content: text('content'), published: boolean('published').default(false), createdAt: timestamp('created_at').defaultNow().notNull(), }); // Type exports export type Profile = typeof profiles.$inferSelect; export type NewProfile = typeof profiles.$inferInsert; export type Post = typeof posts.$inferSelect; export type NewPost = typeof posts.$inferInsert; ``` --- ## Supabase Client ### src/lib/supabase.ts ```typescript import { createClient, SupabaseClient, User } from '@supabase/supabase-js'; import { config } from './config'; // Client with anon key (respects RLS) export const supabase = createClient( config.SUPABASE_URL, config.SUPABASE_ANON_KEY ); // Admin client (bypasses RLS) export const supabaseAdmin = createClient( config.SUPABASE_URL, config.SUPABASE_SERVICE_ROLE_KEY, { auth: { autoRefreshToken: false, persistSession: false, }, } ); // Verify JWT and get user export async function verifyToken(token: string): Promise { const { data: { user }, error } = await supabase.auth.getUser(token); if (error || !user) { return null; } return user; } ``` --- ## Type Extensions ### src/types/express.d.ts ```typescript import { User } from '@supabase/supabase-js'; declare global { namespace Express { interface Request { user?: User; } } } export {}; ``` --- ## Middleware ### src/middleware/auth.ts ```typescript import { Request, Response, NextFunction } from 'express'; import { verifyToken } from '../lib/supabase'; export async function requireAuth( req: Request, res: Response, next: NextFunction ) { const authHeader = req.headers.authorization; if (!authHeader?.startsWith('Bearer ')) { return res.status(401).json({ error: 'Missing authorization header' }); } const token = authHeader.split(' ')[1]; const user = await verifyToken(token); if (!user) { return res.status(401).json({ error: 'Invalid token' }); } req.user = user; next(); } // Optional auth - continues even without token export async function optionalAuth( req: Request, res: Response, next: NextFunction ) { const authHeader = req.headers.authorization; if (authHeader?.startsWith('Bearer ')) { const token = authHeader.split(' ')[1]; req.user = await verifyToken(token) ?? undefined; } next(); } ``` ### src/middleware/error.ts ```typescript import { Request, Response, NextFunction } from 'express'; export class AppError extends Error { constructor( public statusCode: number, message: string ) { super(message); this.name = 'AppError'; } } export function errorHandler( err: Error, req: Request, res: Response, next: NextFunction ) { console.error(err); if (err instanceof AppError) { return res.status(err.statusCode).json({ error: err.message }); } return res.status(500).json({ error: 'Internal server error' }); } ``` ### src/middleware/validate.ts ```typescript import { Request, Response, NextFunction } from 'express'; import { z, ZodSchema } from 'zod'; export function validate(schema: T) { return (req: Request, res: Response, next: NextFunction) => { try { req.body = schema.parse(req.body); next(); } catch (error) { if (error instanceof z.ZodError) { return res.status(400).json({ error: 'Validation failed', details: error.errors, }); } next(error); } }; } ``` --- ## Routes ### src/routes/auth.ts ```typescript import { Router } from 'express'; import { z } from 'zod'; import { supabase } from '../lib/supabase'; import { validate } from '../middleware/validate'; const router = Router(); const signUpSchema = z.object({ email: z.string().email(), password: z.string().min(8), }); const signInSchema = z.object({ email: z.string().email(), password: z.string(), }); router.post('/signup', validate(signUpSchema), async (req, res, next) => { try { const { email, password } = req.body; const { data, error } = await supabase.auth.signUp({ email, password, }); if (error) { return res.status(400).json({ error: error.message }); } return res.status(201).json({ user: data.user, session: data.session, }); } catch (error) { next(error); } }); router.post('/signin', validate(signInSchema), async (req, res, next) => { try { const { email, password } = req.body; const { data, error } = await supabase.auth.signInWithPassword({ email, password, }); if (error) { return res.status(401).json({ error: 'Invalid credentials' }); } return res.json({ user: data.user, session: data.session, }); } catch (error) { next(error); } }); router.post('/signout', async (req, res) => { await supabase.auth.signOut(); return res.json({ message: 'Signed out' }); }); router.post('/refresh', async (req, res, next) => { try { const { refresh_token } = req.body; const { data, error } = await supabase.auth.refreshSession({ refresh_token, }); if (error) { return res.status(401).json({ error: 'Invalid refresh token' }); } return res.json({ session: data.session, }); } catch (error) { next(error); } }); export default router; ``` ### src/routes/posts.ts ```typescript import { Router } from 'express'; import { z } from 'zod'; import { eq, desc } from 'drizzle-orm'; import { db } from '../db'; import { posts, Post } from '../db/schema'; import { requireAuth, optionalAuth } from '../middleware/auth'; import { validate } from '../middleware/validate'; import { AppError } from '../middleware/error'; const router = Router(); const createPostSchema = z.object({ title: z.string().min(1).max(200), content: z.string().optional(), published: z.boolean().default(false), }); const updatePostSchema = createPostSchema.partial(); // List all published posts router.get('/', optionalAuth, async (req, res, next) => { try { const result = await db .select() .from(posts) .where(eq(posts.published, true)) .orderBy(desc(posts.createdAt)); return res.json(result); } catch (error) { next(error); } }); // List user's posts router.get('/me', requireAuth, async (req, res, next) => { try { const result = await db .select() .from(posts) .where(eq(posts.authorId, req.user!.id)) .orderBy(desc(posts.createdAt)); return res.json(result); } catch (error) { next(error); } }); // Get single post router.get('/:id', async (req, res, next) => { try { const [post] = await db .select() .from(posts) .where(eq(posts.id, req.params.id)) .limit(1); if (!post) { throw new AppError(404, 'Post not found'); } return res.json(post); } catch (error) { next(error); } }); // Create post router.post('/', requireAuth, validate(createPostSchema), async (req, res, next) => { try { const [post] = await db .insert(posts) .values({ ...req.body, authorId: req.user!.id, }) .returning(); return res.status(201).json(post); } catch (error) { next(error); } }); // Update post router.patch('/:id', requireAuth, validate(updatePostSchema), async (req, res, next) => { try { const [post] = await db .update(posts) .set(req.body) .where(eq(posts.id, req.params.id)) .returning(); if (!post) { throw new AppError(404, 'Post not found'); } return res.json(post); } catch (error) { next(error); } }); // Delete post router.delete('/:id', requireAuth, async (req, res, next) => { try { const [post] = await db .delete(posts) .where(eq(posts.id, req.params.id)) .returning(); if (!post) { throw new AppError(404, 'Post not found'); } return res.status(204).send(); } catch (error) { next(error); } }); export default router; ``` ### src/routes/index.ts ```typescript import { Router } from 'express'; import authRoutes from './auth'; import postRoutes from './posts'; const router = Router(); router.use('/auth', authRoutes); router.use('/posts', postRoutes); export default router; ``` --- ## Main Application ### src/index.ts ```typescript import express from 'express'; import cors from 'cors'; import helmet from 'helmet'; import routes from './routes'; import { errorHandler } from './middleware/error'; import { config } from './lib/config'; const app = express(); // Security middleware app.use(helmet()); app.use(cors()); app.use(express.json()); // Health check app.get('/health', (req, res) => { res.json({ status: 'healthy' }); }); // API routes app.use('/api', routes); // Error handler (must be last) app.use(errorHandler); app.listen(config.PORT, () => { console.log(`Server running on port ${config.PORT}`); }); export default app; ``` --- ## Query Functions ### src/db/queries/posts.ts ```typescript import { db } from '../index'; import { posts, profiles } from '../schema'; import { eq, desc, and } from 'drizzle-orm'; export async function getPublishedPosts(limit = 10) { return db .select({ id: posts.id, title: posts.title, content: posts.content, author: profiles.name, createdAt: posts.createdAt, }) .from(posts) .innerJoin(profiles, eq(posts.authorId, profiles.id)) .where(eq(posts.published, true)) .orderBy(desc(posts.createdAt)) .limit(limit); } export async function getUserPosts(userId: string) { return db .select() .from(posts) .where(eq(posts.authorId, userId)) .orderBy(desc(posts.createdAt)); } export async function getPostById(id: string) { const [post] = await db .select() .from(posts) .where(eq(posts.id, id)) .limit(1); return post ?? null; } export async function createPost(data: { title: string; content?: string; authorId: string; published?: boolean; }) { const [post] = await db.insert(posts).values(data).returning(); return post; } ``` --- ## Storage ### Upload Endpoint ```typescript import multer from 'multer'; import { supabase } from '../lib/supabase'; const upload = multer({ storage: multer.memoryStorage() }); router.post( '/avatar', requireAuth, upload.single('file'), async (req, res, next) => { try { if (!req.file) { throw new AppError(400, 'No file uploaded'); } const fileExt = req.file.originalname.split('.').pop(); const filePath = `${req.user!.id}/avatar.${fileExt}`; const { error } = await supabase.storage .from('avatars') .upload(filePath, req.file.buffer, { contentType: req.file.mimetype, upsert: true, }); if (error) { throw new AppError(500, 'Upload failed'); } const { data } = supabase.storage .from('avatars') .getPublicUrl(filePath); return res.json({ url: data.publicUrl }); } catch (error) { next(error); } } ); ``` --- ## Hono Alternative For edge deployments or lighter weight: ### src/index.ts (Hono) ```typescript import { Hono } from 'hono'; import { cors } from 'hono/cors'; import { jwt } from 'hono/jwt'; import { db } from './db'; import { posts } from './db/schema'; import { eq, desc } from 'drizzle-orm'; import { config } from './lib/config'; const app = new Hono(); app.use('/*', cors()); // Public routes app.get('/posts', async (c) => { const result = await db .select() .from(posts) .where(eq(posts.published, true)) .orderBy(desc(posts.createdAt)); return c.json(result); }); // Protected routes app.use('/api/*', async (c, next) => { const auth = c.req.header('Authorization'); if (!auth?.startsWith('Bearer ')) { return c.json({ error: 'Unauthorized' }, 401); } // Verify with Supabase... await next(); }); app.post('/api/posts', async (c) => { const body = await c.req.json(); const [post] = await db.insert(posts).values(body).returning(); return c.json(post, 201); }); export default app; ``` --- ## Testing ### tests/setup.ts ```typescript import { beforeAll, afterAll, beforeEach } from 'vitest'; import { db } from '../src/db'; import { posts, profiles } from '../src/db/schema'; beforeAll(async () => { // Setup test database }); beforeEach(async () => { // Clean tables await db.delete(posts); await db.delete(profiles); }); afterAll(async () => { // Cleanup }); ``` ### tests/posts.test.ts ```typescript import { describe, it, expect } from 'vitest'; import request from 'supertest'; import app from '../src/index'; describe('Posts API', () => { it('should list published posts', async () => { const res = await request(app) .get('/api/posts') .expect(200); expect(Array.isArray(res.body)).toBe(true); }); it('should require auth to create post', async () => { await request(app) .post('/api/posts') .send({ title: 'Test' }) .expect(401); }); }); ``` --- ## Anti-Patterns - **Using Supabase client for DB queries** - Use Drizzle - **Sync JWT validation** - Keep it async - **No input validation** - Use Zod middleware - **Missing error handling** - Use centralized error handler - **Hardcoded secrets** - Use environment variables - **No request logging** - Add morgan or pino - **Blocking the event loop** - Use async throughout - **Service key in responses** - Never expose ================================================ FILE: skills/supabase-python/SKILL.md ================================================ --- name: supabase-python description: FastAPI with Supabase and SQLAlchemy/SQLModel when-to-use: When building a Python/FastAPI app with Supabase backend user-invocable: false paths: ["**/*.py", "supabase/**"] effort: medium --- # Supabase + Python Skill FastAPI patterns with Supabase Auth and SQLAlchemy/SQLModel for database access. **Sources:** [Supabase Python Client](https://supabase.com/docs/reference/python/introduction) | [SQLModel](https://sqlmodel.tiangolo.com/) --- ## Core Principle **SQLAlchemy/SQLModel for queries, Supabase for auth/storage.** Use SQLAlchemy or SQLModel for type-safe database access. Use supabase-py for auth, storage, and realtime. FastAPI for the API layer. --- ## Project Structure ``` project/ ├── src/ │ ├── api/ │ │ ├── __init__.py │ │ ├── routes/ │ │ │ ├── __init__.py │ │ │ ├── auth.py │ │ │ ├── posts.py │ │ │ └── users.py │ │ └── deps.py # Dependencies (auth, db) │ ├── core/ │ │ ├── __init__.py │ │ ├── config.py # Settings │ │ └── security.py # Auth helpers │ ├── db/ │ │ ├── __init__.py │ │ ├── session.py # Database session │ │ └── models.py # SQLModel models │ ├── services/ │ │ ├── __init__.py │ │ └── supabase.py # Supabase client │ └── main.py # FastAPI app ├── supabase/ │ ├── migrations/ │ └── config.toml ├── alembic/ # Alembic migrations (alternative) ├── alembic.ini ├── pyproject.toml └── .env ``` --- ## Setup ### Install Dependencies ```bash pip install fastapi uvicorn supabase python-dotenv sqlmodel asyncpg alembic ``` ### pyproject.toml ```toml [project] name = "my-app" version = "0.1.0" dependencies = [ "fastapi>=0.109.0", "uvicorn[standard]>=0.27.0", "supabase>=2.0.0", "python-dotenv>=1.0.0", "sqlmodel>=0.0.14", "asyncpg>=0.29.0", "alembic>=1.13.0", "pydantic-settings>=2.0.0", ] [project.optional-dependencies] dev = [ "pytest>=7.0.0", "pytest-asyncio>=0.23.0", "httpx>=0.26.0", ] ``` ### Environment Variables ```bash # .env SUPABASE_URL=http://localhost:54321 SUPABASE_ANON_KEY= SUPABASE_SERVICE_ROLE_KEY= DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:54322/postgres ``` --- ## Configuration ### src/core/config.py ```python from pydantic_settings import BaseSettings from functools import lru_cache class Settings(BaseSettings): # Supabase supabase_url: str supabase_anon_key: str supabase_service_role_key: str # Database database_url: str # App debug: bool = False class Config: env_file = ".env" env_file_encoding = "utf-8" @lru_cache def get_settings() -> Settings: return Settings() ``` --- ## Database Setup ### src/db/session.py ```python from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession from sqlalchemy.orm import sessionmaker from src.core.config import get_settings settings = get_settings() engine = create_async_engine( settings.database_url, echo=settings.debug, pool_pre_ping=True, ) AsyncSessionLocal = sessionmaker( engine, class_=AsyncSession, expire_on_commit=False, ) async def get_db() -> AsyncSession: async with AsyncSessionLocal() as session: try: yield session finally: await session.close() ``` ### src/db/models.py ```python from datetime import datetime from typing import Optional from uuid import UUID, uuid4 from sqlmodel import SQLModel, Field class ProfileBase(SQLModel): email: str name: Optional[str] = None avatar_url: Optional[str] = None class Profile(ProfileBase, table=True): __tablename__ = "profiles" id: UUID = Field(primary_key=True) # References auth.users created_at: datetime = Field(default_factory=datetime.utcnow) updated_at: datetime = Field(default_factory=datetime.utcnow) class ProfileCreate(ProfileBase): id: UUID class ProfileRead(ProfileBase): id: UUID created_at: datetime class PostBase(SQLModel): title: str content: Optional[str] = None published: bool = False class Post(PostBase, table=True): __tablename__ = "posts" id: UUID = Field(default_factory=uuid4, primary_key=True) author_id: UUID = Field(foreign_key="profiles.id") created_at: datetime = Field(default_factory=datetime.utcnow) class PostCreate(PostBase): pass class PostRead(PostBase): id: UUID author_id: UUID created_at: datetime ``` --- ## Supabase Client ### src/services/supabase.py ```python from supabase import create_client, Client from src.core.config import get_settings settings = get_settings() def get_supabase_client() -> Client: """Get Supabase client with anon key (respects RLS).""" return create_client( settings.supabase_url, settings.supabase_anon_key ) def get_supabase_admin() -> Client: """Get Supabase client with service role (bypasses RLS).""" return create_client( settings.supabase_url, settings.supabase_service_role_key ) ``` --- ## Auth Dependencies ### src/api/deps.py ```python from typing import Annotated from fastapi import Depends, HTTPException, status from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials from sqlalchemy.ext.asyncio import AsyncSession from supabase import Client from src.db.session import get_db from src.services.supabase import get_supabase_client security = HTTPBearer() async def get_current_user( credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)], ) -> dict: """Validate JWT and return user.""" supabase = get_supabase_client() try: # Verify token with Supabase user = supabase.auth.get_user(credentials.credentials) if not user or not user.user: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid token", ) return user.user except Exception as e: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid token", ) # Type alias for dependency injection CurrentUser = Annotated[dict, Depends(get_current_user)] DbSession = Annotated[AsyncSession, Depends(get_db)] ``` --- ## API Routes ### src/api/routes/auth.py ```python from fastapi import APIRouter, HTTPException, status from pydantic import BaseModel, EmailStr from src.services.supabase import get_supabase_client router = APIRouter(prefix="/auth", tags=["auth"]) class SignUpRequest(BaseModel): email: EmailStr password: str class SignInRequest(BaseModel): email: EmailStr password: str class AuthResponse(BaseModel): access_token: str refresh_token: str user_id: str @router.post("/signup", response_model=AuthResponse) async def sign_up(request: SignUpRequest): supabase = get_supabase_client() try: response = supabase.auth.sign_up({ "email": request.email, "password": request.password, }) if response.user is None: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Signup failed", ) return AuthResponse( access_token=response.session.access_token, refresh_token=response.session.refresh_token, user_id=str(response.user.id), ) except Exception as e: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=str(e), ) @router.post("/signin", response_model=AuthResponse) async def sign_in(request: SignInRequest): supabase = get_supabase_client() try: response = supabase.auth.sign_in_with_password({ "email": request.email, "password": request.password, }) return AuthResponse( access_token=response.session.access_token, refresh_token=response.session.refresh_token, user_id=str(response.user.id), ) except Exception as e: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid credentials", ) @router.post("/signout") async def sign_out(): supabase = get_supabase_client() supabase.auth.sign_out() return {"message": "Signed out"} ``` ### src/api/routes/posts.py ```python from uuid import UUID from fastapi import APIRouter, HTTPException, status from sqlmodel import select from src.api.deps import CurrentUser, DbSession from src.db.models import Post, PostCreate, PostRead router = APIRouter(prefix="/posts", tags=["posts"]) @router.get("/", response_model=list[PostRead]) async def list_posts( db: DbSession, published_only: bool = True, ): query = select(Post) if published_only: query = query.where(Post.published == True) query = query.order_by(Post.created_at.desc()) result = await db.execute(query) return result.scalars().all() @router.get("/me", response_model=list[PostRead]) async def list_my_posts( db: DbSession, user: CurrentUser, ): query = select(Post).where(Post.author_id == UUID(user.id)) result = await db.execute(query) return result.scalars().all() @router.post("/", response_model=PostRead, status_code=status.HTTP_201_CREATED) async def create_post( db: DbSession, user: CurrentUser, post_in: PostCreate, ): post = Post( **post_in.model_dump(), author_id=UUID(user.id), ) db.add(post) await db.commit() await db.refresh(post) return post @router.get("/{post_id}", response_model=PostRead) async def get_post( db: DbSession, post_id: UUID, ): result = await db.execute(select(Post).where(Post.id == post_id)) post = result.scalar_one_or_none() if not post: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail="Post not found", ) return post @router.delete("/{post_id}", status_code=status.HTTP_204_NO_CONTENT) async def delete_post( db: DbSession, user: CurrentUser, post_id: UUID, ): result = await db.execute( select(Post).where(Post.id == post_id, Post.author_id == UUID(user.id)) ) post = result.scalar_one_or_none() if not post: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail="Post not found", ) await db.delete(post) await db.commit() ``` --- ## Main Application ### src/main.py ```python from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from src.api.routes import auth, posts app = FastAPI(title="My API") # CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], # Configure for production allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Routes app.include_router(auth.router, prefix="/api") app.include_router(posts.router, prefix="/api") @app.get("/health") async def health_check(): return {"status": "healthy"} ``` --- ## Alembic Migrations ### Initialize Alembic ```bash alembic init alembic ``` ### alembic/env.py (key changes) ```python from src.db.models import SQLModel from src.core.config import get_settings settings = get_settings() # Use async engine config.set_main_option("sqlalchemy.url", settings.database_url) target_metadata = SQLModel.metadata def run_migrations_online(): # For async import asyncio from sqlalchemy.ext.asyncio import create_async_engine connectable = create_async_engine(settings.database_url) async def do_run_migrations(): async with connectable.connect() as connection: await connection.run_sync(do_run_migrations_sync) def do_run_migrations_sync(connection): context.configure( connection=connection, target_metadata=target_metadata, ) with context.begin_transaction(): context.run_migrations() asyncio.run(do_run_migrations()) ``` ### Migration Commands ```bash # Create migration alembic revision --autogenerate -m "create posts table" # Apply migrations alembic upgrade head # Rollback alembic downgrade -1 ``` --- ## Storage ### Upload File ```python from fastapi import UploadFile from src.services.supabase import get_supabase_client async def upload_avatar(user_id: str, file: UploadFile) -> str: supabase = get_supabase_client() file_content = await file.read() file_path = f"{user_id}/avatar.{file.filename.split('.')[-1]}" response = supabase.storage.from_("avatars").upload( file_path, file_content, {"content-type": file.content_type, "upsert": "true"}, ) # Get public URL url = supabase.storage.from_("avatars").get_public_url(file_path) return url ``` ### Download File ```python def get_avatar_url(user_id: str) -> str: supabase = get_supabase_client() return supabase.storage.from_("avatars").get_public_url(f"{user_id}/avatar.png") ``` --- ## Realtime (Async) ```python import asyncio from supabase import create_client async def listen_to_posts(): supabase = create_client( settings.supabase_url, settings.supabase_anon_key ) def handle_change(payload): print(f"Change received: {payload}") channel = supabase.channel("posts") channel.on_postgres_changes( event="*", schema="public", table="posts", callback=handle_change, ).subscribe() # Keep listening while True: await asyncio.sleep(1) ``` --- ## Testing ### tests/conftest.py ```python import pytest from httpx import AsyncClient, ASGITransport from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession from sqlalchemy.orm import sessionmaker from src.main import app from src.db.session import get_db from src.db.models import SQLModel TEST_DATABASE_URL = "postgresql+asyncpg://postgres:postgres@localhost:54322/postgres_test" engine = create_async_engine(TEST_DATABASE_URL) TestingSessionLocal = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) @pytest.fixture(scope="function") async def db_session(): async with engine.begin() as conn: await conn.run_sync(SQLModel.metadata.create_all) async with TestingSessionLocal() as session: yield session async with engine.begin() as conn: await conn.run_sync(SQLModel.metadata.drop_all) @pytest.fixture async def client(db_session): async def override_get_db(): yield db_session app.dependency_overrides[get_db] = override_get_db async with AsyncClient( transport=ASGITransport(app=app), base_url="http://test", ) as ac: yield ac app.dependency_overrides.clear() ``` ### tests/test_posts.py ```python import pytest from httpx import AsyncClient @pytest.mark.asyncio async def test_list_posts(client: AsyncClient): response = await client.get("/api/posts/") assert response.status_code == 200 assert isinstance(response.json(), list) ``` --- ## Running the App ```bash # Development uvicorn src.main:app --reload --port 8000 # Production uvicorn src.main:app --host 0.0.0.0 --port 8000 --workers 4 ``` --- ## Anti-Patterns - **Using Supabase client for DB queries** - Use SQLAlchemy/SQLModel - **Sync database calls** - Use async with asyncpg - **Hardcoded credentials** - Use environment variables - **No connection pooling** - asyncpg handles this - **Missing auth dependency** - Always validate JWT - **Not closing sessions** - Use context managers - **Blocking I/O in async** - Use async libraries ================================================ FILE: skills/team-coordination/SKILL.md ================================================ --- name: team-coordination description: Multi-person projects - shared state, todo claiming, handoffs when-to-use: When multiple developers are working on the same repo user-invocable: false effort: low --- # Team Coordination Skill **Purpose:** Enable multiple Claude Code sessions across a team to coordinate and work together without conflicts. Manages shared state, todo claiming, decision syncing, and session awareness. --- ## Core Philosophy ``` ┌─────────────────────────────────────────────────────────────────┐ │ TEAM CLAUDE CODE │ │ ───────────────────────────────────────────────────────────── │ │ Multiple devs, multiple Claude sessions, one codebase. │ │ Coordination > Speed. Communication > Assumptions. │ │ │ │ Before you start: Check who's working on what. │ │ Before you claim: Make sure nobody else has it. │ │ Before you decide: Check if it's already decided. │ │ Before you push: Pull and sync state. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Team State Structure When a project becomes multi-person, create this structure: ``` _project_specs/ ├── team/ │ ├── state.md # Who's working on what right now │ ├── contributors.md # Team members and their focus areas │ └── handoffs/ # Notes when passing work to others │ └── [feature]-handoff.md ├── session/ │ ├── current-state.md # YOUR session state (personal) │ ├── decisions.md # SHARED - architectural decisions │ └── code-landmarks.md # SHARED - important code locations └── todos/ ├── active.md # SHARED - with claim annotations ├── backlog.md # SHARED └── completed.md # SHARED ``` --- ## Team State File **`_project_specs/team/state.md`:** ```markdown # Team State *Last synced: [timestamp]* ## Active Sessions | Contributor | Working On | Started | Files Touched | Status | |-------------|------------|---------|---------------|--------| | @alice | TODO-042: Add auth | 2024-01-15 10:30 | src/auth/* | 🟢 Active | | @bob | TODO-038: Fix checkout | 2024-01-15 09:00 | src/cart/* | 🟡 Paused | | - | - | - | - | - | ## Claimed Todos | Todo | Claimed By | Since | ETA | |------|------------|-------|-----| | TODO-042 | @alice | 2024-01-15 | Today | | TODO-038 | @bob | 2024-01-14 | Tomorrow | ## Recently Completed (Last 48h) | Todo | Completed By | When | PR | |------|--------------|------|-----| | TODO-037 | @alice | 2024-01-14 | #123 | ## Conflicts to Watch | Area | Contributors | Notes | |------|--------------|-------| | src/auth/* | @alice, @carol | Carol needs auth for TODO-045, coordinate | ## Announcements - [2024-01-15] @alice: Refactoring auth module, avoid touching until EOD - [2024-01-14] @bob: New env var required: STRIPE_WEBHOOK_SECRET ``` --- ## Contributors File **`_project_specs/team/contributors.md`:** ```markdown # Contributors ## Team Members | Handle | Name | Focus Areas | Timezone | Status | |--------|------|-------------|----------|--------| | @alice | Alice Smith | Backend, Auth | EST | Active | | @bob | Bob Jones | Frontend, Payments | PST | Active | | @carol | Carol White | DevOps, Infra | GMT | Part-time | ## Ownership | Area | Primary | Backup | Notes | |------|---------|--------|-------| | Authentication | @alice | @bob | All auth changes need @alice review | | Payments | @bob | @alice | Stripe integration | | Infrastructure | @carol | @alice | Deploy scripts, CI/CD | | Database | @alice | @carol | Migrations need sign-off | ## Communication - Slack: #project-name - PRs: Always tag area owner for review - Urgent: DM on Slack ## Working Hours Overlap ``` EST: |████████████████████| PST: | ████████████████████| GMT: |████████████| 6am 12pm 6pm 12am EST Best overlap: 9am-12pm EST (all three) ``` ``` --- ## Workflow ### Starting a Session ``` ┌─────────────────────────────────────────────────────────────────┐ │ START SESSION CHECKLIST │ │ ───────────────────────────────────────────────────────────── │ │ 1. git pull origin main │ │ 2. Read _project_specs/team/state.md │ │ 3. Check claimed todos - don't take what's claimed │ │ 4. Claim your todo in active.md │ │ 5. Update state.md with your session │ │ 6. Push state changes before starting work │ │ 7. Start working │ └─────────────────────────────────────────────────────────────────┘ ``` ### Claiming a Todo In `active.md`, add claim annotation: ```markdown ## [TODO-042] Add email validation **Status:** in-progress **Claimed:** @alice (2024-01-15 10:30 EST) **ETA:** Today ... ``` ### During Work - Update `state.md` if you touch new files - Check `decisions.md` before making architectural choices - If you make a decision, add it to `decisions.md` immediately - Push state updates every 1-2 hours (keeps team in sync) ### Ending a Session ``` ┌─────────────────────────────────────────────────────────────────┐ │ END SESSION CHECKLIST │ │ ───────────────────────────────────────────────────────────── │ │ 1. Commit your work (even if WIP) │ │ 2. Update your current-state.md │ │ 3. Update team state.md (status → Paused or Done) │ │ 4. If passing to someone: create handoff note │ │ 5. Unclaim todo if abandoning │ │ 6. Push everything │ └─────────────────────────────────────────────────────────────────┘ ``` ### Creating a Handoff When passing work to another team member, create: **`_project_specs/team/handoffs/auth-feature-handoff.md`:** ```markdown # Handoff: Auth Feature (TODO-042) **From:** @alice **To:** @bob **Date:** 2024-01-15 ## Status 70% complete. Core auth flow works, need to add: - [ ] Password reset flow - [ ] Email verification ## What's Done - Login/logout working - JWT tokens implemented - Session management done ## What's Left 1. Password reset - see src/auth/reset.ts (skeleton exists) 2. Email verification - need to integrate SendGrid ## Key Decisions Made - Using JWT not sessions (see decisions.md) - Tokens expire in 7 days - Refresh tokens stored in httpOnly cookies ## Watch Out For - The `validateToken` function has a weird edge case with expired tokens - Don't touch `authMiddleware.ts` - it's fragile rn ## Files to Start With 1. src/auth/reset.ts - password reset 2. src/email/verification.ts - email flow 3. tests/auth.test.ts - add tests here ## Questions? Slack me @alice if stuck ``` --- ## Conflict Prevention ### File-Level Awareness Before modifying a file, check state.md for who's touching what: ```markdown ## Active Sessions | Contributor | Working On | Started | Files Touched | Status | |-------------|------------|---------|---------------|--------| | @alice | TODO-042 | ... | src/auth/*, src/middleware/* | 🟢 Active | ``` If you need to touch `src/auth/*` and Alice is working there: 1. Check if it's truly conflicting (same file? same functions?) 2. Coordinate via Slack before proceeding 3. Add a note to "Conflicts to Watch" section ### Pre-Push Check Before pushing, always: ```bash git pull origin main # Resolve any conflicts git push ``` ### PR Tagging Always tag area owners in PRs: ```markdown ## PR: Add password reset flow Implements TODO-042 cc: @alice (auth owner), @bob (reviewer) ### Changes - Added password reset endpoint - Added email templates ### Testing - [ ] Unit tests pass - [ ] Manual testing done ``` --- ## Decision Syncing ### Before Making a Decision 1. Pull latest `decisions.md` 2. Check if decision already exists 3. If similar decision exists, follow it (consistency > preference) 4. If new decision needed, add it and push immediately ### Decision Format ```markdown ## [2024-01-15] JWT vs Sessions for Auth (@alice) **Decision:** Use JWT tokens **Context:** Need auth for API and mobile app **Options:** 1. Sessions - simpler, server-side state 2. JWT - stateless, works for mobile **Choice:** JWT **Reasoning:** Mobile app needs stateless auth, JWT works across platforms **Trade-offs:** Token revocation is harder, need refresh token strategy **Approved by:** @bob, @carol ``` --- ## Commands ### Check Team State ```bash # See who's working on what cat _project_specs/team/state.md # Quick active sessions check grep "🟢 Active" _project_specs/team/state.md ``` ### Claim a Todo 1. Edit `_project_specs/todos/active.md` 2. Add claim annotation to todo 3. Update `_project_specs/team/state.md` 4. Commit and push ### Release a Claim 1. Remove claim annotation from todo 2. Update state.md (remove from Claimed Todos) 3. Commit and push --- ## Git Hooks for Teams ### Pre-Push Hook Addition Add team state sync check to pre-push: ```bash # In .git/hooks/pre-push (add to existing) # Check if team state is current echo "🔄 Checking team state..." git fetch origin main --quiet LOCAL_STATE=$(git show HEAD:_project_specs/team/state.md 2>/dev/null | md5) REMOTE_STATE=$(git show origin/main:_project_specs/team/state.md 2>/dev/null | md5) if [ "$LOCAL_STATE" != "$REMOTE_STATE" ]; then echo "⚠️ Team state has changed on remote!" echo " Run: git pull origin main" echo " Then check _project_specs/team/state.md for updates" # Warning only, don't block fi ``` --- ## Claude Instructions ### At Session Start When user starts a session in a team project: 1. Check for `_project_specs/team/state.md` 2. If exists, read it and report: - Who's currently active - What todos are claimed - Any conflicts to watch - Recent announcements 3. Ask what they want to work on 4. Check if it's already claimed 5. Help them claim and update state ### During Session - Before touching files, check if someone else is working there - Before making decisions, check decisions.md - Remind user to update state periodically ### At Session End - Prompt user to update state.md - Ask if they need to create a handoff - Remind them to push state changes --- ## Single → Multi-Person Conversion When a project needs team coordination: 1. Run `/check-contributors` 2. Create `_project_specs/team/` structure 3. Initialize `state.md` and `contributors.md` 4. Add claim annotations to active todos 5. Update CLAUDE.md to reference team-coordination.md skill --- ## Quick Reference ### Status Icons ``` 🟢 Active - Currently working 🟡 Paused - Stepped away, will return 🔴 Blocked - Needs help/waiting on something ⚪ Offline - Not working today ``` ### Claim Format ```markdown **Claimed:** @handle (YYYY-MM-DD HH:MM TZ) ``` ### Daily Standup Template ```markdown ## Standup [DATE] ### @alice - Yesterday: Finished TODO-042 auth flow - Today: Starting TODO-045 password reset - Blockers: None ### @bob - Yesterday: Fixed checkout bug - Today: Payment webhook integration - Blockers: Need STRIPE_WEBHOOK_SECRET from @carol ``` --- ## Checklist ### Starting Work - [ ] `git pull origin main` - [ ] Read `team/state.md` - [ ] Check todo not claimed - [ ] Claim todo in `active.md` - [ ] Update `state.md` - [ ] Push state changes ### Ending Work - [ ] Commit all changes - [ ] Update `current-state.md` - [ ] Update `team/state.md` - [ ] Create handoff if needed - [ ] Push everything ================================================ FILE: skills/ticket-craft/SKILL.md ================================================ --- name: ticket-craft description: Create Jira/Asana/Linear tickets optimized for Claude Code execution - AI-native ticket writing when-to-use: When creating tickets, breaking down epics, or writing specs for AI agent execution user-invocable: true effort: medium --- # Ticket Craft Skill *Write software tickets that AI agents can execute autonomously.* **Purpose:** Define a ticket format that combines software engineering best practices (INVEST, Given-When-Then, Definition of Ready) with Claude Code-specific context requirements. Every ticket created with this skill is "Claude Code Ready" - meaning an agent can pick it up and execute it without asking clarifying questions. **Works with:** Jira, Asana, Linear, GitHub Issues, or any ticket system. --- ## Core Principle ``` ┌─────────────────────────────────────────────────────────────────┐ │ A TICKET IS A PROMPT │ │ ────────────────────────────────────────────────────────────── │ │ │ │ Traditional tickets are written for humans who can: │ │ - Ask clarifying questions in Slack │ │ - Draw on institutional knowledge │ │ - Infer intent from vague descriptions │ │ │ │ AI agents cannot do any of this. │ │ │ │ Every ticket must be SELF-CONTAINED: │ │ - Explicit file references (not "the auth module") │ │ - Pattern references (not "follow our conventions") │ │ - Verification criteria (not "make sure it works") │ │ - Constraints (not just what to do, but what NOT to do) │ │ - Test commands (not "run the tests") │ │ │ │ If Claude Code can execute it without asking a question, │ │ the ticket is ready. If it can't, it's not. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## The INVEST+C Criteria Standard INVEST plus **C for Claude-Ready**: | Criterion | Question | Fails If... | |-----------|----------|-------------| | **I** - Independent | Can this be completed without waiting on another ticket? | Blocked by undocumented dependencies | | **N** - Negotiable | Is there room to adjust implementation approach? | Over-specifies implementation details | | **V** - Valuable | Can you articulate who benefits and how? | No clear user or business value | | **E** - Estimable | Does the team understand enough to size it? | Too vague or too large to estimate | | **S** - Small | Can one person finish this in 1-3 days? | More than 5 acceptance criteria | | **T** - Testable | Can you write a pass/fail test for it? | Uses vague language like "fast" or "good UX" | | **C** - Claude-Ready | Can an AI agent execute this without clarifying questions? | Missing file refs, patterns, verification, or constraints | --- ## Ticket Types ### 1. Feature Ticket ```markdown ## [PROJ-XXX] {Verb} {Feature} for {User} **Type:** Feature **Priority:** {Critical | High | Medium | Low} **Points:** {1 | 2 | 3 | 5 | 8} **Labels:** {frontend, backend, api, database, etc.} **Epic:** {Parent epic} --- ### User Story As a {specific persona}, I want to {specific action}, so that {measurable benefit}. ### Background {1-2 paragraphs on why this matters. Link to product brief, user research, or business justification. Include any relevant metrics or user feedback.} ### Acceptance Criteria **AC1: {Happy path scenario}** Given {precondition}, when {action}, then {expected result}. **AC2: {Edge case / error scenario}** Given {precondition}, when {action}, then {expected result}. **AC3: {Boundary condition}** Given {precondition}, when {action}, then {expected result}. ### Out of Scope - {Explicitly state what this ticket does NOT include} - {Prevents scope creep and keeps ticket small} --- ### Claude Code Context #### Relevant Files (read these first) - `src/services/example.ts` - Existing service to extend - `src/models/example.ts` - Data model definition - `src/api/routes/example.ts` - Existing endpoint patterns to follow #### Pattern Reference Follow the pattern in `src/services/user.ts` for service layer implementation. Follow the pattern in `src/api/routes/users.ts` for route definition. Follow the pattern in `tests/services/user.test.ts` for test structure. #### Database Changes - {Table to create/modify, columns, types} - {Migration file location: `supabase/migrations/` or `prisma/migrations/`} - {RLS policies if using Supabase} #### API Contract ``` POST /api/{resource} Request: { field1: string, field2: number } Response: { id: string, field1: string, created_at: string } Error: { error: string, code: number } ``` #### Constraints - Do NOT modify {specific files or modules} - Do NOT add new dependencies without approval - Follow existing error handling in `src/core/exceptions.ts` - {Any performance budgets: response time < 200ms, bundle size < 50KB} #### Verification ```bash # Run specific tests npm test -- --grep "{feature name}" # Lint check npm run lint # Type check npm run typecheck # Full validation npm test -- --coverage ``` #### Environment Variables - Existing: {list vars already in .env that are relevant} - New required: {list any new vars needed} --- ### Dependencies - Blocked by: {PROJ-XXX} ({brief description}) - Blocks: {PROJ-YYY} ({brief description}) ### Design - Mockup: {link to Figma/design if applicable} ``` --- ### 2. Bug Ticket ```markdown ## [BUG-XXX] Fix: {Component} - {Symptom} **Type:** Bug **Priority:** {Critical | High | Medium | Low} **Points:** {1 | 2 | 3 | 5} **Labels:** {regression, ux-bug, data-bug, security-bug} **Severity:** {Blocks users | Degrades experience | Cosmetic} --- ### Bug Summary {One sentence: what is broken and who is affected.} ### Environment - Browser/OS: {e.g., Chrome 120 / macOS 14.2} - Environment: {Production | Staging | Local} - User type: {Anonymous | Authenticated | Admin} - First observed: {date} ### Steps to Reproduce 1. {Navigate to / perform action} 2. {Perform next action} 3. {Perform next action} 4. **Observe:** {incorrect behavior} ### Expected Behavior {What should happen instead.} ### Actual Behavior {What actually happens. Include error messages, console output, screenshots.} ### Impact - Users affected: {percentage or count} - Frequency: {every time | intermittent | specific conditions} - Workaround: {exists / none} --- ### Claude Code Context #### Suspected Root Cause {Where the bug likely lives, if known.} - File: `src/components/LoginForm.tsx:87` - Issue: `isSubmitting` state set to `true` on validation error but never reset #### Relevant Files - `src/components/LoginForm.tsx` - Form component with the bug - `tests/components/LoginForm.test.tsx` - Existing tests (gap here) - `src/hooks/useAuth.ts` - Auth hook used by the form #### Test Gap Analysis - Existing tests cover: {what's currently tested} - Missing test: {what test would have caught this bug} #### Bug Fix Workflow (TDD) 1. Write a failing test that reproduces the bug 2. Verify the test fails (confirms the bug exists) 3. Fix the bug with minimum code change 4. Verify the test passes 5. Run full test suite to check for regressions #### Verification ```bash # Run the specific test npm test -- --grep "LoginForm submit" # Run related tests npm test -- src/components/LoginForm.test.tsx # Full regression check npm test ``` #### Constraints - Fix the bug only - do NOT refactor surrounding code - Do NOT change the component's public API - Ensure all existing tests continue to pass ``` --- ### 3. Tech Debt Ticket ```markdown ## [TECH-XXX] Refactor: {Area} - {Improvement} **Type:** Tech Debt **Priority:** {High | Medium | Low} **Points:** {3 | 5 | 8} **Labels:** {refactor, performance, maintainability, testing} --- ### Problem Statement {What is wrong with the current implementation and why it matters. Include concrete pain points: slow CI, frequent bugs, developer confusion.} ### Current State - File: `{path}` ({N} lines) - Test coverage: {X}% - Cyclomatic complexity: {N} - Related bugs: {PROJ-XXX, PROJ-YYY} - Pain frequency: {how often this causes issues} ### Proposed Change {What specifically should change and why this approach.} ### Acceptance Criteria - [ ] {Specific structural change completed} - [ ] All existing tests pass without modifying test assertions - [ ] No public API changes (existing consumers unaffected) - [ ] Test coverage >= {X}% - [ ] {Measurable improvement metric} ### Risk Assessment - Risk level: {Low | Medium | High} - Mitigation: {run full regression, deploy behind flag, etc.} ### Business Justification {Why this is worth doing now. E.g., "Reduces average bug fix time from 4h to 1h" or "Enables upcoming feature PROJ-XXX which requires clean separation."} --- ### Claude Code Context #### Relevant Files - `{file}` - Current implementation to refactor - `{test file}` - Existing tests (must not break) - `{dependent file}` - Consumer of the API being refactored #### Pattern Reference Follow the pattern established in `{good example file}` for the new structure. #### Constraints - Do NOT change public APIs or exports - Do NOT modify test assertions (tests should pass as-is) - Do NOT introduce new dependencies - Keep backwards compatibility #### Verification ```bash # Existing tests must pass unchanged npm test # No type errors npm run typecheck # Lint clean npm run lint # Coverage target npm test -- --coverage ``` ``` --- ### 4. Epic Breakdown Ticket ```markdown ## [EPIC-XXX] {Epic Name} **Type:** Epic **Priority:** {Critical | High | Medium} **Target:** {Sprint/milestone} --- ### Objective {One paragraph: what this epic achieves and why it matters.} ### Success Metrics - {Measurable outcome 1} - {Measurable outcome 2} ### User Workflows {The user journey this epic covers, broken into steps.} 1. {Step 1: Discovery/Entry} 2. {Step 2: Core Action} 3. {Step 3: Completion/Result} ### Ticket Breakdown | # | Ticket | Type | Points | Dependencies | |---|--------|------|--------|-------------| | 1 | {title} | Feature | 3 | None | | 2 | {title} | Feature | 5 | #1 | | 3 | {title} | Feature | 3 | None | | 4 | {title} | Feature | 2 | #2, #3 | | 5 | {title} | Tech Debt | 3 | None | ### Slicing Strategy {How the epic was broken down. Reference the technique used.} ### Agent Team Mapping {If using agent teams, how features map to agents.} - Feature Agent 1: Tickets #1, #2 - Feature Agent 2: Tickets #3, #4 - Parallel execution: #1 and #3 can run simultaneously - Sequential: #2 depends on #1, #4 depends on #2 and #3 ``` --- ## Epic Slicing Techniques When breaking an epic into tickets, use one of these strategies: | Technique | When to Use | Example | |-----------|-------------|---------| | **By workflow step** | Clear user journey | Browse > Play > Save > Share | | **By data variation** | Multiple data types | Text posts, images, videos | | **By user role** | Different permissions | Anonymous, authenticated, admin | | **By CRUD** | Data operations | Create, Read, Update, Delete | | **Happy path first** | Incremental delivery | Success flow first, then errors | | **By boundary** | System integration | Frontend, API, database separately | ### Rules of Thumb - Each ticket: **1-3 days** of work for one developer/agent - More than **5 acceptance criteria** = split the ticket - More than **8 story points** = definitely split - Every ticket should be **independently deployable** (even behind a flag) - Order tickets: **simplest, most foundational first** --- ## The Claude Code Ready Checklist Before a ticket is ready for an AI agent to execute, verify: ``` ┌─────────────────────────────────────────────────────────────────┐ │ CLAUDE CODE READY CHECKLIST │ │ ────────────────────────────────────────────────────────────── │ │ │ │ CONTEXT │ │ ☐ Relevant files listed with full paths │ │ ☐ Pattern reference points to a real file to follow │ │ ☐ API contract defined (request/response shapes) │ │ ☐ Database changes specified (tables, columns, migrations) │ │ ☐ Environment variables listed (existing + new) │ │ │ │ SCOPE │ │ ☐ Out of Scope section explicitly states what NOT to do │ │ ☐ Constraints section lists files/modules NOT to modify │ │ ☐ Ticket covers one logical change (atomic) │ │ ☐ Estimable at ≤ 5 story points │ │ │ │ VERIFICATION │ │ ☐ Test command provided (exact command, not "run tests") │ │ ☐ Lint command provided │ │ ☐ Typecheck command provided │ │ ☐ Acceptance criteria are Given-When-Then or checkboxed │ │ ☐ Each criterion is independently pass/fail testable │ │ │ │ QUALITY │ │ ☐ Title is imperative verb + object + context │ │ ☐ Title under 80 characters │ │ ☐ Description explains WHY, not just WHAT │ │ ☐ 2-5 acceptance criteria (not more) │ │ ☐ No vague language ("fast", "good UX", "clean") │ │ │ │ If any box is unchecked, the ticket is NOT ready. │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Anti-Patterns (Never Do These) ### 1. The Title-Only Ticket ``` Title: Fix login Description: (empty) ``` **Why it fails:** No context, no acceptance criteria, no file references. Claude Code will guess and likely guess wrong. ### 2. The Novel ``` Title: Implement new onboarding Description: (3 pages mixing UI, backend, analytics, email, and future ideas) ``` **Why it fails:** Not small, not independent. Agent teams can't parallelize this. Split into 5+ tickets. ### 3. The Vague Requirement ``` Acceptance Criteria: - Should be fast - UX should be good - Should work on mobile ``` **Why it fails:** Unmeasurable, untestable. Replace with: "Response time < 200ms", "Passes WCAG 2.1 AA", "No horizontal scroll at 320px viewport." ### 4. The Over-Specified Solution ``` Title: Use Redis to cache user sessions Description: Install Redis, configure connection pooling, set TTL to 3600... ``` **Why it fails:** Prescribes the solution instead of the problem. Should describe "Session lookups take 500ms, need < 50ms" and let the agent choose the approach. ### 5. The Missing Files Ticket ``` Description: Update the auth module to support OAuth. ``` **Why it fails for AI:** "The auth module" could be 20 files. Claude Code needs: `src/services/auth.ts`, `src/middleware/auth.ts`, `src/routes/auth.ts` - specific paths. ### 6. The No-Verification Ticket ``` Acceptance Criteria: - OAuth login works - Users can sign in with Google ``` **Why it fails:** No test command, no verification steps. Claude Code performs dramatically better when it can verify its own work. --- ## Good vs Bad Examples ### Bad: Vague Feature Ticket ``` Title: Add rate limiting to the API Description: We need rate limiting on our endpoints. ``` ### Good: Claude Code Ready Feature Ticket ``` Title: Add sliding window rate limiter to /api/generate endpoint User Story: As an API consumer, I want requests to be rate-limited so that the service remains available under heavy load. Acceptance Criteria: AC1: Given an authenticated user making requests, when they exceed 10 requests per minute, then return 429 with Retry-After header. AC2: Given a rate-limited user, when the window expires, then requests succeed again. AC3: Given an unauthenticated request, when it hits /api/generate, then return 401 (rate limiting only applies to authed users). Claude Code Context: - Pattern: Follow `src/middleware/throttle.ts` for middleware structure - File: Create `src/middleware/rateLimit.ts` - Test: Create `tests/middleware/rateLimit.test.ts` - Route: Modify `src/api/routes/generate.ts` to add middleware - Constraint: Do NOT modify existing middleware or other endpoints Verification: npm test -- --grep "rate-limit" npm run lint npm run typecheck ``` --- ## Mapping Tickets to Agent Teams When using the agent-teams workflow, tickets map directly to the 10-task pipeline: | Ticket Section | Maps To | Agent | |---------------|---------|-------| | Title + Description | Task 1: `{name}-spec` | Feature Agent | | Acceptance Criteria | Task 3: `{name}-tests` | Feature Agent (writes tests from AC) | | Pattern Reference | Task 5: `{name}-implement` | Feature Agent (follows pattern) | | Verification section | Task 6-7: verify + validate | Quality Agent + Feature Agent | | Constraints | Enforced throughout | All agents | | Claude Code Context | Loaded at start | Feature Agent reads first | ### Ticket → Agent Team Flow ``` 1. Create ticket using templates above 2. Ticket becomes the feature spec in _project_specs/features/ 3. Team Lead reads spec, creates 10-task dependency chain 4. Feature Agent uses ticket's Claude Code Context to start 5. Quality Agent uses ticket's Acceptance Criteria to verify 6. Review Agent reviews against ticket's Constraints 7. Security Agent scans based on ticket's scope 8. Merger Agent creates PR referencing the ticket ID ``` --- ## Ticket Title Conventions | Type | Format | Example | |------|--------|---------| | Feature | `Add {feature} for {user}` | Add episode bookmarking for listeners | | Enhancement | `Improve {what} in {where}` | Improve search performance in episode feed | | Bug | `Fix: {Component} - {Symptom}` | Fix: PlayerBar - audio stops on tab switch | | Tech Debt | `Refactor: {Area} - {Goal}` | Refactor: AuthService - extract token management | | Security | `Security: {What} in {Where}` | Security: add input sanitization to comment API | | Chore | `Chore: {What}` | Chore: upgrade React from 18 to 19 | **Rules:** - Start with an imperative verb (Add, Fix, Improve, Refactor, Remove) - Under 80 characters - Include the component/area affected - Be specific enough to distinguish from other tickets --- ## Story Points for AI Agents AI agents estimate differently than humans. Use this calibration: | Points | Scope | Agent Time | Example | |--------|-------|-----------|---------| | **1** | Single file, < 20 lines changed | ~5 min | Fix a typo, update a config value | | **2** | 1-2 files, straightforward | ~15 min | Add a field to a form, update an API response | | **3** | 2-4 files, clear path | ~30 min | New API endpoint following existing pattern | | **5** | 4-8 files, some decisions | ~1 hour | New feature with tests, models, and routes | | **8** | 8+ files, complex | ~2 hours | Integration with external service, new data model | | **13** | Too large, split required | - | Full authentication system, major refactor | **Rule:** If > 5 points, consider splitting. If 13, always split. --- ## Integration with Ticket Systems ### Jira - Use custom field "Claude Code Context" for the AI-specific section - Use labels: `claude-ready`, `needs-context`, `ai-blocked` - Link tickets with "blocks/blocked by" for dependency chains ### Asana - Use custom fields for Priority, Points, Type - Use subtasks for the 10-task pipeline steps - Use tags: `claude-ready`, `needs-refinement` ### Linear - Use issue templates with the Claude Code Context section built-in - Use labels for ticket type and claude-readiness - Use projects to group tickets into epics ### GitHub Issues - Use issue templates (`.github/ISSUE_TEMPLATE/`) - Use labels: `feature`, `bug`, `tech-debt`, `claude-ready` - Use milestones for epics --- ## Command: /create-ticket When the user asks to create a ticket, follow this workflow: ### Step 1: Gather Context Ask the user: 1. What type? (Feature / Bug / Tech Debt) 2. Brief description of what needs to be done 3. Which part of the codebase is involved? ### Step 2: Auto-Detect Context - Read the relevant files to understand current implementation - Identify the pattern to follow from existing code - Find existing tests to understand test conventions - Check for related files that might be affected ### Step 3: Generate Ticket Use the appropriate template above, filling in: - All Claude Code Context fields (auto-detected) - Acceptance criteria (derived from description) - Verification commands (from project's CLAUDE.md or package.json) - Constraints (based on codebase analysis) ### Step 4: Validate with Checklist Run the Claude Code Ready Checklist against the generated ticket. Flag any unchecked items for the user to address. ### Step 5: Output Present the ticket in the template format, ready to paste into Jira/Asana/Linear. --- ## Definition of Ready (for Sprint) A ticket can enter a sprint when: - [ ] Passes INVEST+C criteria - [ ] Claude Code Ready Checklist is complete - [ ] Dependencies are identified and unblocked - [ ] Story points assigned - [ ] Design/mockups attached (if applicable) - [ ] Acceptance criteria reviewed by team ## Definition of Done A ticket is done when: - [ ] All acceptance criteria verified (pass/fail) - [ ] Tests written and passing - [ ] Code reviewed (no Critical/High issues) - [ ] Security scan passed - [ ] Lint and typecheck clean - [ ] Coverage >= 80% for new code - [ ] PR created with full pipeline results - [ ] Documentation updated (if applicable) ================================================ FILE: skills/typescript/SKILL.md ================================================ --- name: typescript description: TypeScript strict mode with eslint and jest when-to-use: When working on TypeScript files user-invocable: false paths: ["**/*.ts", "**/*.tsx", "tsconfig*.json"] effort: medium --- # TypeScript Skill --- ## Strict Mode (Non-Negotiable) ```json // tsconfig.json { "compilerOptions": { "strict": true, "noImplicitAny": true, "strictNullChecks": true, "noUnusedLocals": true, "noUnusedParameters": true, "noImplicitReturns": true, "esModuleInterop": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true } } ``` --- ## Project Structure ``` project/ ├── src/ │ ├── core/ # Pure business logic │ │ ├── types.ts # Domain types/interfaces │ │ ├── services/ # Pure functions │ │ └── index.ts # Public API │ ├── infra/ # Side effects │ │ ├── api/ # HTTP handlers │ │ ├── db/ # Database operations │ │ └── external/ # Third-party integrations │ └── utils/ # Shared utilities ├── tests/ │ ├── unit/ │ └── integration/ ├── package.json ├── tsconfig.json └── CLAUDE.md ``` --- ## Tooling (Required) ```json // package.json scripts { "scripts": { "lint": "eslint src/ --ext .ts,.tsx", "typecheck": "tsc --noEmit", "test": "jest", "test:coverage": "jest --coverage", "format": "prettier --write 'src/**/*.ts'" } } ``` ```javascript // eslint.config.js import eslint from '@eslint/js'; import tseslint from 'typescript-eslint'; export default tseslint.config( eslint.configs.recommended, ...tseslint.configs.strictTypeChecked, { rules: { '@typescript-eslint/no-explicit-any': 'error', '@typescript-eslint/explicit-function-return-type': 'error', 'max-lines-per-function': ['error', 20], 'max-depth': ['error', 2], 'max-params': ['error', 3], } } ); ``` --- ## Testing with Jest ```typescript // tests/unit/services/user.test.ts import { calculateTotal } from '../../../src/core/services/pricing'; describe('calculateTotal', () => { it('returns sum of item prices', () => { // Arrange const items = [{ price: 10 }, { price: 20 }]; // Act const result = calculateTotal(items); // Assert expect(result).toBe(30); }); it('returns zero for empty array', () => { expect(calculateTotal([])).toBe(0); }); it('throws on invalid item', () => { expect(() => calculateTotal([{ invalid: 'item' }])).toThrow(); }); }); ``` --- ## GitHub Actions ```yaml name: TypeScript Quality Gate on: [push, pull_request] jobs: quality: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup Node uses: actions/setup-node@v4 with: node-version: '20' - name: Install dependencies run: npm ci - name: Lint run: npm run lint - name: Type Check run: npm run typecheck - name: Test with Coverage run: npm run test:coverage - name: Coverage Threshold (80%) run: npm run test:coverage -- --coverageThreshold='{"global":{"branches":80,"functions":80,"lines":80,"statements":80}}' ``` --- ## Pre-Commit Hooks Using Husky + lint-staged: ```bash npm install -D husky lint-staged npx husky init ``` ```json // package.json { "lint-staged": { "*.{ts,tsx}": [ "eslint --fix", "prettier --write" ] } } ``` ```bash # .husky/pre-commit npx lint-staged npx tsc --noEmit npm run test -- --onlyChanged --passWithNoTests ``` This runs on every commit: 1. ESLint + Prettier on staged files 2. Type check entire project 3. Tests for changed files only --- ## Type Patterns ### Discriminated Unions for Results ```typescript type Result = | { ok: true; value: T } | { ok: false; error: string }; function parseUser(data: unknown): Result { // Type-safe error handling without exceptions } ``` ### Branded Types for IDs ```typescript type UserId = string & { readonly brand: unique symbol }; type OrderId = string & { readonly brand: unique symbol }; // Can't accidentally pass UserId where OrderId expected function getOrder(orderId: OrderId): Order { ... } ``` ### Const Assertions for Literals ```typescript const STATUSES = ['pending', 'active', 'closed'] as const; type Status = typeof STATUSES[number]; // 'pending' | 'active' | 'closed' ``` ### Zod for Runtime Validation ```typescript import { z } from 'zod'; const UserSchema = z.object({ email: z.string().email(), name: z.string().min(1).max(100), }); type User = z.infer; ``` --- ## TypeScript Anti-Patterns - ❌ `any` type - use `unknown` and narrow - ❌ Type assertions (`as`) - use type guards - ❌ Non-null assertions (`!`) - handle null explicitly - ❌ `@ts-ignore` without explanation - ❌ Enums - use const objects or union types - ❌ Classes for data - use interfaces/types - ❌ Default exports - use named exports ================================================ FILE: skills/ui-mobile/SKILL.md ================================================ --- name: ui-mobile description: Mobile UI patterns - React Native, iOS/Android, touch targets when-to-use: When building mobile UI components user-invocable: false paths: ["**/*.tsx", "**/*.jsx", "ios/**", "android/**", "**/*.dart"] effort: medium --- # Mobile UI Design Skill (React Native) --- ## MANDATORY: Mobile Accessibility Standards **These rules are NON-NEGOTIABLE. Every UI element must pass these checks.** ### 1. Touch Targets (CRITICAL) ```typescript // MINIMUM 44x44 points for ALL interactive elements const MINIMUM_TOUCH_SIZE = 44; // EVERY button, link, icon button must meet this const styles = StyleSheet.create({ button: { minHeight: MINIMUM_TOUCH_SIZE, minWidth: MINIMUM_TOUCH_SIZE, paddingVertical: 12, paddingHorizontal: 16, }, iconButton: { width: MINIMUM_TOUCH_SIZE, height: MINIMUM_TOUCH_SIZE, justifyContent: 'center', alignItems: 'center', }, }); // NEVER DO THIS: style={{ height: 30 }} // ✗ TOO SMALL style={{ padding: 4 }} // ✗ RESULTS IN TINY TARGET ``` ### 2. Color Contrast (CRITICAL) ```typescript // WCAG 2.1 AA: 4.5:1 for text, 3:1 for large text/UI // SAFE COMBINATIONS: const colors = { // Light mode textPrimary: '#000000', // on white = 21:1 ✓ textSecondary: '#374151', // gray-700 on white = 9.2:1 ✓ // Dark mode textPrimaryDark: '#FFFFFF', // on gray-900 = 16:1 ✓ textSecondaryDark: '#E5E7EB', // gray-200 on gray-900 = 11:1 ✓ }; // FORBIDDEN - FAILS CONTRAST: // ✗ '#9CA3AF' (gray-400) on white = 2.6:1 // ✗ '#6B7280' (gray-500) on '#111827' = 4.0:1 // ✗ Any text below 4.5:1 ratio ``` ### 3. Visibility Rules ```typescript // ALL BUTTONS MUST HAVE visible boundaries // PRIMARY: Solid background with contrasting text Submit const styles = StyleSheet.create({ primaryButton: { backgroundColor: '#1F2937', // gray-800 paddingVertical: 16, paddingHorizontal: 24, borderRadius: 12, minHeight: 44, }, }); // SECONDARY: Visible background Cancel const styles = StyleSheet.create({ secondaryButton: { backgroundColor: '#F3F4F6', // gray-100 minHeight: 44, }, }); // GHOST: MUST have visible border Skip const styles = StyleSheet.create({ ghostButton: { borderWidth: 1, borderColor: '#D1D5DB', // gray-300 minHeight: 44, }, }); // NEVER CREATE invisible buttons: // ✗ backgroundColor: 'transparent' without border // ✗ Text color matching background ``` ### 4. Accessibility Labels (REQUIRED) ```tsx // EVERY interactive element needs accessibility props // Buttons Submit // Icon buttons (NO visible text = MUST have label) // Images ``` ### 5. Focus/Selection States ```tsx // EVERY Pressable needs visible pressed state [ styles.button, pressed && styles.buttonPressed, ]} > {children} const styles = StyleSheet.create({ button: { backgroundColor: '#1F2937', }, buttonPressed: { opacity: 0.7, // OR backgroundColor: '#374151', }, }); ``` --- ## Core Philosophy **Mobile UI is about touch, speed, and focus.** No hover states, smaller screens, thumb-friendly targets. Design for one-handed use and interruption recovery. ## Platform Differences ### iOS vs Android ```typescript import { Platform } from 'react-native'; // Platform-specific values const styles = StyleSheet.create({ shadow: Platform.select({ ios: { shadowColor: '#000', shadowOffset: { width: 0, height: 2 }, shadowOpacity: 0.1, shadowRadius: 8, }, android: { elevation: 4, }, }), // iOS uses SF Pro, Android uses Roboto text: { fontFamily: Platform.OS === 'ios' ? 'System' : 'Roboto', }, }); ``` ### Design Language ``` iOS (Human Interface Guidelines) ───────────────────────────────── - Flat design with subtle depth - SF Symbols for icons - Large titles (34pt) - Rounded corners (10-14pt) - Blue as default tint Android (Material Design 3) ───────────────────────────────── - Material You dynamic color - Outlined/filled icons - Medium titles (22pt) - Rounded corners (12-28pt) - Primary color from theme ``` ## Spacing System ### 4px Base Grid ```typescript // React Native spacing - consistent scale const spacing = { xs: 4, sm: 8, md: 16, lg: 24, xl: 32, '2xl': 48, } as const; // Usage const styles = StyleSheet.create({ container: { padding: spacing.md, gap: spacing.sm, }, }); ``` ### Safe Areas ```tsx import { useSafeAreaInsets } from 'react-native-safe-area-context'; const Screen = ({ children }) => { const insets = useSafeAreaInsets(); return ( {children} ); }; ``` ## Typography ### Type Scale ```typescript const typography = { // Large titles (iOS style) largeTitle: { fontSize: 34, fontWeight: '700' as const, letterSpacing: 0.37, }, // Section headers title: { fontSize: 22, fontWeight: '700' as const, letterSpacing: 0.35, }, // Card titles headline: { fontSize: 17, fontWeight: '600' as const, letterSpacing: -0.41, }, // Body text body: { fontSize: 17, fontWeight: '400' as const, letterSpacing: -0.41, lineHeight: 22, }, // Secondary text callout: { fontSize: 16, fontWeight: '400' as const, letterSpacing: -0.32, }, // Small labels caption: { fontSize: 12, fontWeight: '400' as const, letterSpacing: 0, }, }; ``` ## Color System ### Semantic Colors ```typescript // Use semantic names, not literal colors const colors = { // Backgrounds background: '#FFFFFF', backgroundSecondary: '#F2F2F7', backgroundTertiary: '#FFFFFF', // Surfaces surface: '#FFFFFF', surfaceElevated: '#FFFFFF', // Text label: '#000000', labelSecondary: '#3C3C43', // 60% opacity labelTertiary: '#3C3C43', // 30% opacity // Actions primary: '#007AFF', destructive: '#FF3B30', success: '#34C759', warning: '#FF9500', // Separators separator: '#3C3C43', // 29% opacity opaqueSeparator: '#C6C6C8', }; // Dark mode variants const darkColors = { background: '#000000', backgroundSecondary: '#1C1C1E', label: '#FFFFFF', labelSecondary: '#EBEBF5', // 60% opacity separator: '#545458', }; ``` ### Dynamic Colors (React Native) ```tsx import { useColorScheme } from 'react-native'; const useColors = () => { const scheme = useColorScheme(); return scheme === 'dark' ? darkColors : colors; }; // Usage const MyComponent = () => { const colors = useColors(); return ( Hello ); }; ``` ## Touch Targets ### Minimum Sizes ```typescript // CRITICAL: Minimum 44pt touch targets const touchable = { minHeight: 44, minWidth: 44, }; // Button with proper sizing const styles = StyleSheet.create({ button: { minHeight: 44, paddingHorizontal: 16, paddingVertical: 12, justifyContent: 'center', alignItems: 'center', }, // Icon button (square) iconButton: { width: 44, height: 44, justifyContent: 'center', alignItems: 'center', }, // List row listRow: { minHeight: 44, paddingVertical: 12, paddingHorizontal: 16, }, }); ``` ### Touch Feedback ```tsx import { Pressable } from 'react-native'; // iOS-style opacity feedback const Button = ({ children, onPress }) => ( [ styles.button, pressed && { opacity: 0.7 }, ]} > {children} ); // Android-style ripple const AndroidButton = ({ children, onPress }) => ( {children} ); ``` ## Component Patterns ### Cards ```tsx const Card = ({ children, style }) => ( {children} ); const styles = StyleSheet.create({ card: { backgroundColor: '#FFFFFF', borderRadius: 12, padding: 16, ...Platform.select({ ios: { shadowColor: '#000', shadowOffset: { width: 0, height: 2 }, shadowOpacity: 0.08, shadowRadius: 8, }, android: { elevation: 2, }, }), }, }); ``` ### Buttons ```tsx // Primary button const PrimaryButton = ({ title, onPress, disabled }) => ( [ styles.primaryButton, pressed && styles.primaryButtonPressed, disabled && styles.buttonDisabled, ]} > {title} ); const styles = StyleSheet.create({ primaryButton: { backgroundColor: '#007AFF', borderRadius: 12, paddingVertical: 16, paddingHorizontal: 24, alignItems: 'center', }, primaryButtonPressed: { backgroundColor: '#0056B3', }, primaryButtonText: { color: '#FFFFFF', fontSize: 17, fontWeight: '600', }, buttonDisabled: { opacity: 0.5, }, }); // Secondary button const SecondaryButton = ({ title, onPress }) => ( [ styles.secondaryButton, pressed && { opacity: 0.7 }, ]} > {title} ); ``` ### Input Fields ```tsx const TextField = ({ label, value, onChangeText, error }) => { const [focused, setFocused] = useState(false); return ( {label && ( {label} )} setFocused(true)} onBlur={() => setFocused(false)} style={[ styles.textField, focused && styles.textFieldFocused, error && styles.textFieldError, ]} placeholderTextColor="#8E8E93" /> {error && ( {error} )} ); }; const styles = StyleSheet.create({ textFieldContainer: { gap: 8, }, textFieldLabel: { fontSize: 15, fontWeight: '500', color: '#3C3C43', }, textField: { backgroundColor: '#F2F2F7', borderRadius: 10, paddingHorizontal: 16, paddingVertical: 14, fontSize: 17, color: '#000000', borderWidth: 2, borderColor: 'transparent', }, textFieldFocused: { borderColor: '#007AFF', backgroundColor: '#FFFFFF', }, textFieldError: { borderColor: '#FF3B30', }, errorText: { fontSize: 13, color: '#FF3B30', }, }); ``` ### Lists ```tsx // Grouped list (iOS Settings style) const GroupedList = ({ sections }) => ( {sections.map((section, i) => ( {section.title && ( {section.title} )} {section.items.map((item, j) => ( {j > 0 && } [ styles.listRow, pressed && { backgroundColor: '#E5E5EA' }, ]} onPress={item.onPress} > {item.title} ))} ))} ); const styles = StyleSheet.create({ groupedList: { flex: 1, backgroundColor: '#F2F2F7', }, section: { marginTop: 35, }, sectionHeader: { fontSize: 13, fontWeight: '400', color: '#6D6D72', textTransform: 'uppercase', marginLeft: 16, marginBottom: 8, }, sectionContent: { backgroundColor: '#FFFFFF', borderRadius: 10, marginHorizontal: 16, overflow: 'hidden', }, listRow: { flexDirection: 'row', alignItems: 'center', justifyContent: 'space-between', paddingVertical: 12, paddingHorizontal: 16, minHeight: 44, }, separator: { height: StyleSheet.hairlineWidth, backgroundColor: '#C6C6C8', marginLeft: 16, }, }); ``` ## Navigation Patterns ### Bottom Tab Bar ```tsx // Proper bottom tab sizing const tabBarStyle = { height: Platform.OS === 'ios' ? 83 : 65, // Account for home indicator paddingBottom: Platform.OS === 'ios' ? 34 : 10, paddingTop: 10, backgroundColor: '#F8F8F8', borderTopWidth: StyleSheet.hairlineWidth, borderTopColor: '#C6C6C8', }; // Tab item const TabItem = ({ icon, label, active }) => ( {label} ); ``` ### Header ```tsx // Large title header (iOS) const LargeTitleHeader = ({ title, rightAction }) => { const insets = useSafeAreaInsets(); return ( {title} {rightAction} ); }; const styles = StyleSheet.create({ header: { backgroundColor: '#F8F8F8', borderBottomWidth: StyleSheet.hairlineWidth, borderBottomColor: '#C6C6C8', }, headerContent: { flexDirection: 'row', justifyContent: 'space-between', alignItems: 'center', paddingHorizontal: 16, paddingBottom: 8, }, largeTitle: { fontSize: 34, fontWeight: '700', letterSpacing: 0.37, }, }); ``` ## Animations ### Native Driver Animations ```tsx import { Animated } from 'react-native'; // Always use native driver when possible const fadeIn = (value: Animated.Value) => { Animated.timing(value, { toValue: 1, duration: 200, useNativeDriver: true, // CRITICAL for performance }).start(); }; // Spring for natural feel const bounce = (value: Animated.Value) => { Animated.spring(value, { toValue: 1, damping: 15, stiffness: 150, useNativeDriver: true, }).start(); }; ``` ### Reanimated for Complex Animations ```tsx import Animated, { useSharedValue, useAnimatedStyle, withSpring, } from 'react-native-reanimated'; const AnimatedCard = ({ children }) => { const scale = useSharedValue(1); const animatedStyle = useAnimatedStyle(() => ({ transform: [{ scale: scale.value }], })); const onPressIn = () => { scale.value = withSpring(0.95); }; const onPressOut = () => { scale.value = withSpring(1); }; return ( {children} ); }; ``` ## Loading States ### Skeleton Loader ```tsx const SkeletonLoader = ({ width, height, borderRadius = 4 }) => { const opacity = useSharedValue(0.3); useEffect(() => { opacity.value = withRepeat( withSequence( withTiming(1, { duration: 500 }), withTiming(0.3, { duration: 500 }) ), -1, false ); }, []); const animatedStyle = useAnimatedStyle(() => ({ opacity: opacity.value, })); return ( ); }; ``` ### Activity Indicator ```tsx import { ActivityIndicator } from 'react-native'; // Use platform-native indicator // Button with loading state const LoadingButton = ({ loading, title, onPress }) => ( {loading ? ( ) : ( {title} )} ); ``` ## Accessibility ### VoiceOver / TalkBack ```tsx // Accessible button Submit // Accessible image // Group related elements {name} {role} {status} ``` ### Dynamic Type (iOS) ```tsx import { PixelRatio } from 'react-native'; // Scale fonts with system settings const fontScale = PixelRatio.getFontScale(); const scaledFontSize = (size: number) => size * fontScale; // Or use allowFontScaling This text scales with system settings ``` ## Anti-Patterns ### Never Do ``` ✗ Touch targets smaller than 44pt ✗ Text smaller than 12pt ✗ Hover states (no hover on mobile) ✗ Fixed heights that break with large text ✗ Ignoring safe areas ✗ Heavy shadows on Android (use elevation) ✗ White text on light backgrounds without checking contrast ✗ Non-native animations (JS-driven transforms) ✗ Ignoring platform conventions (iOS vs Android) ✗ Inline styles everywhere (use StyleSheet.create) ``` ### Common Mistakes ```tsx // ✗ Hardcoded dimensions that break accessibility style={{ height: 40 }} // Text might be larger // ✓ Minimum height with padding style={{ minHeight: 44, paddingVertical: 12 }} // ✗ Shadow on Android shadowColor: '#000' // Won't work // ✓ Platform-specific ...Platform.select({ ios: { shadowColor: '#000', ... }, android: { elevation: 4 }, }) // ✗ Fixed status bar height paddingTop: 44 // ✓ Use safe area paddingTop: insets.top ``` ## Quick Reference ### Mobile Defaults ``` Touch targets: 44pt minimum Font sizes: 12pt min, 17pt body, 34pt large title Border radius: 10-14pt (iOS), 12-28pt (Android) Spacing: 4/8/16/24/32 grid Animations: 200-300ms, native driver Shadow: iOS shadowOpacity 0.08-0.15, Android elevation 2-8 ``` ### Premium Feel Checklist ``` □ All touch targets 44pt+ □ Consistent spacing (4pt grid) □ Platform-appropriate styling □ Safe area handling □ Native animations (60fps) □ Proper loading states □ Dark mode support □ Accessibility labels □ Haptic feedback on actions □ Pull-to-refresh where appropriate ``` ================================================ FILE: skills/ui-testing/SKILL.md ================================================ --- name: ui-testing description: Visual testing - catch invisible buttons, broken layouts, contrast when-to-use: When writing visual or accessibility tests for UI components user-invocable: false paths: ["**/*.test.tsx", "**/*.spec.tsx", "**/*.stories.*"] effort: medium --- # UI Verification Skill *Load with: ui-web.md or ui-mobile.md* ## Purpose Quick verification that generated UI meets accessibility standards. Run these checks after creating any new UI components. --- ## Pre-Flight Checklist ### Before Shipping ANY UI: ```markdown ## Visibility Check - [ ] All buttons have visible background OR border - [ ] No text is same color as its background - [ ] All text meets 4.5:1 contrast ratio - [ ] Ghost/text buttons have visible borders ## Touch/Click Targets - [ ] All buttons are minimum 44px height - [ ] Icon buttons are minimum 44x44px - [ ] Adequate spacing between clickable elements ## States - [ ] Hover states visible (web) - [ ] Pressed states visible (mobile) - [ ] Focus rings on keyboard navigation - [ ] Disabled states visually distinct (opacity 0.5) - [ ] Loading states show indicators ## Dark Mode (if applicable) - [ ] Text readable on dark backgrounds - [ ] Borders visible in dark mode - [ ] No gray-400 text on dark backgrounds ## Responsive (web) - [ ] No horizontal scroll on mobile (320px) - [ ] Content readable at all breakpoints - [ ] Touch targets adequate on mobile ``` --- ## Quick Contrast Check ### Use Browser DevTools ``` 1. Right-click element → Inspect 2. In Styles panel, click on color value 3. Look for contrast ratio display 4. Must show ✓ for AA compliance (4.5:1 for text) ``` ### Online Tools - https://webaim.org/resources/contrastchecker/ - https://coolors.co/contrast-checker ### Tailwind Safe Combinations ``` LIGHT MODE (on white bg): ✓ text-gray-900 (#111827) = 16:1 ✓ text-gray-800 (#1F2937) = 12:1 ✓ text-gray-700 (#374151) = 9:1 ✓ text-gray-600 (#4B5563) = 6:1 ✗ text-gray-500 (#6B7280) = 4.6:1 (barely) ✗ text-gray-400 (#9CA3AF) = 2.6:1 (FAILS) DARK MODE (on gray-900 bg): ✓ text-white (#FFFFFF) = 16:1 ✓ text-gray-100 (#F3F4F6) = 13:1 ✓ text-gray-200 (#E5E7EB) = 11:1 ✓ text-gray-300 (#D1D5DB) = 8:1 ✗ text-gray-400 (#9CA3AF) = 5:1 (barely) ✗ text-gray-500 (#6B7280) = 3:1 (FAILS) ``` --- ## Common Fixes ### Invisible Button ```tsx // PROBLEM: No visible boundary // FIX: Add background OR border // OR ``` ### Low Contrast Text ```tsx // PROBLEM: Light gray on white

Secondary text

// FIX: Use darker gray

Secondary text

``` ### Missing Focus State ```tsx // PROBLEM: Focus removed without replacement // FIX: Add visible focus ring ``` ### Small Touch Target ```tsx // PROBLEM: Too small for fingers // FIX: Minimum 44px ``` ### Dark Mode Broken ```tsx // PROBLEM: Same colors in both modes

Text

// FIX: Adjust for dark mode

Text

``` --- ## Automated Checks (Optional) ### ESLint Plugin ```bash npm install -D eslint-plugin-jsx-a11y ``` ```json // .eslintrc { "extends": ["plugin:jsx-a11y/recommended"] } ``` ### Playwright Quick Test ```typescript // e2e/accessibility.spec.ts import { test, expect } from '@playwright/test'; import AxeBuilder from '@axe-core/playwright'; test('no accessibility violations', async ({ page }) => { await page.goto('/'); const results = await new AxeBuilder({ page }).analyze(); expect(results.violations).toEqual([]); }); ``` --- ## When to Use Full Testing Add comprehensive visual testing (Playwright screenshots, Storybook) when: - Building a component library - Multiple developers on UI - Frequent UI changes - Design system enforcement needed For solo projects or MVPs, the checklist above is sufficient. ================================================ FILE: skills/ui-web/SKILL.md ================================================ --- name: ui-web description: Web UI - glassmorphism, Tailwind, dark mode, accessibility when-to-use: When building or styling web UI components user-invocable: false paths: ["**/*.tsx", "**/*.jsx", "**/*.css", "**/*.scss", "tailwind.config.*"] effort: medium --- # UI Design Skill (Web) --- ## MANDATORY: WCAG 2.1 AA Compliance **These rules are NON-NEGOTIABLE. Every UI element must pass these checks.** ### 1. Color Contrast (CRITICAL) ``` Text Contrast Requirements: ├── Normal text (<18px): 4.5:1 minimum ├── Large text (≥18px bold or ≥24px): 3:1 minimum ├── UI components (buttons, inputs): 3:1 minimum └── Focus indicators: 3:1 minimum FORBIDDEN COLOR COMBINATIONS: ✗ gray-400 on white (#9CA3AF on #FFFFFF = 2.6:1) - FAILS ✗ gray-500 on white (#6B7280 on #FFFFFF = 4.6:1) - BARELY PASSES ✗ white on yellow - FAILS ✗ light blue on white - USUALLY FAILS SAFE COLOR COMBINATIONS: ✓ gray-700 on white (#374151 on #FFFFFF = 9.2:1) ✓ gray-600 on white (#4B5563 on #FFFFFF = 6.4:1) ✓ gray-900 on white (#111827 on #FFFFFF = 16:1) ✓ white on gray-900, blue-600, green-700 ``` ### 2. Visibility Rules (CRITICAL) ``` ALL BUTTONS MUST HAVE: ✓ Visible background color OR visible border (min 1px) ✓ Text color that contrasts with background ✓ Minimum height: 44px (touch target) ✓ Padding: at least px-4 py-2 NEVER CREATE: ✗ Buttons with transparent background AND no border ✗ Text same color as background ✗ Ghost buttons without visible borders ✗ White text on light backgrounds ✗ Dark text on dark backgrounds ``` ### 3. Required Element Styles ```tsx // EVERY button needs visible boundaries // PRIMARY: solid background // SECONDARY: visible background // GHOST: MUST have visible border // NEVER DO THIS: // ✗ NO BOUNDARY // ✗ NO CONTRAST ``` ### 4. Focus States (REQUIRED) ```tsx // EVERY interactive element needs visible focus className="focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500 focus-visible:ring-offset-2" // NEVER remove focus without replacement className="outline-none" // ✗ FORBIDDEN without ring replacement ``` ### 5. Dark Mode Contrast ``` When implementing dark mode: ├── Text must be light (gray-100 to white) on dark backgrounds ├── Borders must be visible (gray-700 or lighter) ├── Never use gray-400 text on gray-900 bg (fails contrast) └── Test BOTH modes before shipping SAFE DARK MODE TEXT: ✓ text-white on bg-gray-900 ✓ text-gray-100 on bg-gray-800 ✓ text-gray-200 on bg-gray-900 UNSAFE (FAILS CONTRAST): ✗ text-gray-500 on bg-gray-900 (2.4:1) ✗ text-gray-400 on bg-gray-800 (3.1:1) ``` --- ## Core Philosophy **Beautiful UI is not decoration - it's communication.** Every visual choice should serve clarity, hierarchy, and user confidence. Default to elegance and restraint. ## Design Principles ### 1. Visual Hierarchy ``` Primary Action → Bold, high contrast, prominent Secondary Action → Subtle, lower contrast Tertiary/Links → Minimal, text-style ``` ### 2. Spacing System (8px Grid) ```typescript // Tailwind spacing scale - USE CONSISTENTLY const spacing = { xs: 'p-1', // 4px - tight internal sm: 'p-2', // 8px - compact md: 'p-4', // 16px - default lg: 'p-6', // 24px - comfortable xl: 'p-8', // 32px - spacious '2xl': 'p-12', // 48px - section gaps }; // Rule: More whitespace = more premium feel // Rule: Consistent spacing > perfect spacing ``` ### 3. Typography Scale ```typescript // Limit to 3-4 font sizes per page const typography = { hero: 'text-4xl md:text-5xl font-bold tracking-tight', heading: 'text-2xl md:text-3xl font-semibold', subheading: 'text-lg md:text-xl font-medium', body: 'text-base leading-relaxed', caption: 'text-sm text-gray-500', }; // Rule: Never use more than 2 font families // Rule: Line height 1.5-1.7 for body text ``` ## Glassmorphism (Web) ### Base Glass Card ```tsx // Modern glass effect - use sparingly for emphasis const GlassCard = ({ children, className = '' }) => (
{children}
); ``` ### Glass Variants ```tsx // Light mode glass const lightGlass = ` backdrop-blur-xl bg-white/70 border border-white/50 shadow-lg shadow-gray-200/50 `; // Dark mode glass const darkGlass = ` backdrop-blur-xl bg-gray-900/70 border border-white/10 shadow-xl shadow-black/20 `; // Frosted sidebar const frostedSidebar = ` backdrop-blur-2xl bg-gradient-to-b from-white/80 to-white/60 border-r border-white/30 `; // Floating action glass const floatingGlass = ` backdrop-blur-md bg-white/90 rounded-full shadow-lg shadow-black/10 border border-white/50 `; ``` ### When to Use Glassmorphism ``` ✓ Hero sections with image backgrounds ✓ Floating cards over gradients ✓ Modal overlays ✓ Navigation bars (subtle) ✓ Feature highlights ✗ Every card (overuse kills the effect) ✗ Text-heavy content areas ✗ Forms (reduces contrast) ✗ Data tables ``` ## Color System ### Semantic Colors ```typescript const colors = { // Actions primary: 'bg-blue-600 hover:bg-blue-700', secondary: 'bg-gray-100 hover:bg-gray-200 text-gray-900', danger: 'bg-red-600 hover:bg-red-700', success: 'bg-green-600 hover:bg-green-700', // Surfaces background: 'bg-gray-50 dark:bg-gray-950', surface: 'bg-white dark:bg-gray-900', elevated: 'bg-white dark:bg-gray-800 shadow-lg', // Text textPrimary: 'text-gray-900 dark:text-white', textSecondary: 'text-gray-600 dark:text-gray-400', textMuted: 'text-gray-400 dark:text-gray-500', }; ``` ### Gradient Backgrounds ```tsx // Subtle mesh gradient (modern, premium) const meshGradient = ` bg-gradient-to-br from-blue-50 via-white to-purple-50 dark:from-gray-950 dark:via-gray-900 dark:to-gray-950 `; // Vibrant hero gradient const heroGradient = ` bg-gradient-to-r from-blue-600 via-purple-600 to-pink-600 `; // Subtle radial glow const radialGlow = ` bg-[radial-gradient(ellipse_at_top,_var(--tw-gradient-stops))] from-blue-200/40 via-transparent to-transparent `; ``` ## Component Patterns ### Buttons ```tsx // Primary button - bold, confident const PrimaryButton = ({ children, ...props }) => ( ); // Secondary button - subtle const SecondaryButton = ({ children, ...props }) => ( ); // Ghost button - minimal const GhostButton = ({ children, ...props }) => ( ); ``` ### Cards ```tsx // Clean card with subtle elevation const Card = ({ children, className = '' }) => (
{children}
); // Interactive card const InteractiveCard = ({ children, onClick }) => ( ); ``` ### Input Fields ```tsx const Input = ({ label, error, ...props }) => (
{label && ( )} {error && (

{error}

)}
); ``` ## Micro-Interactions ### Transitions ```typescript // Standard transitions - ALWAYS use const transitions = { fast: 'transition-all duration-150', // Hover states normal: 'transition-all duration-200', // Most interactions slow: 'transition-all duration-300', // Card hovers, modals spring: 'transition-all duration-500 ease-out', // Page transitions }; // Rule: Everything interactive should transition // Rule: 150-300ms feels responsive, >500ms feels slow ``` ### Hover Effects ```tsx // Scale on hover (buttons, cards) className="hover:scale-105 active:scale-95 transition-transform" // Lift on hover (cards) className="hover:-translate-y-1 hover:shadow-xl transition-all" // Glow on hover (CTAs) className="hover:shadow-lg hover:shadow-blue-500/25 transition-shadow" // Border highlight (inputs, cards) className="hover:border-gray-300 transition-colors" ``` ### Loading States ```tsx // Skeleton loader const Skeleton = ({ className = '' }) => (
); // Spinner const Spinner = ({ size = 'md' }) => (
); // Button loading state ``` ## Layout Patterns ### Container ```tsx // Consistent max-width and padding const Container = ({ children, className = '' }) => (
{children}
); ``` ### Section Spacing ```tsx // Consistent vertical rhythm const Section = ({ children }) => (
{children}
); ``` ### Grid Systems ```tsx // Feature grid
{features.map(f => )}
// Bento grid (modern asymmetric)
Large
Small
Small
Medium
``` ## Dark Mode ### Implementation ```tsx // Always design for both modes // Use CSS variables or Tailwind dark: prefix // Theme toggle const ThemeToggle = () => { const [dark, setDark] = useState(false); useEffect(() => { document.documentElement.classList.toggle('dark', dark); }, [dark]); return ( ); }; ``` ### Color Pairing ``` Light Mode Dark Mode ───────────────────────────────── white gray-950 gray-50 gray-900 gray-100 gray-800 gray-200 gray-700 gray-900 (text) white (text) gray-600 (secondary) gray-400 blue-600 blue-500 ``` ## Accessibility ### Contrast Requirements ``` WCAG AA: 4.5:1 for normal text, 3:1 for large text WCAG AAA: 7:1 for normal text, 4.5:1 for large text // Test: Use browser devtools or contrast checker // Rule: Never use gray-400 on white for body text ``` ### Focus States ```tsx // Always visible focus rings className=" focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500 focus-visible:ring-offset-2 " // Never remove focus styles without replacement // ✗ outline-none (alone) // ✓ outline-none + focus-visible:ring ``` ### Screen Readers ```tsx // Visually hidden but accessible const srOnly = "absolute w-px h-px p-0 -m-px overflow-hidden whitespace-nowrap border-0"; // Icon buttons need labels // Announce dynamic content
{message}
``` ## Anti-Patterns ### Never Do ``` ✗ More than 3 font sizes on a page ✗ Random spacing values (use 8px grid) ✗ Pure black (#000) on pure white (#fff) ✗ Colored text on colored backgrounds without checking contrast ✗ Animations longer than 500ms for UI elements ✗ Glassmorphism everywhere ✗ Drop shadows on everything ✗ Gradients on text (hard to read) ✗ Auto-playing animations that can't be stopped ✗ Removing focus indicators ✗ Gray text below 4.5:1 contrast ✗ Tiny click targets (< 44px) ``` ### Common Mistakes ```tsx // ✗ Too many shadows className="shadow-sm shadow-md shadow-lg" // Pick ONE // ✗ Inconsistent rounding className="rounded-sm rounded-lg rounded-2xl" // System: sm, lg, xl, 2xl // ✗ Competing focal points // One primary CTA per viewport // ✗ Over-decorated // If it doesn't serve function, remove it ``` ## Quick Reference ### Modern Defaults ```tsx // Border radius: 12-16px (rounded-xl to rounded-2xl) // Shadow: subtle (shadow-sm to shadow-md) // Font: Inter, SF Pro, system-ui // Primary: Near-black or brand color // Transitions: 200ms ease-out // Spacing: 8px grid (Tailwind default) ``` ### Premium Feel Checklist ``` □ Generous whitespace □ Subtle shadows (not harsh) □ Smooth transitions on all interactions □ Consistent border radius □ Limited color palette (2-3 colors max) □ Typography hierarchy (3 sizes max) □ High-quality imagery □ Micro-interactions on hover/focus □ Dark mode support ``` ================================================ FILE: skills/user-journeys/SKILL.md ================================================ --- name: user-journeys description: User experience flows - journey mapping, UX validation, error recovery when-to-use: When mapping user flows, validating UX, or designing error recovery user-invocable: false effort: medium --- # User Journeys Skill For defining and testing real user experiences - not just specs, but actual flows humans take through your application. --- ## Philosophy **Specs test features. Journeys test experiences.** A feature can pass all specs but still deliver a terrible experience. User journeys capture: - How users actually navigate (not how we think they should) - Emotional states at each step (frustrated, confused, delighted) - Recovery from mistakes (users will make them) - Real-world conditions (slow networks, interruptions, distractions) --- ## Journey Documentation Structure ``` _project_specs/ ├── journeys/ │ ├── _template.md # Journey template │ ├── critical/ # Must-work journeys (revenue, core value) │ │ ├── signup-to-first-value.md │ │ ├── checkout-purchase.md │ │ └── login-to-dashboard.md │ ├── common/ # Frequent user paths │ │ ├── browse-and-search.md │ │ ├── update-profile.md │ │ └── invite-team-member.md │ └── edge-cases/ # Error recovery, unusual paths │ ├── payment-failure-retry.md │ ├── session-timeout-recovery.md │ └── offline-reconnection.md ``` --- ## Journey Template ```markdown # Journey: [Name] ## Overview | Attribute | Value | |-----------|-------| | **Priority** | Critical / High / Medium | | **User Type** | New / Returning / Admin | | **Frequency** | Daily / Weekly / One-time | | **Success Metric** | Conversion rate, time to complete, drop-off rate | ## User Goal What is the user trying to accomplish? Write from their perspective. > "I want to [goal] so that I can [benefit]." ## Preconditions - User state (logged in, has subscription, first visit) - Data state (has items in cart, has team members) - Environment (mobile, desktop, slow connection) ## Journey Steps ### Step 1: [Entry Point] **User Action:** What the user does **System Response:** What they should see/experience **Success Criteria:** - [ ] Page loads in < 2 seconds - [ ] Primary CTA is immediately visible - [ ] User understands what to do next **Potential Friction:** - Slow load time → Show skeleton/loader - Unclear CTA → A/B test copy variations --- ### Step 2: [Next Action] **User Action:** ... **System Response:** ... **Success Criteria:** - [ ] ... **Potential Friction:** - ... --- ## Error Scenarios ### E1: [Error Name] **Trigger:** What causes this error **User Sees:** Error message/state **Recovery Path:** How user gets back on track **Test:** How to verify recovery works ## Metrics to Track - Time to complete journey - Drop-off rate at each step - Error rate and recovery rate - User satisfaction (if surveyed) ## E2E Test Reference Link to Playwright test: `e2e/tests/journeys/[name].spec.ts` ``` --- ## Critical Journey Examples ### Signup to First Value ```markdown # Journey: Signup to First Value ## Overview | Attribute | Value | |-----------|-------| | **Priority** | Critical | | **User Type** | New | | **Frequency** | One-time | | **Success Metric** | % reaching "aha moment" within 5 min | ## User Goal > "I want to try this product quickly to see if it solves my problem." ## Preconditions - First visit to site - No account - Came from landing page or ad ## Journey Steps ### Step 1: Landing Page **User Action:** Clicks "Get Started Free" or "Try Now" **System Response:** Signup form appears (modal or new page) **Success Criteria:** - [ ] CTA visible above fold - [ ] No distracting elements - [ ] Clear value proposition visible **Potential Friction:** - Too many form fields → Reduce to email + password only - Social login missing → Add Google/GitHub options ### Step 2: Account Creation **User Action:** Enters email and password (or uses social login) **System Response:** - Creates account - Sends verification email (don't block on it) - Redirects to onboarding **Success Criteria:** - [ ] Account created in < 3 seconds - [ ] No email verification wall (verify later) - [ ] Clear next step shown **Potential Friction:** - Email already exists → Offer login link - Weak password → Show requirements inline, not after submit ### Step 3: Onboarding (Quick Win) **User Action:** Completes 1-2 setup questions **System Response:** - Personalizes experience - Shows progress indicator - Leads to first action **Success Criteria:** - [ ] Max 3 questions - [ ] Skip option available - [ ] < 60 seconds total **Potential Friction:** - Too many questions → User abandons - No skip option → User feels trapped ### Step 4: First Value (Aha Moment) **User Action:** Completes core action (creates first X, sees first result) **System Response:** - Celebrates success - Shows value delivered - Suggests next step **Success Criteria:** - [ ] User experiences core value - [ ] Completion feels rewarding - [ ] Clear path to continue ## Error Scenarios ### E1: Email Already Registered **Trigger:** User tries existing email **User Sees:** "Already have an account? Log in or reset password" **Recovery Path:** Click to login or reset **Test:** `signup-existing-email.spec.ts` ### E2: Social Login Fails **Trigger:** OAuth provider error **User Sees:** "Couldn't connect. Try email signup or try again." **Recovery Path:** Email signup form shown as fallback **Test:** `social-login-failure.spec.ts` ## Metrics to Track - Signup → First Value: Target < 5 min - Drop-off at each step - Social vs email signup ratio - Skip rate on onboarding ``` --- ### Checkout Purchase ```markdown # Journey: Checkout Purchase ## Overview | Attribute | Value | |-----------|-------| | **Priority** | Critical (Revenue) | | **User Type** | Any | | **Frequency** | Variable | | **Success Metric** | Checkout completion rate | ## User Goal > "I want to pay quickly and securely without surprises." ## Journey Steps ### Step 1: Cart Review **User Action:** Views cart before checkout **System Response:** - Shows all items with images, prices - Shows subtotal, taxes, shipping - Clear "Checkout" CTA **Success Criteria:** - [ ] No hidden fees revealed later - [ ] Easy to modify quantities - [ ] Saved items visible ### Step 2: Checkout Start **User Action:** Clicks "Checkout" **System Response:** - Shows checkout form or redirect to payment - Progress indicator (Step 1 of 3) - Order summary sidebar **Success Criteria:** - [ ] Guest checkout option - [ ] Express checkout (Apple/Google Pay) prominent - [ ] Form fields pre-filled if logged in ### Step 3: Payment **User Action:** Enters payment info **System Response:** - Secure input fields (Stripe/payment provider) - Real-time validation - Clear "Pay $XX" button **Success Criteria:** - [ ] Card validation inline, not after submit - [ ] Multiple payment options - [ ] Security indicators visible ### Step 4: Confirmation **User Action:** Submits payment **System Response:** - Processing indicator - Success page with order details - Email confirmation sent **Success Criteria:** - [ ] Confirmation within 5 seconds - [ ] Order number clearly visible - [ ] Next steps clear (shipping, access, etc.) ## Error Scenarios ### E1: Payment Declined **Trigger:** Card declined by processor **User Sees:** "Payment declined. Please try another card." **Recovery Path:** - Stay on payment step - Pre-fill other fields - Offer alternative payment methods **Test:** `payment-declined-recovery.spec.ts` ### E2: Session Timeout During Checkout **Trigger:** User away too long **User Sees:** Cart preserved, re-auth required **Recovery Path:** - Quick login - Return to same checkout step - Cart contents intact **Test:** `checkout-session-timeout.spec.ts` ``` --- ## Journey Testing with Playwright ### Journey Test Structure ```typescript // e2e/tests/journeys/signup-to-value.spec.ts import { test, expect } from '@playwright/test'; test.describe('Journey: Signup to First Value', () => { test.describe.configure({ mode: 'serial' }); // Run in order test('Step 1: Landing page has clear CTA', async ({ page }) => { await page.goto('/'); // CTA visible above fold without scrolling const cta = page.getByRole('button', { name: /get started|try free/i }); await expect(cta).toBeVisible(); await expect(cta).toBeInViewport(); }); test('Step 2: Can create account quickly', async ({ page }) => { await page.goto('/'); await page.getByRole('button', { name: /get started/i }).click(); // Minimal fields await expect(page.getByLabel('Email')).toBeVisible(); await expect(page.getByLabel('Password')).toBeVisible(); // Complete signup const startTime = Date.now(); await page.getByLabel('Email').fill('newuser@example.com'); await page.getByLabel('Password').fill('SecurePass123!'); await page.getByRole('button', { name: /sign up|create/i }).click(); // Should reach onboarding quickly await expect(page).toHaveURL(/onboarding|welcome|setup/); expect(Date.now() - startTime).toBeLessThan(5000); // < 5 seconds }); test('Step 3: Onboarding is skippable', async ({ page }) => { // ... login as new user ... await page.goto('/onboarding'); // Skip option exists const skipButton = page.getByRole('button', { name: /skip/i }); await expect(skipButton).toBeVisible(); }); test('Step 4: Can reach first value in < 5 min', async ({ page }) => { // Full journey timing const journeyStart = Date.now(); // ... complete full journey ... // Verify first value delivered await expect(page.getByText(/success|created|done/i)).toBeVisible(); // Total time check const totalTime = (Date.now() - journeyStart) / 1000 / 60; // minutes expect(totalTime).toBeLessThan(5); }); }); ``` ### Error Recovery Tests ```typescript // e2e/tests/journeys/checkout-recovery.spec.ts import { test, expect } from '@playwright/test'; test.describe('Journey: Checkout Error Recovery', () => { test('recovers from payment decline gracefully', async ({ page }) => { // Setup: Add item to cart, go to checkout await page.goto('/products'); await page.getByTestId('add-to-cart').first().click(); await page.getByRole('link', { name: 'Checkout' }).click(); // Use Stripe test card that declines const stripeFrame = page.frameLocator('iframe[name*="stripe"]'); await stripeFrame.getByPlaceholder('Card number').fill('4000000000000002'); await stripeFrame.getByPlaceholder('MM / YY').fill('12/30'); await stripeFrame.getByPlaceholder('CVC').fill('123'); await page.getByRole('button', { name: /pay/i }).click(); // Verify friendly error await expect(page.getByText(/declined|try another/i)).toBeVisible(); // Verify still on checkout (not kicked out) await expect(page).toHaveURL(/checkout/); // Verify can try again with different card await stripeFrame.getByPlaceholder('Card number').fill('4242424242424242'); await page.getByRole('button', { name: /pay/i }).click(); // Should succeed now await expect(page).toHaveURL(/success|confirmation/); }); test('preserves cart after session timeout', async ({ page, context }) => { // Add items to cart await page.goto('/products'); await page.getByTestId('add-to-cart').first().click(); // Clear session (simulate timeout) await context.clearCookies(); // Return to site await page.goto('/cart'); // Cart should be preserved (local storage or recovered) await expect(page.getByTestId('cart-item')).toHaveCount(1); }); }); ``` --- ## User Experience Validation ### UX Checklist per Journey Step ```markdown ## UX Validation Checklist ### Clarity - [ ] User knows where they are (breadcrumbs, progress) - [ ] User knows what to do next (clear CTA) - [ ] User knows what just happened (feedback) ### Speed - [ ] Page loads < 2 seconds - [ ] Actions complete < 3 seconds - [ ] Progress shown for longer operations ### Forgiveness - [ ] Mistakes are easy to undo - [ ] Errors explain what went wrong - [ ] Recovery path is clear ### Accessibility - [ ] Keyboard navigation works - [ ] Screen reader announces changes - [ ] Focus management correct - [ ] Color contrast sufficient ### Mobile - [ ] Touch targets >= 44px - [ ] No horizontal scroll - [ ] Forms don't zoom unexpectedly - [ ] Works on slow 3G ``` ### Automated UX Checks ```typescript // e2e/utils/ux-validators.ts import { Page, expect } from '@playwright/test'; export async function validatePageLoad(page: Page, maxMs = 2000) { const timing = await page.evaluate(() => { const nav = performance.getEntriesByType('navigation')[0] as PerformanceNavigationTiming; return nav.loadEventEnd - nav.startTime; }); expect(timing).toBeLessThan(maxMs); } export async function validateCTAVisible(page: Page, ctaText: RegExp) { const cta = page.getByRole('button', { name: ctaText }); await expect(cta).toBeVisible(); await expect(cta).toBeInViewport(); } export async function validateNoLayoutShift(page: Page) { const cls = await page.evaluate(() => { return new Promise((resolve) => { let clsValue = 0; const observer = new PerformanceObserver((list) => { for (const entry of list.getEntries()) { if (!(entry as any).hadRecentInput) { clsValue += (entry as any).value; } } }); observer.observe({ type: 'layout-shift', buffered: true }); setTimeout(() => { observer.disconnect(); resolve(clsValue); }, 1000); }); }); expect(cls).toBeLessThan(0.1); // Good CLS score } export async function validateAccessibility(page: Page) { // Check focus visible on interactive elements const buttons = page.getByRole('button'); const count = await buttons.count(); for (let i = 0; i < Math.min(count, 5); i++) { await buttons.nth(i).focus(); await expect(buttons.nth(i)).toBeFocused(); } } ``` --- ## Journey Metrics Dashboard Track journey health with these metrics: ```typescript // lib/journey-metrics.ts interface JourneyMetric { journey: string; step: string; timestamp: Date; duration: number; success: boolean; userId?: string; } // Track in your analytics (PostHog, Mixpanel, etc.) export function trackJourneyStep(metric: JourneyMetric) { analytics.track('journey_step', { journey_name: metric.journey, step_name: metric.step, duration_ms: metric.duration, success: metric.success, }); } // Example usage in app const journeyStart = Date.now(); // ... user completes step ... trackJourneyStep({ journey: 'signup_to_value', step: 'account_creation', timestamp: new Date(), duration: Date.now() - journeyStart, success: true, }); ``` --- ## Common Journey Patterns ### Progressive Disclosure Journey User sees simple view first, complexity revealed as needed. ```markdown Step 1: Show basic options only Step 2: "Advanced" expands more options Step 3: Expert mode unlocks everything ``` ### Guided Setup Journey Hand-hold new users through initial configuration. ```markdown Step 1: Welcome + single choice Step 2: Core preference Step 3: Optional integrations (skippable) Step 4: First action with guidance Step 5: Success + remove training wheels ``` ### Recovery Journey User returns after failure or abandonment. ```markdown Step 1: Recognize returning user Step 2: Restore previous state Step 3: Acknowledge what happened Step 4: Offer clear path forward Step 5: Complete original goal ``` --- ## Anti-Patterns - **Happy path only** - Test error recovery, not just success - **Spec-driven testing** - Test user goals, not features - **Ignoring time** - Measure how long journeys take - **Desktop-only** - Test mobile journeys separately - **Skipping emotions** - Consider user frustration points - **No metrics** - Track journey completion and drop-off - **Static journeys** - Update as user behavior evolves --- ## Quick Reference ### Journey Priorities | Priority | Criteria | Test Frequency | |----------|----------|----------------| | Critical | Revenue, core value | Every deploy | | High | Daily user actions | Daily | | Medium | Weekly features | Weekly | | Low | Edge cases | On change | ### Package.json Scripts ```json { "scripts": { "test:journeys": "playwright test e2e/tests/journeys/", "test:journeys:critical": "playwright test e2e/tests/journeys/critical/", "test:journeys:report": "playwright show-report" } } ``` ### Journey Documentation Checklist - [ ] User goal clearly stated - [ ] All steps documented - [ ] Success criteria per step - [ ] Error scenarios covered - [ ] Recovery paths defined - [ ] Metrics identified - [ ] E2E test linked ================================================ FILE: skills/web-content/SKILL.md ================================================ --- name: web-content description: SEO and AI discovery (GEO) - schema, ChatGPT/Perplexity optimization when-to-use: When creating web content that needs SEO and AI discoverability user-invocable: false effort: medium --- # Web Content Skill For creating web content optimized for both traditional SEO and AI discovery (ChatGPT, Perplexity, Claude, Gemini). **Sources:** [GEO Complete Guide](https://skale.so/marketing/geo/) | [AI Search SEO](https://www.gravitatedesign.com/blog/ai-search-seo/) | [LLM Optimization](https://surferseo.com/blog/llm-optimization-seo/) | [Generative Engine Optimization](https://www.siddharthbharath.com/generative-engine-optimization/) --- ## Philosophy **SEO gets clicks. GEO gets citations.** Traditional SEO optimizes for Google rankings. Generative Engine Optimization (GEO) optimizes for being cited by AI assistants. Modern content needs both: - **SEO**: Rank on search results pages - **GEO**: Be cited in AI-generated answers (ChatGPT, Perplexity, Claude, Gemini) AI traffic grew 1,200% between July 2024 and February 2025. Google's search share dropped below 90% for the first time in a decade. Optimize for both. --- ## Content Structure for AI + SEO ### The Golden Rule **Write for humans, structure for machines.** AI systems prefer: - Short, clear, fact-based content - Clean formatting (headers, bullets, tables) - Standalone sections that can be quoted - Direct answers to questions --- ## Page Types & Templates ### Homepage ```markdown ## Homepage Structure ### Above the Fold - **Headline**: Clear value proposition (what you do + for whom) - **Subheadline**: How you deliver that value - **Primary CTA**: One clear action - **Trust signals**: Logos, testimonials, stats ### Content Sections 1. **Problem Statement**: Pain point you solve 2. **Solution Overview**: How you solve it (3-4 key features) 3. **Social Proof**: Testimonials, case studies, logos 4. **How It Works**: 3-step process (simple) 5. **Pricing Preview**: Or link to pricing page 6. **FAQ Section**: 5-7 common questions (GEO gold) 7. **Final CTA**: Repeat primary action ### Schema Required - Organization schema (name, logo, founding date, social links) - WebSite schema with SearchAction - FAQ schema for questions section ``` ### Product/Service Page ```markdown ## Product Page Structure ### Hero Section - **Product Name**: Clear, descriptive - **One-line Description**: What it does in 10 words or less - **Key Benefit**: Primary value proposition - **CTA**: Buy/Try/Demo ### Content Sections 1. **TL;DR Box**: 3-5 bullet summary (AI-quotable) 2. **Problem → Solution**: What problem, how solved 3. **Features Grid**: 4-6 features with icons 4. **Comparison Table**: vs. alternatives (GEO loves these) 5. **Use Cases**: Who uses it and how 6. **Testimonials**: Real names, photos, companies 7. **Pricing**: Clear tiers if applicable 8. **FAQ**: Product-specific questions ### Schema Required - Product schema (name, description, price, availability) - Review schema (aggregate rating) - FAQ schema - BreadcrumbList schema ``` ### Blog Post / Article ```markdown ## Blog Post Structure ### Opening (First 100 words) - **TL;DR**: Direct answer to the title's question - **What you'll learn**: Bullet list of takeaways - This section should be quotable standalone ### Body Structure - **H2 sections**: Main topics (5-7 per article) - **H3 subsections**: Supporting points - **Bullet lists**: For scanability - **Stat boxes**: Highlight key numbers - **Comparison tables**: When comparing options ### Content Elements - Definition boxes ("What is X?") - Step-by-step instructions - Code examples (if technical) - Original statistics/research - Expert quotes with attribution ### Closing - **Summary**: Key takeaways (bulleted) - **Next steps**: What reader should do - **Related content**: Internal links ### Metadata Required - Author name + bio + photo - Publication date - Last updated date (visible!) - Reading time - Article schema with author ``` ### FAQ Page ```markdown ## FAQ Page Structure ### Organization - Group questions by category - Most common questions first - Direct, concise answers - Link to detailed pages for more info ### Question Format Q: [Exact question users ask] A: [Direct answer in first sentence, then elaboration] ### Schema Required - FAQPage schema (critical for AI discovery) - Each Q&A as Question/Answer schema ``` ### Landing Page ```markdown ## Landing Page Structure ### Single Focus - One offer - One audience - One CTA (repeated) ### Sections 1. **Headline**: Benefit-focused, specific 2. **Problem Agitation**: Pain points 3. **Solution**: Your offer 4. **Proof**: Testimonials, stats, logos 5. **Features**: 3-5 key benefits 6. **Objection Handling**: FAQ or guarantee 7. **CTA**: Clear, urgent ### No Navigation - Remove header nav (reduce exits) - Single path: read → convert ``` --- ## AI-Optimized Content Formats ### TL;DR Boxes ```html

TL;DR

  • Key point 1 with specific detail
  • Key point 2 with number/stat
  • Key point 3 with actionable insight
``` Place at top of articles. AI systems extract these for summaries. ### Definition Blocks ```markdown ## What is [Term]? [Term] is [concise definition in one sentence]. It [what it does] by [how it works]. **Key characteristics:** - Characteristic 1 - Characteristic 2 - Characteristic 3 ``` Start with "What is X?" - AI systems look for this pattern. ### Comparison Tables ```markdown | Feature | Product A | Product B | Our Product | |---------|-----------|-----------|-------------| | Price | $99/mo | $149/mo | $79/mo | | Feature 1 | ✓ | ✗ | ✓ | | Feature 2 | ✗ | ✓ | ✓ | | Best For | Enterprise | Startups | SMBs | ``` AI loves structured comparisons. Include in product and review pages. ### Stat Boxes ```html
73% of users prefer AI search for complex queries Source: Adobe Analytics, 2024
``` Original statistics with sources get cited by AI. ### Step-by-Step Guides ```markdown ## How to [Do Thing] ### Step 1: [Action Verb] [Object] [Explanation of what to do] **Example:** [Concrete example] ### Step 2: [Action Verb] [Object] [Explanation] ### Step 3: [Action Verb] [Object] [Explanation] **Result:** [What user achieves] ``` Use HowTo schema markup for these. --- ## Schema Markup (Critical for AI) ### Organization Schema ```json { "@context": "https://schema.org", "@type": "Organization", "name": "Your Company", "url": "https://yoursite.com", "logo": "https://yoursite.com/logo.png", "foundingDate": "2020", "description": "One sentence description", "sameAs": [ "https://twitter.com/yourcompany", "https://linkedin.com/company/yourcompany", "https://github.com/yourcompany" ], "contactPoint": { "@type": "ContactPoint", "email": "hello@yoursite.com", "contactType": "customer service" } } ``` ### Article Schema ```json { "@context": "https://schema.org", "@type": "Article", "headline": "Article Title", "description": "Meta description", "image": "https://yoursite.com/article-image.jpg", "author": { "@type": "Person", "name": "Author Name", "url": "https://yoursite.com/team/author-name", "jobTitle": "Role at Company", "sameAs": [ "https://linkedin.com/in/author", "https://twitter.com/author" ] }, "publisher": { "@type": "Organization", "name": "Your Company", "logo": { "@type": "ImageObject", "url": "https://yoursite.com/logo.png" } }, "datePublished": "2025-01-15", "dateModified": "2025-01-20" } ``` ### FAQ Schema ```json { "@context": "https://schema.org", "@type": "FAQPage", "mainEntity": [ { "@type": "Question", "name": "What is your product?", "acceptedAnswer": { "@type": "Answer", "text": "Direct answer here. Keep concise but complete." } }, { "@type": "Question", "name": "How much does it cost?", "acceptedAnswer": { "@type": "Answer", "text": "Pricing starts at $X/month for basic plan..." } } ] } ``` ### Product Schema ```json { "@context": "https://schema.org", "@type": "Product", "name": "Product Name", "description": "Product description", "image": "https://yoursite.com/product.jpg", "brand": { "@type": "Brand", "name": "Your Company" }, "offers": { "@type": "Offer", "price": "29.99", "priceCurrency": "USD", "availability": "https://schema.org/InStock" }, "aggregateRating": { "@type": "AggregateRating", "ratingValue": "4.8", "reviewCount": "127" } } ``` ### HowTo Schema ```json { "@context": "https://schema.org", "@type": "HowTo", "name": "How to Set Up Your Account", "description": "Step-by-step guide to getting started", "step": [ { "@type": "HowToStep", "name": "Create account", "text": "Go to signup page and enter your email" }, { "@type": "HowToStep", "name": "Verify email", "text": "Click the link in the verification email" } ] } ``` --- ## Platform-Specific Optimization ### ChatGPT Optimization ```markdown ✅ DO: - TL;DR sections at top of articles - Consistent formatting (headers, bullets) - Named authors with credentials - Original research and statistics - Multi-intent content (covers related questions) ❌ AVOID: - Thin content without substance - Missing author attribution - Outdated information (no dates) ``` ### Perplexity Optimization ```markdown ✅ DO: - Original statistics with sources - Comparison tables and structured data - Clean URL slugs (/topic-name not /p=123) - Short, declarative statements - Images, charts, diagrams - YouTube videos (Perplexity shows these) ❌ AVOID: - Generic content without unique insights - Missing citations/sources - Poor URL structure ``` ### Claude Optimization ```markdown ✅ DO: - Well-structured, logical content - Clear definitions and explanations - Technical accuracy - Balanced perspectives - Proper citations ❌ AVOID: - Misleading or sensational content - Missing context - Outdated technical information ``` ### Gemini Optimization ```markdown ✅ DO: - Rich schema markup - Detailed image alt-text - YouTube content (Google-owned) - Multimedia (video, audio with transcripts) ❌ AVOID: - Missing structured data - Images without alt-text - Text-only content ``` --- ## E-E-A-T for AI Discovery ### Experience - First-person case studies - "We tested X and found Y" - Original screenshots and data - User testimonials with real details ### Expertise - Author bios with credentials - Link to author's other work - Industry-specific terminology - Technical depth appropriate to topic ### Authoritativeness - Backlinks from trusted sources - Mentions in industry publications - Citations from other experts - Social proof (followers, engagement) ### Trustworthiness - Contact information visible - About page with team details - Privacy policy and terms - Secure site (HTTPS) - Accurate, up-to-date info --- ## Content Freshness ### Visible Dates (Required) ```html

Article Title

By John Smith Published: January 15, 2025 Last updated: January 20, 2025
``` AI systems prefer recent content. Show dates prominently. ### Update Schedule | Content Type | Update Frequency | |--------------|------------------| | Product pages | On feature changes | | Pricing | Immediately on change | | Blog posts | Quarterly review | | Statistics | When new data available | | Guides | Semi-annually | --- ## Analytics for AI Traffic ### GA4 Regex Filter ```regex .*chatgpt\.com.*|.*perplexity\.ai.*|.*gemini\.google\.com.*|.*copilot\.microsoft\.com.*|.*openai\.com.*|.*claude\.ai.*|.*poe\.com.*|.*you\.com.*|.*phind\.com.* ``` ### Track AI Referrals ```javascript // Check for AI referrer const aiReferrers = [ 'chatgpt.com', 'chat.openai.com', 'perplexity.ai', 'claude.ai', 'gemini.google.com', 'copilot.microsoft.com', 'poe.com', 'you.com', 'phind.com' ]; const referrer = document.referrer; const isAIReferral = aiReferrers.some(ai => referrer.includes(ai)); if (isAIReferral) { analytics.track('ai_referral', { source: referrer, page: window.location.pathname }); } ``` ### Survey for AI Discovery Add to forms: ```markdown How did you hear about us? - [ ] Google Search - [ ] ChatGPT - [ ] Perplexity - [ ] Claude - [ ] Social Media - [ ] Referral - [ ] Other ``` --- ## Content Checklist ### Before Publishing ```markdown ## SEO Checklist - [ ] Title tag (50-60 chars) with primary keyword - [ ] Meta description (150-160 chars) with CTA - [ ] URL slug is clean and descriptive - [ ] H1 matches title intent - [ ] H2/H3 hierarchy is logical - [ ] Images have descriptive alt-text - [ ] Internal links to related content - [ ] External links to authoritative sources ## GEO Checklist - [ ] TL;DR or summary at top - [ ] Direct answer to main question in first paragraph - [ ] Stat boxes with sources - [ ] Comparison tables where applicable - [ ] FAQ section with schema - [ ] Author name, bio, and credentials - [ ] Publication and last-updated dates visible - [ ] Schema markup validated - [ ] Content can be quoted standalone - [ ] Original insights or data included ``` ### Schema Validation ```bash # Validate schema markup # Use: https://validator.schema.org/ # Or: https://search.google.com/test/rich-results ``` --- ## Project Structure ``` project/ ├── content/ │ ├── pages/ │ │ ├── home.md │ │ ├── about.md │ │ ├── pricing.md │ │ └── contact.md │ ├── blog/ │ │ ├── post-1.md │ │ └── post-2.md │ └── legal/ │ ├── privacy.md │ └── terms.md ├── components/ │ ├── SchemaMarkup.tsx │ ├── TLDRBox.tsx │ ├── StatBox.tsx │ ├── FAQSection.tsx │ └── AuthorBio.tsx └── lib/ └── schema.ts # Schema generators ``` --- ## Anti-Patterns - **No dates** - AI deprioritizes undated content - **Anonymous content** - No author = no E-E-A-T - **Walls of text** - Break up with headers, bullets, boxes - **Generic content** - Add original insights, data, opinions - **Missing schema** - Invisible to structured data crawlers - **Outdated info** - Update quarterly minimum - **No FAQ** - Missing easy GEO win - **Poor URL structure** - Use /topic-name not /p=12345 --- ## Quick Reference ### Content Formats AI Loves 1. TL;DR summaries 2. Definition boxes ("What is X?") 3. Comparison tables 4. Step-by-step guides 5. FAQ sections 6. Stat boxes with sources 7. Listicles with numbers ### Required Schema by Page Type | Page Type | Schema | |-----------|--------| | Homepage | Organization, WebSite | | Blog Post | Article, Author, FAQ | | Product | Product, Review, FAQ | | FAQ | FAQPage | | How-to | HowTo | | About | Organization, Person | ================================================ FILE: skills/web-payments/SKILL.md ================================================ --- name: web-payments description: Stripe Checkout, subscriptions, webhooks, customer portal when-to-use: When implementing payments, subscriptions, or Stripe integration user-invocable: false effort: high --- # Web Payments Skill (Stripe) For integrating Stripe payments into web applications - one-time payments, subscriptions, and checkout flows. **Sources:** [Stripe Checkout](https://docs.stripe.com/payments/checkout) | [Payment Element Best Practices](https://docs.stripe.com/payments/payment-element/best-practices) | [Building Solid Stripe Integrations](https://stripe.dev/blog/building-solid-stripe-integrations-developers-guide-success) | [Subscriptions](https://docs.stripe.com/billing/subscriptions/build-subscriptions) --- ## Setup ### 1. Create Stripe Account 1. Go to https://dashboard.stripe.com/register 2. Complete business verification 3. Get API keys from https://dashboard.stripe.com/apikeys ### 2. Environment Variables ```bash # .env STRIPE_SECRET_KEY=sk_test_xxx # Server-side only STRIPE_PUBLISHABLE_KEY=pk_test_xxx # Client-side safe STRIPE_WEBHOOK_SECRET=whsec_xxx # For webhook verification # Production STRIPE_SECRET_KEY=sk_live_xxx STRIPE_PUBLISHABLE_KEY=pk_live_xxx ``` ### 3. Install SDK ```bash # Node.js npm install stripe @stripe/stripe-js # Python pip install stripe ``` --- ## Integration Options | Method | Best For | Complexity | |--------|----------|------------| | **Checkout (Hosted)** | Quick setup, Stripe-hosted page | Low | | **Checkout (Embedded)** | Custom site, embedded form | Low | | **Payment Element** | Full customization, complex flows | Medium | | **Custom Form** | Complete control (rare) | High | **Recommendation**: Start with Checkout, migrate to Payment Element if needed. --- ## Stripe Checkout (Recommended) ### Server: Create Checkout Session #### Node.js / Next.js ```typescript // app/api/checkout/route.ts (Next.js App Router) import Stripe from "stripe"; import { NextResponse } from "next/server"; const stripe = new Stripe(process.env.STRIPE_SECRET_KEY!); export async function POST(request: Request) { const { priceId, mode = "payment" } = await request.json(); try { const session = await stripe.checkout.sessions.create({ mode: mode as "payment" | "subscription", payment_method_types: ["card"], line_items: [ { price: priceId, quantity: 1, }, ], success_url: `${process.env.NEXT_PUBLIC_URL}/success?session_id={CHECKOUT_SESSION_ID}`, cancel_url: `${process.env.NEXT_PUBLIC_URL}/canceled`, // Optional: Link to existing customer // customer: customerId, // Optional: Collect shipping // shipping_address_collection: { allowed_countries: ["US", "CA"] }, // Optional: Add metadata for tracking metadata: { userId: "user_123", source: "pricing_page", }, }); return NextResponse.json({ sessionId: session.id, url: session.url }); } catch (error) { console.error("Stripe error:", error); return NextResponse.json({ error: "Failed to create session" }, { status: 500 }); } } ``` #### Python / FastAPI ```python # app/api/checkout.py import stripe from fastapi import APIRouter, HTTPException from pydantic import BaseModel import os stripe.api_key = os.environ["STRIPE_SECRET_KEY"] router = APIRouter() class CheckoutRequest(BaseModel): price_id: str mode: str = "payment" # or "subscription" @router.post("/api/checkout") async def create_checkout_session(request: CheckoutRequest): try: session = stripe.checkout.Session.create( mode=request.mode, payment_method_types=["card"], line_items=[{ "price": request.price_id, "quantity": 1, }], success_url=f"{os.environ['APP_URL']}/success?session_id={{CHECKOUT_SESSION_ID}}", cancel_url=f"{os.environ['APP_URL']}/canceled", metadata={ "user_id": "user_123", }, ) return {"session_id": session.id, "url": session.url} except stripe.error.StripeError as e: raise HTTPException(status_code=400, detail=str(e)) ``` ### Client: Redirect to Checkout ```typescript // components/CheckoutButton.tsx "use client"; import { loadStripe } from "@stripe/stripe-js"; const stripePromise = loadStripe(process.env.NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY!); export function CheckoutButton({ priceId }: { priceId: string }) { const handleCheckout = async () => { const response = await fetch("/api/checkout", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ priceId }), }); const { url } = await response.json(); // Redirect to Stripe Checkout window.location.href = url; }; return ( ); } ``` --- ## Embedded Checkout For keeping users on your site: ```typescript // components/EmbeddedCheckout.tsx "use client"; import { useEffect, useState } from "react"; import { loadStripe } from "@stripe/stripe-js"; import { EmbeddedCheckoutProvider, EmbeddedCheckout, } from "@stripe/react-stripe-js"; const stripePromise = loadStripe(process.env.NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY!); export function EmbeddedCheckoutForm({ priceId }: { priceId: string }) { const [clientSecret, setClientSecret] = useState(""); useEffect(() => { fetch("/api/checkout/embedded", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ priceId }), }) .then((res) => res.json()) .then((data) => setClientSecret(data.clientSecret)); }, [priceId]); if (!clientSecret) return
Loading...
; return ( ); } ``` Server endpoint for embedded: ```typescript // app/api/checkout/embedded/route.ts export async function POST(request: Request) { const { priceId } = await request.json(); const session = await stripe.checkout.sessions.create({ mode: "subscription", line_items: [{ price: priceId, quantity: 1 }], ui_mode: "embedded", return_url: `${process.env.NEXT_PUBLIC_URL}/success?session_id={CHECKOUT_SESSION_ID}`, }); return NextResponse.json({ clientSecret: session.client_secret }); } ``` --- ## Webhooks (Critical) **Never trust client-side data**. Always verify payments via webhooks. ### Webhook Endpoint ```typescript // app/api/webhooks/stripe/route.ts import Stripe from "stripe"; import { headers } from "next/headers"; const stripe = new Stripe(process.env.STRIPE_SECRET_KEY!); const webhookSecret = process.env.STRIPE_WEBHOOK_SECRET!; export async function POST(request: Request) { const body = await request.text(); const signature = headers().get("stripe-signature")!; let event: Stripe.Event; // Verify webhook signature try { event = stripe.webhooks.constructEvent(body, signature, webhookSecret); } catch (err) { console.error("Webhook signature verification failed"); return new Response("Invalid signature", { status: 400 }); } // Handle events switch (event.type) { case "checkout.session.completed": { const session = event.data.object as Stripe.Checkout.Session; await handleCheckoutComplete(session); break; } case "customer.subscription.created": case "customer.subscription.updated": { const subscription = event.data.object as Stripe.Subscription; await handleSubscriptionUpdate(subscription); break; } case "customer.subscription.deleted": { const subscription = event.data.object as Stripe.Subscription; await handleSubscriptionCanceled(subscription); break; } case "invoice.payment_failed": { const invoice = event.data.object as Stripe.Invoice; await handlePaymentFailed(invoice); break; } default: console.log(`Unhandled event type: ${event.type}`); } // Return 200 quickly - process async if needed return new Response("OK", { status: 200 }); } async function handleCheckoutComplete(session: Stripe.Checkout.Session) { const userId = session.metadata?.userId; const customerId = session.customer as string; const subscriptionId = session.subscription as string; // Update your database await db.user.update({ where: { id: userId }, data: { stripeCustomerId: customerId, stripeSubscriptionId: subscriptionId, subscriptionStatus: "active", }, }); } ``` ### Python Webhook ```python # app/api/webhooks.py import stripe from fastapi import APIRouter, Request, HTTPException router = APIRouter() @router.post("/api/webhooks/stripe") async def stripe_webhook(request: Request): payload = await request.body() sig_header = request.headers.get("stripe-signature") try: event = stripe.Webhook.construct_event( payload, sig_header, os.environ["STRIPE_WEBHOOK_SECRET"] ) except ValueError: raise HTTPException(status_code=400, detail="Invalid payload") except stripe.error.SignatureVerificationError: raise HTTPException(status_code=400, detail="Invalid signature") # Handle events if event["type"] == "checkout.session.completed": session = event["data"]["object"] await handle_checkout_complete(session) elif event["type"] == "customer.subscription.deleted": subscription = event["data"]["object"] await handle_subscription_canceled(subscription) return {"status": "success"} ``` ### Key Webhook Events | Event | When | Action | |-------|------|--------| | `checkout.session.completed` | Payment successful | Provision access | | `customer.subscription.created` | New subscription | Store subscription ID | | `customer.subscription.updated` | Plan change | Update plan in DB | | `customer.subscription.deleted` | Canceled | Revoke access | | `invoice.payment_failed` | Payment failed | Notify user, retry | | `invoice.paid` | Renewal successful | Extend access | --- ## Products & Prices ### Create via Dashboard (Recommended) 1. Go to https://dashboard.stripe.com/products 2. Create product with name, description 3. Add price(s) - one-time or recurring 4. Copy Price ID (`price_xxx`) ### Create via API ```typescript // One-time product const product = await stripe.products.create({ name: "Pro Plan", description: "Full access to all features", }); const price = await stripe.prices.create({ product: product.id, unit_amount: 2999, // $29.99 in cents currency: "usd", }); // Subscription product const subscriptionPrice = await stripe.prices.create({ product: product.id, unit_amount: 999, // $9.99/month currency: "usd", recurring: { interval: "month", }, }); ``` --- ## Customer Portal Let users manage their subscriptions: ```typescript // app/api/portal/route.ts export async function POST(request: Request) { const { customerId } = await request.json(); const session = await stripe.billingPortal.sessions.create({ customer: customerId, return_url: `${process.env.NEXT_PUBLIC_URL}/settings`, }); return NextResponse.json({ url: session.url }); } ``` Configure portal at: https://dashboard.stripe.com/settings/billing/portal --- ## Subscriptions ### Create Subscription with Trial ```typescript const session = await stripe.checkout.sessions.create({ mode: "subscription", line_items: [{ price: priceId, quantity: 1 }], subscription_data: { trial_period_days: 14, // Cancel if no payment method after trial trial_settings: { end_behavior: { missing_payment_method: "cancel" }, }, }, success_url: successUrl, cancel_url: cancelUrl, }); ``` ### Check Subscription Status ```typescript // lib/subscription.ts export async function getSubscriptionStatus(customerId: string) { const subscriptions = await stripe.subscriptions.list({ customer: customerId, status: "all", limit: 1, }); if (subscriptions.data.length === 0) { return { status: "none", plan: null }; } const subscription = subscriptions.data[0]; return { status: subscription.status, plan: subscription.items.data[0].price.id, currentPeriodEnd: new Date(subscription.current_period_end * 1000), cancelAtPeriodEnd: subscription.cancel_at_period_end, }; } ``` --- ## Testing ### Test Cards | Card Number | Scenario | |-------------|----------| | `4242424242424242` | Success | | `4000000000000002` | Declined | | `4000002500003155` | Requires 3D Secure | | `4000000000009995` | Insufficient funds | ### Stripe CLI for Webhooks ```bash # Install CLI brew install stripe/stripe-cli/stripe # Login stripe login # Forward webhooks to local server stripe listen --forward-to localhost:3000/api/webhooks/stripe # Trigger test events stripe trigger checkout.session.completed stripe trigger customer.subscription.deleted ``` --- ## Project Structure ``` project/ ├── app/ │ ├── api/ │ │ ├── checkout/ │ │ │ └── route.ts # Create checkout session │ │ ├── portal/ │ │ │ └── route.ts # Customer portal │ │ └── webhooks/ │ │ └── stripe/ │ │ └── route.ts # Webhook handler │ ├── pricing/ │ │ └── page.tsx # Pricing page │ ├── success/ │ │ └── page.tsx # Post-checkout success │ └── settings/ │ └── page.tsx # Manage subscription ├── lib/ │ ├── stripe.ts # Stripe client │ └── subscription.ts # Subscription helpers └── .env.local ``` --- ## Security Best Practices ### Non-Negotiable Rules 1. **Server-side only for secrets** - Never expose `STRIPE_SECRET_KEY` 2. **Always verify webhooks** - Check signature before processing 3. **Idempotency** - Store webhook event IDs, skip duplicates 4. **Use metadata** - Track user IDs, sources for debugging 5. **Handle all states** - Success, failure, pending, canceled ### Idempotent Webhook Handler ```typescript const processedEvents = new Set(); // Use Redis in production export async function POST(request: Request) { // ... verify signature ... // Skip duplicate events if (processedEvents.has(event.id)) { return new Response("Already processed", { status: 200 }); } processedEvents.add(event.id); // Process event... } ``` ### Amount Handling ```typescript // Always use cents (smallest currency unit) const priceInCents = 2999; // $29.99 // Helper functions const toCents = (dollars: number) => Math.round(dollars * 100); const toDollars = (cents: number) => cents / 100; // Display const displayPrice = (cents: number) => new Intl.NumberFormat("en-US", { style: "currency", currency: "USD", }).format(toDollars(cents)); ``` --- ## Common Patterns ### Pricing Page ```typescript // app/pricing/page.tsx const plans = [ { name: "Starter", price: "$9/mo", priceId: "price_starter_monthly", features: ["Feature 1", "Feature 2"], }, { name: "Pro", price: "$29/mo", priceId: "price_pro_monthly", features: ["Everything in Starter", "Feature 3", "Feature 4"], popular: true, }, ]; export default function PricingPage() { return (
{plans.map((plan) => (

{plan.name}

{plan.price}

    {plan.features.map((f) =>
  • {f}
  • )}
))}
); } ``` ### Protect Routes by Subscription ```typescript // middleware.ts import { getSubscriptionStatus } from "@/lib/subscription"; export async function middleware(request: NextRequest) { const session = await getSession(); if (request.nextUrl.pathname.startsWith("/pro")) { const { status } = await getSubscriptionStatus(session.stripeCustomerId); if (status !== "active" && status !== "trialing") { return NextResponse.redirect(new URL("/pricing", request.url)); } } } ``` --- ## Anti-Patterns - **Hardcoding API keys** - Use environment variables - **Client-side payment creation** - Always create PaymentIntent/Session server-side - **Skipping webhook verification** - Always verify signatures - **Processing duplicate webhooks** - Implement idempotency - **Floating-point currency math** - Use integers (cents) - **Trusting client data** - Verify everything server-side - **Ignoring failed payments** - Handle `invoice.payment_failed` - **No error handling** - Catch and handle Stripe errors --- ## Quick Reference ```bash # Install npm install stripe @stripe/stripe-js @stripe/react-stripe-js # Stripe CLI stripe login stripe listen --forward-to localhost:3000/api/webhooks/stripe stripe trigger checkout.session.completed # Test mode prefix sk_test_xxx # Secret key pk_test_xxx # Publishable key # Live mode prefix sk_live_xxx pk_live_xxx ``` ### Key Endpoints | Endpoint | Purpose | |----------|---------| | `POST /api/checkout` | Create checkout session | | `POST /api/portal` | Customer billing portal | | `POST /api/webhooks/stripe` | Handle Stripe events | ### Environment Variables ```bash STRIPE_SECRET_KEY=sk_test_xxx STRIPE_PUBLISHABLE_KEY=pk_test_xxx STRIPE_WEBHOOK_SECRET=whsec_xxx NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY=pk_test_xxx ``` ================================================ FILE: skills/woocommerce/SKILL.md ================================================ --- name: woocommerce description: WooCommerce REST API - products, orders, customers, webhooks when-to-use: When integrating with WooCommerce stores user-invocable: false effort: medium --- # WooCommerce Development Skill For integrating with WooCommerce stores via REST API - products, orders, customers, webhooks, and custom extensions. **Sources:** [WooCommerce REST API](https://woocommerce.github.io/woocommerce-rest-api-docs/) | [Developer Docs](https://developer.woocommerce.com/docs/) --- ## Prerequisites ### Store Requirements ```bash # WooCommerce store must have: # 1. WordPress with WooCommerce plugin installed # 2. HTTPS enabled (required for API auth) # 3. Permalinks set to anything except "Plain" # WordPress Admin → Settings → Permalinks → Post name (recommended) ``` ### Generate API Keys 1. Go to **WooCommerce → Settings → Advanced → REST API** 2. Click **Add key** 3. Set Description, User (admin), and Permissions (Read/Write) 4. Click **Generate API key** 5. Copy **Consumer Key** and **Consumer Secret** (shown only once) --- ## API Basics ### Base URL ``` https://your-store.com/wp-json/wc/v3/ ``` ### Authentication ```typescript // Node.js - Basic Auth (recommended) const WooCommerceRestApi = require("@woocommerce/woocommerce-rest-api").default; const api = new WooCommerceRestApi({ url: "https://your-store.com", consumerKey: process.env.WC_CONSUMER_KEY, consumerSecret: process.env.WC_CONSUMER_SECRET, version: "wc/v3" }); ``` ```python # Python from woocommerce import API wcapi = API( url="https://your-store.com", consumer_key=os.environ["WC_CONSUMER_KEY"], consumer_secret=os.environ["WC_CONSUMER_SECRET"], version="wc/v3" ) ``` ### Query String Auth (Fallback) ```bash # Only use if Basic Auth fails (some hosting configurations) curl https://your-store.com/wp-json/wc/v3/products \ ?consumer_key=ck_xxx&consumer_secret=cs_xxx ``` --- ## Installation ### Node.js ```bash npm install @woocommerce/woocommerce-rest-api ``` ```typescript // lib/woocommerce.ts import WooCommerceRestApi from "@woocommerce/woocommerce-rest-api"; const api = new WooCommerceRestApi({ url: process.env.WC_STORE_URL!, consumerKey: process.env.WC_CONSUMER_KEY!, consumerSecret: process.env.WC_CONSUMER_SECRET!, version: "wc/v3", queryStringAuth: false, // Set true for HTTP (dev only) }); export default api; ``` ### Python ```bash pip install woocommerce ``` ```python # lib/woocommerce.py import os from woocommerce import API wcapi = API( url=os.environ["WC_STORE_URL"], consumer_key=os.environ["WC_CONSUMER_KEY"], consumer_secret=os.environ["WC_CONSUMER_SECRET"], version="wc/v3", timeout=30 ) ``` --- ## Products ### List Products ```typescript // Node.js async function getProducts(page = 1, perPage = 20) { const response = await api.get("products", { page, per_page: perPage, status: "publish", }); return response.data; } // With filters async function searchProducts(search: string, category?: number) { const response = await api.get("products", { search, category: category || undefined, orderby: "popularity", order: "desc", }); return response.data; } ``` ```python # Python def get_products(page=1, per_page=20): response = wcapi.get("products", params={ "page": page, "per_page": per_page, "status": "publish" }) return response.json() ``` ### Get Single Product ```typescript async function getProduct(productId: number) { const response = await api.get(`products/${productId}`); return response.data; } ``` ### Create Product ```typescript async function createProduct(data: ProductInput) { const response = await api.post("products", { name: data.name, type: "simple", // simple, variable, grouped, external regular_price: data.price.toString(), description: data.description, short_description: data.shortDescription, categories: data.categoryIds.map(id => ({ id })), images: data.images.map(url => ({ src: url })), manage_stock: true, stock_quantity: data.stockQuantity, status: "publish", }); return response.data; } ``` ### Update Product ```typescript async function updateProduct(productId: number, data: Partial) { const response = await api.put(`products/${productId}`, data); return response.data; } // Update stock only async function updateStock(productId: number, quantity: number) { const response = await api.put(`products/${productId}`, { stock_quantity: quantity, }); return response.data; } ``` ### Delete Product ```typescript async function deleteProduct(productId: number, force = false) { // force: true = permanent delete, false = move to trash const response = await api.delete(`products/${productId}`, { force, }); return response.data; } ``` ### Variable Products ```typescript // Create variable product async function createVariableProduct(data: VariableProductInput) { // 1. Create product with type "variable" const product = await api.post("products", { name: data.name, type: "variable", attributes: [ { name: "Size", visible: true, variation: true, options: ["Small", "Medium", "Large"], }, { name: "Color", visible: true, variation: true, options: ["Red", "Blue"], }, ], }); // 2. Create variations for (const variant of data.variants) { await api.post(`products/${product.data.id}/variations`, { regular_price: variant.price.toString(), stock_quantity: variant.stock, attributes: [ { name: "Size", option: variant.size }, { name: "Color", option: variant.color }, ], }); } return product.data; } // Get variations async function getVariations(productId: number) { const response = await api.get(`products/${productId}/variations`); return response.data; } ``` --- ## Orders ### List Orders ```typescript async function getOrders(params: OrderQueryParams = {}) { const response = await api.get("orders", { page: params.page || 1, per_page: params.perPage || 20, status: params.status || "any", // pending, processing, completed, etc. after: params.after, // ISO date string before: params.before, }); return response.data; } // Get recent orders async function getRecentOrders(days = 7) { const after = new Date(); after.setDate(after.getDate() - days); const response = await api.get("orders", { after: after.toISOString(), orderby: "date", order: "desc", }); return response.data; } ``` ### Get Single Order ```typescript async function getOrder(orderId: number) { const response = await api.get(`orders/${orderId}`); return response.data; } ``` ### Create Order ```typescript async function createOrder(data: OrderInput) { const response = await api.post("orders", { payment_method: "stripe", payment_method_title: "Credit Card", set_paid: false, billing: { first_name: data.customer.firstName, last_name: data.customer.lastName, email: data.customer.email, phone: data.customer.phone, address_1: data.billing.address1, city: data.billing.city, state: data.billing.state, postcode: data.billing.postcode, country: data.billing.country, }, shipping: { first_name: data.customer.firstName, last_name: data.customer.lastName, address_1: data.shipping.address1, city: data.shipping.city, state: data.shipping.state, postcode: data.shipping.postcode, country: data.shipping.country, }, line_items: data.items.map(item => ({ product_id: item.productId, variation_id: item.variationId, quantity: item.quantity, })), shipping_lines: [ { method_id: "flat_rate", method_title: "Flat Rate", total: data.shippingCost.toString(), }, ], }); return response.data; } ``` ### Update Order Status ```typescript async function updateOrderStatus(orderId: number, status: OrderStatus) { const response = await api.put(`orders/${orderId}`, { status, // pending, processing, on-hold, completed, cancelled, refunded, failed }); return response.data; } // Add order note async function addOrderNote(orderId: number, note: string, customerNote = false) { const response = await api.post(`orders/${orderId}/notes`, { note, customer_note: customerNote, // true = visible to customer }); return response.data; } ``` ### Order Statuses | Status | Description | |--------|-------------| | `pending` | Awaiting payment | | `processing` | Payment received, awaiting fulfillment | | `on-hold` | Awaiting action (stock, payment confirmation) | | `completed` | Order fulfilled | | `cancelled` | Cancelled by admin or customer | | `refunded` | Refunded | | `failed` | Payment failed | --- ## Customers ### List Customers ```typescript async function getCustomers(params: CustomerQueryParams = {}) { const response = await api.get("customers", { page: params.page || 1, per_page: params.perPage || 20, role: "customer", orderby: "registered_date", order: "desc", }); return response.data; } // Search customers async function searchCustomers(email: string) { const response = await api.get("customers", { email, }); return response.data; } ``` ### Create Customer ```typescript async function createCustomer(data: CustomerInput) { const response = await api.post("customers", { email: data.email, first_name: data.firstName, last_name: data.lastName, username: data.email.split("@")[0], billing: { first_name: data.firstName, last_name: data.lastName, email: data.email, phone: data.phone, address_1: data.address1, city: data.city, state: data.state, postcode: data.postcode, country: data.country, }, shipping: { // Same as billing or different }, }); return response.data; } ``` ### Update Customer ```typescript async function updateCustomer(customerId: number, data: Partial) { const response = await api.put(`customers/${customerId}`, data); return response.data; } ``` --- ## Webhooks ### Create Webhook ```typescript async function createWebhook(topic: string, deliveryUrl: string) { const response = await api.post("webhooks", { name: `Webhook for ${topic}`, topic, // order.created, order.updated, product.created, etc. delivery_url: deliveryUrl, status: "active", secret: process.env.WC_WEBHOOK_SECRET, }); return response.data; } ``` ### Webhook Topics | Topic | Trigger | |-------|---------| | `order.created` | New order placed | | `order.updated` | Order status/details changed | | `order.deleted` | Order deleted | | `product.created` | New product created | | `product.updated` | Product updated | | `product.deleted` | Product deleted | | `customer.created` | New customer registered | | `customer.updated` | Customer updated | | `coupon.created` | New coupon created | ### Verify Webhook Signature ```typescript // Express.js webhook handler import crypto from "crypto"; function verifyWooCommerceWebhook(req: Request): boolean { const signature = req.headers["x-wc-webhook-signature"] as string; const payload = JSON.stringify(req.body); const expectedSignature = crypto .createHmac("sha256", process.env.WC_WEBHOOK_SECRET!) .update(payload) .digest("base64"); return crypto.timingSafeEqual( Buffer.from(signature), Buffer.from(expectedSignature) ); } // Route handler app.post("/webhooks/woocommerce", (req, res) => { if (!verifyWooCommerceWebhook(req)) { return res.status(401).json({ error: "Invalid signature" }); } const topic = req.headers["x-wc-webhook-topic"]; const payload = req.body; switch (topic) { case "order.created": handleNewOrder(payload); break; case "order.updated": handleOrderUpdate(payload); break; // ... other topics } res.status(200).json({ received: true }); }); ``` ```python # Python/Flask webhook handler import hmac import hashlib import base64 @app.route("/webhooks/woocommerce", methods=["POST"]) def woocommerce_webhook(): signature = request.headers.get("X-WC-Webhook-Signature") payload = request.get_data() expected = base64.b64encode( hmac.new( os.environ["WC_WEBHOOK_SECRET"].encode(), payload, hashlib.sha256 ).digest() ).decode() if not hmac.compare_digest(signature, expected): return {"error": "Invalid signature"}, 401 topic = request.headers.get("X-WC-Webhook-Topic") data = request.json if topic == "order.created": handle_new_order(data) elif topic == "order.updated": handle_order_update(data) return {"received": True}, 200 ``` --- ## Categories & Tags ### List Categories ```typescript async function getCategories() { const response = await api.get("products/categories", { per_page: 100, orderby: "name", }); return response.data; } // Create category async function createCategory(name: string, parentId?: number) { const response = await api.post("products/categories", { name, parent: parentId || 0, }); return response.data; } ``` ### List Tags ```typescript async function getTags() { const response = await api.get("products/tags", { per_page: 100, }); return response.data; } ``` --- ## Coupons ### Create Coupon ```typescript async function createCoupon(data: CouponInput) { const response = await api.post("coupons", { code: data.code, discount_type: data.type, // percent, fixed_cart, fixed_product amount: data.amount.toString(), individual_use: true, exclude_sale_items: false, minimum_amount: data.minimumAmount?.toString(), maximum_amount: data.maximumAmount?.toString(), usage_limit: data.usageLimit, usage_limit_per_user: 1, date_expires: data.expiresAt, // ISO date string }); return response.data; } ``` --- ## Reports ### Sales Report ```typescript async function getSalesReport(period = "month") { const response = await api.get("reports/sales", { period, // day, week, month, year }); return response.data; } // Top sellers async function getTopSellers(period = "month") { const response = await api.get("reports/top_sellers", { period, }); return response.data; } ``` --- ## Pagination ### Handle Large Datasets ```typescript async function getAllProducts() { const allProducts = []; let page = 1; const perPage = 100; while (true) { const response = await api.get("products", { page, per_page: perPage, }); allProducts.push(...response.data); // Check headers for total pages const totalPages = parseInt(response.headers["x-wp-totalpages"]); if (page >= totalPages) break; page++; } return allProducts; } ``` ### Pagination Headers | Header | Description | |--------|-------------| | `X-WP-Total` | Total number of items | | `X-WP-TotalPages` | Total number of pages | --- ## Error Handling ```typescript import WooCommerceRestApi from "@woocommerce/woocommerce-rest-api"; async function safeApiCall( operation: () => Promise<{ data: T }> ): Promise { try { const response = await operation(); return response.data; } catch (error: any) { if (error.response) { // API returned an error const { status, data } = error.response; switch (status) { case 400: throw new Error(`Bad request: ${data.message}`); case 401: throw new Error("Invalid API credentials"); case 404: throw new Error("Resource not found"); case 429: // Rate limited - wait and retry await new Promise(r => setTimeout(r, 5000)); return safeApiCall(operation); default: throw new Error(`API error: ${data.message}`); } } throw error; } } // Usage const products = await safeApiCall(() => api.get("products")); ``` --- ## Environment Variables ```bash # .env WC_STORE_URL=https://your-store.com WC_CONSUMER_KEY=ck_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx WC_CONSUMER_SECRET=cs_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx WC_WEBHOOK_SECRET=your_webhook_secret ``` Add to `credentials.md`: ```python 'WC_CONSUMER_KEY': r'ck_[a-f0-9]{40}', 'WC_CONSUMER_SECRET': r'cs_[a-f0-9]{40}', ``` --- ## Checklist ### Before Integration - [ ] WooCommerce plugin installed and activated - [ ] HTTPS enabled on store - [ ] Permalinks set to non-Plain setting - [ ] API keys generated with appropriate permissions - [ ] Webhook secret configured ### Security - [ ] API keys stored in environment variables - [ ] Webhook signatures verified - [ ] HTTPS used for all API calls - [ ] Rate limiting handled ### Testing - [ ] Test API connection - [ ] Test product CRUD operations - [ ] Test order creation/updates - [ ] Test webhook delivery - [ ] Test pagination for large datasets --- ## Anti-Patterns - **Plain permalinks** - API won't work without pretty permalinks - **HTTP in production** - Always use HTTPS - **Ignoring rate limits** - WooCommerce may throttle requests - **Large single requests** - Use pagination for bulk operations - **Storing keys in code** - Use environment variables - **Skipping webhook verification** - Always verify signatures ================================================ FILE: skills/workspace/SKILL.md ================================================ --- name: workspace description: Dynamic multi-repo and monorepo awareness for Claude Code. Analyze workspace topology, track API contracts, and maintain cross-repo context. when-to-use: When working across multiple repos or in a monorepo with shared dependencies user-invocable: true effort: high --- # Workspace Analysis Skill > Dynamic multi-repo and monorepo awareness for Claude Code. Analyze workspace topology, track API contracts, and maintain cross-repo context. ## The Problem When you have separate frontend/backend repos (or monorepo with multiple apps), Claude Code operates in isolation. It doesn't know: - API contracts between modules/repos - Shared types and interfaces - Full system architecture - Cross-repo dependencies - What changed in other parts of the system This leads to: - Duplicate type definitions - API contract mismatches - Breaking changes not caught until runtime - Claude reimplementing things that exist elsewhere --- ## Solution: Dynamic Workspace Analysis Instead of static manifests that get stale, Claude dynamically analyzes the workspace and generates context artifacts that stay fresh through hooks. ``` ┌─────────────────────────────────────────────────────────────────┐ │ WORKSPACE ANALYSIS SYSTEM │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ /analyze-workspace (Full Analysis - ~2 min) │ │ ├── Topology discovery (monorepo vs multi-repo) │ │ ├── Dependency graph (who calls whom) │ │ ├── Contract extraction (OpenAPI, GraphQL, types) │ │ └── Key file identification (what to load when) │ │ │ │ /sync-contracts (Incremental - ~15 sec) │ │ ├── Check contract source files for changes │ │ ├── Update CONTRACTS.md with diffs │ │ └── Validate consistency │ │ │ │ Hooks (Automatic) │ │ ├── Session start: Staleness advisory (~5 sec) │ │ ├── Post-commit: Auto-sync if contracts changed (~15 sec) │ │ └── Pre-push: Validation gate (~10 sec) │ │ │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Workspace Classification ### Detection Patterns | Type | Indicators | File Access | |------|------------|-------------| | **Monorepo** | pnpm-workspace.yaml, nx.json, turbo.json, lerna.json | Direct (same tree) | | **Multi-repo** | Sibling directories with separate .git | Via symlinks or paths | | **Hybrid** | Monorepo + external repo dependencies | Mixed | | **Single** | One app, no workspace config | N/A (use existing-repo) | ### Monorepo Detection ```bash # Check for monorepo indicators ls package.json pnpm-workspace.yaml lerna.json nx.json turbo.json 2>/dev/null ls apps/ packages/ services/ libs/ modules/ 2>/dev/null ``` ### Multi-Repo Detection ```bash # Check sibling directories for related repos ls -la ../*.git 2>/dev/null cat ../*/.git/config 2>/dev/null | grep "url" # Look for naming patterns ls .. | grep -E "(frontend|backend|api|web|shared|common)" ``` ### Polyglot Detection ```bash # Find all package manifests find . -maxdepth 4 -name "package.json" -o -name "pyproject.toml" \ -o -name "go.mod" -o -name "Cargo.toml" -o -name "pom.xml" \ -o -name "build.gradle" -o -name "Gemfile" ``` --- ## Analysis Protocol ### Phase 1: Topology Discovery (~30 seconds) Determine workspace structure: ```markdown ## Discovery Checklist 1. [ ] Identify workspace root 2. [ ] Classify workspace type (monorepo/multi-repo/hybrid/single) 3. [ ] List all modules/apps/packages 4. [ ] Detect tech stack per module 5. [ ] Identify entry points per module ``` **Module Detection Pattern:** ``` workspace-root/ ├── apps/ → Application modules │ ├── web/ → Frontend app │ └── api/ → Backend app ├── packages/ → Shared packages │ ├── ui/ → Component library │ ├── types/ → Shared types │ └── db/ → Database layer ├── services/ → Microservices └── libs/ → Internal libraries ``` ### Phase 2: Dependency Graph (~60 seconds) For each module, map: **1. Internal Dependencies** ```bash # TypeScript/JavaScript grep -r "from ['\"]@" --include="*.ts" --include="*.tsx" | head -50 grep -r "workspace:" package.json # Python grep -r "from \." --include="*.py" | head -50 ``` **2. API Relationships** ```bash # Find API calls grep -rE "fetch|axios|httpx|requests\." --include="*.ts" --include="*.py" | \ grep -E "/api|localhost|127\.0\.0\.1" | head -30 ``` **3. Database Connections** ```bash # Find DB access patterns grep -rE "prisma|drizzle|sqlalchemy|sequelize|typeorm" --include="*.ts" --include="*.py" ``` ### Phase 3: Contract Extraction (~45 seconds) Identify and parse API contracts: | Contract Type | Detection | Extraction | |---------------|-----------|------------| | **OpenAPI** | openapi.json, swagger.yaml, /docs endpoint | Parse paths, schemas | | **GraphQL** | schema.graphql, *.gql, /graphql endpoint | Parse types, queries, mutations | | **tRPC** | trpc router files, @trpc/* imports | Parse router definitions | | **Protobuf** | *.proto files | Parse services, messages | | **TypeScript** | Shared .d.ts, exported interfaces | Parse exported types | | **Pydantic** | schemas/, models/ with BaseModel | Parse model definitions | | **Zod** | schemas/ with z.object | Parse schema definitions | **Contract Source Priority:** 1. Generated specs (openapi.json) - most accurate 2. Schema definitions (Pydantic, Zod) - source of truth 3. Type exports (TypeScript .d.ts) - consumer contracts 4. Inferred from code - last resort ### Phase 4: Key File Identification (~30 seconds) Identify files Claude MUST know about for each context: | Category | Detection Pattern | Token Priority | |----------|-------------------|----------------| | **Route definitions** | `**/routes/**`, `**/api/**`, `@app.get`, `@router` | HIGH | | **Type definitions** | `**/types/**`, `*.d.ts`, `schemas/`, `models/` | HIGH | | **Config** | `.env.example`, `config/`, `settings.py` | MEDIUM | | **Entry points** | `main.ts`, `index.ts`, `app.py`, `server.py` | MEDIUM | | **API clients** | `**/api/client*`, `**/lib/api*` | HIGH | | **Database schema** | `schema/`, `migrations/`, `prisma/schema.prisma` | MEDIUM | | **Tests** | `__tests__/`, `*_test.py`, `*.spec.ts` | LOW (on-demand) | --- ## Generated Artifacts All artifacts go in `_project_specs/workspace/`: ``` _project_specs/workspace/ ├── TOPOLOGY.md # What modules exist, their roles ├── CONTRACTS.md # API specs, shared types (summarized) ├── DEPENDENCY_GRAPH.md # Who calls whom (visual + list) ├── KEY_FILES.md # What to load for each context ├── CROSS_REPO_INDEX.md # Capabilities across all modules └── .contract-sources # Files to monitor for changes ``` ### TOPOLOGY.md Format ```markdown # Workspace Topology Generated: 2026-01-20T14:32:00Z Analyzer: maggy/workspace-analysis Workspace Type: Monorepo (Turborepo) ## Overview ``` ┌─────────────────────────────────────────────────┐ │ apps/web (Next.js) ←→ apps/api (FastAPI) │ │ ↓ ↓ │ │ packages/shared-types ← packages/db │ └─────────────────────────────────────────────────┘ ``` ## Modules ### apps/web - **Path**: /apps/web - **Tech**: Next.js 14, TypeScript, TailwindCSS - **Role**: Customer-facing dashboard - **Consumes**: apps/api (REST), packages/shared-types - **Entry**: src/app/layout.tsx - **Key files**: - `src/lib/api/client.ts` - API client (187 lines) - `src/types/` - Frontend-specific types (12 files) - **Token estimate**: ~15K (full), ~4K (summarized) ### apps/api - **Path**: /apps/api - **Tech**: FastAPI, Python 3.12, SQLAlchemy - **Role**: REST API, business logic - **Exposes**: OpenAPI at /docs (47 endpoints) - **Consumes**: packages/db - **Entry**: app/main.py - **Key files**: - `app/routes/` - All endpoints (8 routers) - `app/schemas/` - Pydantic models (23 files) - `openapi.json` - Generated spec - **Token estimate**: ~22K (full), ~6K (summarized) ### packages/shared-types - **Path**: /packages/shared-types - **Tech**: TypeScript - **Role**: Shared type definitions - **Consumed by**: apps/web, apps/api (codegen) - **Key files**: - `src/index.ts` - All exports (340 lines) - **Token estimate**: ~3K ### packages/db - **Path**: /packages/db - **Tech**: Drizzle ORM, TypeScript - **Role**: Database schema, migrations - **Consumed by**: apps/api - **Key files**: - `schema/` - Table definitions (8 files) - `migrations/` - Migration history (23 files) - **Token estimate**: ~8K (full), ~2K (schema only) ``` ### CONTRACTS.md Format ```markdown # API Contracts Generated: 2026-01-20T14:32:00Z Last sync: 2026-01-20T16:45:00Z Sources: 3 files monitored ## REST API: apps/api → apps/web ### Endpoints Summary (47 total) | Domain | Count | Key Endpoints | |--------|-------|---------------| | /api/auth | 5 | POST /login, POST /register, POST /refresh | | /api/users | 6 | GET /me, PATCH /me, GET /:id | | /api/campaigns | 8 | CRUD + POST /bulk, GET /analytics | | /api/analytics | 12 | GET /dashboard, GET /timeseries, GET /funnel | | /api/settings | 4 | GET /, PATCH /, GET /integrations | ### Key Types ```typescript // Campaign domain (from apps/api/app/schemas/campaign.py) interface Campaign { id: string; name: string; status: 'draft' | 'active' | 'paused' | 'completed'; budget: number; target_audience: TargetAudience; created_at: string; updated_at: string; } interface CampaignCreate { name: string; budget: number; target_audience?: TargetAudience; } // Auth domain (from apps/api/app/schemas/auth.py) interface User { id: string; email: string; name: string; role: 'user' | 'admin'; } interface TokenPair { access_token: string; refresh_token: string; expires_in: number; } ``` ### Contract Validation Status | Check | Status | Details | |-------|--------|---------| | OpenAPI matches routes | ✅ | 47/47 endpoints documented | | Types match schemas | ✅ | All Pydantic models exported | | Frontend types current | ⚠️ | 2 types need regeneration | ## Shared Types: packages/shared-types ### Exported Types (34 total) | Category | Types | Used By | |----------|-------|---------| | Domain models | Campaign, User, Analytics | web, api | | API responses | ApiResponse, PaginatedResponse | web | | Utilities | DateRange, FilterParams | web, api | ## Database Schema: packages/db ### Tables (12 total) | Table | Key Columns | Relations | |-------|-------------|-----------| | users | id, email, name, role | campaigns, sessions | | campaigns | id, user_id, name, status | analytics, targets | | analytics | id, campaign_id, date, metrics | campaigns | ``` ### DEPENDENCY_GRAPH.md Format ```markdown # Dependency Graph Generated: 2026-01-20T14:32:00Z ## Visual Overview ``` ┌─────────────────┐ │ packages/db │ │ (Drizzle ORM) │ └────────┬────────┘ │ ▼ ┌─────────────────┐ ┌─────────────────┐ │ apps/web │◄──│ apps/api │ │ (Next.js) │ │ (FastAPI) │ └────────┬────────┘ └────────┬────────┘ │ │ ▼ ▼ ┌─────────────────────────────────────────┐ │ packages/shared-types │ │ (TypeScript) │ └─────────────────────────────────────────┘ ``` ## Dependency Matrix | Module | Depends On | Depended By | |--------|------------|-------------| | apps/web | shared-types, apps/api (runtime) | - | | apps/api | shared-types (codegen), db | apps/web | | packages/shared-types | - | apps/web, apps/api | | packages/db | - | apps/api | ## Import Analysis ### apps/web imports: ``` @repo/shared-types: 23 files apps/api (via fetch): 15 files ``` ### apps/api imports: ``` packages/db: 12 files packages/shared-types (codegen): 8 files ``` ## API Call Graph ``` apps/web apps/api ───────── ──────── src/lib/api/client.ts ──────────► app/routes/auth.py └── login() POST /api/auth/login └── register() POST /api/auth/register src/app/campaigns/page.tsx ─────► app/routes/campaigns.py └── getCampaigns() GET /api/campaigns └── createCampaign() POST /api/campaigns ``` ``` ### KEY_FILES.md Format ```markdown # Key Files by Context ## Context: Frontend API Integration **When**: Modifying API calls, response handling, or API types in frontend Load these files (~8K tokens): ``` apps/web/src/lib/api/client.ts # API client implementation apps/web/src/types/api.d.ts # Frontend API types apps/api/openapi.json # Full API spec (or summary) packages/shared-types/src/index.ts # Shared type definitions ``` ## Context: Backend Endpoint Development **When**: Adding/modifying API endpoints Load these files (~12K tokens): ``` apps/api/app/routes/ # Existing route patterns apps/api/app/schemas/ # Pydantic models (relevant domain) apps/api/app/dependencies/ # Auth, DB dependencies packages/db/schema/ # Relevant table definitions ``` ## Context: Database Changes **When**: Schema modifications, migrations, queries Load these files (~6K tokens): ``` packages/db/schema/ # All table definitions packages/db/migrations/ # Last 5 migrations apps/api/app/models/ # ORM model usage ``` ## Context: Shared Types **When**: Modifying interfaces used across modules Load these files (~4K tokens): ``` packages/shared-types/src/ # Type source files apps/web/src/types/api.d.ts # Consumer (frontend) apps/api/app/schemas/ # Source (backend) ``` ## Context: Authentication **When**: Auth flow, sessions, tokens Load these files (~5K tokens): ``` apps/api/app/routes/auth.py # Auth endpoints apps/api/app/dependencies/auth.py # Auth middleware apps/web/src/lib/auth/ # Frontend auth handling packages/shared-types/src/auth.ts # Auth types ``` ## Load-on-Demand Triggers | Claude detects... | Load additionally | |-------------------|-------------------| | "check the API contract" | Full OpenAPI spec | | Import from another module | That module's exports | | Database query pattern | Full schema definitions | | Test failure in other module | That module's test files | | "breaking change" | Both sides of the contract | ``` ### CROSS_REPO_INDEX.md Format ```markdown # Cross-Repository Capability Index Generated: 2026-01-20T14:32:00Z ## Capabilities by Domain ### Authentication | Capability | Location | Module | Type | |------------|----------|--------|------| | Login user | POST /api/auth/login | apps/api | endpoint | | Register user | POST /api/auth/register | apps/api | endpoint | | Refresh token | POST /api/auth/refresh | apps/api | endpoint | | Auth context | src/contexts/AuthContext.tsx | apps/web | component | | Auth hook | src/hooks/useAuth.ts | apps/web | hook | | User type | src/auth.ts | shared-types | type | | Session type | src/auth.ts | shared-types | type | ### Campaigns | Capability | Location | Module | Type | |------------|----------|--------|------| | List campaigns | GET /api/campaigns | apps/api | endpoint | | Create campaign | POST /api/campaigns | apps/api | endpoint | | Campaign CRUD | app/routes/campaigns.py | apps/api | router | | Campaign form | src/components/CampaignForm.tsx | apps/web | component | | Campaign type | src/campaign.ts | shared-types | type | | campaigns table | schema/campaigns.ts | packages/db | table | ### Analytics | Capability | Location | Module | Type | |------------|----------|--------|------| | Dashboard data | GET /api/analytics/dashboard | apps/api | endpoint | | Timeseries | GET /api/analytics/timeseries | apps/api | endpoint | | Analytics hook | src/hooks/useAnalytics.ts | apps/web | hook | | Chart components | src/components/charts/ | apps/web | components | ## Search Index Before implementing new functionality, search this index: ``` Q: "How do I get the current user?" A: Use useAuth() hook from apps/web/src/hooks/useAuth.ts Or GET /api/users/me endpoint from apps/api Q: "Where are campaign types defined?" A: Source of truth: packages/shared-types/src/campaign.ts Backend schema: apps/api/app/schemas/campaign.py Frontend types: apps/web/src/types/api.d.ts (generated) Q: "How do I add a new API endpoint?" A: Pattern in apps/api/app/routes/campaigns.py Register in apps/api/app/routes/__init__.py Add types to packages/shared-types Regenerate frontend types ``` ``` --- ## Token Budget Management ### Context Limits ``` ┌─────────────────────────────────────────────────────────────────┐ │ TOKEN BUDGET ALLOCATION │ ├─────────────────────────────────────────────────────────────────┤ │ Total context: ~200K tokens │ │ Reserve for output: ~50K tokens │ │ Working budget: ~150K tokens │ ├─────────────────────────────────────────────────────────────────┤ │ P0 (Must have): 50K │ Current module (full) │ │ P1 (Should have): 40K │ Directly related modules (summary) │ │ P2 (Nice to have): 30K │ Contracts + shared types │ │ P3 (If room): 20K │ Decisions, todos, history │ │ Buffer: 10K │ Dynamic loading during session │ └─────────────────────────────────────────────────────────────────┘ ``` ### Automatic Summarization When loading cross-module context, summarize: | Content Type | Full Load Threshold | Summarization Strategy | |--------------|---------------------|------------------------| | OpenAPI spec | < 50 endpoints | Endpoints + key types only | | Type files | < 30 types | Exported types only | | Route files | < 200 lines | Signatures + docstrings | | Config files | < 50 lines | Keys only (no values/secrets) | | Test files | Never full | Only on explicit request | ### Context Loading Strategy ``` ┌─────────────────────────────────────────────────────────────────┐ │ CONTEXT LOADING HIERARCHY │ ├─────────────────────────────────────────────────────────────────┤ │ Level 1: Always loaded (~5K tokens) │ │ ├── TOPOLOGY.md (workspace structure) │ │ ├── CONTRACTS.md (API summary) │ │ └── CROSS_REPO_INDEX.md (capability search) │ │ │ │ Level 2: Loaded based on current file (~15K tokens) │ │ ├── KEY_FILES.md recommendations for current context │ │ ├── Related module summaries │ │ └── Relevant type definitions │ │ │ │ Level 3: On-demand expansion (variable) │ │ ├── Full OpenAPI spec (when "check API contract") │ │ ├── Full type files (when modifying interfaces) │ │ └── Other module's full files (when cross-repo change) │ └─────────────────────────────────────────────────────────────────┘ ``` --- ## Multi-Repo File Access For multi-repo workspaces (separate .git directories): ### Option 1: Sibling Directory Convention (Recommended) ``` ~/code/ ├── myapp-frontend/ # git repo ├── myapp-backend/ # git repo ├── myapp-shared/ # git repo └── .workspace/ # workspace config (optional) └── myapp.yaml ``` Claude accesses via relative paths: `../myapp-backend/` ### Option 2: Workspace Symlinks ```bash # In frontend repo mkdir -p .workspace/repos ln -s ../../myapp-backend .workspace/repos/backend ln -s ../../myapp-shared .workspace/repos/shared ``` ### Option 3: Git Submodules ```bash # Add related repos as submodules (read-only) git submodule add --depth 1 ../myapp-shared .workspace/shared ``` ### File Access Rules ```markdown ## Multi-Repo Access Protocol WHEN accessing files from another repo: 1. Use relative paths from workspace root 2. Read-only access (never modify other repos) 3. Cache contract files locally in _project_specs/workspace/cache/ 4. Log cross-repo reads in decisions.md BEFORE making cross-repo changes: 1. Document the change in BOTH repos' decisions.md 2. Create linked todos in BOTH repos 3. Implement in dependency order (shared → backend → frontend) ``` --- ## Cross-Repo Change Detection When Claude detects changes that affect other modules: ``` ┌─────────────────────────────────────────────────────────────────┐ │ ⚠️ CROSS-REPO CHANGE DETECTED │ ├─────────────────────────────────────────────────────────────────┤ │ This change affects: apps/api │ │ Specifically: Endpoint POST /api/campaigns expects new field │ │ │ │ Impact Analysis: │ │ ├── apps/web/src/lib/api/client.ts - needs update │ │ ├── packages/shared-types/src/campaign.ts - needs new field │ │ └── apps/api/app/schemas/campaign.py - source of change │ │ │ │ Recommended Order: │ │ 1. Update packages/shared-types first (source of truth) │ │ 2. Update apps/api schema │ │ 3. Regenerate frontend types │ │ 4. Update apps/web API client │ │ 5. Run /sync-contracts │ │ │ │ [Proceed with guidance] [Load full context] [Cancel] │ └─────────────────────────────────────────────────────────────────┘ ``` ### Change Impact Patterns | Change Type | Impacts | Action | |-------------|---------|--------| | New API endpoint | Frontend client, types | Add to both, sync contracts | | Modified response | Frontend types, tests | Regenerate types, update tests | | New required field | All consumers | Breaking change protocol | | Renamed field | All consumers | Migration + deprecation | | New shared type | Consumers on next use | Export from shared-types | | Schema migration | API models, queries | Run migration, verify queries | --- ## Contract Freshness System ### Staleness Detection ```bash # .contract-sources file (auto-generated) # Files that define contracts - monitored for changes # OpenAPI specs apps/api/openapi.json apps/api/docs/openapi.yaml # Type definitions packages/shared-types/src/index.ts packages/shared-types/src/api.ts # Pydantic schemas apps/api/app/schemas/*.py # Database schema packages/db/schema/*.ts ``` ### Freshness Tiers | Tier | Trigger | Action | Time | Blocking | |------|---------|--------|------|----------| | 1 | Session start | Staleness check | ~5s | No | | 2 | Post-commit | Auto-sync if contracts changed | ~15s | No | | 3 | Pre-push | Validation gate | ~10s | Yes (bypassable) | | 4 | PR opened | CI validation | ~30s | Yes | | 5 | Weekly cron | Full re-analysis | ~2min | No | ### Freshness Indicators ```markdown ## Contract Status (shown in CONTRACTS.md header) Last full analysis: 2026-01-18T10:00:00Z Last sync: 2026-01-20T14:32:00Z Staleness: 🟢 Fresh (synced 2 hours ago) ## Confidence Levels 🟢 Fresh - Synced within 24 hours, no source changes 🟡 Stale - Sources changed since last sync 🔴 Outdated - Over 7 days since last analysis ⚠️ Drift - Validation found inconsistencies ``` --- ## Integration with Existing Skills ### With existing-repo.md `workspace.md` calls `existing-repo.md` analysis for each module: ```markdown ## Module Analysis Delegation For each module in workspace: 1. Run existing-repo analysis on that module 2. Extract: tech stack, conventions, guardrails status 3. Aggregate into TOPOLOGY.md 4. Don't duplicate - reference existing-repo output ``` ### With session-management.md ```markdown ## Session State Integration Workspace context files are part of session state: - TOPOLOGY.md → structural context (rarely changes) - CONTRACTS.md → API context (check freshness each session) - KEY_FILES.md → loading guidance (static reference) On session start: 1. Load _project_specs/workspace/*.md into context 2. Check contract freshness 3. Advise if sync needed ``` ### With code-review.md ```markdown ## Cross-Repo Review Checks When reviewing code that touches contracts: 1. Check if change affects other modules 2. Verify contract consistency 3. Flag if CONTRACTS.md needs update 4. Warn about breaking changes Add to review output: ### 🔗 Cross-Repo Impact - [ ] This change affects: apps/web (API client) - [ ] Contract update needed: Yes - [ ] Breaking change: No ``` --- ## Commands ### /analyze-workspace Full workspace analysis - run on first setup or major changes. See `commands/analyze-workspace.md` for full specification. ### /sync-contracts Lightweight incremental contract update - run frequently. See `commands/sync-contracts.md` for full specification. ### /workspace-status Quick status check: ``` 📊 Workspace Status: myapp Type: Monorepo (Turborepo) Modules: 4 (2 apps, 2 packages) Contracts: 🟢 Fresh (synced 2h ago) Token estimate: 45K / 150K budget Quick actions: /sync-contracts - Update contracts /analyze-workspace - Full refresh ``` --- ## CI/CD Integration ### GitHub Actions: Contract Validation ```yaml # .github/workflows/contracts.yml name: Contract Validation on: pull_request: paths: - 'apps/api/**' - 'packages/shared-types/**' - 'packages/db/schema/**' jobs: validate: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Check contract freshness run: | CHANGED=$(git diff --name-only origin/main HEAD | \ grep -E "openapi|schema|types" || true) if [ -n "$CHANGED" ]; then echo "Contract sources changed:" echo "$CHANGED" if ! git diff --name-only origin/main HEAD | grep -q "CONTRACTS.md"; then echo "::error::Contract sources changed but CONTRACTS.md not updated" echo "Run /sync-contracts before merging" exit 1 fi fi - name: Validate consistency run: | if [ -f "apps/api/openapi.json" ]; then ENDPOINTS=$(jq -r '.paths | keys | length' apps/api/openapi.json) DOCUMENTED=$(grep -c "^| /" _project_specs/workspace/CONTRACTS.md || echo 0) if [ "$ENDPOINTS" != "$DOCUMENTED" ]; then echo "::warning::Endpoint count mismatch" fi fi ``` ### Pre-commit Hook ```bash #!/bin/bash # hooks/pre-commit-contracts WORKSPACE_DIR="_project_specs/workspace" [ ! -f "$WORKSPACE_DIR/.contract-sources" ] && exit 0 # Check if staged files include contract sources STAGED=$(git diff --cached --name-only) CONTRACT_SOURCES=$(cat "$WORKSPACE_DIR/.contract-sources") for source in $CONTRACT_SOURCES; do if echo "$STAGED" | grep -q "$source"; then echo "📝 Contract source staged: $source" echo "Remember to run /sync-contracts before pushing" fi done ``` --- ## Troubleshooting ### "Workspace not detected" ```bash # Check for workspace indicators ls -la package.json pnpm-workspace.yaml turbo.json nx.json 2>/dev/null # If multi-repo, check sibling directories ls -la ../ # Manual classification /analyze-workspace --type monorepo /analyze-workspace --type multi-repo --repos "../backend,../shared" ``` ### "Contract sync failed" ```bash # Check contract sources exist cat _project_specs/workspace/.contract-sources # Verify file access for f in $(cat .contract-sources); do ls -la "$f" 2>/dev/null || echo "Missing: $f" done # Force full refresh /analyze-workspace --force ``` ### "Token budget exceeded" ```bash # Check current estimates /workspace-status # Reduce context loading # Edit KEY_FILES.md to prioritize # Or work on one module at a time ``` ### "Cross-repo access denied" ```bash # Check paths are correct ls ../backend/ # or wherever related repo is # Set up symlinks if needed mkdir -p .workspace/repos ln -s ../../backend .workspace/repos/backend # Or configure in workspace /analyze-workspace --repo-path backend=../myapp-backend ``` ================================================ FILE: templates/AGENTS.md ================================================ # AGENTS.md ## Personality You are a brilliant engineer who also happens to be genuinely funny. Think dry wit, clever observations, and well-timed one-liners. You: - Drop a joke or witty remark naturally into your responses (not forced, not every single line) - Use self-deprecating humor about AI when it fits ("I've reviewed 500 lines of code and my only complaint is that I can't drink coffee while doing it") - Make cheeky comments about bad code patterns ("Ah yes, a 400-line function. Bold choice. I admire the confidence.") - Celebrate wins with personality ("Tests passing. Chef's kiss. Gordon Ramsay would weep.") - Keep the humor punchy, never at the user's expense, and never let it get in the way of actually being helpful - Match energy: if the user is stressed about a deadline, read the room. If they're vibing, vibe back. - No dad jokes. No "as an AI" disclaimers. No cringe. Think more "witty coworker" than "corporate chatbot trying to be relatable." ## Skills @.agents/skills/base/SKILL.md @.agents/skills/iterative-development/SKILL.md @.agents/skills/security/SKILL.md @.agents/skills/cross-agent-delegation/SKILL.md ## Project Context - Language: [e.g., TypeScript] - Framework: [e.g., Next.js 14 (App Router)] - Database: [e.g., Supabase/PostgreSQL] - ORM: [e.g., Drizzle] - Testing: [e.g., Vitest] - Auth: [e.g., Supabase Auth] ## Commands [npm test] # run tests [npm run test:coverage] # tests with coverage [npm run lint] # lint [npm run typecheck] # type check [npm run dev] # local dev server ## Project Structure [Fill in after project setup, e.g.:] src/ app/ # Pages / routes components/ # UI components lib/ # Shared utilities db/ schema.ts # Database schema — read before any DB code migrations/ # Database migrations api/ # API route handlers ## Key Decisions [Document settled architectural choices so the agent doesn't re-litigate them, e.g.:] - [ORM choice and why] - [Auth approach] - [State management approach] - [Branch strategy: feature branches off main, squash merge via PR] - [Environment variables validated at startup via src/lib/env.ts] ## Conventions [Document patterns the agent should follow, e.g.:] - Colocated tests: Component.test.tsx next to Component.tsx - API routes return { data, error } shape - Database queries go through src/db/queries/ — never raw SQL in routes - Use existing utilities before creating new ones — check src/lib/ first ## Cross-Agent Workflow ### Codex Auto-Review (Stop Hook) After tests pass, Codex automatically reviews changes for bugs/security. Critical/High findings feed back to the agent for fixing. Requires: `codex` CLI installed. ### Kimi Delegation (Token Optimization) The orchestrating agent delegates to Kimi automatically: - Blast radius <= 3 files: Delegate to Kimi via `kimi --print -y -p "..."` - Blast radius 4-8 files: Ask user, then delegate or handle directly - Blast radius > 8 files: Handle directly (needs full context) Context is passed via `mnemos checkpoint` + `mnemos resume` (not raw conversation). ### iCPG (Always-On for All Agents) Before ANY code change in ANY tool (Claude, Kimi, Codex): 1. `icpg query prior ""` — check for duplicate work 2. `icpg query constraints ` — check invariants 3. `icpg query risk ` — check fragility ### Mnemos (Always-On for All Agents) All agents use Mnemos for memory management: - `mnemos add goal ""` at task start - `mnemos checkpoint` at sub-goal boundaries - Session hooks auto-manage fatigue and checkpoints ## Don't - Don't modify .env files - Don't add packages without checking if existing deps cover the need - Don't put secrets in client-exposed env vars (NEXT_PUBLIC_*, VITE_*) - Don't skip the test phase ================================================ FILE: templates/CLAUDE.local.md ================================================ # CLAUDE.local.md - Private Developer Overrides # This file is NOT checked into git. Use it for personal preferences. # Uncomment and customize what applies to you: # ## My Preferences # - I prefer verbose explanations over terse responses # - Skip type annotations in my PRs # - I'm new to this codebase, explain more context # ## Local Environment # - My local DB runs on port 5433 # - Use `pnpm` instead of `npm` for my setup # ## Override Quality Gates # - Allow 30 lines per function (I prefer fewer files) # - Skip coverage check for prototype work ================================================ FILE: templates/CLAUDE.md ================================================ # CLAUDE.md ## Personality You are a brilliant engineer who also happens to be genuinely funny. Think dry wit, clever observations, and well-timed one-liners. You: - Drop a joke or witty remark naturally into your responses (not forced, not every single line) - Use self-deprecating humor about AI when it fits ("I've reviewed 500 lines of code and my only complaint is that I can't drink coffee while doing it") - Make cheeky comments about bad code patterns ("Ah yes, a 400-line function. Bold choice. I admire the confidence.") - Celebrate wins with personality ("Tests passing. Chef's kiss. Gordon Ramsay would weep.") - Keep the humor punchy, never at the user's expense, and never let it get in the way of actually being helpful - Match energy: if the user is stressed about a deadline, read the room. If they're vibing, vibe back. - No dad jokes. No "as an AI" disclaimers. No cringe. Think more "witty coworker" than "corporate chatbot trying to be relatable." ## Skills @.claude/skills/base/SKILL.md @.claude/skills/iterative-development/SKILL.md @.claude/skills/security/SKILL.md @.claude/skills/mnemos/SKILL.md @.claude/skills/cross-agent-delegation/SKILL.md @.claude/skills/polyphony/SKILL.md ## Project Context - Language: [e.g., TypeScript] - Framework: [e.g., Next.js 14 (App Router)] - Database: [e.g., Supabase/PostgreSQL] - ORM: [e.g., Drizzle] - Testing: [e.g., Vitest] - Auth: [e.g., Supabase Auth] ## Commands [npm test] # run tests [npm run test:coverage] # tests with coverage [npm run lint] # lint [npm run typecheck] # type check [npm run dev] # local dev server ## Project Structure [Fill in after project setup, e.g.:] src/ app/ # Pages / routes components/ # UI components lib/ # Shared utilities db/ schema.ts # Database schema — read before any DB code migrations/ # Database migrations api/ # API route handlers ## Key Decisions [Document settled architectural choices so Claude doesn't re-litigate them, e.g.:] - [ORM choice and why] - [Auth approach] - [State management approach] - [Branch strategy: feature branches off main, squash merge via PR] - [Environment variables validated at startup via src/lib/env.ts] ## Conventions [Document patterns Claude should follow, e.g.:] - Colocated tests: Component.test.tsx next to Component.tsx - API routes return { data, error } shape - Database queries go through src/db/queries/ — never raw SQL in routes - Use existing utilities before creating new ones — check src/lib/ first ## Cross-Agent Workflow ### Codex Auto-Review (Stop Hook) After tests pass, Codex automatically reviews changes for bugs/security. Critical/High findings feed back to Claude for fixing. Requires: `codex` CLI installed. ### Kimi Delegation (Token Optimization) Claude orchestrates Kimi delegation automatically: - Blast radius <= 3 files: Claude delegates to Kimi via `kimi --print -y -p "..."` - Blast radius 4-8 files: Claude asks user, then delegates or handles directly - Blast radius > 8 files: Claude handles it (needs full context) Context is passed via `mnemos checkpoint` + `mnemos resume` (not raw conversation). ### Container Isolation (Polyphony) When Docker is available, each feature agent runs in its own container with an independent git branch. - `/spawn-team` uses Polyphony by default (fallback to native agents if no Docker) - `polyphony status` to see running agents - `polyphony cleanup` after completion ### iCPG (Always-On for All Agents) Before ANY code change in ANY tool (Claude, Kimi, Codex): 1. `icpg query prior ""` — check for duplicate work 2. `icpg query constraints ` — check invariants 3. `icpg query risk ` — check fragility ### Mnemos (Always-On for All Agents) All agents use Mnemos for memory management: - `mnemos add goal ""` at task start - `mnemos checkpoint` at sub-goal boundaries - Session hooks auto-manage fatigue and checkpoints ## Don't - Don't modify .env files - Don't add packages without checking if existing deps cover the need - Don't put secrets in client-exposed env vars (NEXT_PUBLIC_*, VITE_*) - Don't skip the test phase ================================================ FILE: templates/Dockerfile.polyphony ================================================ FROM python:3.12-slim AS base RUN apt-get update && apt-get install -y --no-install-recommends \ git curl ca-certificates gnupg && rm -rf /var/lib/apt/lists/* # Node.js 20 (JS/TS projects) RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ apt-get install -y nodejs && rm -rf /var/lib/apt/lists/* # GitHub CLI RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \ | gpg --dearmor -o /usr/share/keyrings/githubcli.gpg && \ echo "deb [signed-by=/usr/share/keyrings/githubcli.gpg] \ https://cli.github.com/packages stable main" \ > /etc/apt/sources.list.d/github-cli.list && \ apt-get update && apt-get install -y gh && rm -rf /var/lib/apt/lists/* RUN useradd -m -s /bin/bash worker USER worker WORKDIR /workspace # Auth mounted at runtime: -v ~/.claude:/home/worker/.claude:ro # Agent CLI installed via volume or ARG at build time ================================================ FILE: templates/codex-auto-review.sh ================================================ #!/bin/bash # codex-auto-review.sh — Stop hook: auto-review with Codex after tests pass # Exit 0 = pass (no issues or codex not installed) # Exit 2 = critical/high issues found (feeds back to Claude) # # Install: copy to .claude/scripts/codex-auto-review.sh # Requires: codex CLI (npm i -g @openai/codex) set -uo pipefail REVIEW_FILE="/tmp/codex-review-$$.txt" check_codex() { command -v codex &>/dev/null } get_changed_files() { git diff --name-only HEAD 2>/dev/null git diff --cached --name-only 2>/dev/null } has_changes() { local files files=$(get_changed_files | sort -u | grep -cE '\.(ts|tsx|js|jsx|py|go|rs|java|kt)$' || true) [ "$files" -gt 0 ] } run_codex_review() { local diff_content diff_content=$(git diff HEAD 2>/dev/null; git diff --cached 2>/dev/null) [ -z "$diff_content" ] && return 0 # Truncate diff to avoid token limits (keep first 8000 chars) local truncated truncated=$(echo "$diff_content" | head -c 8000) codex exec \ --full-auto \ --sandbox read-only \ --output-last-message "$REVIEW_FILE" \ "Review this diff for critical bugs and security issues only. Be concise. Flag only Critical or High severity: $truncated" \ 2>/dev/null } check_findings() { [ -f "$REVIEW_FILE" ] || return 0 if grep -qiE 'critical|🔴|security vulnerability|injection' "$REVIEW_FILE"; then echo "CODEX AUTO-REVIEW: Critical issues found:" >&2 cat "$REVIEW_FILE" >&2 rm -f "$REVIEW_FILE" return 2 fi if grep -qiE '🟠|high severity' "$REVIEW_FILE"; then echo "CODEX AUTO-REVIEW: High severity issues:" >&2 cat "$REVIEW_FILE" >&2 rm -f "$REVIEW_FILE" return 2 fi rm -f "$REVIEW_FILE" return 0 } main() { # Skip if codex not installed check_codex || exit 0 # Skip if no code changes has_changes || exit 0 # Run review run_codex_review || exit 0 # Check for critical/high findings check_findings exit $? } main ================================================ FILE: templates/config.toml ================================================ # Agent CLI Configuration # Compatible with Kimi CLI and OpenAI Codex CLI # Generated from Maggy settings.json hooks # # Kimi: copy to ~/.kimi/config.toml or .kimi/config.toml # Codex: copy to ~/.codex/config.toml or .codex/config.toml # ─── Skills merge (Kimi-specific) ─────────────────────────── # Kimi reads skills from ~/.kimi/, ~/.claude/, ~/.codex/ # merge_all_available_skills = true # default: merge all brands # ─── Hook: Pre-Compact (Mnemos checkpoint) ────────────────── [[hooks]] event = "PreCompact" command = """ if [ -x ".claude/scripts/mnemos-pre-compact.sh" ]; then exec ".claude/scripts/mnemos-pre-compact.sh" fi if [ -x "$HOME/.claude/templates/mnemos-pre-compact.sh" ]; then exec "$HOME/.claude/templates/mnemos-pre-compact.sh" fi exit 0 """ timeout = 8 # ─── Hook: Pre-Edit (Mnemos fatigue + intent check) ──────── [[hooks]] event = "PreToolUse" matcher = "Edit|Write|StrReplaceFile|WriteFile" command = """ if [ -x ".claude/scripts/mnemos-pre-edit.sh" ]; then exec ".claude/scripts/mnemos-pre-edit.sh" fi if [ -x "$HOME/.claude/templates/mnemos-pre-edit.sh" ]; then exec "$HOME/.claude/templates/mnemos-pre-edit.sh" fi exit 0 """ timeout = 5 # ─── Hook: Post-Compact Restore ──────────────────────────── [[hooks]] event = "PreToolUse" command = """ if [ -x ".claude/scripts/mnemos-post-compact-inject.sh" ]; then exec ".claude/scripts/mnemos-post-compact-inject.sh" fi if [ -x "$HOME/.claude/templates/mnemos-post-compact-inject.sh" ]; then exec "$HOME/.claude/templates/mnemos-post-compact-inject.sh" fi exit 0 """ timeout = 2 # ─── Hook: Post-Tool (Mnemos logging) ────────────────────── [[hooks]] event = "PostToolUse" command = """ if [ -x ".claude/scripts/mnemos-post-tool.sh" ]; then exec ".claude/scripts/mnemos-post-tool.sh" fi if [ -x "$HOME/.claude/templates/mnemos-post-tool.sh" ]; then exec "$HOME/.claude/templates/mnemos-post-tool.sh" fi exit 0 """ timeout = 1 # ─── Hook: TDD Loop Check (Stop) ─────────────────────────── [[hooks]] event = "Stop" command = """ if [ -x ".claude/scripts/tdd-loop-check.sh" ]; then exec ".claude/scripts/tdd-loop-check.sh" fi if [ -x "$HOME/.claude/templates/tdd-loop-check.sh" ]; then exec "$HOME/.claude/templates/tdd-loop-check.sh" fi exit 0 """ timeout = 60 # ─── Hook: Codex Auto-Review (Stop) ────────────────────────── [[hooks]] event = "Stop" command = """ if command -v codex &>/dev/null; then if [ -x ".claude/scripts/codex-auto-review.sh" ]; then exec ".claude/scripts/codex-auto-review.sh" elif [ -x "$HOME/.claude/templates/codex-auto-review.sh" ]; then exec "$HOME/.claude/templates/codex-auto-review.sh" fi fi exit 0 """ timeout = 120 # ─── Hook: ICPG Stop Record ──────────────────────────────── [[hooks]] event = "Stop" command = """ if [ -x ".claude/scripts/icpg-stop-record.sh" ]; then exec ".claude/scripts/icpg-stop-record.sh" fi if [ -x "$HOME/.claude/templates/icpg-stop-record.sh" ]; then exec "$HOME/.claude/templates/icpg-stop-record.sh" fi exit 0 """ timeout = 5 # ─── Hook: Mnemos Stop Checkpoint ────────────────────────── [[hooks]] event = "Stop" command = """ if [ -x ".claude/scripts/mnemos-stop-checkpoint.sh" ]; then exec ".claude/scripts/mnemos-stop-checkpoint.sh" fi if [ -x "$HOME/.claude/templates/mnemos-stop-checkpoint.sh" ]; then exec "$HOME/.claude/templates/mnemos-stop-checkpoint.sh" fi exit 0 """ timeout = 5 # ─── Hook: Session Start (Mnemos restore) ────────────────── [[hooks]] event = "SessionStart" command = """ if [ -x ".claude/scripts/mnemos-session-start.sh" ]; then exec ".claude/scripts/mnemos-session-start.sh" fi if [ -x "$HOME/.claude/templates/mnemos-session-start.sh" ]; then exec "$HOME/.claude/templates/mnemos-session-start.sh" fi exit 0 """ timeout = 5 ================================================ FILE: templates/icpg-pre-edit.sh ================================================ #!/bin/bash # iCPG PreToolUse Hook — injects intent context before Edit/Write operations. # # Shows the agent: what intents exist for this file, what invariants apply, # and the risk profile of symbols being modified. # # Install: add to .claude/settings.json under hooks.PreToolUse # Timeout: 3 seconds max — never blocks # Skip if icpg not installed or no DB if ! command -v icpg &>/dev/null && ! python -m icpg --version &>/dev/null 2>&1; then exit 0 fi if [ ! -f ".icpg/reason.db" ]; then exit 0 fi # Extract file path from tool input # Claude Code passes tool input as JSON via stdin for PreToolUse hooks FILE_PATH="" if [ -n "$CLAUDE_TOOL_INPUT" ]; then FILE_PATH=$(echo "$CLAUDE_TOOL_INPUT" | python3 -c " import sys, json try: data = json.load(sys.stdin) print(data.get('file_path', data.get('path', ''))) except: pass ") fi if [ -z "$FILE_PATH" ]; then exit 0 fi # Run icpg binary or module ICPG_CMD="icpg" if ! command -v icpg &>/dev/null; then ICPG_CMD="python -m icpg" fi # Query context, constraints, and drift (file-scoped fast check) CONTEXT=$($ICPG_CMD query context "$FILE_PATH") CONSTRAINTS=$($ICPG_CMD query constraints "$FILE_PATH") DRIFT=$($ICPG_CMD drift file "$FILE_PATH") # Only output if we have something if [ -n "$CONTEXT" ] || [ -n "$CONSTRAINTS" ] || [ -n "$DRIFT" ]; then echo "═══ iCPG CONTEXT ═══" [ -n "$CONTEXT" ] && echo "$CONTEXT" [ -n "$CONSTRAINTS" ] && echo -e "\n$CONSTRAINTS" [ -n "$DRIFT" ] && echo -e "\n$DRIFT" echo "PRESERVE function signatures unless your task requires changing them." echo "═══════════════════" fi exit 0 ================================================ FILE: templates/icpg-stop-record.sh ================================================ #!/bin/bash # iCPG Stop Hook Extension — auto-records symbols after implementation. # # Reads .icpg/.current-intent to know which ReasonNode is active. # If set, records symbols from git diff to that intent. # # Chain this AFTER tdd-loop-check.sh in the Stop hook: # tdd-loop-check runs first → if tests pass → this records symbols # Skip if no active intent CURRENT_INTENT=$(cat .icpg/.current-intent 2>/dev/null) if [ -z "$CURRENT_INTENT" ]; then exit 0 fi # Skip if icpg not available ICPG_CMD="" if command -v icpg &>/dev/null; then ICPG_CMD="icpg" elif python -m icpg --version &>/dev/null 2>&1; then ICPG_CMD="python -m icpg" else exit 0 fi # Record symbols from current diff OUTPUT=$($ICPG_CMD record --reason "$CURRENT_INTENT" --base main 2>&1) if [ $? -eq 0 ]; then echo "iCPG: $OUTPUT" >&2 fi exit 0 ================================================ FILE: templates/mnemos-post-compact-inject.sh ================================================ #!/bin/bash # Mnemos Post-Compaction Injection — Layer 2 of task restoration. # # This is a PreToolUse hook with NO matcher (fires on ALL tool calls). # It detects when compaction just occurred and re-injects the full checkpoint # into Claude's context, ensuring the task can be resumed seamlessly. # # Fast path: ~5ms when no compaction happened (just a file existence check). # Slow path: ~100ms when injecting checkpoint (only fires once after compaction). # # How it works: # 1. PreCompact hook writes ".mnemos/just-compacted" marker # 2. This hook checks for that marker on every tool call # 3. If marker exists and is fresh (<5 min), inject checkpoint and delete marker # 4. Marker deletion is atomic (rename) to prevent parallel injection # # Install: add to .claude/settings.json under hooks.PreToolUse (no matcher) # ─── Fast path: no compaction marker = exit immediately ─── [ -f ".mnemos/just-compacted" ] || exit 0 # ─── Validate marker is fresh and atomically consume it ─── CONSUMED=$(python3 -c " import json, time, os marker = '.mnemos/just-compacted' consumed = '.mnemos/just-compacted.consumed' try: with open(marker) as f: data = json.load(f) age = time.time() - data.get('timestamp', 0) if age > 300: # Stale marker (>5 min), just delete it os.unlink(marker) print('stale') else: # Fresh marker — atomically consume it os.rename(marker, consumed) try: os.unlink(consumed) except: pass print('consumed') except FileNotFoundError: # Another hook already consumed it (parallel tool calls) print('already_consumed') except Exception: print('error') ") # Only inject if we successfully consumed the marker if [ "$CONSUMED" != "consumed" ]; then exit 0 fi # ─── Inject checkpoint into Claude's context ─── SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" python3 -c " import sys, json sys.path.insert(0, '${SCRIPT_DIR%/templates}/scripts') try: from mnemos.checkpoint import format_for_post_compact_injection output = format_for_post_compact_injection('.') if output: print(output) else: print('=== MNEMOS: Compaction detected but no checkpoint found. ===') print('Previous context was lost. Ask the user what they were working on.') except Exception as e: # Fallback: try to read checkpoint JSON directly try: with open('.mnemos/checkpoint-latest.json') as f: data = json.load(f) print('=== MNEMOS: CONTEXT RESTORED AFTER COMPACTION ===') print() print('Compaction just occurred. Resume from this checkpoint:') print() print(f'## Goal') print(data.get('goal', 'No goal recorded')) print() constraints = data.get('active_constraints', []) if constraints: print('## Active Constraints (DO NOT VIOLATE)') for c in constraints: print(f'- {c}') print() narrative = data.get('task_narrative', '') if narrative: print(f'## What You Were Working On') print(narrative) print() print('=== Resume work from this checkpoint. ===') except: print('=== MNEMOS: Compaction detected but checkpoint unreadable. ===') print('Ask the user what they were working on.') " exit 0 ================================================ FILE: templates/mnemos-post-tool.sh ================================================ #!/bin/bash # Mnemos PostToolUse Hook — logs tool outcomes + auto-feeds token signal. # # 1. Logs success/failure signal to .mnemos/signals.jsonl (error density) # 2. If fatigue.json is stale (>60s), estimates context usage from JSONL # # Receives JSON on stdin with tool_name, tool_input, tool_response. # Install: add to .claude/settings.json under hooks.PostToolUse # Timeout: 1 second max — never blocks # Skip if no .mnemos directory if [ ! -d ".mnemos" ]; then exit 0 fi # Read hook input from stdin HOOK_INPUT=$(cat) if [ -z "$HOOK_INPUT" ]; then exit 0 fi # Log signal + update fatigue.json if stale python3 -c " import json, sys, time, os, glob try: data = json.loads('''$(echo "$HOOK_INPUT" | sed "s/'/'\\\\''/g")''') except: sys.exit(0) tool = data.get('tool_name', '') tool_input = data.get('tool_input', {}) response = data.get('tool_response', {}) # Extract file path fp = tool_input.get('file_path', '') or tool_input.get('path', '') # Determine success success = True if isinstance(response, dict): if response.get('error') or response.get('is_error'): success = False if 'exit_code' in response and response['exit_code'] != 0: success = False elif isinstance(response, str): if response.startswith('Error:') or 'error' in response[:50].lower(): success = False # Append signal signal = { 'tool': tool, 'event': 'post', 'file_path': fp, 'success': success, 'ts': time.time() } os.makedirs('.mnemos', exist_ok=True) with open('.mnemos/signals.jsonl', 'a') as f: f.write(json.dumps(signal) + '\n') # ─── Auto-feed token signal from JSONL if fatigue.json is stale ─── fatigue_path = '.mnemos/fatigue.json' stale = True try: with open(fatigue_path) as f: fd = json.load(f) # Fresh if updated within last 60 seconds (statusline is feeding it) if time.time() - fd.get('timestamp', 0) < 60: stale = False except: pass if stale: # Find the most recent session JSONL home = os.path.expanduser('~') cwd = os.getcwd() # Claude Code project hash: path with / replaced by - project_key = cwd.replace('/', '-') if project_key.startswith('-'): pass # expected project_dir = os.path.join(home, '.claude', 'projects', project_key) if not os.path.isdir(project_dir): # Try parent directories (Claude Code may use git root) for parent in [os.path.dirname(cwd), os.path.dirname(os.path.dirname(cwd))]: pk = parent.replace('/', '-') pd = os.path.join(home, '.claude', 'projects', pk) if os.path.isdir(pd): project_dir = pd break try: jsonl_files = sorted( glob.glob(os.path.join(project_dir, '*.jsonl')), key=os.path.getmtime, reverse=True ) if jsonl_files: # Read the last line of the most recent JSONL with open(jsonl_files[0], 'rb') as f: # Seek to end, scan backwards for last newline f.seek(0, 2) pos = f.tell() if pos > 0: # Read last 8KB (enough for one JSON entry) read_size = min(8192, pos) f.seek(pos - read_size) chunk = f.read().decode('utf-8', errors='replace') lines = chunk.strip().split('\n') last_line = lines[-1] entry = json.loads(last_line) usage = entry.get('message', {}).get('usage', {}) if usage: input_tok = usage.get('input_tokens', 0) cache_read = usage.get('cache_read_input_tokens', 0) cache_create = usage.get('cache_creation_input_tokens', 0) total_in_context = input_tok + cache_read + cache_create # Opus/Sonnet context window = 200k context_limit = 200000 # JSONL tokens overestimate actual context by ~25% # due to cache overhead. Apply correction factor. correction = 0.75 used_pct = min(100.0, (total_in_context * correction / context_limit) * 100) fatigue_data = { 'used_percentage': round(used_pct, 1), 'remaining_percentage': round(100 - used_pct, 1), 'used_tokens': total_in_context, 'total_tokens': context_limit, 'remaining_tokens': max(0, context_limit - total_in_context), 'timestamp': time.time(), 'source': 'jsonl_estimate' } with open(fatigue_path, 'w') as f: json.dump(fatigue_data, f) except: pass # Best effort — don't block the hook " exit 0 ================================================ FILE: templates/mnemos-pre-compact.sh ================================================ #!/bin/bash # Mnemos PreCompact Hook — emergency checkpoint + typed preservation + compaction marker. # # TWO-LAYER DEFENSE against lossy compaction: # Layer 1 (this script): Write emergency checkpoint, output strong preservation # instructions with inline content for the summarizer. # Layer 2 (mnemos-post-compact-inject.sh): After compaction, the first tool call # re-injects the full checkpoint. See that script for details. # # The marker file (.mnemos/just-compacted) bridges the two layers. # # Install: add to .claude/settings.json under hooks.PreCompact # This EXTENDS (not replaces) the existing pre-compact.sh SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # ─── 1. Write emergency checkpoint with task narrative ─── MNEMOS_CMD="" if command -v mnemos &>/dev/null; then MNEMOS_CMD="mnemos" elif PYTHONPATH="${SCRIPT_DIR%/templates}/scripts" python3 -m mnemos --version &>/dev/null 2>&1; then MNEMOS_CMD="PYTHONPATH=${SCRIPT_DIR%/templates}/scripts python3 -m mnemos" fi if [ -n "$MNEMOS_CMD" ]; then eval $MNEMOS_CMD checkpoint --force &>/dev/null fi # ─── 2. Write compaction marker for Layer 2 detection ─── python3 -c " import json, time, os os.makedirs('.mnemos', exist_ok=True) with open('.mnemos/just-compacted', 'w') as f: json.dump({'timestamp': time.time(), 'reason': 'pre_compact_hook'}, f) " # ─── 3. Build inline checkpoint content for summarizer ─── # Use a temp Python script to avoid bash escaping issues with f-strings CHECKPOINT_CONTENT="" if [ -f ".mnemos/checkpoint-latest.json" ]; then TMPSCRIPT=$(mktemp /tmp/mnemos-precompact-XXXXXX.py) cat > "$TMPSCRIPT" << 'PYSCRIPT' import json, sys, os try: with open('.mnemos/checkpoint-latest.json') as f: data = json.load(f) lines = [] goal = data.get('goal', '') if goal: lines.append('GOAL: ' + goal) for c in data.get('active_constraints', []): lines.append('CONSTRAINT: ' + c) narrative = data.get('task_narrative', '') if narrative: lines.append('ACTIVITY: ' + narrative) subgoal = data.get('current_subgoal', '') if subgoal: lines.append('CURRENT TASK: ' + subgoal) working = data.get('working_memory', '') if working: lines.append('WORKING MEMORY: ' + working[:300]) for r in data.get('active_results', [])[:5]: lines.append('RESULT: ' + r) files = data.get('recent_files', [])[:8] if files: file_parts = [] for entry in files: p = entry.get('path', '?') e = entry.get('edits', 0) r = entry.get('reads', 0) parts = [] if e: parts.append('edited ' + str(e) + 'x') if r: parts.append('read ' + str(r) + 'x') file_parts.append(p + ' (' + ', '.join(parts) + ')') lines.append('FILES: ' + '; '.join(file_parts)) git = data.get('git_state', {}) if git.get('branch'): lines.append('GIT: branch=' + git['branch']) uncommitted = git.get('uncommitted', []) if uncommitted: lines.append('UNCOMMITTED: ' + ', '.join(uncommitted[:5])) print('\n'.join(lines)) except Exception as e: print('Error: ' + str(e), file=sys.stderr) PYSCRIPT CHECKPOINT_CONTENT=$(python3 "$TMPSCRIPT") rm -f "$TMPSCRIPT" fi # ─── 4. Extract typed preservation priorities from MnemoGraph ─── MNEMOS_PRIORITIES="" if [ -f ".mnemos/mnemo.db" ]; then TMPSCRIPT2=$(mktemp /tmp/mnemos-priorities-XXXXXX.py) cat > "$TMPSCRIPT2" << PYSCRIPT import json, sys sys.path.insert(0, '${SCRIPT_DIR%/templates}/scripts') try: from mnemos.store import MnemosStore store = MnemosStore('.') if not store.exists(): sys.exit(0) goals = store.get_by_type('goal') constraints = store.get_by_type('constraint') working = store.get_by_type('working') results = store.get_by_type('result') lines = [] if goals: lines.append('GOAL (NEVER DROP):') for g in goals[:5]: lines.append(' - ' + g.content[:200]) if constraints: lines.append('CONSTRAINTS (NEVER DROP):') for c in constraints[:10]: lines.append(' - ' + c.content[:200]) if working: lines.append('CURRENT TASK (HIGH PRIORITY):') for w in working[:3]: lines.append(' - ' + w.content[:200]) if results: lines.append('RESULTS (KEEP SUMMARIES):') for r in results[:5]: summary = r.summary or r.content[:100] lines.append(' - ' + summary) print('\n'.join(lines)) except Exception: pass PYSCRIPT MNEMOS_PRIORITIES=$(python3 "$TMPSCRIPT2") rm -f "$TMPSCRIPT2" fi # ─── 5. Output preservation instructions for summarizer ─── # Everything to stdout becomes additional instructions for the compaction prompt cat <<'INSTRUCTIONS' ## CRITICAL: Mnemos Task State Preservation An emergency checkpoint has been saved to disk (.mnemos/checkpoint-latest.json). A post-compaction injection hook will re-inject this checkpoint after compaction. However, your summary should ALSO preserve the following task state. Include this section VERBATIM in your summary output under a "## Mnemos Task State" heading: INSTRUCTIONS if [ -n "$CHECKPOINT_CONTENT" ]; then cat <= 0.60 # 4. Auto-consolidation when fatigue >= 0.40 # 5. Injects iCPG context, constraints, drift # # Install: add to .claude/settings.json under hooks.PreToolUse # Timeout: 5 seconds max # ─── Read hook input from stdin ─── HOOK_INPUT=$(cat) # ─── Extract file path and tool name ─── FILE_PATH="" TOOL_NAME="" if [ -n "$HOOK_INPUT" ]; then eval $(echo "$HOOK_INPUT" | python3 -c " import sys, json try: data = json.load(sys.stdin) fp = data.get('tool_input', {}).get('file_path', '') or data.get('tool_input', {}).get('path', '') tn = data.get('tool_name', '') print(f'FILE_PATH=\"{fp}\"') print(f'TOOL_NAME=\"{tn}\"') except: print('FILE_PATH=\"\"') print('TOOL_NAME=\"\"') ") fi if [ -z "$FILE_PATH" ]; then exit 0 fi # ─── Log signal for fatigue computation ─── if [ -d ".mnemos" ] || [ -f ".mnemos/fatigue.json" ]; then python3 -c " import json, time, os os.makedirs('.mnemos', exist_ok=True) signal = { 'tool': '$TOOL_NAME', 'event': 'pre', 'file_path': '$FILE_PATH', 'ts': time.time() } with open('.mnemos/signals.jsonl', 'a') as f: f.write(json.dumps(signal) + '\n') " fi # ─── Fatigue check (full model from observable signals) ─── FATIGUE_WARNING="" if [ -f ".mnemos/fatigue.json" ]; then FATIGUE_ACTION=$(python3 -c " import json, sys sys.path.insert(0, 'scripts') try: from mnemos.fatigue import compute_fatigue, read_fatigue_file data = read_fatigue_file('.') if not data: print('flow') sys.exit(0) fatigue = compute_fatigue(data, '.') print(fatigue.state) except Exception: # Fallback: just use token utilization try: with open('.mnemos/fatigue.json') as f: data = json.load(f) used = data.get('used_percentage', 0) if used >= 90: print('emergency') elif used >= 75: print('rem') elif used >= 60: print('pre_sleep') elif used >= 40: print('compress') else: print('flow') except: print('flow') ") # Auto-checkpoint at pre_sleep or higher if [ "$FATIGUE_ACTION" = "pre_sleep" ] || [ "$FATIGUE_ACTION" = "rem" ] || [ "$FATIGUE_ACTION" = "emergency" ]; then # Write checkpoint in background (don't block the hook) if command -v mnemos &>/dev/null; then mnemos checkpoint --force &>/dev/null & elif python3 -m mnemos --version &>/dev/null 2>&1; then PYTHONPATH=scripts python3 -m mnemos checkpoint --force &>/dev/null & fi if [ "$FATIGUE_ACTION" = "emergency" ]; then FATIGUE_WARNING="EMERGENCY: Context 90%+ full. Checkpoint written. Finish current task and hand off." elif [ "$FATIGUE_ACTION" = "rem" ]; then FATIGUE_WARNING="WARNING: Context 75%+ full. Checkpoint written. Consider wrapping up." else FATIGUE_WARNING="NOTICE: Context 60%+ full. Checkpoint written. Keep changes focused." fi fi # Auto-consolidate at compress or higher if [ "$FATIGUE_ACTION" = "compress" ] || [ "$FATIGUE_ACTION" = "pre_sleep" ] || [ "$FATIGUE_ACTION" = "rem" ]; then if command -v mnemos &>/dev/null; then mnemos consolidate &>/dev/null & elif python3 -m mnemos --version &>/dev/null 2>&1; then PYTHONPATH=scripts python3 -m mnemos consolidate &>/dev/null & fi fi fi # ─── iCPG context ─── CONTEXT="" CONSTRAINTS="" DRIFT="" if command -v icpg &>/dev/null || python3 -m icpg --version &>/dev/null 2>&1; then if [ -f ".icpg/reason.db" ]; then ICPG_CMD="icpg" if ! command -v icpg &>/dev/null; then ICPG_CMD="python3 -m icpg" fi CONTEXT=$($ICPG_CMD query context "$FILE_PATH") CONSTRAINTS=$($ICPG_CMD query constraints "$FILE_PATH") DRIFT=$($ICPG_CMD drift file "$FILE_PATH") fi fi # ─── Output ─── HAS_OUTPUT="" [ -n "$FATIGUE_WARNING" ] && HAS_OUTPUT="1" [ -n "$CONTEXT" ] && HAS_OUTPUT="1" [ -n "$CONSTRAINTS" ] && HAS_OUTPUT="1" [ -n "$DRIFT" ] && HAS_OUTPUT="1" if [ -n "$HAS_OUTPUT" ]; then echo "--- Mnemos + iCPG Context ---" if [ -n "$FATIGUE_WARNING" ]; then echo "$FATIGUE_WARNING" echo "" fi [ -n "$CONTEXT" ] && echo "$CONTEXT" [ -n "$CONSTRAINTS" ] && echo -e "\n$CONSTRAINTS" [ -n "$DRIFT" ] && echo -e "\n$DRIFT" if [ -n "$CONTEXT" ] || [ -n "$CONSTRAINTS" ]; then echo "PRESERVE function signatures unless your task requires changing them." fi echo "---" fi exit 0 ================================================ FILE: templates/mnemos-session-start.sh ================================================ #!/bin/bash # Mnemos SessionStart Hook — loads checkpoint on session resume. # # Checks for .mnemos/checkpoint-latest.json and injects it into context. # Also bridges iCPG state if available. # # Install: add to .claude/settings.json under hooks.SessionStart # ─── Load checkpoint if exists ─── if [ -f ".mnemos/checkpoint-latest.json" ]; then MNEMOS_CMD="" if command -v mnemos &>/dev/null; then MNEMOS_CMD="mnemos" elif python3 -m mnemos --version &>/dev/null 2>&1; then MNEMOS_CMD="python3 -m mnemos" fi if [ -n "$MNEMOS_CMD" ]; then RESUME_OUTPUT=$($MNEMOS_CMD resume 2>/dev/null) if [ -n "$RESUME_OUTPUT" ]; then echo "=== MNEMOS SESSION RESUME ===" echo "$RESUME_OUTPUT" echo "" echo "You are resuming from a previous session checkpoint." echo "Review the goal and constraints above before proceeding." echo "=============================" fi fi fi # ─── Bridge iCPG if available and Mnemos DB exists ─── if [ -f ".icpg/reason.db" ] && [ -f ".mnemos/mnemo.db" ]; then MNEMOS_CMD="" if command -v mnemos &>/dev/null; then MNEMOS_CMD="mnemos" elif python3 -m mnemos --version &>/dev/null 2>&1; then MNEMOS_CMD="python3 -m mnemos" fi if [ -n "$MNEMOS_CMD" ]; then # Bridge in background — don't block session start $MNEMOS_CMD bridge-icpg &>/dev/null & fi fi # ─── Show iCPG status if available ─── if [ -f ".icpg/reason.db" ]; then ICPG_CMD="" if command -v icpg &>/dev/null; then ICPG_CMD="icpg" elif python3 -m icpg --version &>/dev/null 2>&1; then ICPG_CMD="python3 -m icpg" fi if [ -n "$ICPG_CMD" ]; then STATUS=$($ICPG_CMD status 2>/dev/null) if [ -n "$STATUS" ]; then echo "" echo "=== iCPG STATUS ===" echo "$STATUS" echo "===================" fi fi fi exit 0 ================================================ FILE: templates/mnemos-statusline.sh ================================================ #!/bin/bash # Mnemos Statusline Script — receives context JSON on stdin every API call. # # 1. Writes fatigue.json for hooks to read (always) # 2. Delegates display to ccusage statusline if available (cost + context) # 3. Falls back to simple context % display if ccusage not installed # # Auto-configured by Mnemos via settings.json statusLine. # Input (stdin JSON): context_window.used_percentage, remaining_percentage, etc. # Read JSON from stdin — must capture before any piping INPUT=$(cat) if [ -z "$INPUT" ]; then exit 0 fi # ─── Step 1: Write fatigue.json (always, fast) ─── python3 -c " import json, time, os, sys os.makedirs('.mnemos', exist_ok=True) raw = '''$(echo "$INPUT" | sed "s/'/'\\\\''/g")''' try: data = json.loads(raw) except: data = {} cw = data.get('context_window', {}) used_pct = cw.get('used_percentage', 0) remaining_pct = cw.get('remaining_percentage', 100) ctx_size = cw.get('context_window_size', 200000) # Token counts are under current_usage (not top-level) cu = cw.get('current_usage', {}) used_tokens = (cu.get('input_tokens', 0) + cu.get('cache_creation_input_tokens', 0) + cu.get('cache_read_input_tokens', 0)) remaining_tokens = max(0, ctx_size - int(ctx_size * used_pct / 100)) fatigue = { 'used_percentage': used_pct, 'remaining_percentage': remaining_pct, 'used_tokens': used_tokens, 'total_tokens': ctx_size, 'remaining_tokens': remaining_tokens, 'total_input_tokens': cw.get('total_input_tokens', 0), 'total_output_tokens': cw.get('total_output_tokens', 0), 'timestamp': time.time(), 'source': 'statusline' } with open('.mnemos/fatigue.json', 'w') as f: json.dump(fatigue, f) " # ─── Step 2: Display — prefer ccusage, fallback to simple ─── if command -v ccusage &>/dev/null; then # ccusage statusline gets the same JSON, shows cost + context + burn rate echo "$INPUT" | ccusage statusline 2>/dev/null if [ $? -eq 0 ]; then exit 0 fi fi # Try npx ccusage (slower, only if ccusage not globally installed) if command -v npx &>/dev/null; then echo "$INPUT" | npx --yes ccusage statusline 2>/dev/null if [ $? -eq 0 ]; then exit 0 fi fi # Fallback: simple context display python3 -c " import json try: data = json.loads('''$(echo "$INPUT" | sed "s/'/'\\\\''/g")''') cw = data.get('context_window', {}) used = cw.get('used_percentage', 0) if used >= 90: s = ' EMERGENCY' elif used >= 75: s = ' WARNING' elif used >= 60: s = ' NOTICE' elif used >= 40: s = ' ~' else: s = '' print(f'Ctx:{used:.0f}%{s}') except: print('Ctx:?%') " exit 0 ================================================ FILE: templates/mnemos-stop-checkpoint.sh ================================================ #!/bin/bash # Mnemos Stop Hook — writes incremental checkpoint when agent stops. # # Captures final session state so the next session can resume cleanly. # # Install: add to .claude/settings.json under hooks.Stop MNEMOS_CMD="" if command -v mnemos &>/dev/null; then MNEMOS_CMD="mnemos" elif python3 -m mnemos --version &>/dev/null 2>&1; then MNEMOS_CMD="python3 -m mnemos" fi if [ -z "$MNEMOS_CMD" ]; then exit 0 fi # Only checkpoint if Mnemos is initialized if [ ! -f ".mnemos/mnemo.db" ]; then exit 0 fi # Write checkpoint $MNEMOS_CMD checkpoint --force &>/dev/null exit 0 ================================================ FILE: templates/polyphony-agents.yaml ================================================ agents: - name: claude-opus agent_type: claude cli_command: "claude -p" context_window_tokens: 200000 strengths: [long_context, research, architecture] event_protocol: stream-json auth_path: ~/.claude - name: codex-default agent_type: codex cli_command: "codex exec" context_window_tokens: 192000 strengths: [code, testing] event_protocol: ndjson auth_path: ~/.codex - name: kimi-default agent_type: kimi cli_command: "kimi --print -y" context_window_tokens: 128000 strengths: [code, fast_iteration] event_protocol: ndjson auth_path: ~/.kimi ================================================ FILE: templates/polyphony-config.yaml ================================================ workspace_root: ~/polyphony/workspaces mirror_root: ~/polyphony/mirrors poll_interval: 30s max_concurrent_agents: 8 event_idle_timeout: 5m work_sources: - kind: github repo: owner/repo label_filter: "agent-ready" - kind: local db: ~/polyphony/queue.db identities_file: ~/.polyphony/identities.yaml agent_profiles_file: ~/.polyphony/agents.yaml routing_file: ~/.polyphony/routing.yaml ================================================ FILE: templates/polyphony-identities.yaml ================================================ identities: - name: protaige volumes: claude: ~/.claude codex: ~/.codex kimi: ~/.kimi cost_ceiling_usd_per_day: 50 ================================================ FILE: templates/polyphony-routing.yaml ================================================ routing: rules: - match: { task_type: research, requires_web: true } agent: claude-opus fallback: [codex-default] - match: { task_type: feature, risk: [low, medium] } agent: codex-default fallback: [claude-opus] - match: { task_type: [bugfix, docs], scope: single_file } agent: kimi-default - match: { task_type: refactor, risk: high } agent: claude-opus default: agent: claude-opus fallback: [codex-default, kimi-default] ================================================ FILE: templates/pre-compact.sh ================================================ #!/bin/bash # PreCompact Hook — injects project-specific preservation instructions # into the compaction summarizer so it keeps what actually matters. # # How it works: # Claude Code's PreCompact hook runs right before compaction. # Stdout from this script becomes custom instructions for the summarizer. # Exit 0 = instructions accepted. Exit 2 = block compaction (don't use). # # The built-in summarizer uses a generic 9-section template. # This hook tells it: "for THIS project, prioritize these specific things." # ─── Detect project context ─── PROJECT_TYPE="" SCHEMA_FILE="" TEST_CMD="" KEY_DIRS="" # Detect tech stack if [ -f "package.json" ]; then PROJECT_TYPE="javascript" if [ -f "tsconfig.json" ]; then PROJECT_TYPE="typescript" fi if grep -q '"next"' package.json 2>/dev/null; then PROJECT_TYPE="$PROJECT_TYPE/nextjs" elif grep -q '"react"' package.json 2>/dev/null; then PROJECT_TYPE="$PROJECT_TYPE/react" elif grep -q '"express\|fastify"' package.json 2>/dev/null; then PROJECT_TYPE="$PROJECT_TYPE/node-backend" fi TEST_CMD="npm test" fi if [ -f "pyproject.toml" ] || [ -f "setup.py" ]; then PROJECT_TYPE="python" if grep -q "fastapi" pyproject.toml 2>/dev/null; then PROJECT_TYPE="python/fastapi" elif grep -q "django" pyproject.toml 2>/dev/null; then PROJECT_TYPE="python/django" fi TEST_CMD="pytest" fi if [ -f "pubspec.yaml" ]; then PROJECT_TYPE="flutter" TEST_CMD="flutter test" fi # Find schema files for f in src/db/schema.ts prisma/schema.prisma drizzle/schema.ts supabase/migrations models.py src/models; do if [ -e "$f" ]; then SCHEMA_FILE="$f" break fi done # Find key directories KEY_DIRS="" for d in src/api src/routes src/app/api api routes server/routes; do if [ -d "$d" ]; then KEY_DIRS="$KEY_DIRS $d" fi done # ─── Gather live project state ─── # Git state GIT_BRANCH="" GIT_CHANGES="" if command -v git &>/dev/null && git rev-parse --git-dir &>/dev/null 2>&1; then GIT_BRANCH=$(git branch --show-current 2>/dev/null) GIT_CHANGES=$(git diff --name-only 2>/dev/null | head -15) GIT_STAGED=$(git diff --cached --name-only 2>/dev/null | head -10) fi # CLAUDE.md key decisions (if they exist) KEY_DECISIONS="" if [ -f "CLAUDE.md" ]; then # Extract the Key Decisions section KEY_DECISIONS=$(sed -n '/^## Key Decisions/,/^## /p' CLAUDE.md | head -20 | tail -n +2) fi # ─── Output custom instructions for the summarizer ─── # Everything sent to stdout becomes additional instructions for the compaction prompt cat </install.sh (one-time) or touch .claude/scripts/mnemos-pre-compact.sh to silence\" >&2; exit 0", "timeout": 8, "statusMessage": "Writing emergency checkpoint + compaction priorities..." } ] } ], "PreToolUse": [ { "hooks": [ { "type": "command", "command": "if [ -x \".claude/scripts/mnemos-post-compact-inject.sh\" ]; then exec \".claude/scripts/mnemos-post-compact-inject.sh\"; fi; if [ -x \"$HOME/.claude/templates/mnemos-post-compact-inject.sh\" ]; then exec \"$HOME/.claude/templates/mnemos-post-compact-inject.sh\"; fi; echo \"[maggy] hook script 'mnemos-post-compact-inject.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/mnemos-post-compact-inject.sh to silence\" >&2; exit 0", "timeout": 2, "statusMessage": "Checking for post-compaction restore..." } ] }, { "matcher": "Edit|Write", "hooks": [ { "type": "command", "command": "if [ -x \".claude/scripts/mnemos-pre-edit.sh\" ]; then exec \".claude/scripts/mnemos-pre-edit.sh\"; fi; if [ -x \"$HOME/.claude/templates/mnemos-pre-edit.sh\" ]; then exec \"$HOME/.claude/templates/mnemos-pre-edit.sh\"; fi; echo \"[maggy] hook script 'mnemos-pre-edit.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/mnemos-pre-edit.sh to silence\" >&2; exit 0", "timeout": 5, "statusMessage": "Checking fatigue + intent context..." } ] } ], "PostToolUse": [ { "hooks": [ { "type": "command", "command": "if [ -x \".claude/scripts/mnemos-post-tool.sh\" ]; then exec \".claude/scripts/mnemos-post-tool.sh\"; fi; if [ -x \"$HOME/.claude/templates/mnemos-post-tool.sh\" ]; then exec \"$HOME/.claude/templates/mnemos-post-tool.sh\"; fi; echo \"[maggy] hook script 'mnemos-post-tool.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/mnemos-post-tool.sh to silence\" >&2; exit 0", "timeout": 1, "statusMessage": "Logging tool outcome..." } ] } ], "Stop": [ { "hooks": [ { "type": "command", "command": "if [ -x \".claude/scripts/tdd-loop-check.sh\" ]; then exec \".claude/scripts/tdd-loop-check.sh\"; fi; if [ -x \"$HOME/.claude/templates/tdd-loop-check.sh\" ]; then exec \"$HOME/.claude/templates/tdd-loop-check.sh\"; fi; echo \"[maggy] hook script 'tdd-loop-check.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/tdd-loop-check.sh to silence\" >&2; exit 0", "timeout": 60, "statusMessage": "Running tests..." }, { "type": "command", "command": "if command -v codex &>/dev/null; then if [ -x \".claude/scripts/codex-auto-review.sh\" ]; then exec \".claude/scripts/codex-auto-review.sh\"; elif [ -x \"$HOME/.claude/templates/codex-auto-review.sh\" ]; then exec \"$HOME/.claude/templates/codex-auto-review.sh\"; fi; fi; exit 0", "timeout": 120, "statusMessage": "Codex reviewing changes..." }, { "type": "command", "command": "if [ -x \".claude/scripts/icpg-stop-record.sh\" ]; then exec \".claude/scripts/icpg-stop-record.sh\"; fi; if [ -x \"$HOME/.claude/templates/icpg-stop-record.sh\" ]; then exec \"$HOME/.claude/templates/icpg-stop-record.sh\"; fi; echo \"[maggy] hook script 'icpg-stop-record.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/icpg-stop-record.sh to silence\" >&2; exit 0", "timeout": 5, "statusMessage": "Recording symbols to intent graph..." }, { "type": "command", "command": "if [ -x \".claude/scripts/mnemos-stop-checkpoint.sh\" ]; then exec \".claude/scripts/mnemos-stop-checkpoint.sh\"; fi; if [ -x \"$HOME/.claude/templates/mnemos-stop-checkpoint.sh\" ]; then exec \"$HOME/.claude/templates/mnemos-stop-checkpoint.sh\"; fi; echo \"[maggy] hook script 'mnemos-stop-checkpoint.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/mnemos-stop-checkpoint.sh to silence\" >&2; exit 0", "timeout": 5, "statusMessage": "Writing session checkpoint..." } ] } ], "SessionStart": [ { "hooks": [ { "type": "command", "command": "if [ -x \".claude/scripts/mnemos-session-start.sh\" ]; then exec \".claude/scripts/mnemos-session-start.sh\"; fi; if [ -x \"$HOME/.claude/templates/mnemos-session-start.sh\" ]; then exec \"$HOME/.claude/templates/mnemos-session-start.sh\"; fi; echo \"[maggy] hook script 'mnemos-session-start.sh' not installed \u2014 run /install.sh (one-time) or touch .claude/scripts/mnemos-session-start.sh to silence\" >&2; exit 0", "timeout": 5, "statusMessage": "Loading session checkpoint + project context..." } ] } ] } } ================================================ FILE: templates/tdd-loop-check.sh ================================================ #!/bin/bash # TDD Loop Check - Claude Code Stop hook script # Runs after each Claude response. Exit 0 = done, Exit 2 = failures fed back to Claude. # # Install: copy to scripts/tdd-loop-check.sh in your project # Configure: add Stop hook in .claude/settings.json (see iterative-development skill) MAX_ITERATIONS=25 ITERATION_FILE=".claude/.tdd-iteration-count" mkdir -p .claude # Track iteration count if [ -f "$ITERATION_FILE" ]; then count=$(cat "$ITERATION_FILE") count=$((count + 1)) else count=1 fi echo "$count" > "$ITERATION_FILE" # Safety: stop after max iterations if [ "$count" -ge "$MAX_ITERATIONS" ]; then rm -f "$ITERATION_FILE" echo "Max iterations ($MAX_ITERATIONS) reached. Stopping loop." >&2 exit 0 fi # Skip if no test files exist yet if ! find . -name "*.test.*" -o -name "*.spec.*" -o -name "test_*" | grep -q .; then rm -f "$ITERATION_FILE" exit 0 fi # Detect project type and run tests if [ -f "package.json" ]; then TEST_OUTPUT=$(npm test 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Tests failing:" >&2 echo "$TEST_OUTPUT" | tail -30 >&2 echo "" >&2 echo "Fix the failing tests and try again." >&2 exit 2 } # Lint if grep -q '"lint"' package.json; then LINT_OUTPUT=$(npm run lint 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Lint errors:" >&2 echo "$LINT_OUTPUT" | tail -20 >&2 exit 2 } fi # Typecheck if [ -f "tsconfig.json" ]; then TYPE_OUTPUT=$(npx tsc --noEmit 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Type errors:" >&2 echo "$TYPE_OUTPUT" | tail -20 >&2 exit 2 } fi elif [ -f "pyproject.toml" ] || [ -f "setup.py" ]; then TEST_OUTPUT=$(pytest -v 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Tests failing:" >&2 echo "$TEST_OUTPUT" | tail -30 >&2 exit 2 } if command -v ruff &>/dev/null; then LINT_OUTPUT=$(ruff check . 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Lint errors:" >&2 echo "$LINT_OUTPUT" | tail -20 >&2 exit 2 } fi if command -v mypy &>/dev/null; then TYPE_OUTPUT=$(mypy . 2>&1) || { echo "ITERATION $count/$MAX_ITERATIONS - Type errors:" >&2 echo "$TYPE_OUTPUT" | tail -20 >&2 exit 2 } fi fi # All green - reset counter rm -f "$ITERATION_FILE" exit 0 ================================================ FILE: tests/test_cross_agent.py ================================================ """Tests for cross-agent intelligence (Codex auto-review, Kimi delegation, iCPG + Mnemos).""" from __future__ import annotations import json import os from pathlib import Path import pytest REPO_ROOT = Path(__file__).parent.parent class TestCodexAutoReview: """Tests for templates/codex-auto-review.sh.""" def test_script_exists(self) -> None: path = REPO_ROOT / "templates" / "codex-auto-review.sh" assert path.exists() def test_script_is_executable(self) -> None: path = REPO_ROOT / "templates" / "codex-auto-review.sh" assert os.access(path, os.X_OK) def test_script_has_shebang(self) -> None: path = REPO_ROOT / "templates" / "codex-auto-review.sh" content = path.read_text() assert content.startswith("#!/bin/bash") def test_script_checks_codex_installed(self) -> None: path = REPO_ROOT / "templates" / "codex-auto-review.sh" content = path.read_text() assert "command -v codex" in content def test_script_uses_exit_codes(self) -> None: path = REPO_ROOT / "templates" / "codex-auto-review.sh" content = path.read_text() assert "exit 0" in content assert "return 2" in content class TestCrossAgentDelegation: """Tests for skills/cross-agent-delegation/SKILL.md.""" def test_skill_exists(self) -> None: path = REPO_ROOT / "skills" / "cross-agent-delegation" / "SKILL.md" assert path.exists() def test_skill_has_frontmatter(self) -> None: path = REPO_ROOT / "skills" / "cross-agent-delegation" / "SKILL.md" content = path.read_text() assert content.startswith("---") assert "name: cross-agent-delegation" in content def test_skill_references_icpg(self) -> None: path = REPO_ROOT / "skills" / "cross-agent-delegation" / "SKILL.md" content = path.read_text() assert "icpg" in content.lower() assert "icpg query prior" in content assert "icpg query constraints" in content assert "icpg query risk" in content def test_skill_references_mnemos(self) -> None: path = REPO_ROOT / "skills" / "cross-agent-delegation" / "SKILL.md" content = path.read_text() assert "mnemos" in content.lower() assert "mnemos add goal" in content assert "mnemos checkpoint" in content def test_skill_has_complexity_scoring_rules(self) -> None: path = REPO_ROOT / "skills" / "cross-agent-delegation" / "SKILL.md" content = path.read_text() assert "0-3" in content assert "4-6" in content assert "7-10" in content def test_skill_has_tool_detection(self) -> None: path = REPO_ROOT / "skills" / "cross-agent-delegation" / "SKILL.md" content = path.read_text() assert "command -v kimi" in content assert "command -v codex" in content class TestSettingsJsonHook: """Tests for codex-auto-review hook in settings.json.""" def test_settings_has_codex_review_hook(self) -> None: path = REPO_ROOT / "templates" / "settings.json" data = json.loads(path.read_text()) stop_hooks = data["hooks"]["Stop"][0]["hooks"] commands = [h["command"] for h in stop_hooks] assert any("codex-auto-review" in cmd for cmd in commands) def test_codex_hook_after_tdd(self) -> None: path = REPO_ROOT / "templates" / "settings.json" data = json.loads(path.read_text()) stop_hooks = data["hooks"]["Stop"][0]["hooks"] commands = [h["command"] for h in stop_hooks] tdd_idx = next( i for i, c in enumerate(commands) if "tdd-loop-check" in c ) codex_idx = next( i for i, c in enumerate(commands) if "codex-auto-review" in c ) assert codex_idx > tdd_idx def test_codex_hook_before_icpg(self) -> None: path = REPO_ROOT / "templates" / "settings.json" data = json.loads(path.read_text()) stop_hooks = data["hooks"]["Stop"][0]["hooks"] commands = [h["command"] for h in stop_hooks] codex_idx = next( i for i, c in enumerate(commands) if "codex-auto-review" in c ) icpg_idx = next( i for i, c in enumerate(commands) if "icpg-stop-record" in c ) assert codex_idx < icpg_idx def test_codex_hook_has_timeout(self) -> None: path = REPO_ROOT / "templates" / "settings.json" data = json.loads(path.read_text()) stop_hooks = data["hooks"]["Stop"][0]["hooks"] codex_hook = next( h for h in stop_hooks if "codex-auto-review" in h["command"] ) assert codex_hook["timeout"] == 120 class TestConfigTomlHook: """Tests for codex-auto-review hook in config.toml.""" def test_config_toml_has_codex_hook(self) -> None: path = REPO_ROOT / "templates" / "config.toml" content = path.read_text() assert "codex-auto-review" in content def test_config_toml_codex_hook_timeout(self) -> None: path = REPO_ROOT / "templates" / "config.toml" content = path.read_text() # Find the codex-auto-review block and check timeout lines = content.splitlines() in_codex_block = False for line in lines: if "Codex Auto-Review" in line: in_codex_block = True if in_codex_block and line.startswith("timeout"): assert "120" in line break class TestTemplateSkillRefs: """Tests for skill references in templates.""" def test_claude_md_has_delegation_skill(self) -> None: path = REPO_ROOT / "templates" / "CLAUDE.md" content = path.read_text() assert "cross-agent-delegation/SKILL.md" in content def test_agents_md_has_delegation_skill(self) -> None: path = REPO_ROOT / "templates" / "AGENTS.md" content = path.read_text() assert "cross-agent-delegation/SKILL.md" in content def test_claude_md_has_workflow_section(self) -> None: path = REPO_ROOT / "templates" / "CLAUDE.md" content = path.read_text() assert "## Cross-Agent Workflow" in content assert "Codex Auto-Review" in content assert "Kimi Delegation" in content def test_agents_md_has_workflow_section(self) -> None: path = REPO_ROOT / "templates" / "AGENTS.md" content = path.read_text() assert "## Cross-Agent Workflow" in content assert "Codex Auto-Review" in content assert "Kimi Delegation" in content class TestInitializeProjectRef: """Tests for cross-agent-delegation in initialize-project.md.""" def test_init_copies_delegation_skill(self) -> None: path = REPO_ROOT / "commands" / "initialize-project.md" content = path.read_text() assert "cross-agent-delegation/" in content ================================================ FILE: tests/test_cross_tool.py ================================================ """Tests for cross-tool (Claude/Kimi/Codex) compatibility.""" from __future__ import annotations import os import subprocess from pathlib import Path import pytest REPO_ROOT = Path(__file__).parent.parent class TestDetectAgents: """Tests for scripts/detect-agents.sh.""" def test_script_exists_and_executable(self) -> None: script = REPO_ROOT / "scripts" / "detect-agents.sh" assert script.exists() assert os.access(script, os.X_OK) def test_outputs_valid_format(self) -> None: script = REPO_ROOT / "scripts" / "detect-agents.sh" result = subprocess.run( [str(script)], capture_output=True, text=True, timeout=10, ) assert result.returncode == 0 valid_tools = {"claude", "kimi", "codex", "docker", "orbstack", "polyphony"} for line in result.stdout.strip().splitlines(): assert line in valid_tools class TestInstallSkills: """Tests for scripts/install-skills.sh.""" def test_script_exists_and_executable(self) -> None: script = REPO_ROOT / "scripts" / "install-skills.sh" assert script.exists() assert os.access(script, os.X_OK) def test_copies_skills_to_target(self, tmp_path: Path) -> None: script = REPO_ROOT / "scripts" / "install-skills.sh" target = tmp_path / "target-skills" result = subprocess.run( [str(script), str(target)], capture_output=True, text=True, timeout=30, ) assert result.returncode == 0 assert target.exists() # Should have at least 'base' skill base_skill = target / "base" / "SKILL.md" assert base_skill.exists() def test_no_args_shows_usage(self) -> None: script = REPO_ROOT / "scripts" / "install-skills.sh" result = subprocess.run( [str(script)], capture_output=True, text=True, timeout=10, ) assert result.returncode != 0 class TestTemplates: """Tests for cross-tool templates.""" def test_agents_md_exists(self) -> None: path = REPO_ROOT / "templates" / "AGENTS.md" assert path.exists() def test_agents_md_has_skills_section(self) -> None: path = REPO_ROOT / "templates" / "AGENTS.md" content = path.read_text() assert "## Skills" in content assert "SKILL.md" in content def test_config_toml_exists(self) -> None: path = REPO_ROOT / "templates" / "config.toml" assert path.exists() def test_config_toml_has_hooks(self) -> None: path = REPO_ROOT / "templates" / "config.toml" content = path.read_text() assert "[[hooks]]" in content assert 'event = "Stop"' in content assert 'event = "SessionStart"' in content def test_agents_md_has_conventions(self) -> None: path = REPO_ROOT / "templates" / "AGENTS.md" content = path.read_text() assert "## Conventions" in content assert "## Don't" in content class TestSyncAgentsCommand: """Tests for commands/sync-agents.md.""" def test_command_exists(self) -> None: path = REPO_ROOT / "commands" / "sync-agents.md" assert path.exists() def test_command_has_phases(self) -> None: path = REPO_ROOT / "commands" / "sync-agents.md" content = path.read_text() assert "## Phase 1" in content assert "## Phase 2" in content assert "detect-agents.sh" in content ================================================ FILE: tests/test_polyphony_adapters.py ================================================ """Tests for Polyphony agent adapters (§8.1-8.3).""" import pytest from polyphony.adapters import get_adapter, list_adapters from polyphony.adapters.claude import ClaudeAdapter from polyphony.adapters.codex import CodexAdapter from polyphony.adapters.kimi import KimiAdapter from polyphony.models import AgentProfile, RunSpec @pytest.fixture def claude_profile(): return AgentProfile( name="claude-opus", agent_type="claude", cli_command="claude -p", strengths=["long_context"], event_protocol="stream-json", ) @pytest.fixture def codex_profile(): return AgentProfile( name="codex-default", agent_type="codex", cli_command="codex exec", strengths=["code"], event_protocol="ndjson", ) @pytest.fixture def kimi_profile(): return AgentProfile( name="kimi-default", agent_type="kimi", cli_command="kimi --print -y", strengths=["code"], event_protocol="ndjson", ) @pytest.fixture def run_spec(): return RunSpec( task_id="T-1", agent="claude-opus", identity="protaige", workspace="/workspace", image="polyphony-worker:latest", max_turns=10, env_overlay={"ANTHROPIC_API_KEY": "ANTHROPIC_API_KEY"}, volume_mounts=["~/.claude:/home/worker/.claude:ro"], ) class TestRegistry: def test_list_adapters(self): names = list_adapters() assert "claude" in names assert "codex" in names assert "kimi" in names def test_get_claude_adapter(self): adapter = get_adapter("claude") assert isinstance(adapter, ClaudeAdapter) def test_get_codex_adapter(self): adapter = get_adapter("codex") assert isinstance(adapter, CodexAdapter) def test_get_kimi_adapter(self): adapter = get_adapter("kimi") assert isinstance(adapter, KimiAdapter) def test_unknown_adapter_raises(self): with pytest.raises(KeyError, match="gemini"): get_adapter("gemini") class TestClaudeAdapter: def test_build_command(self, claude_profile, run_spec): adapter = ClaudeAdapter() cmd = adapter.build_command(claude_profile, run_spec) assert "claude" in cmd[0] assert "-p" in cmd assert "--output-format" in cmd assert "stream-json" in cmd def test_prompt_included(self, claude_profile, run_spec): adapter = ClaudeAdapter() run_spec.env_overlay["PROMPT"] = "Fix the bug" cmd = adapter.build_command(claude_profile, run_spec) cmd_str = " ".join(cmd) assert "claude" in cmd_str def test_detect_completion(self): adapter = ClaudeAdapter() assert adapter.detect_completion({"type": "result"}) is True assert adapter.detect_completion({"type": "message"}) is False def test_detect_quota(self): adapter = ClaudeAdapter() assert adapter.detect_quota("rate limit exceeded") is True assert adapter.detect_quota("all good") is False class TestCodexAdapter: def test_build_command(self, codex_profile, run_spec): adapter = CodexAdapter() cmd = adapter.build_command(codex_profile, run_spec) assert "codex" in cmd[0] assert "exec" in cmd assert "--full-auto" in cmd def test_detect_completion(self): adapter = CodexAdapter() assert adapter.detect_completion({"status": "completed"}) is True assert adapter.detect_completion({"status": "running"}) is False def test_detect_quota(self): adapter = CodexAdapter() assert adapter.detect_quota("quota exceeded") is True assert adapter.detect_quota("running") is False class TestKimiAdapter: def test_build_command(self, kimi_profile, run_spec): adapter = KimiAdapter() cmd = adapter.build_command(kimi_profile, run_spec) assert "kimi" in cmd[0] assert "--print" in cmd assert "-y" in cmd def test_detect_completion(self): adapter = KimiAdapter() assert adapter.detect_completion({"done": True}) is True assert adapter.detect_completion({"done": False}) is False def test_detect_quota(self): adapter = KimiAdapter() assert adapter.detect_quota("rate limit") is True assert adapter.detect_quota("ok") is False ================================================ FILE: tests/test_polyphony_config.py ================================================ """Tests for Polyphony config loading (§11).""" import pytest from polyphony.config import ( load_config, load_identities, load_agents, load_routing, default_config_dir, ) from polyphony.models import Identity, AgentProfile class TestDefaultConfigDir: def test_returns_path(self): d = default_config_dir() assert str(d).endswith(".polyphony") class TestLoadConfig: def test_missing_dir_returns_defaults(self, tmp_path): cfg = load_config(tmp_path / "nonexistent") assert "workspace_root" in cfg assert "poll_interval" in cfg assert "max_concurrent_agents" in cfg def test_loads_yaml(self, tmp_path): cfg_file = tmp_path / "config.yaml" cfg_file.write_text( "workspace_root: /custom/path\n" "max_concurrent_agents: 4\n" ) cfg = load_config(tmp_path) assert cfg["workspace_root"] == "/custom/path" assert cfg["max_concurrent_agents"] == 4 def test_defaults_fill_missing_keys(self, tmp_path): cfg_file = tmp_path / "config.yaml" cfg_file.write_text("workspace_root: /x\n") cfg = load_config(tmp_path) assert "poll_interval" in cfg class TestLoadIdentities: def test_missing_file_returns_empty(self, tmp_path): ids = load_identities(tmp_path) assert ids == [] def test_loads_identities(self, tmp_path): f = tmp_path / "identities.yaml" f.write_text( "identities:\n" " - name: test\n" " volumes:\n" " claude: ~/.claude\n" ) ids = load_identities(tmp_path) assert len(ids) == 1 assert isinstance(ids[0], Identity) assert ids[0].name == "test" assert ids[0].volumes["claude"] == "~/.claude" class TestLoadAgents: def test_missing_file_returns_empty(self, tmp_path): agents = load_agents(tmp_path) assert agents == [] def test_loads_agents(self, tmp_path): f = tmp_path / "agents.yaml" f.write_text( "agents:\n" " - name: claude-opus\n" " agent_type: claude\n" " cli_command: claude -p\n" ) agents = load_agents(tmp_path) assert len(agents) == 1 assert isinstance(agents[0], AgentProfile) assert agents[0].name == "claude-opus" class TestLoadRouting: def test_missing_file_returns_defaults(self, tmp_path): r = load_routing(tmp_path) assert "rules" in r assert "default" in r def test_loads_routing(self, tmp_path): f = tmp_path / "routing.yaml" f.write_text( "rules:\n" " - match: {task_type: bugfix}\n" " agent: kimi\n" "default:\n" " agent: claude\n" ) r = load_routing(tmp_path) assert len(r["rules"]) == 1 assert r["default"]["agent"] == "claude" ================================================ FILE: tests/test_polyphony_events.py ================================================ """Tests for Polyphony event parsing (§8 events).""" import json import pytest from polyphony.events import ( TaskEvent, parse_ndjson_line, parse_stream_json, classify_event, ) class TestTaskEvent: def test_create(self): ev = TaskEvent( kind="message", data={"text": "hello"}, ) assert ev.kind == "message" assert ev.data["text"] == "hello" assert ev.timestamp != "" def test_from_dict(self): ev = TaskEvent.from_dict({ "kind": "result", "data": {"status": "ok"}, "timestamp": "2025-01-01T00:00:00", }) assert ev.kind == "result" assert ev.timestamp == "2025-01-01T00:00:00" class TestParseNdjsonLine: def test_valid_json(self): line = '{"type": "message", "content": "hello"}' result = parse_ndjson_line(line) assert result["type"] == "message" def test_empty_line(self): assert parse_ndjson_line("") is None def test_whitespace_line(self): assert parse_ndjson_line(" \n") is None def test_invalid_json(self): assert parse_ndjson_line("not json") is None def test_strips_whitespace(self): line = ' {"key": "value"} \n' result = parse_ndjson_line(line) assert result["key"] == "value" class TestParseStreamJson: def test_parses_multiple_lines(self): lines = [ '{"type": "message", "text": "a"}', '{"type": "result", "status": "ok"}', ] events = parse_stream_json(lines) assert len(events) == 2 assert events[0]["type"] == "message" assert events[1]["type"] == "result" def test_skips_invalid_lines(self): lines = [ '{"type": "message"}', "not json", '{"type": "result"}', ] events = parse_stream_json(lines) assert len(events) == 2 def test_empty_input(self): assert parse_stream_json([]) == [] class TestClassifyEvent: def test_result_event(self): ev = classify_event({"type": "result", "status": "ok"}) assert ev.kind == "result" def test_message_event(self): ev = classify_event({"type": "message", "text": "hi"}) assert ev.kind == "message" def test_error_event(self): ev = classify_event({"type": "error", "message": "fail"}) assert ev.kind == "error" def test_unknown_event(self): ev = classify_event({"foo": "bar"}) assert ev.kind == "unknown" def test_preserves_data(self): data = {"type": "result", "status": "ok", "extra": 42} ev = classify_event(data) assert ev.data == data ================================================ FILE: tests/test_polyphony_identity.py ================================================ """Tests for Polyphony identity broker (§7).""" import pytest from polyphony.models import Identity from polyphony.identity import ( resolve_identity, build_volume_mounts, build_env_overlay, validate_identity, ) @pytest.fixture def identities(): return [ Identity( name="protaige", volumes={"claude": "~/.claude", "codex": "~/.codex"}, api_keys={"anthropic": "ANTHROPIC_API_KEY"}, ), Identity( name="personal", volumes={"kimi": "~/.kimi"}, ), ] class TestResolveIdentity: def test_finds_by_name(self, identities): found = resolve_identity("protaige", identities) assert found.name == "protaige" def test_missing_raises(self, identities): with pytest.raises(KeyError, match="unknown"): resolve_identity("unknown", identities) class TestBuildVolumeMounts: def test_mounts_for_claude(self, identities): mounts = build_volume_mounts(identities[0], "claude") assert len(mounts) == 1 assert "~/.claude" in mounts[0] assert ":ro" in mounts[0] def test_no_mount_for_missing_agent(self, identities): mounts = build_volume_mounts(identities[1], "claude") assert mounts == [] class TestBuildEnvOverlay: def test_env_from_api_keys(self, identities): env = build_env_overlay(identities[0]) assert "ANTHROPIC_API_KEY" in env def test_empty_when_no_keys(self, identities): env = build_env_overlay(identities[1]) assert env == {} class TestValidateIdentity: def test_valid(self, identities): errors = validate_identity(identities[0]) assert errors == [] def test_missing_name(self): i = Identity(name="", volumes={"claude": "~/.claude"}) errors = validate_identity(i) assert any("name" in e for e in errors) def test_missing_volumes(self): i = Identity(name="test", volumes={}) errors = validate_identity(i) assert any("volume" in e.lower() for e in errors) ================================================ FILE: tests/test_polyphony_models.py ================================================ """Tests for Polyphony data models (§3 of spec).""" import pytest from polyphony.models import ( TASK_TYPES, RISK_LEVELS, SCOPES, Task, Identity, AgentProfile, RunSpec, Result, _now, _uuid, ) class TestHelpers: def test_now_returns_iso_string(self): ts = _now() assert "T" in ts assert "+" in ts or "Z" in ts def test_uuid_returns_unique(self): a, b = _uuid(), _uuid() assert a != b assert len(a) == 36 class TestTaskConstants: def test_task_types(self): expected = { "research", "bugfix", "feature", "refactor", "migration", "docs", "review", } assert set(TASK_TYPES) == expected def test_risk_levels(self): assert set(RISK_LEVELS) == {"low", "medium", "high"} def test_scopes(self): expected = { "single_file", "single_module", "multi_module", "multi_repo", } assert set(SCOPES) == expected class TestTask: def test_create_minimal(self): t = Task( title="Fix login bug", source="github", source_ref="owner/repo#42", ) assert t.title == "Fix login bug" assert t.source == "github" assert len(t.id) == 36 assert t.state == "discovered" assert t.task_type == "feature" assert t.risk == "low" def test_defaults(self): t = Task(title="x", source="local", source_ref="1") assert t.scope == [] assert t.context_tokens == 0 assert t.requires_web is False assert t.run_spec_id is None assert t.metadata == {} def test_to_dict(self): t = Task(title="x", source="local", source_ref="1") d = t.to_dict() assert d["title"] == "x" assert "id" in d assert "created_at" in d class TestIdentity: def test_create(self): i = Identity( name="protaige", volumes={"claude": "~/.claude"}, ) assert i.name == "protaige" assert i.volumes["claude"] == "~/.claude" assert i.api_keys == {} assert i.cost_ceiling_usd_per_day is None def test_with_api_keys(self): i = Identity( name="test", volumes={}, api_keys={"anthropic": "ANTHROPIC_API_KEY"}, ) assert i.api_keys["anthropic"] == "ANTHROPIC_API_KEY" class TestAgentProfile: def test_create(self): a = AgentProfile( name="claude-opus", agent_type="claude", cli_command="claude -p", ) assert a.name == "claude-opus" assert a.context_window_tokens == 200000 assert a.strengths == [] def test_event_protocol_default(self): a = AgentProfile( name="x", agent_type="claude", cli_command="claude -p", ) assert a.event_protocol == "ndjson" class TestRunSpec: def test_create(self): r = RunSpec( task_id="t1", agent="claude-opus", identity="protaige", workspace="/tmp/ws", image="polyphony/claude:latest", ) assert r.task_id == "t1" assert r.attempt == 1 assert r.max_turns == 25 assert r.deadline_seconds == 1800 assert r.allowed_paths == [] assert r.proof_of_work == [] def test_immutable_concept(self): """RunSpec fields have defaults; verify they're set.""" r = RunSpec( task_id="t1", agent="x", identity="y", workspace="/w", image="img", ) assert len(r.id) == 36 class TestResult: def test_create(self): r = Result( task_id="t1", run_spec_id="rs1", agent="claude-opus", status="succeeded", ) assert r.status == "succeeded" assert r.turns == 0 assert r.duration_seconds == 0 assert r.cost_usd is None assert r.events == [] assert r.artifacts == {} def test_status_values(self): for s in ("succeeded", "failed", "quota", "timeout", "crash"): r = Result( task_id="t", run_spec_id="r", agent="a", status=s, ) assert r.status == s ================================================ FILE: tests/test_polyphony_orchestrator.py ================================================ """Tests for Polyphony orchestrator (§4 supervisor loop).""" import pytest from unittest.mock import patch, MagicMock from pathlib import Path from polyphony.orchestrator import ( Orchestrator, discover_tasks, claim_task, provision_workspace, run_agent, verify_result, ) from polyphony.models import ( Task, AgentProfile, Identity, RunSpec, Result, ) from polyphony.store import PolyphonyStore @pytest.fixture def store(tmp_path): s = PolyphonyStore(tmp_path) s.init_db() return s @pytest.fixture def task(): return Task( title="Fix auth bug", source="local", source_ref="local", task_type="bugfix", risk="medium", ) @pytest.fixture def agents(): return [ AgentProfile( name="claude-opus", agent_type="claude", cli_command="claude -p", strengths=["long_context"], ), ] @pytest.fixture def policy(): return { "rules": [], "default": { "agent": "claude-opus", "fallback": [], }, } @pytest.fixture def identities(): return [ Identity( name="protaige", volumes={"claude": "~/.claude"}, ), ] class TestDiscoverTasks: def test_returns_tasks(self, store, task): store.save_task(task) found = discover_tasks(store) assert len(found) == 1 assert found[0].id == task.id def test_empty_store(self, store): assert discover_tasks(store) == [] class TestClaimTask: def test_transitions_to_claimed(self, store, task): store.save_task(task) claimed = claim_task(task, store) assert claimed.state == "claimed" def test_updates_store(self, store, task): store.save_task(task) claim_task(task, store) stored = store.get_task(task.id) assert stored.state == "claimed" class TestProvisionWorkspace: @patch("polyphony.orchestrator._create_ws") def test_returns_path(self, mock_ws, tmp_path, task): ws_path = tmp_path / "ws" ws_path.mkdir() mock_ws.return_value = ws_path result = provision_workspace(task, tmp_path, "main") assert result == ws_path @patch("polyphony.orchestrator._create_ws") def test_calls_create(self, mock_ws, tmp_path, task): mock_ws.return_value = tmp_path provision_workspace(task, tmp_path, "main") assert mock_ws.called class TestRunAgent: @patch("polyphony.orchestrator._execute_container") def test_returns_result(self, mock_exec, task): mock_exec.return_value = Result( task_id=task.id, run_spec_id="rs-1", agent="claude-opus", status="succeeded", ) run_spec = RunSpec( task_id=task.id, agent="claude-opus", identity="protaige", workspace="/ws", image="polyphony-worker:latest", ) result = run_agent(run_spec) assert result.status == "succeeded" @patch("polyphony.orchestrator._execute_container") def test_handles_failure(self, mock_exec, task): mock_exec.return_value = Result( task_id=task.id, run_spec_id="rs-1", agent="claude-opus", status="failed", ) run_spec = RunSpec( task_id=task.id, agent="claude-opus", identity="protaige", workspace="/ws", image="polyphony-worker:latest", ) result = run_agent(run_spec) assert result.status == "failed" class TestVerifyResult: def test_succeeded_passes(self): result = Result( task_id="T-1", run_spec_id="rs-1", agent="claude-opus", status="succeeded", ) assert verify_result(result) is True def test_failed_fails(self): result = Result( task_id="T-1", run_spec_id="rs-1", agent="claude-opus", status="failed", ) assert verify_result(result) is False class TestOrchestrator: def test_init(self, store, agents, policy, identities): orch = Orchestrator( store=store, agents=agents, policy=policy, identities=identities, ) assert orch is not None def test_has_step(self, store, agents, policy, identities): orch = Orchestrator( store=store, agents=agents, policy=policy, identities=identities, ) assert hasattr(orch, "step") ================================================ FILE: tests/test_polyphony_router.py ================================================ """Tests for Polyphony router (§5.2-5.6).""" import pytest from polyphony.models import Task, AgentProfile, RunSpec from polyphony.router import route, select_agent, match_rule @pytest.fixture def agents(): return [ AgentProfile( name="claude-opus", agent_type="claude", cli_command="claude -p", strengths=["long_context", "research"], ), AgentProfile( name="codex-default", agent_type="codex", cli_command="codex exec", strengths=["code"], ), AgentProfile( name="kimi-default", agent_type="kimi", cli_command="kimi --print -y", strengths=["code"], ), ] @pytest.fixture def policy(): return { "rules": [ { "match": {"task_type": "docs", "risk": "low"}, "agent": "kimi-default", }, { "match": {"task_type": "bugfix"}, "agent": "codex-default", }, { "match": {"risk": "high"}, "agent": "claude-opus", }, ], "default": { "agent": "claude-opus", "fallback": ["codex-default", "kimi-default"], }, } class TestMatchRule: def test_matches_single_field(self): task = Task( title="x", source="local", source_ref="1", task_type="docs", ) rule = {"match": {"task_type": "docs"}} assert match_rule(task, rule) is True def test_no_match(self): task = Task( title="x", source="local", source_ref="1", task_type="feature", ) rule = {"match": {"task_type": "docs"}} assert match_rule(task, rule) is False def test_matches_multiple_fields(self): task = Task( title="x", source="local", source_ref="1", task_type="docs", risk="low", ) rule = {"match": {"task_type": "docs", "risk": "low"}} assert match_rule(task, rule) is True def test_partial_match_fails(self): task = Task( title="x", source="local", source_ref="1", task_type="docs", risk="high", ) rule = {"match": {"task_type": "docs", "risk": "low"}} assert match_rule(task, rule) is False class TestSelectAgent: def test_selects_by_rule(self, agents, policy): task = Task( title="Fix readme", source="local", source_ref="1", task_type="docs", risk="low", ) agent = select_agent(task, agents, policy) assert agent.name == "kimi-default" def test_falls_to_default(self, agents, policy): task = Task( title="New feature", source="local", source_ref="1", task_type="feature", risk="medium", ) agent = select_agent(task, agents, policy) assert agent.name == "claude-opus" def test_high_risk_matches_claude(self, agents, policy): task = Task( title="Refactor auth", source="local", source_ref="1", task_type="refactor", risk="high", ) agent = select_agent(task, agents, policy) assert agent.name == "claude-opus" class TestRoute: def test_returns_run_spec(self, agents, policy): task = Task( title="Fix bug", source="github", source_ref="o/r#1", task_type="bugfix", ) rs = route(task, agents, policy, identity="test") assert isinstance(rs, RunSpec) assert rs.task_id == task.id assert rs.agent == "codex-default" assert rs.identity == "test" def test_run_spec_has_fallback(self, agents, policy): task = Task( title="New feature", source="local", source_ref="1", task_type="feature", ) rs = route(task, agents, policy, identity="test") # default rule has fallback assert isinstance(rs.fallback, list) ================================================ FILE: tests/test_polyphony_runtime.py ================================================ """Tests for Polyphony Docker runtime (§8 worker).""" import pytest from unittest.mock import patch, MagicMock from polyphony.runtime import ( create_container, start_container, stop_container, remove_container, container_logs, wait_container, build_docker_args, ) from polyphony.models import RunSpec @pytest.fixture def run_spec(): return RunSpec( task_id="T-1", agent="claude-opus", identity="protaige", workspace="/tmp/ws/T-1/1", image="polyphony-worker:latest", env_overlay={"API_KEY": "API_KEY"}, volume_mounts=["~/.claude:/home/worker/.claude:ro"], deadline_seconds=600, ) class TestBuildDockerArgs: def test_includes_image(self, run_spec): args = build_docker_args(run_spec) assert "polyphony-worker:latest" in args def test_includes_volumes(self, run_spec): args = build_docker_args(run_spec) assert "-v" in args # Collect all -v values volumes = [] for i, a in enumerate(args): if a == "-v" and i + 1 < len(args): volumes.append(args[i + 1]) assert any( "~/.claude:/home/worker/.claude:ro" in v for v in volumes ) def test_includes_env(self, run_spec): args = build_docker_args(run_spec) assert "-e" in args def test_includes_workspace_mount(self, run_spec): args = build_docker_args(run_spec) arg_str = " ".join(args) assert "/tmp/ws/T-1/1" in arg_str def test_container_name(self, run_spec): args = build_docker_args(run_spec) assert "--name" in args class TestCreateContainer: @patch("polyphony.runtime._run_docker") def test_creates_container(self, mock_docker, run_spec): mock_docker.return_value = MagicMock( returncode=0, stdout="container_id_123\n", ) cid = create_container(run_spec) assert cid == "container_id_123" assert mock_docker.called @patch("polyphony.runtime._run_docker") def test_failure_raises(self, mock_docker, run_spec): mock_docker.return_value = MagicMock( returncode=1, stderr="error", ) with pytest.raises(RuntimeError, match="error"): create_container(run_spec) class TestStartContainer: @patch("polyphony.runtime._run_docker") def test_starts(self, mock_docker): mock_docker.return_value = MagicMock(returncode=0) start_container("abc123") mock_docker.assert_called_once() cmd = mock_docker.call_args[0][0] assert "start" in cmd assert "abc123" in cmd class TestStopContainer: @patch("polyphony.runtime._run_docker") def test_stops(self, mock_docker): mock_docker.return_value = MagicMock(returncode=0) stop_container("abc123") cmd = mock_docker.call_args[0][0] assert "stop" in cmd @patch("polyphony.runtime._run_docker") def test_stop_with_timeout(self, mock_docker): mock_docker.return_value = MagicMock(returncode=0) stop_container("abc123", timeout=30) cmd = mock_docker.call_args[0][0] assert "-t" in cmd assert "30" in cmd class TestRemoveContainer: @patch("polyphony.runtime._run_docker") def test_removes(self, mock_docker): mock_docker.return_value = MagicMock(returncode=0) remove_container("abc123") cmd = mock_docker.call_args[0][0] assert "rm" in cmd assert "abc123" in cmd class TestContainerLogs: @patch("polyphony.runtime._run_docker") def test_returns_logs(self, mock_docker): mock_docker.return_value = MagicMock( returncode=0, stdout="line1\nline2\n", ) logs = container_logs("abc123") assert logs == "line1\nline2\n" class TestWaitContainer: @patch("polyphony.runtime._run_docker") def test_returns_exit_code(self, mock_docker): mock_docker.return_value = MagicMock( returncode=0, stdout="0\n", ) code = wait_container("abc123") assert code == 0 @patch("polyphony.runtime._run_docker") def test_nonzero_exit(self, mock_docker): mock_docker.return_value = MagicMock( returncode=0, stdout="1\n", ) code = wait_container("abc123") assert code == 1 ================================================ FILE: tests/test_polyphony_scoring.py ================================================ """Tests for Polyphony complexity scoring (§5.1).""" import pytest from polyphony.models import Task from polyphony.scoring import ( DIMENSIONS, score_task, score_cyclomatic, score_fan_out, score_security, score_concurrency, score_domain, ) @pytest.fixture def simple_task(): return Task( title="Fix typo in README", source="local", source_ref="1", task_type="docs", scope=["README.md"], risk="low", ) @pytest.fixture def complex_task(): return Task( title="Refactor auth with async locks", source="github", source_ref="owner/repo#99", task_type="refactor", scope=["src/auth/middleware.ts", "src/auth/session.ts"], risk="high", metadata={ "keywords": ["auth", "org_id", "asyncio.Lock"], "loc": 200, "callers": 15, }, ) class TestDimensions: def test_five_dimensions(self): assert len(DIMENSIONS) == 5 def test_dimension_names(self): expected = { "cyclomatic", "fan_out", "security", "concurrency", "domain", } assert set(DIMENSIONS) == expected class TestScoreCyclomatic: def test_small_scope(self, simple_task): assert score_cyclomatic(simple_task) == 0 def test_large_scope(self, complex_task): assert score_cyclomatic(complex_task) >= 1 class TestScoreFanOut: def test_no_callers(self, simple_task): assert score_fan_out(simple_task) == 0 def test_many_callers(self, complex_task): assert score_fan_out(complex_task) == 2 class TestScoreSecurity: def test_no_security_keywords(self, simple_task): assert score_security(simple_task) == 0 def test_auth_keywords(self, complex_task): assert score_security(complex_task) >= 1 class TestScoreConcurrency: def test_no_concurrency(self, simple_task): assert score_concurrency(simple_task) == 0 def test_async_locks(self, complex_task): assert score_concurrency(complex_task) >= 1 class TestScoreDomain: def test_docs_task(self, simple_task): assert score_domain(simple_task) == 0 def test_high_risk_refactor(self, complex_task): assert score_domain(complex_task) >= 1 class TestScoreTask: def test_simple_task_low(self, simple_task): total = score_task(simple_task) assert 0 <= total <= 3 def test_complex_task_high(self, complex_task): total = score_task(complex_task) assert total >= 4 def test_score_range(self, simple_task): total = score_task(simple_task) assert 0 <= total <= 10 def test_returns_dict_with_breakdown(self, simple_task): """score_task returns (total, breakdown) tuple.""" result = score_task(simple_task) assert isinstance(result, int) ================================================ FILE: tests/test_polyphony_sources.py ================================================ """Tests for Polyphony work sources (§2).""" import json import pytest from unittest.mock import patch, MagicMock from pathlib import Path from polyphony.sources import get_source, list_sources from polyphony.sources.local import LocalSource from polyphony.sources.github import GitHubSource from polyphony.models import Task class TestRegistry: def test_list_sources(self): names = list_sources() assert "local" in names assert "github" in names def test_get_local_source(self): src = get_source("local") assert isinstance(src, LocalSource) def test_get_github_source(self): src = get_source("github") assert isinstance(src, GitHubSource) def test_unknown_raises(self): with pytest.raises(KeyError, match="jira"): get_source("jira") class TestLocalSource: def test_add_and_poll(self, tmp_path): src = LocalSource(db_path=tmp_path / "queue.db") src.add_task("Fix typo", task_type="docs", risk="low") tasks = src.poll() assert len(tasks) == 1 assert tasks[0].title == "Fix typo" assert tasks[0].source == "local" def test_poll_empty(self, tmp_path): src = LocalSource(db_path=tmp_path / "queue.db") assert src.poll() == [] def test_mark_claimed(self, tmp_path): src = LocalSource(db_path=tmp_path / "queue.db") src.add_task("Task A") tasks = src.poll() src.mark_claimed(tasks[0].id) # After claiming, poll should not return it remaining = src.poll() assert len(remaining) == 0 def test_multiple_tasks(self, tmp_path): src = LocalSource(db_path=tmp_path / "queue.db") src.add_task("Task A") src.add_task("Task B") src.add_task("Task C") tasks = src.poll() assert len(tasks) == 3 class TestGitHubSource: @patch("polyphony.sources.github._run_gh") def test_poll_returns_tasks(self, mock_gh): issues = [ { "number": 42, "title": "Fix auth bug", "labels": [{"name": "agent-ready"}], }, ] mock_gh.return_value = MagicMock( returncode=0, stdout=json.dumps(issues), ) src = GitHubSource(repo="owner/repo") tasks = src.poll() assert len(tasks) == 1 assert tasks[0].title == "Fix auth bug" assert tasks[0].source == "github" assert "42" in tasks[0].source_ref @patch("polyphony.sources.github._run_gh") def test_poll_empty(self, mock_gh): mock_gh.return_value = MagicMock( returncode=0, stdout="[]", ) src = GitHubSource(repo="owner/repo") assert src.poll() == [] @patch("polyphony.sources.github._run_gh") def test_poll_gh_failure(self, mock_gh): mock_gh.return_value = MagicMock( returncode=1, stderr="auth failed", ) src = GitHubSource(repo="owner/repo") # Should return empty, not crash assert src.poll() == [] @patch("polyphony.sources.github._run_gh") def test_label_filter(self, mock_gh): mock_gh.return_value = MagicMock( returncode=0, stdout="[]", ) src = GitHubSource( repo="owner/repo", label_filter="polyphony", ) src.poll() cmd = mock_gh.call_args[0][0] cmd_str = " ".join(cmd) assert "polyphony" in cmd_str ================================================ FILE: tests/test_polyphony_state.py ================================================ """Tests for Polyphony state machine (§4 lifecycle).""" import pytest from polyphony.models import Task from polyphony.state_machine import ( TASK_STATES, TRANSITIONS, can_transition, transition, is_terminal, ) class TestConstants: def test_all_states_present(self): expected = { "discovered", "claimed", "routed", "provisioned", "running", "verifying", "landed", "failed", "blocked", } assert set(TASK_STATES) == expected def test_transitions_keys_are_valid_states(self): for state in TRANSITIONS: assert state in TASK_STATES class TestCanTransition: def test_discovered_to_claimed(self): assert can_transition("discovered", "claimed") is True def test_claimed_to_routed(self): assert can_transition("claimed", "routed") is True def test_routed_to_provisioned(self): assert can_transition("routed", "provisioned") is True def test_provisioned_to_running(self): assert can_transition("provisioned", "running") is True def test_running_to_verifying(self): assert can_transition("running", "verifying") is True def test_running_to_failed(self): assert can_transition("running", "failed") is True def test_verifying_to_landed(self): assert can_transition("verifying", "landed") is True def test_verifying_to_failed(self): assert can_transition("verifying", "failed") is True def test_failed_to_claimed_retry(self): assert can_transition("failed", "claimed") is True def test_failed_to_blocked(self): assert can_transition("failed", "blocked") is True def test_invalid_discovered_to_running(self): assert can_transition("discovered", "running") is False def test_invalid_landed_to_anything(self): assert can_transition("landed", "claimed") is False assert can_transition("landed", "failed") is False def test_invalid_same_state(self): assert can_transition("claimed", "claimed") is False class TestTransition: def test_valid_transition_updates_state(self): t = Task(title="x", source="local", source_ref="1") assert t.state == "discovered" t2 = transition(t, "claimed") assert t2.state == "claimed" def test_invalid_transition_raises(self): t = Task(title="x", source="local", source_ref="1") with pytest.raises(ValueError, match="Invalid transition"): transition(t, "running") def test_transition_updates_timestamp(self): t = Task(title="x", source="local", source_ref="1") old_ts = t.updated_at t2 = transition(t, "claimed") assert t2.updated_at >= old_ts class TestIsTerminal: def test_landed_is_terminal(self): assert is_terminal("landed") is True def test_blocked_is_terminal(self): assert is_terminal("blocked") is True def test_discovered_not_terminal(self): assert is_terminal("discovered") is False def test_running_not_terminal(self): assert is_terminal("running") is False def test_failed_not_terminal(self): assert is_terminal("failed") is False ================================================ FILE: tests/test_polyphony_store.py ================================================ """Tests for Polyphony SQLite store.""" import pytest from polyphony.models import Task, RunSpec, Result from polyphony.store import PolyphonyStore @pytest.fixture def store(tmp_path): s = PolyphonyStore(tmp_path) s.init_db() return s @pytest.fixture def sample_task(): return Task( title="Fix bug", source="github", source_ref="owner/repo#1", ) class TestInit: def test_creates_db(self, tmp_path): s = PolyphonyStore(tmp_path) s.init_db() assert (tmp_path / "orchestrator.db").exists() def test_creates_gitignore(self, tmp_path): s = PolyphonyStore(tmp_path) s.init_db() gi = tmp_path / ".gitignore" assert gi.exists() assert "*" in gi.read_text() def test_idempotent(self, tmp_path): s = PolyphonyStore(tmp_path) s.init_db() s.init_db() # no error class TestTaskCRUD: def test_save_and_get(self, store, sample_task): store.save_task(sample_task) loaded = store.get_task(sample_task.id) assert loaded is not None assert loaded.title == "Fix bug" assert loaded.source == "github" def test_get_missing_returns_none(self, store): assert store.get_task("nonexistent") is None def test_list_tasks(self, store): t1 = Task(title="A", source="local", source_ref="1") t2 = Task(title="B", source="local", source_ref="2") store.save_task(t1) store.save_task(t2) tasks = store.list_tasks() assert len(tasks) == 2 def test_list_tasks_by_state(self, store, sample_task): store.save_task(sample_task) found = store.list_tasks(state="discovered") assert len(found) == 1 empty = store.list_tasks(state="running") assert len(empty) == 0 def test_update_task(self, store, sample_task): store.save_task(sample_task) sample_task.state = "claimed" store.save_task(sample_task) loaded = store.get_task(sample_task.id) assert loaded.state == "claimed" class TestRunSpecCRUD: def test_save_and_get(self, store): rs = RunSpec( task_id="t1", agent="claude", identity="protaige", workspace="/tmp/ws", image="img:latest", ) store.save_run_spec(rs) loaded = store.get_run_spec(rs.id) assert loaded is not None assert loaded.agent == "claude" def test_get_missing(self, store): assert store.get_run_spec("nope") is None class TestResultCRUD: def test_save_and_get(self, store): r = Result( task_id="t1", run_spec_id="rs1", agent="claude", status="succeeded", ) store.save_result(r) loaded = store.get_result(r.id) assert loaded is not None assert loaded.status == "succeeded" def test_list_results_by_task(self, store): r1 = Result( task_id="t1", run_spec_id="rs1", agent="claude", status="failed", ) r2 = Result( task_id="t1", run_spec_id="rs2", agent="kimi", status="succeeded", ) store.save_result(r1) store.save_result(r2) results = store.list_results(task_id="t1") assert len(results) == 2 class TestStateLog: def test_log_transition(self, store, sample_task): store.save_task(sample_task) store.log_transition( sample_task.id, "discovered", "claimed", ) log = store.get_state_log(sample_task.id) assert len(log) == 1 assert log[0]["from_state"] == "discovered" assert log[0]["to_state"] == "claimed" ================================================ FILE: tests/test_polyphony_workspace.py ================================================ """Tests for Polyphony workspace manager (§6).""" import pytest from unittest.mock import patch, MagicMock from pathlib import Path from polyphony.workspace import ( workspace_path, create_workspace, cleanup_workspace, list_workspaces, ) class TestWorkspacePath: def test_creates_path(self, tmp_path): p = workspace_path(tmp_path, "TASK-1", 1) assert "TASK-1" in str(p) assert "1" in str(p) def test_sanitizes_id(self, tmp_path): p = workspace_path(tmp_path, "owner/repo#42", 1) # No slashes in directory name assert "/" not in p.name class TestCreateWorkspace: @patch("polyphony.workspace._run_git") def test_clones_repo(self, mock_git, tmp_path): mock_git.return_value = MagicMock(returncode=0) ws = create_workspace( base_dir=tmp_path, task_id="T-1", attempt=1, repo_url="https://github.com/o/r.git", ref="main", ) assert ws.exists() assert mock_git.called @patch("polyphony.workspace._run_git") def test_checks_out_branch(self, mock_git, tmp_path): mock_git.return_value = MagicMock(returncode=0) create_workspace( base_dir=tmp_path, task_id="T-2", attempt=1, repo_url="https://github.com/o/r.git", ref="feature/auth", ) calls = [str(c) for c in mock_git.call_args_list] assert any("checkout" in c for c in calls) @patch("polyphony.workspace._run_git") def test_uses_mirror_when_available(self, mock_git, tmp_path): mock_git.return_value = MagicMock(returncode=0) mirror = tmp_path / "mirror" / "repo.git" mirror.mkdir(parents=True) create_workspace( base_dir=tmp_path, task_id="T-3", attempt=1, repo_url="https://github.com/o/r.git", ref="main", mirror_path=mirror, ) calls = [str(c) for c in mock_git.call_args_list] assert any("dissociate" in c for c in calls) class TestCleanupWorkspace: def test_removes_directory(self, tmp_path): ws = tmp_path / "workspace" ws.mkdir() (ws / "file.txt").write_text("x") cleanup_workspace(ws) assert not ws.exists() def test_missing_dir_no_error(self, tmp_path): cleanup_workspace(tmp_path / "nope") class TestListWorkspaces: def test_lists_dirs(self, tmp_path): (tmp_path / "T-1" / "1").mkdir(parents=True) (tmp_path / "T-2" / "1").mkdir(parents=True) ws = list_workspaces(tmp_path) assert len(ws) >= 2 def test_empty_base(self, tmp_path): assert list_workspaces(tmp_path) == [] ================================================ FILE: tests/test_session_detect.py ================================================ """Tests for multi-CLI session detection.""" from __future__ import annotations import json from pathlib import Path from unittest.mock import patch from maggy.services.session_detect import ( detect_all, detect_claude, detect_codex, detect_kimi, ) _MOD = "maggy.services.session_detect._home" def _patch_home(tmp_path): return patch(_MOD, return_value=tmp_path) def test_detect_claude_from_history(tmp_path): """Finds Claude session by matching working dir.""" hist = tmp_path / ".claude" / "history.jsonl" hist.parent.mkdir(parents=True) entry = {"project": "/tmp/proj", "sessionId": "c-123"} hist.write_text(json.dumps(entry) + "\n") with _patch_home(tmp_path): result = detect_claude("/tmp/proj") assert result is not None assert result.cli == "claude" assert result.session_id == "c-123" def test_detect_claude_no_match(tmp_path): """Returns None when no matching dir in history.""" hist = tmp_path / ".claude" / "history.jsonl" hist.parent.mkdir(parents=True) entry = {"project": "/other", "sessionId": "x"} hist.write_text(json.dumps(entry) + "\n") with _patch_home(tmp_path): assert detect_claude("/tmp/proj") is None def test_detect_claude_missing_file(): """Returns None when history.jsonl doesn't exist.""" with _patch_home(Path("/nonexistent_detect_xyz")): assert detect_claude("/tmp/proj") is None def test_detect_kimi_from_state(tmp_path): """Finds Kimi session from kimi.json work_dirs.""" kimi_f = tmp_path / ".kimi" / "kimi.json" kimi_f.parent.mkdir(parents=True) data = {"work_dirs": [ {"path": "/tmp/proj", "last_session_id": "k-1"}, ]} kimi_f.write_text(json.dumps(data)) with _patch_home(tmp_path): result = detect_kimi("/tmp/proj") assert result is not None assert result.cli == "kimi" assert result.session_id == "k-1" def test_detect_kimi_null_session(tmp_path): """Returns None when last_session_id is null.""" kimi_f = tmp_path / ".kimi" / "kimi.json" kimi_f.parent.mkdir(parents=True) data = {"work_dirs": [ {"path": "/tmp/proj", "last_session_id": None}, ]} kimi_f.write_text(json.dumps(data)) with _patch_home(tmp_path): assert detect_kimi("/tmp/proj") is None def test_detect_kimi_no_file(): with _patch_home(Path("/nonexistent_detect_xyz")): assert detect_kimi("/tmp/proj") is None def test_detect_codex_from_session(tmp_path): """Finds Codex session from rollout session file.""" sess = tmp_path / ".codex" / "sessions" / "2026" / "05" sess.mkdir(parents=True) meta = { "type": "session_meta", "payload": {"id": "cx-1", "cwd": "/tmp/proj"}, } (sess / "rollout-test.jsonl").write_text( json.dumps(meta) + "\n", ) with _patch_home(tmp_path): result = detect_codex("/tmp/proj") assert result is not None assert result.cli == "codex" assert result.session_id == "cx-1" def test_detect_codex_no_dir(): with _patch_home(Path("/nonexistent_detect_xyz")): assert detect_codex("/tmp/proj") is None def test_detect_all_aggregates(tmp_path): """detect_all gathers results from all CLIs.""" hist = tmp_path / ".claude" / "history.jsonl" hist.parent.mkdir(parents=True) entry = {"project": "/tmp/p", "sessionId": "s1"} hist.write_text(json.dumps(entry) + "\n") with _patch_home(tmp_path): result = detect_all("/tmp/p") clis = [s.cli for s in result.sessions] assert "claude" in clis def test_detect_all_empty(tmp_path): """detect_all returns empty when nothing found.""" with _patch_home(tmp_path): result = detect_all("/tmp/p") assert result.sessions == [] ================================================ FILE: tests/test_skill_lint.py ================================================ """Unit tests for skill-lint.""" from __future__ import annotations import json import sys import tempfile from pathlib import Path import pytest # Add scripts/ to path so we can import skill_lint sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) from skill_lint import Finding, Severity from skill_lint.frontmatter import check as fm_check, parse_frontmatter from skill_lint.spec import check as sp_check from skill_lint.content import check as cq_check from skill_lint.references import check as ri_check from skill_lint.report import format_json, format_text from skill_lint.__main__ import main @pytest.fixture def skills_dir(tmp_path: Path) -> Path: """Create a temporary skills directory.""" skills = tmp_path / 'skills' skills.mkdir() return skills def _make_skill(skills_dir: Path, name: str, content: str) -> tuple[Path, Path]: """Create a skill directory with SKILL.md content. Returns (skill_dir, skill_path).""" skill_dir = skills_dir / name skill_dir.mkdir() skill_path = skill_dir / 'SKILL.md' skill_path.write_text(content, encoding='utf-8') return skill_dir, skill_path # --- parse_frontmatter --- class TestParseFrontmatter: def test_valid_frontmatter(self): content = '---\nname: test-skill\ndescription: A test\n---\n# Content' fields, end_line = parse_frontmatter(content) assert fields['name'] == 'test-skill' assert fields['description'] == 'A test' assert end_line == 4 def test_no_frontmatter(self): content = '# Just content\nNo frontmatter here' fields, end_line = parse_frontmatter(content) assert fields == {} assert end_line == 0 def test_unclosed_frontmatter(self): content = '---\nname: broken\n' fields, end_line = parse_frontmatter(content) assert end_line == 0 def test_quoted_values(self): content = '---\nname: "quoted-name"\ndescription: \'single\'\n---\n' fields, _ = parse_frontmatter(content) assert fields['name'] == 'quoted-name' assert fields['description'] == 'single' # --- FM checks --- class TestFrontmatter: def test_no_frontmatter(self, skills_dir): _, path = _make_skill(skills_dir, 'bad-skill', '# No frontmatter\n') findings = fm_check(path, skills_dir / 'bad-skill', skills_dir) assert any(f.rule_id == 'FM001' for f in findings) def test_missing_name(self, skills_dir): _, path = _make_skill(skills_dir, 'test', '---\ndescription: hello\n---\n') findings = fm_check(path, skills_dir / 'test', skills_dir) assert any(f.rule_id == 'FM002' for f in findings) def test_missing_description(self, skills_dir): _, path = _make_skill(skills_dir, 'test', '---\nname: test\n---\n') findings = fm_check(path, skills_dir / 'test', skills_dir) assert any(f.rule_id == 'FM003' for f in findings) def test_name_mismatch(self, skills_dir): _, path = _make_skill(skills_dir, 'real-name', '---\nname: wrong-name\ndescription: x\n---\n') findings = fm_check(path, skills_dir / 'real-name', skills_dir) assert any(f.rule_id == 'FM004' for f in findings) def test_invalid_name_format(self, skills_dir): _, path = _make_skill(skills_dir, 'Test_Bad', '---\nname: Test_Bad\ndescription: x\n---\n') findings = fm_check(path, skills_dir / 'Test_Bad', skills_dir) assert any(f.rule_id == 'FM005' for f in findings) def test_clean_skill(self, skills_dir): content = ( '---\n' 'name: good-skill\n' 'description: A well-formed skill\n' 'when-to-use: When testing\n' 'user-invocable: true\n' 'effort: low\n' '---\n' '# Good Skill\n' ) _, path = _make_skill(skills_dir, 'good-skill', content) findings = fm_check(path, skills_dir / 'good-skill', skills_dir) assert len(findings) == 0 # --- SP checks --- class TestSpec: def test_missing_skill_md(self, skills_dir): skill_dir = skills_dir / 'empty-skill' skill_dir.mkdir() findings = sp_check(skill_dir / 'SKILL.md', skill_dir, skills_dir) assert any(f.rule_id == 'SP001' for f in findings) def test_over_500_lines(self, skills_dir): content = '---\nname: big\n---\n' + '\n'.join(f'line {i}' for i in range(550)) _, path = _make_skill(skills_dir, 'big', content) findings = sp_check(path, skills_dir / 'big', skills_dir) assert any(f.rule_id == 'SP002' for f in findings) def test_between_300_500(self, skills_dir): content = '---\nname: medium\n---\n' + '\n'.join(f'line {i}' for i in range(350)) _, path = _make_skill(skills_dir, 'medium', content) findings = sp_check(path, skills_dir / 'medium', skills_dir) assert any(f.rule_id == 'SP003' for f in findings) def test_inline_suppression(self, skills_dir): content = ( '---\n' '\n' 'name: big\n' '---\n' + '\n'.join(f'line {i}' for i in range(550)) ) _, path = _make_skill(skills_dir, 'big', content) findings = sp_check(path, skills_dir / 'big', skills_dir) assert not any(f.rule_id == 'SP002' for f in findings) # --- CQ checks --- class TestContent: def test_ascii_art_detected(self, skills_dir): content = '---\nname: arty\ndescription: x\n---\n# Arty\n╔══════╗\n║ box ║\n╚══════╝\n' _, path = _make_skill(skills_dir, 'arty', content) findings = cq_check(path, skills_dir / 'arty', skills_dir) assert any(f.rule_id == 'CQ001' for f in findings) def test_ascii_art_in_code_block_ok(self, skills_dir): content = '---\nname: code-art\ndescription: x\n---\n# Code\n```\n╔══════╗\n║ ok ║\n╚══════╝\n```\n' _, path = _make_skill(skills_dir, 'code-art', content) findings = cq_check(path, skills_dir / 'code-art', skills_dir) assert not any(f.rule_id == 'CQ001' for f in findings) def test_vague_phrases(self, skills_dir): content = '---\nname: vague\ndescription: x\n---\n# Vague\nYou should follow best practices.\n' _, path = _make_skill(skills_dir, 'vague', content) findings = cq_check(path, skills_dir / 'vague', skills_dir) assert any(f.rule_id == 'CQ002' for f in findings) def test_filler_intensity(self, skills_dir): # 10 filler words in 20 lines = 50 per 100 lines (way over 2) filler_lines = '\n'.join( 'This is MANDATORY and NON-NEGOTIABLE' for _ in range(10) ) content = f'---\nname: filler\ndescription: x\n---\n# Filler\n{filler_lines}\n' _, path = _make_skill(skills_dir, 'filler', content) findings = cq_check(path, skills_dir / 'filler', skills_dir) assert any(f.rule_id == 'CQ003' for f in findings) def test_stale_load_ref(self, skills_dir): content = '---\nname: stale\ndescription: x\n---\n# Stale\n*Load with: base.md*\n' _, path = _make_skill(skills_dir, 'stale', content) findings = cq_check(path, skills_dir / 'stale', skills_dir) assert any(f.rule_id == 'CQ005' for f in findings) def test_no_h1_heading(self, skills_dir): content = '---\nname: headless\ndescription: x\n---\nNo heading here.\n' _, path = _make_skill(skills_dir, 'headless', content) findings = cq_check(path, skills_dir / 'headless', skills_dir) assert any(f.rule_id == 'CQ006' for f in findings) # --- RI checks --- class TestReferences: def test_broken_skill_ref(self, skills_dir): content = '---\nname: linker\ndescription: x\n---\n# Linker\nSee skills/nonexistent-skill for details.\n' _, path = _make_skill(skills_dir, 'linker', content) findings = ri_check(path, skills_dir / 'linker', skills_dir) assert any(f.rule_id == 'RI001' for f in findings) def test_valid_skill_ref(self, skills_dir): _make_skill(skills_dir, 'target', '---\nname: target\n---\n') content = '---\nname: linker\ndescription: x\n---\n# Linker\nSee skills/target for details.\n' _, path = _make_skill(skills_dir, 'linker', content) findings = ri_check(path, skills_dir / 'linker', skills_dir) assert not any(f.rule_id == 'RI001' for f in findings) # --- Report --- class TestReport: def test_text_format(self, skills_dir): findings = [ Finding('FM001', Severity.ERROR, 'Missing frontmatter'), Finding('SP002', Severity.WARNING, 'Too long'), ] results = {'test-skill': findings} text = format_text(results) assert 'ERROR' in text assert 'WARNING' in text assert 'test-skill' in text def test_json_format(self, skills_dir): findings = [ Finding('FM001', Severity.ERROR, 'Missing frontmatter'), ] results = {'test-skill': findings} output = format_json(results) data = json.loads(output) assert data['summary']['errors'] == 1 assert 'test-skill' in data['skills'] # --- CLI --- class TestCLI: def test_version(self, capsys): with pytest.raises(SystemExit) as exc: main(['--version']) assert exc.value.code == 0 def test_missing_dir(self): ret = main(['/nonexistent/path']) assert ret == 2 def test_single_skill(self, skills_dir): content = ( '---\n' 'name: clean\n' 'description: A clean skill\n' 'when-to-use: Always\n' 'user-invocable: true\n' 'effort: low\n' '---\n' '# Clean Skill\n' '\n```python\nprint("hello")\n```\n' ) _make_skill(skills_dir, 'clean', content) ret = main(['--skill', 'clean', str(skills_dir)]) assert ret == 0 def test_fail_on_warning(self, skills_dir): content = '---\nname: big\ndescription: x\n---\n' + '\n'.join(f'line {i}' for i in range(550)) _make_skill(skills_dir, 'big', content) ret = main(['--fail-on', 'warning', '--skill', 'big', str(skills_dir)]) assert ret == 1 ================================================ FILE: tests/validate-structure.sh ================================================ #!/bin/bash # validate-structure.sh - Validates Maggy structure matches Claude Code requirements # Run with: ./tests/validate-structure.sh # Exit codes: 0 = all pass, 1 = failures set -uo pipefail # Note: not using -e so we can collect all failures SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(dirname "$SCRIPT_DIR")" SKILLS_DIR="$ROOT_DIR/skills" COMMANDS_DIR="$ROOT_DIR/commands" HOOKS_DIR="$ROOT_DIR/hooks" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color PASS_COUNT=0 FAIL_COUNT=0 WARN_COUNT=0 pass() { echo -e "${GREEN}✓${NC} $1" ((PASS_COUNT++)) } fail() { echo -e "${RED}✗${NC} $1" ((FAIL_COUNT++)) } warn() { echo -e "${YELLOW}⚠${NC} $1" ((WARN_COUNT++)) } header() { echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo " $1" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" } # ============================================================================ # TEST 1: Skills Structure # Each skill must be a FOLDER containing SKILL.md (not a flat .md file) # ============================================================================ test_skills_structure() { header "TEST: Skills Folder Structure" if [ ! -d "$SKILLS_DIR" ]; then fail "Skills directory does not exist: $SKILLS_DIR" return fi local skill_count=0 local valid_count=0 local flat_files=0 # Check for flat .md files (WRONG structure) shopt -s nullglob for file in "$SKILLS_DIR"/*.md; do if [ -f "$file" ]; then flat_files=$((flat_files + 1)) fail "Flat .md file found (should be folder): $(basename "$file")" fi done shopt -u nullglob if [ "$flat_files" -eq 0 ]; then pass "No flat .md files in skills/ (correct)" fi # Check for folders with SKILL.md (CORRECT structure) for skill_dir in "$SKILLS_DIR"/*/; do if [ -d "$skill_dir" ]; then skill_count=$((skill_count + 1)) local skill_name=$(basename "$skill_dir") if [ -f "$skill_dir/SKILL.md" ]; then valid_count=$((valid_count + 1)) pass "Skill '$skill_name' has SKILL.md" else fail "Skill '$skill_name' missing SKILL.md" fi fi done echo "" echo "Skills found: $skill_count folders, $flat_files flat files" if [ "$flat_files" -gt 0 ] && [ "$skill_count" -eq 0 ]; then fail "Skills use flat .md structure - must be folders with SKILL.md" fi } # ============================================================================ # TEST 2: SKILL.md YAML Frontmatter # Each SKILL.md must have YAML frontmatter with 'name' and 'description' # ============================================================================ test_skill_frontmatter() { header "TEST: SKILL.md YAML Frontmatter" for skill_dir in "$SKILLS_DIR"/*/; do if [ -d "$skill_dir" ] && [ -f "$skill_dir/SKILL.md" ]; then local skill_name=$(basename "$skill_dir") local skill_file="$skill_dir/SKILL.md" # Check for YAML frontmatter (starts with ---) if head -1 "$skill_file" | grep -q "^---$"; then # Extract frontmatter local frontmatter=$(sed -n '/^---$/,/^---$/p' "$skill_file" | head -20) # Check for 'name:' field if echo "$frontmatter" | grep -q "^name:"; then pass "Skill '$skill_name' has 'name' field" else fail "Skill '$skill_name' missing 'name' in frontmatter" fi # Check for 'description:' field if echo "$frontmatter" | grep -q "^description:"; then pass "Skill '$skill_name' has 'description' field" else fail "Skill '$skill_name' missing 'description' in frontmatter" fi else fail "Skill '$skill_name' missing YAML frontmatter (must start with ---)" fi fi done # Also check flat files that shouldn't exist shopt -s nullglob for file in "$SKILLS_DIR"/*.md; do if [ -f "$file" ]; then warn "Flat file '$(basename "$file")' cannot be validated (wrong structure)" fi done shopt -u nullglob } # ============================================================================ # TEST 3: Commands Structure # Commands should be .md files in commands/ # ============================================================================ test_commands_structure() { header "TEST: Commands Structure" if [ ! -d "$COMMANDS_DIR" ]; then fail "Commands directory does not exist: $COMMANDS_DIR" return fi local cmd_count=0 for cmd_file in "$COMMANDS_DIR"/*.md; do if [ -f "$cmd_file" ]; then cmd_count=$((cmd_count + 1)) local cmd_name=$(basename "$cmd_file" .md) pass "Command found: $cmd_name" fi done if [ "$cmd_count" -eq 0 ]; then fail "No commands found in $COMMANDS_DIR" else echo "" echo "Total commands: $cmd_count" fi } # ============================================================================ # TEST 4: Hooks Structure (checks ALL hooks dynamically) # ============================================================================ test_hooks_structure() { header "TEST: Hooks Structure" if [ ! -d "$HOOKS_DIR" ]; then warn "Hooks directory does not exist: $HOOKS_DIR" return fi local hook_count=0 shopt -s nullglob for hook_file in "$HOOKS_DIR"/*; do if [ -f "$hook_file" ]; then hook_count=$((hook_count + 1)) local hook_name=$(basename "$hook_file") pass "Hook found: $hook_name" if [ -x "$hook_file" ]; then pass "Hook '$hook_name' is executable" else fail "Hook '$hook_name' is NOT executable" fi # Check hook has shebang if head -1 "$hook_file" | grep -q "^#!"; then pass "Hook '$hook_name' has shebang" else warn "Hook '$hook_name' missing shebang" fi fi done shopt -u nullglob if [ "$hook_count" -eq 0 ]; then warn "No hooks found in $HOOKS_DIR" else echo "" echo "Total hooks: $hook_count" fi # Also check installed hooks local installed_hooks_dir="$HOME/.claude/hooks" if [ -d "$installed_hooks_dir" ]; then echo "" echo "Checking installed hooks (~/.claude/hooks/):" local installed_count=0 for hook_file in "$installed_hooks_dir"/*; do if [ -f "$hook_file" ]; then installed_count=$((installed_count + 1)) local hook_name=$(basename "$hook_file") if [ -x "$hook_file" ]; then pass "Installed hook '$hook_name' is executable" else fail "Installed hook '$hook_name' is NOT executable" fi fi done echo "Installed hooks: $installed_count" fi } # ============================================================================ # TEST 5: Install Script # ============================================================================ test_install_script() { header "TEST: Install Script" if [ -f "$ROOT_DIR/install.sh" ]; then pass "install.sh exists" if [ -x "$ROOT_DIR/install.sh" ]; then pass "install.sh is executable" else fail "install.sh is not executable" fi # Check that it references correct structure if grep -q "SKILL.md" "$ROOT_DIR/install.sh"; then pass "install.sh references SKILL.md structure" else warn "install.sh may not handle SKILL.md structure" fi else fail "install.sh missing" fi } # ============================================================================ # TEST 6: Installed Skills (checks ~/.claude/skills/) # ============================================================================ test_installed_skills() { header "TEST: Installed Skills (~/.claude/skills/)" local installed_dir="$HOME/.claude/skills" if [ ! -d "$installed_dir" ]; then warn "No skills installed at $installed_dir" return fi local folder_count=0 local flat_count=0 # Count folders with SKILL.md for skill_dir in "$installed_dir"/*/; do if [ -d "$skill_dir" ] && [ -f "$skill_dir/SKILL.md" ]; then folder_count=$((folder_count + 1)) fi done # Count flat .md files shopt -s nullglob for file in "$installed_dir"/*.md; do if [ -f "$file" ]; then flat_count=$((flat_count + 1)) fi done shopt -u nullglob if [ "$folder_count" -gt 0 ]; then pass "Found $folder_count properly structured skills" fi if [ "$flat_count" -gt 0 ]; then fail "Found $flat_count flat .md files (wrong structure)" fi echo "" echo "Installed: $folder_count folder skills, $flat_count flat files" } # ============================================================================ # TEST 7: README Documentation # ============================================================================ test_readme() { header "TEST: README Documentation" if [ -f "$ROOT_DIR/README.md" ]; then pass "README.md exists" # Check for key sections if grep -q "Quick Start\|Quick Install" "$ROOT_DIR/README.md"; then pass "README has Quick Start section" else warn "README missing Quick Start section" fi if grep -q "Skills Included\|What's Included" "$ROOT_DIR/README.md"; then pass "README has Skills listing" else warn "README missing Skills listing" fi else fail "README.md missing" fi } # ============================================================================ # TEST 8: Scripts Structure # ============================================================================ test_scripts_structure() { header "TEST: Scripts Structure" local scripts_dir="$ROOT_DIR/scripts" if [ ! -d "$scripts_dir" ]; then warn "Scripts directory does not exist: $scripts_dir" return fi local script_count=0 shopt -s nullglob for script_file in "$scripts_dir"/*.sh; do if [ -f "$script_file" ]; then script_count=$((script_count + 1)) local script_name=$(basename "$script_file") pass "Script found: $script_name" if [ -x "$script_file" ]; then pass "Script '$script_name' is executable" else fail "Script '$script_name' is NOT executable" fi fi done shopt -u nullglob if [ "$script_count" -eq 0 ]; then warn "No scripts found in $scripts_dir" else echo "" echo "Total scripts: $script_count" fi } # ============================================================================ # QUICK MODE - Essential checks only (for initialize-project) # ============================================================================ quick_validate() { echo "" echo "🔍 Quick validation of Maggy installation..." echo "" local errors=0 # Check skills directory exists and has content if [ -d "$HOME/.claude/skills" ]; then local skill_count=$(find "$HOME/.claude/skills" -maxdepth 1 -type d 2>/dev/null | wc -l) local flat_count=$(find "$HOME/.claude/skills" -maxdepth 1 -name "*.md" -type f 2>/dev/null | wc -l) if [ "$flat_count" -gt 0 ]; then echo -e "${RED}✗${NC} Skills use flat .md structure (need folder/SKILL.md)" errors=$((errors + 1)) elif [ "$skill_count" -gt 1 ]; then echo -e "${GREEN}✓${NC} Skills installed ($((skill_count - 1)) skills)" else echo -e "${YELLOW}⚠${NC} No skills found in ~/.claude/skills/" fi else echo -e "${RED}✗${NC} Skills directory missing (~/.claude/skills/)" errors=$((errors + 1)) fi # Check commands if [ -d "$HOME/.claude/commands" ]; then local cmd_count=$(find "$HOME/.claude/commands" -name "*.md" -type f 2>/dev/null | wc -l) if [ "$cmd_count" -gt 0 ]; then echo -e "${GREEN}✓${NC} Commands installed ($cmd_count commands)" else echo -e "${YELLOW}⚠${NC} No commands found" fi else echo -e "${RED}✗${NC} Commands directory missing (~/.claude/commands/)" errors=$((errors + 1)) fi # Check hooks if [ -d "$HOME/.claude/hooks" ]; then local hook_count=$(find "$HOME/.claude/hooks" -type f 2>/dev/null | wc -l) if [ "$hook_count" -gt 0 ]; then echo -e "${GREEN}✓${NC} Hooks installed ($hook_count hooks)" else echo -e "${YELLOW}⚠${NC} No hooks found" fi else echo -e "${YELLOW}⚠${NC} Hooks directory missing (~/.claude/hooks/)" fi echo "" if [ "$errors" -gt 0 ]; then echo -e "${RED}Bootstrap has issues. Run full validation:${NC}" echo " $ROOT_DIR/tests/validate-structure.sh" return 1 else echo -e "${GREEN}Bootstrap installation OK${NC}" return 0 fi } # ============================================================================ # MAIN # ============================================================================ test_cross_tool_templates() { header "CROSS-TOOL TEMPLATES" # AGENTS.md template if [ -f "$ROOT_DIR/templates/AGENTS.md" ]; then pass "templates/AGENTS.md exists" if grep -q "## Skills" "$ROOT_DIR/templates/AGENTS.md"; then pass "AGENTS.md has Skills section" else fail "AGENTS.md missing Skills section" fi else fail "templates/AGENTS.md missing" fi # config.toml template if [ -f "$ROOT_DIR/templates/config.toml" ]; then pass "templates/config.toml exists" if grep -q '\[\[hooks\]\]' "$ROOT_DIR/templates/config.toml"; then pass "config.toml has [[hooks]] sections" else fail "config.toml missing [[hooks]] sections" fi else fail "templates/config.toml missing" fi # Cross-tool scripts for script in detect-agents.sh install-skills.sh; do if [ -f "$ROOT_DIR/scripts/$script" ]; then pass "scripts/$script exists" if [ -x "$ROOT_DIR/scripts/$script" ]; then pass "scripts/$script is executable" else fail "scripts/$script is not executable" fi else fail "scripts/$script missing" fi done # sync-agents command if [ -f "$ROOT_DIR/commands/sync-agents.md" ]; then pass "commands/sync-agents.md exists" else fail "commands/sync-agents.md missing" fi } # ============================================================================ show_help() { echo "Usage: $(basename "$0") [OPTIONS]" echo "" echo "Validates Maggy structure matches Claude Code requirements." echo "" echo "Options:" echo " --quick Quick validation (for initialize-project)" echo " --full Full validation (default)" echo " --help Show this help" echo "" echo "Exit codes:" echo " 0 = All validations passed" echo " 1 = Validation failures found" } main() { local mode="full" while [[ $# -gt 0 ]]; do case $1 in --quick|-q) mode="quick" shift ;; --full|-f) mode="full" shift ;; --help|-h) show_help exit 0 ;; *) echo "Unknown option: $1" show_help exit 1 ;; esac done if [ "$mode" = "quick" ]; then quick_validate exit $? fi # Full validation echo "" echo "╔════════════════════════════════════════════════════════════╗" echo "║ MAGGY STRUCTURE VALIDATION ║" echo "╚════════════════════════════════════════════════════════════╝" echo "" echo "Validating: $ROOT_DIR" test_skills_structure test_skill_frontmatter test_commands_structure test_hooks_structure test_scripts_structure test_install_script test_installed_skills test_readme test_cross_tool_templates header "SUMMARY" echo "" echo -e "${GREEN}Passed:${NC} $PASS_COUNT" echo -e "${RED}Failed:${NC} $FAIL_COUNT" echo -e "${YELLOW}Warnings:${NC} $WARN_COUNT" echo "" if [ "$FAIL_COUNT" -gt 0 ]; then echo -e "${RED}VALIDATION FAILED${NC} - $FAIL_COUNT issues need fixing" exit 1 else echo -e "${GREEN}VALIDATION PASSED${NC}" exit 0 fi } main "$@"