Showing preview only (2,223K chars total). Download the full file or copy to clipboard to get everything.
Repository: muratcankoylan/Agent-Skills-for-Context-Engineering
Branch: main
Commit: a60bf8699829
Files: 234
Total size: 2.1 MB
Directory structure:
gitextract_kaaor9qo/
├── .claude-plugin/
│ └── marketplace.json
├── .cursorindexingignore
├── .gitignore
├── .plugin/
│ └── plugin.json
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SKILL.md
├── docs/
│ ├── agentskills.md
│ ├── blogs.md
│ ├── claude_research.md
│ ├── compression.md
│ ├── gemini_research.md
│ ├── hncapsule.md
│ ├── netflix_context.md
│ ├── skills-improvement-analysis.md
│ └── vercel_tool.md
├── examples/
│ ├── book-sft-pipeline/
│ │ ├── README.md
│ │ ├── SKILL.md
│ │ ├── examples/
│ │ │ └── gertrude-stein/
│ │ │ ├── README.md
│ │ │ ├── dataset_sample.jsonl
│ │ │ ├── sample_outputs.md
│ │ │ └── training_config.json
│ │ ├── references/
│ │ │ ├── segmentation-strategies.md
│ │ │ ├── tinker-format.md
│ │ │ └── tinker.txt
│ │ └── scripts/
│ │ └── pipeline_example.py
│ ├── digital-brain-skill/
│ │ ├── .gitignore
│ │ ├── AGENT.md
│ │ ├── HOW-SKILLS-BUILT-THIS.md
│ │ ├── README.md
│ │ ├── SKILL.md
│ │ ├── SKILLS-MAPPING.md
│ │ ├── agents/
│ │ │ ├── AGENTS.md
│ │ │ └── scripts/
│ │ │ ├── content_ideas.py
│ │ │ ├── idea_to_draft.py
│ │ │ ├── stale_contacts.py
│ │ │ └── weekly_review.py
│ │ ├── content/
│ │ │ ├── CONTENT.md
│ │ │ ├── calendar.md
│ │ │ ├── engagement.jsonl
│ │ │ ├── ideas.jsonl
│ │ │ ├── posts.jsonl
│ │ │ └── templates/
│ │ │ ├── linkedin-post.md
│ │ │ ├── newsletter.md
│ │ │ └── thread.md
│ │ ├── examples/
│ │ │ ├── content-workflow.md
│ │ │ └── meeting-prep.md
│ │ ├── identity/
│ │ │ ├── IDENTITY.md
│ │ │ ├── bio-variants.md
│ │ │ ├── brand.md
│ │ │ ├── prompts/
│ │ │ │ ├── content-generation.xml
│ │ │ │ └── reply-generator.xml
│ │ │ ├── values.yaml
│ │ │ └── voice.md
│ │ ├── knowledge/
│ │ │ ├── KNOWLEDGE.md
│ │ │ ├── bookmarks.jsonl
│ │ │ ├── competitors.md
│ │ │ ├── learning.yaml
│ │ │ └── research/
│ │ │ └── _template.md
│ │ ├── network/
│ │ │ ├── NETWORK.md
│ │ │ ├── circles.yaml
│ │ │ ├── contacts.jsonl
│ │ │ ├── interactions.jsonl
│ │ │ └── intros.md
│ │ ├── operations/
│ │ │ ├── OPERATIONS.md
│ │ │ ├── goals.yaml
│ │ │ ├── meetings.jsonl
│ │ │ ├── metrics.jsonl
│ │ │ ├── reviews/
│ │ │ │ └── _weekly_template.md
│ │ │ └── todos.md
│ │ ├── package.json
│ │ ├── references/
│ │ │ └── file-formats.md
│ │ └── scripts/
│ │ └── install.sh
│ ├── interleaved-thinking/
│ │ ├── README.md
│ │ ├── SKILL.md
│ │ ├── docs/
│ │ │ ├── agentthinking.md
│ │ │ ├── interleavedthinking.md
│ │ │ └── m2-1.md
│ │ ├── examples/
│ │ │ ├── 01_basic_capture.py
│ │ │ ├── 02_tool_usage.py
│ │ │ └── 03_full_optimization.py
│ │ ├── generated_skills/
│ │ │ └── comprehensive-research-agent/
│ │ │ ├── SKILL.md
│ │ │ └── references/
│ │ │ ├── optimization_summary.json
│ │ │ ├── optimized_prompt.txt
│ │ │ └── patterns_found.json
│ │ ├── optimization_artifacts/
│ │ │ ├── final_prompt.txt
│ │ │ ├── iteration_1/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_10/
│ │ │ │ ├── analysis.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_2/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_3/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_4/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_5/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_6/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_7/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_8/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_9/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ └── summary.json
│ │ ├── pyproject.toml
│ │ ├── reasoning_trace_optimizer/
│ │ │ ├── __init__.py
│ │ │ ├── analyzer.py
│ │ │ ├── capture.py
│ │ │ ├── cli.py
│ │ │ ├── loop.py
│ │ │ ├── models.py
│ │ │ ├── optimizer.py
│ │ │ └── skill_generator.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ └── test_models.py
│ ├── llm-as-judge-skills/
│ │ ├── .gitignore
│ │ ├── .prettierrc
│ │ ├── CONTRIBUTING.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── agents/
│ │ │ ├── evaluator-agent/
│ │ │ │ └── evaluator-agent.md
│ │ │ ├── index.md
│ │ │ ├── orchestrator-agent/
│ │ │ │ └── orchestrator-agent.md
│ │ │ └── research-agent/
│ │ │ └── research-agent.md
│ │ ├── env.example
│ │ ├── eslint.config.js
│ │ ├── examples/
│ │ │ ├── basic-evaluation.ts
│ │ │ ├── full-evaluation-workflow.ts
│ │ │ ├── generate-rubric.ts
│ │ │ └── pairwise-comparison.ts
│ │ ├── package.json
│ │ ├── prompts/
│ │ │ ├── agent-system/
│ │ │ │ └── orchestrator-prompt.md
│ │ │ ├── evaluation/
│ │ │ │ ├── direct-scoring-prompt.md
│ │ │ │ └── pairwise-comparison-prompt.md
│ │ │ ├── index.md
│ │ │ └── research/
│ │ │ └── research-synthesis-prompt.md
│ │ ├── skills/
│ │ │ ├── context-fundamentals/
│ │ │ │ └── context-fundamentals.md
│ │ │ ├── index.md
│ │ │ ├── llm-evaluator/
│ │ │ │ └── llm-evaluator.md
│ │ │ └── tool-design/
│ │ │ └── tool-design.md
│ │ ├── src/
│ │ │ ├── agents/
│ │ │ │ ├── evaluator.ts
│ │ │ │ └── index.ts
│ │ │ ├── config/
│ │ │ │ └── index.ts
│ │ │ ├── index.ts
│ │ │ └── tools/
│ │ │ └── evaluation/
│ │ │ ├── direct-score.ts
│ │ │ ├── generate-rubric.ts
│ │ │ ├── index.ts
│ │ │ └── pairwise-compare.ts
│ │ ├── tests/
│ │ │ ├── evaluation.test.ts
│ │ │ ├── setup.ts
│ │ │ └── skills.test.ts
│ │ ├── tools/
│ │ │ ├── evaluation/
│ │ │ │ ├── direct-score.md
│ │ │ │ ├── generate-rubric.md
│ │ │ │ └── pairwise-compare.md
│ │ │ ├── index.md
│ │ │ ├── orchestration/
│ │ │ │ └── delegate-to-agent.md
│ │ │ └── research/
│ │ │ ├── read-url.md
│ │ │ └── web-search.md
│ │ ├── tsconfig.json
│ │ └── vitest.config.ts
│ └── x-to-book-system/
│ ├── PRD.md
│ ├── README.md
│ └── SKILLS-MAPPING.md
├── researcher/
│ ├── example_output.md
│ └── llm-as-a-judge.md
├── skills/
│ ├── advanced-evaluation/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ ├── bias-mitigation.md
│ │ │ ├── evaluation-pipeline.md
│ │ │ ├── implementation-patterns.md
│ │ │ └── metrics-guide.md
│ │ └── scripts/
│ │ └── evaluation_example.py
│ ├── bdi-mental-states/
│ │ ├── SKILL.md
│ │ └── references/
│ │ ├── bdi-ontology-core.md
│ │ ├── framework-integration.md
│ │ ├── rdf-examples.md
│ │ └── sparql-competency.md
│ ├── context-compression/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── evaluation-framework.md
│ │ └── scripts/
│ │ └── compression_evaluator.py
│ ├── context-degradation/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── patterns.md
│ │ └── scripts/
│ │ └── degradation_detector.py
│ ├── context-fundamentals/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── context-components.md
│ │ └── scripts/
│ │ └── context_manager.py
│ ├── context-optimization/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── optimization_techniques.md
│ │ └── scripts/
│ │ └── compaction.py
│ ├── evaluation/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── metrics.md
│ │ └── scripts/
│ │ └── evaluator.py
│ ├── filesystem-context/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── implementation-patterns.md
│ │ └── scripts/
│ │ └── filesystem_context.py
│ ├── hosted-agents/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── infrastructure-patterns.md
│ │ └── scripts/
│ │ └── sandbox_manager.py
│ ├── memory-systems/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── implementation.md
│ │ └── scripts/
│ │ └── memory_store.py
│ ├── multi-agent-patterns/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── frameworks.md
│ │ └── scripts/
│ │ └── coordination.py
│ ├── project-development/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ ├── case-studies.md
│ │ │ └── pipeline-patterns.md
│ │ └── scripts/
│ │ └── pipeline_template.py
│ └── tool-design/
│ ├── SKILL.md
│ ├── references/
│ │ ├── architectural_reduction.md
│ │ └── best_practices.md
│ └── scripts/
│ └── description_generator.py
└── template/
└── SKILL.md
================================================
FILE CONTENTS
================================================
================================================
FILE: .claude-plugin/marketplace.json
================================================
{
"name": "context-engineering-marketplace",
"owner": {
"name": "Muratcan Koylan",
"email": "muratcan.koylan@outlook.com"
},
"metadata": {
"description": "Context Engineering skills for building production-grade AI agent systems",
"version": "1.0.0"
},
"plugins": [
{
"name": "context-engineering-fundamentals",
"description": "Core context engineering skills covering fundamentals, degradation patterns, compression strategies, and optimization techniques for AI agent systems",
"source": "./",
"strict": false,
"skills": [
"./skills/context-fundamentals",
"./skills/context-degradation",
"./skills/context-compression",
"./skills/context-optimization"
]
},
{
"name": "agent-architecture",
"description": "Multi-agent patterns, memory systems, tool design, filesystem-based context, and hosted agent infrastructure for building production AI agent architectures",
"source": "./",
"strict": false,
"skills": [
"./skills/multi-agent-patterns",
"./skills/memory-systems",
"./skills/tool-design",
"./skills/filesystem-context",
"./skills/hosted-agents"
]
},
{
"name": "agent-evaluation",
"description": "Evaluation frameworks and LLM-as-judge techniques for testing and validating AI agent systems",
"source": "./",
"strict": false,
"skills": [
"./skills/evaluation",
"./skills/advanced-evaluation"
]
},
{
"name": "agent-development",
"description": "Project development methodology for LLM-powered applications including pipeline architecture and batch processing",
"source": "./",
"strict": false,
"skills": [
"./skills/project-development"
]
},
{
"name": "cognitive-architecture",
"description": "BDI mental state modeling and cognitive architecture patterns for building rational agents with formal belief-desire-intention representations",
"source": "./",
"strict": false,
"skills": [
"./skills/bdi-mental-states"
]
}
]
}
================================================
FILE: .cursorindexingignore
================================================
# Don't index SpecStory auto-save files, but allow explicit context inclusion via @ references
.specstory/**
================================================
FILE: .gitignore
================================================
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual environments
venv/
ENV/
env/
.venv
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db
# Testing
.pytest_cache/
.coverage
htmlcov/
# Logs
*.log
# Temporary files
*.tmp
*.bak
# Dashboard (separate private repo)
dashboard/
# Private folder - never push to public repo
Private/
# Cursor IDE
.cursor/
# Local history
.specstory/
================================================
FILE: .plugin/plugin.json
================================================
{
"name": "context-engineering",
"description": "Context engineering skills for building production-grade AI agent systems — covering fundamentals, degradation patterns, compression, optimization, multi-agent coordination, memory systems, tool design, evaluation, and more.",
"version": "2.0.0",
"author": {
"name": "Muratcan Koylan"
}
}
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Agent Skills for Context Engineering
Thank you for your interest in contributing to this collection of Agent Skills for Context Engineering. This document provides guidelines and instructions for contributing.
## How to Contribute
### Reporting Issues
If you find errors, unclear explanations, or missing topics, please open an issue with:
- A clear description of the problem
- The skill and section where the issue was found
- Suggested improvements if you have them
### Submitting Changes
For substantive changes, please:
1. Fork the repository
2. Create a feature branch for your changes
3. Make changes following the skill template structure
4. Ensure SKILL.md files remain under 500 lines
5. Add references or scripts as appropriate
6. Submit a pull request with a clear description of changes
### Adding New Skills
When adding new skills:
1. Use the template in `template/SKILL.md`
2. Follow naming conventions (lowercase with hyphens)
3. Include both SKILL.md and appropriate references/scripts
4. Update the root README.md to include the new skill
5. Ensure content is platform-agnostic (works across Cursor, Claude Code, etc.)
## Skill Structure Requirements
Each skill must include:
- YAML frontmatter with `name` and `description` fields
- Clear sections with logical organization
- Practical examples where appropriate
- Integration notes linking to related skills
Optional additions:
- `references/` directory with additional documentation
- `scripts/` directory with executable examples
- Multiple markdown files for complex skills
## Content Guidelines
### Writing Style
- Be direct and precise
- Use technical terminology appropriately
- Include specific guidance, not vague recommendations
- Provide concrete examples
- Point out complexity and trade-offs
### Avoiding Platform Specificity
Skills should work across agent platforms. Avoid:
- Platform-specific tool names without abstraction
- Vendor-locked examples
- Features specific to one agent product
### Keeping Skills Focused
Each skill should have a single focus. If a topic grows too large, consider splitting into multiple skills with clear dependencies.
## Code of Conduct
This project follows a professional, technical collaboration model. Be respectful of different perspectives and focus on improving the collective knowledge base.
## Questions
For questions about contributing, please open an issue for discussion.
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2025 Context Engineering Agent Skills Contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Agent Skills for Context Engineering
A comprehensive, open collection of Agent Skills focused on context engineering principles for building production-grade AI agent systems. These skills teach the art and science of curating context to maximize agent effectiveness across any agent platform.
## What is Context Engineering?
Context engineering is the discipline of managing the language model's context window. Unlike prompt engineering, which focuses on crafting effective instructions, context engineering addresses the holistic curation of all information that enters the model's limited attention budget: system prompts, tool definitions, retrieved documents, message history, and tool outputs.
The fundamental challenge is that context windows are constrained not by raw token capacity but by attention mechanics. As context length increases, models exhibit predictable degradation patterns: the "lost-in-the-middle" phenomenon, U-shaped attention curves, and attention scarcity. Effective context engineering means finding the smallest possible set of high-signal tokens that maximize the likelihood of desired outcomes.
## Recognition
This repository is cited in academic research as foundational work on static skill architecture:
> "While static skills are well-recognized [Anthropic, 2025b; Muratcan Koylan, 2025], MCE is among the first to dynamically evolve them, bridging manual skill engineering and autonomous self-improvement."
— [Meta Context Engineering via Agentic Skill Evolution](https://arxiv.org/pdf/2601.21557), Peking University State Key Laboratory of General Artificial Intelligence (2026)
## Skills Overview
### Foundational Skills
These skills establish the foundational understanding required for all subsequent context engineering work.
| Skill | Description |
|-------|-------------|
| [context-fundamentals](skills/context-fundamentals/) | Understand what context is, why it matters, and the anatomy of context in agent systems |
| [context-degradation](skills/context-degradation/) | Recognize patterns of context failure: lost-in-middle, poisoning, distraction, and clash |
| [context-compression](skills/context-compression/) | Design and evaluate compression strategies for long-running sessions |
### Architectural Skills
These skills cover the patterns and structures for building effective agent systems.
| Skill | Description |
|-------|-------------|
| [multi-agent-patterns](skills/multi-agent-patterns/) | Master orchestrator, peer-to-peer, and hierarchical multi-agent architectures |
| [memory-systems](skills/memory-systems/) | Design short-term, long-term, and graph-based memory architectures |
| [tool-design](skills/tool-design/) | Build tools that agents can use effectively |
| [filesystem-context](skills/filesystem-context/) | Use filesystems for dynamic context discovery, tool output offloading, and plan persistence |
| [hosted-agents](skills/hosted-agents/) | **NEW** Build background coding agents with sandboxed VMs, pre-built images, multiplayer support, and multi-client interfaces |
### Operational Skills
These skills address the ongoing operation and optimization of agent systems.
| Skill | Description |
|-------|-------------|
| [context-optimization](skills/context-optimization/) | Apply compaction, masking, and caching strategies |
| [evaluation](skills/evaluation/) | Build evaluation frameworks for agent systems |
| [advanced-evaluation](skills/advanced-evaluation/) | Master LLM-as-a-Judge techniques: direct scoring, pairwise comparison, rubric generation, and bias mitigation |
### Development Methodology
These skills cover the meta-level practices for building LLM-powered projects.
| Skill | Description |
|-------|-------------|
| [project-development](skills/project-development/) | Design and build LLM projects from ideation through deployment, including task-model fit analysis, pipeline architecture, and structured output design |
### Cognitive Architecture Skills
These skills cover formal cognitive modeling for rational agent systems.
| Skill | Description |
|-------|-------------|
| [bdi-mental-states](skills/bdi-mental-states/) | **NEW** Transform external RDF context into agent mental states (beliefs, desires, intentions) using formal BDI ontology patterns for deliberative reasoning and explainability |
## Design Philosophy
### Progressive Disclosure
Each skill is structured for efficient context use. At startup, agents load only skill names and descriptions. Full content loads only when a skill is activated for relevant tasks.
### Platform Agnosticism
These skills focus on transferable principles rather than vendor-specific implementations. The patterns work across Claude Code, Cursor, and any agent platform that supports skills or allows custom instructions.
### Conceptual Foundation with Practical Examples
Scripts and examples demonstrate concepts using Python pseudocode that works across environments without requiring specific dependency installations.
## Usage
### Usage with Claude Code
This repository is a **Claude Code Plugin Marketplace** containing context engineering skills that Claude automatically discovers and activates based on your task context.
### Installation
**Step 1: Add the Marketplace**
Run this command in Claude Code to register this repository as a plugin source:
```
/plugin marketplace add muratcankoylan/Agent-Skills-for-Context-Engineering
```
**Step 2: Browse and Install**
Option A - Browse available plugins:
1. Select `Browse and install plugins`
2. Select `context-engineering-marketplace`
3. Choose a plugin (e.g., `context-engineering-fundamentals`, `agent-architecture`)
4. Select `Install now`
Option B - Direct install via command:
```
/plugin install context-engineering-fundamentals@context-engineering-marketplace
/plugin install agent-architecture@context-engineering-marketplace
/plugin install agent-evaluation@context-engineering-marketplace
/plugin install agent-development@context-engineering-marketplace
/plugin install cognitive-architecture@context-engineering-marketplace
```
### Available Plugins
| Plugin | Skills Included |
|--------|-----------------|
| `context-engineering-fundamentals` | context-fundamentals, context-degradation, context-compression, context-optimization |
| `agent-architecture` | multi-agent-patterns, memory-systems, tool-design, filesystem-context, hosted-agents |
| `agent-evaluation` | evaluation, advanced-evaluation |
| `agent-development` | project-development |
| `cognitive-architecture` | bdi-mental-states |
### Skill Triggers
| Skill | Triggers On |
|-------|-------------|
| `context-fundamentals` | "understand context", "explain context windows", "design agent architecture" |
| `context-degradation` | "diagnose context problems", "fix lost-in-middle", "debug agent failures" |
| `context-compression` | "compress context", "summarize conversation", "reduce token usage" |
| `context-optimization` | "optimize context", "reduce token costs", "implement KV-cache" |
| `multi-agent-patterns` | "design multi-agent system", "implement supervisor pattern" |
| `memory-systems` | "implement agent memory", "build knowledge graph", "track entities" |
| `tool-design` | "design agent tools", "reduce tool complexity", "implement MCP tools" |
| `filesystem-context` | "offload context to files", "dynamic context discovery", "agent scratch pad", "file-based context" |
| `hosted-agents` | "build background agent", "create hosted coding agent", "sandboxed execution", "multiplayer agent", "Modal sandboxes" |
| `evaluation` | "evaluate agent performance", "build test framework", "measure quality" |
| `advanced-evaluation` | "implement LLM-as-judge", "compare model outputs", "mitigate bias" |
| `project-development` | "start LLM project", "design batch pipeline", "evaluate task-model fit" |
| `bdi-mental-states` | "model agent mental states", "implement BDI architecture", "transform RDF to beliefs", "build cognitive agent" |
<img width="1014" height="894" alt="Screenshot 2025-12-26 at 12 34 47 PM" src="https://github.com/user-attachments/assets/f79aaf03-fd2d-4c71-a630-7027adeb9bfe" />
### For Cursor (Open Plugins)
This repository is listed on the [Cursor Plugin Directory](https://cursor.directory/plugins/context-engineering).
The `.plugin/plugin.json` manifest follows the [Open Plugins](https://open-plugins.com) standard, so the repo also works with any conformant agent tool (Codex, GitHub Copilot, etc.).
### For Custom Implementations
Extract the principles and patterns from any skill and implement them in your agent framework. The skills are deliberately platform-agnostic.
## Examples
The [examples](examples/) folder contains complete system designs that demonstrate how multiple skills work together in practice.
| Example | Description | Skills Applied |
|---------|-------------|----------------|
| [digital-brain-skill](examples/digital-brain-skill/) | **NEW** Personal operating system for founders and creators. Complete Claude Code skill with 6 modules, 4 automation scripts | context-fundamentals, context-optimization, memory-systems, tool-design, multi-agent-patterns, evaluation, project-development |
| [x-to-book-system](examples/x-to-book-system/) | Multi-agent system that monitors X accounts and generates daily synthesized books | multi-agent-patterns, memory-systems, context-optimization, tool-design, evaluation |
| [llm-as-judge-skills](examples/llm-as-judge-skills/) | Production-ready LLM evaluation tools with TypeScript implementation, 19 passing tests | advanced-evaluation, tool-design, context-fundamentals, evaluation |
| [book-sft-pipeline](examples/book-sft-pipeline/) | Train models to write in any author's style. Includes Gertrude Stein case study with 70% human score on Pangram, $2 total cost | project-development, context-compression, multi-agent-patterns, evaluation |
Each example includes:
- Complete PRD with architecture decisions
- Skills mapping showing which concepts informed each decision
- Implementation guidance
### Digital Brain Skill Example
The [digital-brain-skill](examples/digital-brain-skill/) example is a complete personal operating system demonstrating comprehensive skills application:
- **Progressive Disclosure**: 3-level loading (SKILL.md → MODULE.md → data files)
- **Module Isolation**: 6 independent modules (identity, content, knowledge, network, operations, agents)
- **Append-Only Memory**: JSONL files with schema-first lines for agent-friendly parsing
- **Automation Scripts**: 4 consolidated tools (weekly_review, content_ideas, stale_contacts, idea_to_draft)
Includes detailed traceability in [HOW-SKILLS-BUILT-THIS.md](examples/digital-brain-skill/HOW-SKILLS-BUILT-THIS.md) mapping every architectural decision to specific skill principles.
### LLM-as-Judge Skills Example
The [llm-as-judge-skills](examples/llm-as-judge-skills/) example is a complete TypeScript implementation demonstrating:
- **Direct Scoring**: Evaluate responses against weighted criteria with rubric support
- **Pairwise Comparison**: Compare responses with position bias mitigation
- **Rubric Generation**: Create domain-specific evaluation standards
- **EvaluatorAgent**: High-level agent combining all evaluation capabilities
### Book SFT Pipeline Example
The [book-sft-pipeline](examples/book-sft-pipeline/) example demonstrates training small models (8B) to write in any author's style:
- **Intelligent Segmentation**: Two-tier chunking with overlap for maximum training examples
- **Prompt Diversity**: 15+ templates to prevent memorization and force style learning
- **Tinker Integration**: Complete LoRA training workflow with $2 total cost
- **Validation Methodology**: Modern scenario testing proves style transfer vs content memorization
Integrates with context engineering skills: project-development, context-compression, multi-agent-patterns, evaluation.
## Star History
<img width="3664" height="2648" alt="star-history-2026317" src="https://github.com/user-attachments/assets/0fe53d8d-7fdd-45be-9c28-057881b23b44" />
## Structure
Each skill follows the Agent Skills specification:
```
skill-name/
├── SKILL.md # Required: instructions + metadata
├── scripts/ # Optional: executable code demonstrating concepts
└── references/ # Optional: additional documentation and resources
```
See the [template](template/) folder for the canonical skill structure.
## Contributing
This repository follows the Agent Skills open development model. Contributions are welcome from the broader ecosystem. When contributing:
1. Follow the skill template structure
2. Provide clear, actionable instructions
3. Include working examples where appropriate
4. Document trade-offs and potential issues
5. Keep SKILL.md under 500 lines for optimal performance
Feel free to contact [Muratcan Koylan](https://x.com/koylanai) for collaboration opportunities or any inquiries.
## License
MIT License - see LICENSE file for details.
## References
The principles in these skills are derived from research and production experience at leading AI labs and framework developers. Each skill includes references to the underlying research and case studies that inform its recommendations.
================================================
FILE: SKILL.md
================================================
---
name: context-engineering-collection
description: A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems. Use when building, optimizing, or debugging agent systems that require effective context management.
---
# Agent Skills for Context Engineering
This collection provides structured guidance for building production-grade AI agent systems through effective context engineering.
## When to Activate
Activate these skills when:
- Building new agent systems from scratch
- Optimizing existing agent performance
- Debugging context-related failures
- Designing multi-agent architectures
- Creating or evaluating tools for agents
- Implementing memory and persistence layers
## Skill Map
### Foundational Context Engineering
**Understanding Context Fundamentals**
Context is not just prompt text—it is the complete state available to the language model at inference time, including system instructions, tool definitions, retrieved documents, message history, and tool outputs. Effective context engineering means understanding what information truly matters for the task at hand and curating that information for maximum signal-to-noise ratio.
**Recognizing Context Degradation**
Language models exhibit predictable degradation patterns as context grows: the "lost-in-middle" phenomenon where information in the center of context receives less attention; U-shaped attention curves that prioritize beginning and end; context poisoning when errors compound; and context distraction when irrelevant information overwhelms relevant content.
### Architectural Patterns
**Multi-Agent Coordination**
Production multi-agent systems converge on three dominant patterns: supervisor/orchestrator architectures with centralized control, peer-to-peer swarm architectures for flexible handoffs, and hierarchical structures for complex task decomposition. The critical insight is that sub-agents exist primarily to isolate context rather than to simulate organizational roles.
**Memory System Design**
Memory architectures range from simple scratchpads to sophisticated temporal knowledge graphs. Vector RAG provides semantic retrieval but loses relationship information. Knowledge graphs preserve structure but require more engineering investment. The file-system-as-memory pattern enables just-in-time context loading without stuffing context windows.
**Filesystem-Based Context**
The filesystem provides a single interface for storing, retrieving, and updating effectively unlimited context. Key patterns include scratch pads for tool output offloading, plan persistence for long-horizon tasks, sub-agent communication via shared files, and dynamic skill loading. Agents use `ls`, `glob`, `grep`, and `read_file` for targeted context discovery, often outperforming semantic search for structural queries.
**Hosted Agent Infrastructure**
Background coding agents run in remote sandboxed environments rather than on local machines. Key patterns include pre-built environment images refreshed on regular cadence, warm sandbox pools for instant session starts, filesystem snapshots for session persistence, and multiplayer support for collaborative agent sessions. Critical optimizations include allowing file reads before git sync completes (blocking only writes), predictive sandbox warming when users start typing, and self-spawning agents for parallel task execution.
**Tool Design Principles**
Tools are contracts between deterministic systems and non-deterministic agents. Effective tool design follows the consolidation principle (prefer single comprehensive tools over multiple narrow ones), returns contextual information in errors, supports response format options for token efficiency, and uses clear namespacing.
### Operational Excellence
**Context Compression**
When agent sessions exhaust memory, compression becomes mandatory. The correct optimization target is tokens-per-task, not tokens-per-request. Structured summarization with explicit sections for files, decisions, and next steps preserves more useful information than aggressive compression. Artifact trail integrity remains the weakest dimension across all compression methods.
**Context Optimization**
Techniques include compaction (summarizing context near limits), observation masking (replacing verbose tool outputs with references), prefix caching (reusing KV blocks across requests), and strategic context partitioning (splitting work across sub-agents with isolated contexts).
**Evaluation Frameworks**
Production agent evaluation requires multi-dimensional rubrics covering factual accuracy, completeness, tool efficiency, and process quality. Effective patterns include LLM-as-judge for scalability, human evaluation for edge cases, and end-state evaluation for agents that mutate persistent state.
### Development Methodology
**Project Development**
Effective LLM project development begins with task-model fit analysis: validating through manual prototyping that a task is well-suited for LLM processing before building automation. Production pipelines follow staged, idempotent architectures (acquire, prepare, process, parse, render) with file system state management for debugging and caching. Structured output design with explicit format specifications enables reliable parsing. Start with minimal architecture and add complexity only when proven necessary.
## Core Concepts
The collection is organized around three core themes. First, context fundamentals establish what context is, how attention mechanisms work, and why context quality matters more than quantity. Second, architectural patterns cover the structures and coordination mechanisms that enable effective agent systems. Third, operational excellence addresses the ongoing work of optimizing and evaluating production systems.
## Practical Guidance
Each skill can be used independently or in combination. Start with fundamentals to establish context management mental models. Branch into architectural patterns based on your system requirements. Reference operational skills when optimizing production systems.
The skills are platform-agnostic and work with Claude Code, Cursor, or any agent framework that supports custom instructions or skill-like constructs.
## Integration
This collection integrates with itself—skills reference each other and build on shared concepts. The fundamentals skill provides context for all other skills. Architectural skills (multi-agent, memory, tools) can be combined for complex systems. Operational skills (optimization, evaluation) apply to any system built using the foundational and architectural skills.
## References
Internal skills in this collection:
- [context-fundamentals](skills/context-fundamentals/SKILL.md)
- [context-degradation](skills/context-degradation/SKILL.md)
- [context-compression](skills/context-compression/SKILL.md)
- [multi-agent-patterns](skills/multi-agent-patterns/SKILL.md)
- [memory-systems](skills/memory-systems/SKILL.md)
- [tool-design](skills/tool-design/SKILL.md)
- [filesystem-context](skills/filesystem-context/SKILL.md)
- [hosted-agents](skills/hosted-agents/SKILL.md)
- [context-optimization](skills/context-optimization/SKILL.md)
- [evaluation](skills/evaluation/SKILL.md)
- [project-development](skills/project-development/SKILL.md)
External resources on context engineering:
- Research on attention mechanisms and context window limitations
- Production experience from leading AI labs on agent system design
- Framework documentation for LangGraph, AutoGen, and CrewAI
---
## Skill Metadata
**Created**: 2025-12-20
**Last Updated**: 2025-12-25
**Author**: Agent Skills for Context Engineering Contributors
**Version**: 1.2.0
================================================
FILE: docs/agentskills.md
================================================
---
name: agent-skills-format
description: Official documentation for the Agent Skills format - a lightweight, open standard for extending AI agent capabilities with specialized knowledge and workflows.
doc_type: reference
source_url: No
---
Overview
Copy page
A simple, open format for giving agents new capabilities and expertise.
Agent Skills are folders of instructions, scripts, and resources that agents can discover and use to do things more accurately and efficiently.
Why Agent Skills?
Agents are increasingly capable, but often don’t have the context they need to do real work reliably. Skills solve this by giving agents access to procedural knowledge and company-, team-, and user-specific context they can load on demand. Agents with access to a set of skills can extend their capabilities based on the task they’re working on.
For skill authors: Build capabilities once and deploy them across multiple agent products.
For compatible agents: Support for skills lets end users give agents new capabilities out of the box.
For teams and enterprises: Capture organizational knowledge in portable, version-controlled packages.
What can Agent Skills enable?
Domain expertise: Package specialized knowledge into reusable instructions, from legal review processes to data analysis pipelines.
New capabilities: Give agents new capabilities (e.g. creating presentations, building MCP servers, analyzing datasets).
Repeatable workflows: Turn multi-step tasks into consistent and auditable workflows.
Interoperability: Reuse the same skill across different skills-compatible agent products.
Adoption
Agent Skills are supported by leading AI development tools.
OpenCode
Cursor
Amp
Letta
Goose
GitHub
VS Code
Claude Code
Claude
OpenAI Codex
Open development
The Agent Skills format was originally developed by Anthropic, released as an open standard, and has been adopted by a growing number of agent products. The standard is open to contributions from the broader ecosystem.
What are skills?
Copy page
Agent Skills are a lightweight, open format for extending AI agent capabilities with specialized knowledge and workflows.
At its core, a skill is a folder containing a SKILL.md file. This file includes metadata (name and description, at minimum) and instructions that tell an agent how to perform a specific task. Skills can also bundle scripts, templates, and reference materials.
my-skill/
├── SKILL.md # Required: instructions + metadata
├── scripts/ # Optional: executable code
├── references/ # Optional: documentation
└── assets/ # Optional: templates, resources
How skills work
Skills use progressive disclosure to manage context efficiently:
Discovery: At startup, agents load only the name and description of each available skill, just enough to know when it might be relevant.
Activation: When a task matches a skill’s description, the agent reads the full SKILL.md instructions into context.
Execution: The agent follows the instructions, optionally loading referenced files or executing bundled code as needed.
This approach keeps agents fast while giving them access to more context on demand.
The SKILL.md file
Every skill starts with a SKILL.md file containing YAML frontmatter and Markdown instructions:
---
name: pdf-processing
description: Extract text and tables from PDF files, fill forms, merge documents.
---
# PDF Processing
## When to use this skill
Use this skill when the user needs to work with PDF files...
## How to extract text
1. Use pdfplumber for text extraction...
## How to fill forms
...
The following frontmatter is required at the top of SKILL.md:
name: A short identifier
description: When to use this skill
The Markdown body contains the actual instructions and has no specific restrictions on structure or content.
This simple format has some key advantages:
Self-documenting: A skill author or user can read a SKILL.md and understand what it does, making skills easy to audit and improve.
Extensible: Skills can range in complexity from just text instructions to executable code, assets, and templates.
Portable: Skills are just files, so they’re easy to edit, version, and share.
Next steps
View the specification to understand the full format.
Add skills support to your agent to build a compatible client.
See example skills on GitHub.
Read authoring best practices for writing effective skills.
Use the reference library to validate skills and generate prompt XML.
Specification
Copy page
The complete format specification for Agent Skills.
This document defines the Agent Skills format.
Directory structure
A skill is a directory containing at minimum a SKILL.md file:
skill-name/
└── SKILL.md # Required
You can optionally include additional directories such as scripts/, references/, and assets/ to support your skill.
SKILL.md format
The SKILL.md file must contain YAML frontmatter followed by Markdown content.
Frontmatter (required)
---
name: skill-name
description: A description of what this skill does and when to use it.
---
With optional fields:
---
name: pdf-processing
description: Extract text and tables from PDF files, fill forms, merge documents.
license: Apache-2.0
metadata:
author: example-org
version: "1.0"
---
Field Required Constraints
name Yes Max 64 characters. Lowercase letters, numbers, and hyphens only. Must not start or end with a hyphen.
description Yes Max 1024 characters. Non-empty. Describes what the skill does and when to use it.
license No License name or reference to a bundled license file.
compatibility No Max 500 characters. Indicates environment requirements (intended product, system packages, network access, etc.).
metadata No Arbitrary key-value mapping for additional metadata.
allowed-tools No Space-delimited list of pre-approved tools the skill may use. (Experimental)
name field
The required name field:
Must be 1-64 characters
May only contain unicode lowercase alphanumeric characters and hyphens (a-z and -)
Must not start or end with -
Must not contain consecutive hyphens (--)
Must match the parent directory name
Valid examples:
name: pdf-processing
name: data-analysis
name: code-review
Invalid examples:
name: PDF-Processing # uppercase not allowed
name: -pdf # cannot start with hyphen
name: pdf--processing # consecutive hyphens not allowed
description field
The required description field:
Must be 1-1024 characters
Should describe both what the skill does and when to use it
Should include specific keywords that help agents identify relevant tasks
Good example:
description: Extracts text and tables from PDF files, fills PDF forms, and merges multiple PDFs. Use when working with PDF documents or when the user mentions PDFs, forms, or document extraction.
Poor example:
description: Helps with PDFs.
license field
The optional license field:
Specifies the license applied to the skill
We recommend keeping it short (either the name of a license or the name of a bundled license file)
Example:
license: Proprietary. LICENSE.txt has complete terms
compatibility field
The optional compatibility field:
Must be 1-500 characters if provided
Should only be included if your skill has specific environment requirements
Can indicate intended product, required system packages, network access needs, etc.
Examples:
compatibility: Designed for Claude Code (or similar products)
compatibility: Requires git, docker, jq, and access to the internet
Most skills do not need the compatibility field.
metadata field
The optional metadata field:
A map from string keys to string values
Clients can use this to store additional properties not defined by the Agent Skills spec
We recommend making your key names reasonably unique to avoid accidental conflicts
Example:
metadata:
author: example-org
version: "1.0"
allowed-tools field
The optional allowed-tools field:
A space-delimited list of tools that are pre-approved to run
Experimental. Support for this field may vary between agent implementations
Example:
allowed-tools: Bash(git:*) Bash(jq:*) Read
Body content
The Markdown body after the frontmatter contains the skill instructions. There are no format restrictions. Write whatever helps agents perform the task effectively.
Recommended sections:
Step-by-step instructions
Examples of inputs and outputs
Common edge cases
Note that the agent will load this entire file once it’s decided to activate a skill. Consider splitting longer SKILL.md content into referenced files.
Optional directories
scripts/
Contains executable code that agents can run. Scripts should:
Be self-contained or clearly document dependencies
Include helpful error messages
Handle edge cases gracefully
Supported languages depend on the agent implementation. Common options include Python, Bash, and JavaScript.
references/
Contains additional documentation that agents can read when needed:
REFERENCE.md - Detailed technical reference
FORMS.md - Form templates or structured data formats
Domain-specific files (finance.md, legal.md, etc.)
Keep individual reference files focused. Agents load these on demand, so smaller files mean less use of context.
assets/
Contains static resources:
Templates (document templates, configuration templates)
Images (diagrams, examples)
Data files (lookup tables, schemas)
Progressive disclosure
Skills should be structured for efficient use of context:
Metadata (~100 tokens): The name and description fields are loaded at startup for all skills
Instructions (< 5000 tokens recommended): The full SKILL.md body is loaded when the skill is activated
Resources (as needed): Files (e.g. those in scripts/, references/, or assets/) are loaded only when required
Keep your main SKILL.md under 500 lines. Move detailed reference material to separate files.
File references
When referencing other files in your skill, use relative paths from the skill root:
See [the reference guide](references/REFERENCE.md) for details.
Run the extraction script:
scripts/extract.py
Keep file references one level deep from SKILL.md. Avoid deeply nested reference chains.
Validation
Use the skills-ref reference library to validate your skills:
skills-ref validate ./my-skill
This checks that your SKILL.md frontmatter is valid and follows all naming conventions.
Integrate skills into your agent
Copy page
How to add Agent Skills support to your agent or tool.
This guide explains how to add skills support to an AI agent or development tool.
Integration approaches
The two main approaches to integrating skills are:
Filesystem-based agents operate within a computer environment (bash/unix) and represent the most capable option. Skills are activated when models issue shell commands like cat /path/to/my-skill/SKILL.md. Bundled resources are accessed through shell commands.
Tool-based agents function without a dedicated computer environment. Instead, they implement tools allowing models to trigger skills and access bundled assets. The specific tool implementation is up to the developer.
Overview
A skills-compatible agent needs to:
Discover skills in configured directories
Load metadata (name and description) at startup
Match user tasks to relevant skills
Activate skills by loading full instructions
Execute scripts and access resources as needed
Skill discovery
Skills are folders containing a SKILL.md file. Your agent should scan configured directories for valid skills.
Loading metadata
At startup, parse only the frontmatter of each SKILL.md file. This keeps initial context usage low.
Parsing frontmatter
function parseMetadata(skillPath):
content = readFile(skillPath + "/SKILL.md")
frontmatter = extractYAMLFrontmatter(content)
return {
name: frontmatter.name,
description: frontmatter.description,
path: skillPath
}
Injecting into context
Include skill metadata in the system prompt so the model knows what skills are available.
Follow your platform’s guidance for system prompt updates. For example, for Claude models, the recommended format uses XML:
<available_skills>
<skill>
<name>pdf-processing</name>
<description>Extracts text and tables from PDF files, fills forms, merges documents.</description>
<location>/path/to/skills/pdf-processing/SKILL.md</location>
</skill>
<skill>
<name>data-analysis</name>
<description>Analyzes datasets, generates charts, and creates summary reports.</description>
<location>/path/to/skills/data-analysis/SKILL.md</location>
</skill>
</available_skills>
For filesystem-based agents, include the location field with the absolute path to the SKILL.md file. For tool-based agents, the location can be omitted.
Keep metadata concise. Each skill should add roughly 50-100 tokens to the context.
Security considerations
Script execution introduces security risks. Consider:
Sandboxing: Run scripts in isolated environments
Allowlisting: Only execute scripts from trusted skills
Confirmation: Ask users before running potentially dangerous operations
Logging: Record all script executions for auditing
Reference implementation
The skills-ref library provides Python utilities and a CLI for working with skills.
For example:
Validate a skill directory:
skills-ref validate <path>
Generate <available_skills> XML for agent prompts:
skills-ref to-prompt <path>...
Use the library source code as a reference implementation.
Skill authoring best practices
Copy page
Learn how to write effective Skills that Claude can discover and use successfully.
Good Skills are concise, well-structured, and tested with real usage. This guide provides practical authoring decisions to help you write Skills that Claude can discover and use effectively.
For conceptual background on how Skills work, see the Skills overview.
Core principles
Concise is key
The context window is a public good. Your Skill shares the context window with everything else Claude needs to know, including:
The system prompt
Conversation history
Other Skills' metadata
Your actual request
Not every token in your Skill has an immediate cost. At startup, only the metadata (name and description) from all Skills is pre-loaded. Claude reads SKILL.md only when the Skill becomes relevant, and reads additional files only as needed. However, being concise in SKILL.md still matters: once Claude loads it, every token competes with conversation history and other context.
Default assumption: Claude is already very smart
Only add context Claude doesn't already have. Challenge each piece of information:
"Does Claude really need this explanation?"
"Can I assume Claude knows this?"
"Does this paragraph justify its token cost?"
Good example: Concise (approximately 50 tokens):
## Extract PDF text
Use pdfplumber for text extraction:
```python
import pdfplumber
with pdfplumber.open("file.pdf") as pdf:
text = pdf.pages[0].extract_text()
```
Bad example: Too verbose (approximately 150 tokens):
## Extract PDF text
PDF (Portable Document Format) files are a common file format that contains
text, images, and other content. To extract text from a PDF, you'll need to
use a library. There are many libraries available for PDF processing, but we
recommend pdfplumber because it's easy to use and handles most cases well.
First, you'll need to install it using pip. Then you can use the code below...
The concise version assumes Claude knows what PDFs are and how libraries work.
Set appropriate degrees of freedom
Match the level of specificity to the task's fragility and variability.
High freedom (text-based instructions):
Use when:
Multiple approaches are valid
Decisions depend on context
Heuristics guide the approach
Example:
## Code review process
1. Analyze the code structure and organization
2. Check for potential bugs or edge cases
3. Suggest improvements for readability and maintainability
4. Verify adherence to project conventions
Medium freedom (pseudocode or scripts with parameters):
Use when:
A preferred pattern exists
Some variation is acceptable
Configuration affects behavior
Example:
## Generate report
Use this template and customize as needed:
```python
def generate_report(data, format="markdown", include_charts=True):
# Process data
# Generate output in specified format
# Optionally include visualizations
```
Low freedom (specific scripts, few or no parameters):
Use when:
Operations are fragile and error-prone
Consistency is critical
A specific sequence must be followed
Example:
## Database migration
Run exactly this script:
```bash
python scripts/migrate.py --verify --backup
```
Do not modify the command or add additional flags.
Analogy: Think of Claude as a robot exploring a path:
Narrow bridge with cliffs on both sides: There's only one safe way forward. Provide specific guardrails and exact instructions (low freedom). Example: database migrations that must run in exact sequence.
Open field with no hazards: Many paths lead to success. Give general direction and trust Claude to find the best route (high freedom). Example: code reviews where context determines the best approach.
Test with all models you plan to use
Skills act as additions to models, so effectiveness depends on the underlying model. Test your Skill with all the models you plan to use it with.
Testing considerations by model:
Claude Haiku (fast, economical): Does the Skill provide enough guidance?
Claude Sonnet (balanced): Is the Skill clear and efficient?
Claude Opus (powerful reasoning): Does the Skill avoid over-explaining?
What works perfectly for Opus might need more detail for Haiku. If you plan to use your Skill across multiple models, aim for instructions that work well with all of them.
Skill structure
YAML Frontmatter: The SKILL.md frontmatter requires two fields:
name:
Maximum 64 characters
Must contain only lowercase letters, numbers, and hyphens
Cannot contain XML tags
Cannot contain reserved words: "anthropic", "claude"
description:
Must be non-empty
Maximum 1024 characters
Cannot contain XML tags
Should describe what the Skill does and when to use it
For complete Skill structure details, see the Skills overview.
Naming conventions
Use consistent naming patterns to make Skills easier to reference and discuss. We recommend using gerund form (verb + -ing) for Skill names, as this clearly describes the activity or capability the Skill provides.
Remember that the name field must use lowercase letters, numbers, and hyphens only.
Good naming examples (gerund form):
processing-pdfs
analyzing-spreadsheets
managing-databases
testing-code
writing-documentation
Acceptable alternatives:
Noun phrases: pdf-processing, spreadsheet-analysis
Action-oriented: process-pdfs, analyze-spreadsheets
Avoid:
Vague names: helper, utils, tools
Overly generic: documents, data, files
Reserved words: anthropic-helper, claude-tools
Inconsistent patterns within your skill collection
Consistent naming makes it easier to:
Reference Skills in documentation and conversations
Understand what a Skill does at a glance
Organize and search through multiple Skills
Maintain a professional, cohesive skill library
Writing effective descriptions
The description field enables Skill discovery and should include both what the Skill does and when to use it.
Always write in third person. The description is injected into the system prompt, and inconsistent point-of-view can cause discovery problems.
Good: "Processes Excel files and generates reports"
Avoid: "I can help you process Excel files"
Avoid: "You can use this to process Excel files"
Be specific and include key terms. Include both what the Skill does and specific triggers/contexts for when to use it.
Each Skill has exactly one description field. The description is critical for skill selection: Claude uses it to choose the right Skill from potentially 100+ available Skills. Your description must provide enough detail for Claude to know when to select this Skill, while the rest of SKILL.md provides the implementation details.
Effective examples:
PDF Processing skill:
description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
Excel Analysis skill:
description: Analyze Excel spreadsheets, create pivot tables, generate charts. Use when analyzing Excel files, spreadsheets, tabular data, or .xlsx files.
Git Commit Helper skill:
description: Generate descriptive commit messages by analyzing git diffs. Use when the user asks for help writing commit messages or reviewing staged changes.
Avoid vague descriptions like these:
description: Helps with documents
description: Processes data
description: Does stuff with files
Progressive disclosure patterns
SKILL.md serves as an overview that points Claude to detailed materials as needed, like a table of contents in an onboarding guide. For an explanation of how progressive disclosure works, see How Skills work in the overview.
Practical guidance:
Keep SKILL.md body under 500 lines for optimal performance
Split content into separate files when approaching this limit
Use the patterns below to organize instructions, code, and resources effectively
Visual overview: From simple to complex
A basic Skill starts with just a SKILL.md file containing metadata and instructions:
Simple SKILL.md file showing YAML frontmatter and markdown body
As your Skill grows, you can bundle additional content that Claude loads only when needed:
Bundling additional reference files like reference.md and forms.md.
The complete Skill directory structure might look like this:
pdf/
├── SKILL.md # Main instructions (loaded when triggered)
├── FORMS.md # Form-filling guide (loaded as needed)
├── reference.md # API reference (loaded as needed)
├── examples.md # Usage examples (loaded as needed)
└── scripts/
├── analyze_form.py # Utility script (executed, not loaded)
├── fill_form.py # Form filling script
└── validate.py # Validation script
Pattern 1: High-level guide with references
---
name: pdf-processing
description: Extracts text and tables from PDF files, fills forms, and merges documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
---
# PDF Processing
## Quick start
Extract text with pdfplumber:
```python
import pdfplumber
with pdfplumber.open("file.pdf") as pdf:
text = pdf.pages[0].extract_text()
```
## Advanced features
**Form filling**: See [FORMS.md](FORMS.md) for complete guide
**API reference**: See [REFERENCE.md](REFERENCE.md) for all methods
**Examples**: See [EXAMPLES.md](EXAMPLES.md) for common patterns
Claude loads FORMS.md, REFERENCE.md, or EXAMPLES.md only when needed.
Pattern 2: Domain-specific organization
For Skills with multiple domains, organize content by domain to avoid loading irrelevant context. When a user asks about sales metrics, Claude only needs to read sales-related schemas, not finance or marketing data. This keeps token usage low and context focused.
bigquery-skill/
├── SKILL.md (overview and navigation)
└── reference/
├── finance.md (revenue, billing metrics)
├── sales.md (opportunities, pipeline)
├── product.md (API usage, features)
└── marketing.md (campaigns, attribution)
SKILL.md
# BigQuery Data Analysis
## Available datasets
**Finance**: Revenue, ARR, billing → See [reference/finance.md](reference/finance.md)
**Sales**: Opportunities, pipeline, accounts → See [reference/sales.md](reference/sales.md)
**Product**: API usage, features, adoption → See [reference/product.md](reference/product.md)
**Marketing**: Campaigns, attribution, email → See [reference/marketing.md](reference/marketing.md)
## Quick search
Find specific metrics using grep:
```bash
grep -i "revenue" reference/finance.md
grep -i "pipeline" reference/sales.md
grep -i "api usage" reference/product.md
```
Pattern 3: Conditional details
Show basic content, link to advanced content:
# DOCX Processing
## Creating documents
Use docx-js for new documents. See [DOCX-JS.md](DOCX-JS.md).
## Editing documents
For simple edits, modify the XML directly.
**For tracked changes**: See [REDLINING.md](REDLINING.md)
**For OOXML details**: See [OOXML.md](OOXML.md)
Claude reads REDLINING.md or OOXML.md only when the user needs those features.
Avoid deeply nested references
Claude may partially read files when they're referenced from other referenced files. When encountering nested references, Claude might use commands like head -100 to preview content rather than reading entire files, resulting in incomplete information.
Keep references one level deep from SKILL.md. All reference files should link directly from SKILL.md to ensure Claude reads complete files when needed.
Bad example: Too deep:
# SKILL.md
See [advanced.md](advanced.md)...
# advanced.md
See [details.md](details.md)...
# details.md
Here's the actual information...
Good example: One level deep:
# SKILL.md
**Basic usage**: [instructions in SKILL.md]
**Advanced features**: See [advanced.md](advanced.md)
**API reference**: See [reference.md](reference.md)
**Examples**: See [examples.md](examples.md)
Structure longer reference files with table of contents
For reference files longer than 100 lines, include a table of contents at the top. This ensures Claude can see the full scope of available information even when previewing with partial reads.
Example:
# API Reference
## Contents
- Authentication and setup
- Core methods (create, read, update, delete)
- Advanced features (batch operations, webhooks)
- Error handling patterns
- Code examples
## Authentication and setup
...
## Core methods
...
Claude can then read the complete file or jump to specific sections as needed.
For details on how this filesystem-based architecture enables progressive disclosure, see the Runtime environment section in the Advanced section below.
Workflows and feedback loops
Use workflows for complex tasks
Break complex operations into clear, sequential steps. For particularly complex workflows, provide a checklist that Claude can copy into its response and check off as it progresses.
Example 1: Research synthesis workflow (for Skills without code):
## Research synthesis workflow
Copy this checklist and track your progress:
```
Research Progress:
- [ ] Step 1: Read all source documents
- [ ] Step 2: Identify key themes
- [ ] Step 3: Cross-reference claims
- [ ] Step 4: Create structured summary
- [ ] Step 5: Verify citations
```
**Step 1: Read all source documents**
Review each document in the `sources/` directory. Note the main arguments and supporting evidence.
**Step 2: Identify key themes**
Look for patterns across sources. What themes appear repeatedly? Where do sources agree or disagree?
**Step 3: Cross-reference claims**
For each major claim, verify it appears in the source material. Note which source supports each point.
**Step 4: Create structured summary**
Organize findings by theme. Include:
- Main claim
- Supporting evidence from sources
- Conflicting viewpoints (if any)
**Step 5: Verify citations**
Check that every claim references the correct source document. If citations are incomplete, return to Step 3.
This example shows how workflows apply to analysis tasks that don't require code. The checklist pattern works for any complex, multi-step process.
Example 2: PDF form filling workflow (for Skills with code):
## PDF form filling workflow
Copy this checklist and check off items as you complete them:
```
Task Progress:
- [ ] Step 1: Analyze the form (run analyze_form.py)
- [ ] Step 2: Create field mapping (edit fields.json)
- [ ] Step 3: Validate mapping (run validate_fields.py)
- [ ] Step 4: Fill the form (run fill_form.py)
- [ ] Step 5: Verify output (run verify_output.py)
```
**Step 1: Analyze the form**
Run: `python scripts/analyze_form.py input.pdf`
This extracts form fields and their locations, saving to `fields.json`.
**Step 2: Create field mapping**
Edit `fields.json` to add values for each field.
**Step 3: Validate mapping**
Run: `python scripts/validate_fields.py fields.json`
Fix any validation errors before continuing.
**Step 4: Fill the form**
Run: `python scripts/fill_form.py input.pdf fields.json output.pdf`
**Step 5: Verify output**
Run: `python scripts/verify_output.py output.pdf`
If verification fails, return to Step 2.
Clear steps prevent Claude from skipping critical validation. The checklist helps both Claude and you track progress through multi-step workflows.
Implement feedback loops
Common pattern: Run validator → fix errors → repeat
This pattern greatly improves output quality.
Example 1: Style guide compliance (for Skills without code):
## Content review process
1. Draft your content following the guidelines in STYLE_GUIDE.md
2. Review against the checklist:
- Check terminology consistency
- Verify examples follow the standard format
- Confirm all required sections are present
3. If issues found:
- Note each issue with specific section reference
- Revise the content
- Review the checklist again
4. Only proceed when all requirements are met
5. Finalize and save the document
This shows the validation loop pattern using reference documents instead of scripts. The "validator" is STYLE_GUIDE.md, and Claude performs the check by reading and comparing.
Example 2: Document editing process (for Skills with code):
## Document editing process
1. Make your edits to `word/document.xml`
2. **Validate immediately**: `python ooxml/scripts/validate.py unpacked_dir/`
3. If validation fails:
- Review the error message carefully
- Fix the issues in the XML
- Run validation again
4. **Only proceed when validation passes**
5. Rebuild: `python ooxml/scripts/pack.py unpacked_dir/ output.docx`
6. Test the output document
The validation loop catches errors early.
Content guidelines
Avoid time-sensitive information
Don't include information that will become outdated:
Bad example: Time-sensitive (will become wrong):
If you're doing this before August 2025, use the old API.
After August 2025, use the new API.
Good example (use "old patterns" section):
## Current method
Use the v2 API endpoint: `api.example.com/v2/messages`
## Old patterns
<details>
<summary>Legacy v1 API (deprecated 2025-08)</summary>
The v1 API used: `api.example.com/v1/messages`
This endpoint is no longer supported.
</details>
The old patterns section provides historical context without cluttering the main content.
Use consistent terminology
Choose one term and use it throughout the Skill:
Good - Consistent:
Always "API endpoint"
Always "field"
Always "extract"
Bad - Inconsistent:
Mix "API endpoint", "URL", "API route", "path"
Mix "field", "box", "element", "control"
Mix "extract", "pull", "get", "retrieve"
Consistency helps Claude understand and follow instructions.
Common patterns
Template pattern
Provide templates for output format. Match the level of strictness to your needs.
For strict requirements (like API responses or data formats):
## Report structure
ALWAYS use this exact template structure:
```markdown
# [Analysis Title]
## Executive summary
[One-paragraph overview of key findings]
## Key findings
- Finding 1 with supporting data
- Finding 2 with supporting data
- Finding 3 with supporting data
## Recommendations
1. Specific actionable recommendation
2. Specific actionable recommendation
```
For flexible guidance (when adaptation is useful):
## Report structure
Here is a sensible default format, but use your best judgment based on the analysis:
```markdown
# [Analysis Title]
## Executive summary
[Overview]
## Key findings
[Adapt sections based on what you discover]
## Recommendations
[Tailor to the specific context]
```
Adjust sections as needed for the specific analysis type.
Examples pattern
For Skills where output quality depends on seeing examples, provide input/output pairs just like in regular prompting:
## Commit message format
Generate commit messages following these examples:
**Example 1:**
Input: Added user authentication with JWT tokens
Output:
```
feat(auth): implement JWT-based authentication
Add login endpoint and token validation middleware
```
**Example 2:**
Input: Fixed bug where dates displayed incorrectly in reports
Output:
```
fix(reports): correct date formatting in timezone conversion
Use UTC timestamps consistently across report generation
```
**Example 3:**
Input: Updated dependencies and refactored error handling
Output:
```
chore: update dependencies and refactor error handling
- Upgrade lodash to 4.17.21
- Standardize error response format across endpoints
```
Follow this style: type(scope): brief description, then detailed explanation.
Examples help Claude understand the desired style and level of detail more clearly than descriptions alone.
Conditional workflow pattern
Guide Claude through decision points:
## Document modification workflow
1. Determine the modification type:
**Creating new content?** → Follow "Creation workflow" below
**Editing existing content?** → Follow "Editing workflow" below
2. Creation workflow:
- Use docx-js library
- Build document from scratch
- Export to .docx format
3. Editing workflow:
- Unpack existing document
- Modify XML directly
- Validate after each change
- Repack when complete
If workflows become large or complicated with many steps, consider pushing them into separate files and tell Claude to read the appropriate file based on the task at hand.
Evaluation and iteration
Build evaluations first
Create evaluations BEFORE writing extensive documentation. This ensures your Skill solves real problems rather than documenting imagined ones.
Evaluation-driven development:
Identify gaps: Run Claude on representative tasks without a Skill. Document specific failures or missing context
Create evaluations: Build three scenarios that test these gaps
Establish baseline: Measure Claude's performance without the Skill
Write minimal instructions: Create just enough content to address the gaps and pass evaluations
Iterate: Execute evaluations, compare against baseline, and refine
This approach ensures you're solving actual problems rather than anticipating requirements that may never materialize.
Evaluation structure:
{
"skills": ["pdf-processing"],
"query": "Extract all text from this PDF file and save it to output.txt",
"files": ["test-files/document.pdf"],
"expected_behavior": [
"Successfully reads the PDF file using an appropriate PDF processing library or command-line tool",
"Extracts text content from all pages in the document without missing any pages",
"Saves the extracted text to a file named output.txt in a clear, readable format"
]
}
This example demonstrates a data-driven evaluation with a simple testing rubric. We do not currently provide a built-in way to run these evaluations. Users can create their own evaluation system. Evaluations are your source of truth for measuring Skill effectiveness.
Develop Skills iteratively with Claude
The most effective Skill development process involves Claude itself. Work with one instance of Claude ("Claude A") to create a Skill that will be used by other instances ("Claude B"). Claude A helps you design and refine instructions, while Claude B tests them in real tasks. This works because Claude models understand both how to write effective agent instructions and what information agents need.
Creating a new Skill:
Complete a task without a Skill: Work through a problem with Claude A using normal prompting. As you work, you'll naturally provide context, explain preferences, and share procedural knowledge. Notice what information you repeatedly provide.
Identify the reusable pattern: After completing the task, identify what context you provided that would be useful for similar future tasks.
Example: If you worked through a BigQuery analysis, you might have provided table names, field definitions, filtering rules (like "always exclude test accounts"), and common query patterns.
Ask Claude A to create a Skill: "Create a Skill that captures this BigQuery analysis pattern we just used. Include the table schemas, naming conventions, and the rule about filtering test accounts."
Claude models understand the Skill format and structure natively. You don't need special system prompts or a "writing skills" skill to get Claude to help create Skills. Simply ask Claude to create a Skill and it will generate properly structured SKILL.md content with appropriate frontmatter and body content.
Review for conciseness: Check that Claude A hasn't added unnecessary explanations. Ask: "Remove the explanation about what win rate means - Claude already knows that."
Improve information architecture: Ask Claude A to organize the content more effectively. For example: "Organize this so the table schema is in a separate reference file. We might add more tables later."
Test on similar tasks: Use the Skill with Claude B (a fresh instance with the Skill loaded) on related use cases. Observe whether Claude B finds the right information, applies rules correctly, and handles the task successfully.
Iterate based on observation: If Claude B struggles or misses something, return to Claude A with specifics: "When Claude used this Skill, it forgot to filter by date for Q4. Should we add a section about date filtering patterns?"
Iterating on existing Skills:
The same hierarchical pattern continues when improving Skills. You alternate between:
Working with Claude A (the expert who helps refine the Skill)
Testing with Claude B (the agent using the Skill to perform real work)
Observing Claude B's behavior and bringing insights back to Claude A
Use the Skill in real workflows: Give Claude B (with the Skill loaded) actual tasks, not test scenarios
Observe Claude B's behavior: Note where it struggles, succeeds, or makes unexpected choices
Example observation: "When I asked Claude B for a regional sales report, it wrote the query but forgot to filter out test accounts, even though the Skill mentions this rule."
Return to Claude A for improvements: Share the current SKILL.md and describe what you observed. Ask: "I noticed Claude B forgot to filter test accounts when I asked for a regional report. The Skill mentions filtering, but maybe it's not prominent enough?"
Review Claude A's suggestions: Claude A might suggest reorganizing to make rules more prominent, using stronger language like "MUST filter" instead of "always filter", or restructuring the workflow section.
Apply and test changes: Update the Skill with Claude A's refinements, then test again with Claude B on similar requests
Repeat based on usage: Continue this observe-refine-test cycle as you encounter new scenarios. Each iteration improves the Skill based on real agent behavior, not assumptions.
Gathering team feedback:
Share Skills with teammates and observe their usage
Ask: Does the Skill activate when expected? Are instructions clear? What's missing?
Incorporate feedback to address blind spots in your own usage patterns
Why this approach works: Claude A understands agent needs, you provide domain expertise, Claude B reveals gaps through real usage, and iterative refinement improves Skills based on observed behavior rather than assumptions.
Observe how Claude navigates Skills
As you iterate on Skills, pay attention to how Claude actually uses them in practice. Watch for:
Unexpected exploration paths: Does Claude read files in an order you didn't anticipate? This might indicate your structure isn't as intuitive as you thought
Missed connections: Does Claude fail to follow references to important files? Your links might need to be more explicit or prominent
Overreliance on certain sections: If Claude repeatedly reads the same file, consider whether that content should be in the main SKILL.md instead
Ignored content: If Claude never accesses a bundled file, it might be unnecessary or poorly signaled in the main instructions
Iterate based on these observations rather than assumptions. The 'name' and 'description' in your Skill's metadata are particularly critical. Claude uses these when deciding whether to trigger the Skill in response to the current task. Make sure they clearly describe what the Skill does and when it should be used.
Anti-patterns to avoid
Avoid Windows-style paths
Always use forward slashes in file paths, even on Windows:
✓ Good: scripts/helper.py, reference/guide.md
✗ Avoid: scripts\helper.py, reference\guide.md
Unix-style paths work across all platforms, while Windows-style paths cause errors on Unix systems.
Avoid offering too many options
Don't present multiple approaches unless necessary:
**Bad example: Too many choices** (confusing):
"You can use pypdf, or pdfplumber, or PyMuPDF, or pdf2image, or..."
**Good example: Provide a default** (with escape hatch):
"Use pdfplumber for text extraction:
```python
import pdfplumber
```
For scanned PDFs requiring OCR, use pdf2image with pytesseract instead."
Advanced: Skills with executable code
The sections below focus on Skills that include executable scripts. If your Skill uses only markdown instructions, skip to Checklist for effective Skills.
Solve, don't punt
When writing scripts for Skills, handle error conditions rather than punting to Claude.
Good example: Handle errors explicitly:
def process_file(path):
"""Process a file, creating it if it doesn't exist."""
try:
with open(path) as f:
return f.read()
except FileNotFoundError:
# Create file with default content instead of failing
print(f"File {path} not found, creating default")
with open(path, 'w') as f:
f.write('')
return ''
except PermissionError:
# Provide alternative instead of failing
print(f"Cannot access {path}, using default")
return ''
Bad example: Punt to Claude:
def process_file(path):
# Just fail and let Claude figure it out
return open(path).read()
Configuration parameters should also be justified and documented to avoid "voodoo constants" (Ousterhout's law). If you don't know the right value, how will Claude determine it?
Good example: Self-documenting:
# HTTP requests typically complete within 30 seconds
# Longer timeout accounts for slow connections
REQUEST_TIMEOUT = 30
# Three retries balances reliability vs speed
# Most intermittent failures resolve by the second retry
MAX_RETRIES = 3
Bad example: Magic numbers:
TIMEOUT = 47 # Why 47?
RETRIES = 5 # Why 5?
Provide utility scripts
Even if Claude could write a script, pre-made scripts offer advantages:
Benefits of utility scripts:
More reliable than generated code
Save tokens (no need to include code in context)
Save time (no code generation required)
Ensure consistency across uses
Bundling executable scripts alongside instruction files
The diagram above shows how executable scripts work alongside instruction files. The instruction file (forms.md) references the script, and Claude can execute it without loading its contents into context.
Important distinction: Make clear in your instructions whether Claude should:
Execute the script (most common): "Run analyze_form.py to extract fields"
Read it as reference (for complex logic): "See analyze_form.py for the field extraction algorithm"
For most utility scripts, execution is preferred because it's more reliable and efficient. See the Runtime environment section below for details on how script execution works.
Example:
## Utility scripts
**analyze_form.py**: Extract all form fields from PDF
```bash
python scripts/analyze_form.py input.pdf > fields.json
```
Output format:
```json
{
"field_name": {"type": "text", "x": 100, "y": 200},
"signature": {"type": "sig", "x": 150, "y": 500}
}
```
**validate_boxes.py**: Check for overlapping bounding boxes
```bash
python scripts/validate_boxes.py fields.json
# Returns: "OK" or lists conflicts
```
**fill_form.py**: Apply field values to PDF
```bash
python scripts/fill_form.py input.pdf fields.json output.pdf
```
Use visual analysis
When inputs can be rendered as images, have Claude analyze them:
## Form layout analysis
1. Convert PDF to images:
```bash
python scripts/pdf_to_images.py form.pdf
```
2. Analyze each page image to identify form fields
3. Claude can see field locations and types visually
In this example, you'd need to write the pdf_to_images.py script.
Claude's vision capabilities help understand layouts and structures.
Create verifiable intermediate outputs
When Claude performs complex, open-ended tasks, it can make mistakes. The "plan-validate-execute" pattern catches errors early by having Claude first create a plan in a structured format, then validate that plan with a script before executing it.
Example: Imagine asking Claude to update 50 form fields in a PDF based on a spreadsheet. Without validation, Claude might reference non-existent fields, create conflicting values, miss required fields, or apply updates incorrectly.
Solution: Use the workflow pattern shown above (PDF form filling), but add an intermediate changes.json file that gets validated before applying changes. The workflow becomes: analyze → create plan file → validate plan → execute → verify.
Why this pattern works:
Catches errors early: Validation finds problems before changes are applied
Machine-verifiable: Scripts provide objective verification
Reversible planning: Claude can iterate on the plan without touching originals
Clear debugging: Error messages point to specific problems
When to use: Batch operations, destructive changes, complex validation rules, high-stakes operations.
Implementation tip: Make validation scripts verbose with specific error messages like "Field 'signature_date' not found. Available fields: customer_name, order_total, signature_date_signed" to help Claude fix issues.
Package dependencies
Skills run in the code execution environment with platform-specific limitations:
claude.ai: Can install packages from npm and PyPI and pull from GitHub repositories
Anthropic API: Has no network access and no runtime package installation
List required packages in your SKILL.md and verify they're available in the code execution tool documentation.
Runtime environment
Skills run in a code execution environment with filesystem access, bash commands, and code execution capabilities. For the conceptual explanation of this architecture, see The Skills architecture in the overview.
How this affects your authoring:
How Claude accesses Skills:
Metadata pre-loaded: At startup, the name and description from all Skills' YAML frontmatter are loaded into the system prompt
Files read on-demand: Claude uses bash Read tools to access SKILL.md and other files from the filesystem when needed
Scripts executed efficiently: Utility scripts can be executed via bash without loading their full contents into context. Only the script's output consumes tokens
No context penalty for large files: Reference files, data, or documentation don't consume context tokens until actually read
File paths matter: Claude navigates your skill directory like a filesystem. Use forward slashes (reference/guide.md), not backslashes
Name files descriptively: Use names that indicate content: form_validation_rules.md, not doc2.md
Organize for discovery: Structure directories by domain or feature
Good: reference/finance.md, reference/sales.md
Bad: docs/file1.md, docs/file2.md
Bundle comprehensive resources: Include complete API docs, extensive examples, large datasets; no context penalty until accessed
Prefer scripts for deterministic operations: Write validate_form.py rather than asking Claude to generate validation code
Make execution intent clear:
"Run analyze_form.py to extract fields" (execute)
"See analyze_form.py for the extraction algorithm" (read as reference)
Test file access patterns: Verify Claude can navigate your directory structure by testing with real requests
Example:
bigquery-skill/
├── SKILL.md (overview, points to reference files)
└── reference/
├── finance.md (revenue metrics)
├── sales.md (pipeline data)
└── product.md (usage analytics)
When the user asks about revenue, Claude reads SKILL.md, sees the reference to reference/finance.md, and invokes bash to read just that file. The sales.md and product.md files remain on the filesystem, consuming zero context tokens until needed. This filesystem-based model is what enables progressive disclosure. Claude can navigate and selectively load exactly what each task requires.
For complete details on the technical architecture, see How Skills work in the Skills overview.
MCP tool references
If your Skill uses MCP (Model Context Protocol) tools, always use fully qualified tool names to avoid "tool not found" errors.
Format: ServerName:tool_name
Example:
Use the BigQuery:bigquery_schema tool to retrieve table schemas.
Use the GitHub:create_issue tool to create issues.
Where:
BigQuery and GitHub are MCP server names
bigquery_schema and create_issue are the tool names within those servers
Without the server prefix, Claude may fail to locate the tool, especially when multiple MCP servers are available.
Avoid assuming tools are installed
Don't assume packages are available:
**Bad example: Assumes installation**:
"Use the pdf library to process the file."
**Good example: Explicit about dependencies**:
"Install required package: `pip install pypdf`
Then use it:
```python
from pypdf import PdfReader
reader = PdfReader("file.pdf")
```"
Technical notes
YAML frontmatter requirements
The SKILL.md frontmatter requires name and description fields with specific validation rules:
name: Maximum 64 characters, lowercase letters/numbers/hyphens only, no XML tags, no reserved words
description: Maximum 1024 characters, non-empty, no XML tags
See the Skills overview for complete structure details.
Token budgets
Keep SKILL.md body under 500 lines for optimal performance. If your content exceeds this, split it into separate files using the progressive disclosure patterns described earlier. For architectural details, see the Skills overview.
Checklist for effective Skills
Before sharing a Skill, verify:
Core quality
Description is specific and includes key terms
Description includes both what the Skill does and when to use it
SKILL.md body is under 500 lines
Additional details are in separate files (if needed)
No time-sensitive information (or in "old patterns" section)
Consistent terminology throughout
Examples are concrete, not abstract
File references are one level deep
Progressive disclosure used appropriately
Workflows have clear steps
Code and scripts
Scripts solve problems rather than punt to Claude
Error handling is explicit and helpful
No "voodoo constants" (all values justified)
Required packages listed in instructions and verified as available
Scripts have clear documentation
No Windows-style paths (all forward slashes)
Validation/verification steps for critical operations
Feedback loops included for quality-critical tasks
Testing
At least three evaluations created
Tested with Haiku, Sonnet, and Opus
Tested with real usage scenarios
Team feedback incorporated (if applicable)
https://github.com/anthropics/skills
================================================
FILE: docs/blogs.md
================================================
---
name: context-engineering-blogs
description: Collection of technical blogs about context engineering, covering strategies for managing context windows in agent systems including write, select, compress, and isolate patterns.
doc_type: blog
source_url: No
---
Some technical blogs that I recently read and find valuable:
(Context Engineering
11 min read
Jul 2, 2025
TL;DR
Agents need context to perform tasks. Context engineering is the art and science of filling the context window with just the right information at each step of an agent’s trajectory. In this post, we break down some common strategies — write, select, compress, and isolate — for context engineering by reviewing various popular agents and papers. We then explain how LangGraph is designed to support them!
Also, see our video on context engineering here.
General categories of context engineering
Context Engineering
As Andrej Karpathy puts it, LLMs are like a new kind of operating system. The LLM is like the CPU and its context window is like the RAM, serving as the model’s working memory. Just like RAM, the LLM context window has limited capacity to handle various sources of context. And just as an operating system curates what fits into a CPU’s RAM, we can think about “context engineering” playing a similar role. Karpathy summarizes this well:
[Context engineering is the] ”…delicate art and science of filling the context window with just the right information for the next step.”
Context types commonly used in LLM applications
What are the types of context that we need to manage when building LLM applications? Context engineering as an umbrella that applies across a few different context types:
Instructions – prompts, memories, few‑shot examples, tool descriptions, etc
Knowledge – facts, memories, etc
Tools – feedback from tool calls
Context Engineering for Agents
This year, interest in agents has grown tremendously as LLMs get better at reasoning and tool calling. Agents interleave LLM invocations and tool calls, often for long-running tasks. Agents interleave LLM calls and tool calls, using tool feedback to decide the next step.
Agents interleave LLM calls and tool calls, using tool feedback to decide the next step
However, long-running tasks and accumulating feedback from tool calls mean that agents often utilize a large number of tokens. This can cause numerous problems: it can exceed the size of the context window, balloon cost / latency, or degrade agent performance. Drew Breunig nicely outlined a number of specific ways that longer context can cause perform problems, including:
Context Poisoning: When a hallucination makes it into the context
Context Distraction: When the context overwhelms the training
Context Confusion: When superfluous context influences the response
Context Clash: When parts of the context disagree
Context from tool calls accumulates over multiple agent turns
With this in mind, Cognition called out the importance of context engineering:
“Context engineering” … is effectively the #1 job of engineers building AI agents.
Anthropic also laid it out clearly:
Agents often engage in conversations spanning hundreds of turns, requiring careful context management strategies.
So, how are people tackling this challenge today? We group common strategies for agent context engineering into four buckets — write, select, compress, and isolate — and give examples of each from review of some popular agent products and papers. We then explain how LangGraph is designed to support them!
General categories of context engineering
Write Context
Writing context means saving it outside the context window to help an agent perform a task.
Scratchpads
When humans solve tasks, we take notes and remember things for future, related tasks. Agents are also gaining these capabilities! Note-taking via a “scratchpad” is one approach to persist information while an agent is performing a task. The idea is to save information outside of the context window so that it’s available to the agent. Anthropic’s multi-agent researcher illustrates a clear example of this:
The LeadResearcher begins by thinking through the approach and saving its plan to Memory to persist the context, since if the context window exceeds 200,000 tokens it will be truncated and it is important to retain the plan.
Scratchpads can be implemented in a few different ways. They can be a tool call that simply writes to a file. They can also be a field in a runtime state object that persists during the session. In either case, scratchpads let agents save useful information to help them accomplish a task.
Memories
Scratchpads help agents solve a task within a given session (or thread), but sometimes agents benefit from remembering things across many sessions! Reflexion introduced the idea of reflection following each agent turn and re-using these self-generated memories. Generative Agents created memories synthesized periodically from collections of past agent feedback.
An LLM can be used to update or create memories
These concepts made their way into popular products like ChatGPT, Cursor, and Windsurf, which all have mechanisms to auto-generate long-term memories that can persist across sessions based on user-agent interactions.
Select Context
Selecting context means pulling it into the context window to help an agent perform a task.
Scratchpad
The mechanism for selecting context from a scratchpad depends upon how the scratchpad is implemented. If it’s a tool, then an agent can simply read it by making a tool call. If it’s part of the agent’s runtime state, then the developer can choose what parts of state to expose to an agent each step. This provides a fine-grained level of control for exposing scratchpad context to the LLM at later turns.
Memories
If agents have the ability to save memories, they also need the ability to select memories relevant to the task they are performing. This can be useful for a few reasons. Agents might select few-shot examples (episodic memories) for examples of desired behavior, instructions (procedural memories) to steer behavior, or facts (semantic memories) for task-relevant context.
One challenge is ensuring that relevant memories are selected. Some popular agents simply use a narrow set of files that are always pulled into context. For example, many code agent use specific files to save instructions (”procedural” memories) or, in some cases, examples (”episodic” memories). Claude Code uses CLAUDE.md. Cursor and Windsurf use rules files.
But, if an agent is storing a larger collection of facts and / or relationships (e.g., semantic memories), selection is harder. ChatGPT is a good example of a popular product that stores and selects from a large collection of user-specific memories.
Embeddings and / or knowledge graphs for memory indexing are commonly used to assist with selection. Still, memory selection is challenging. At the AIEngineer World’s Fair, Simon Willison shared an example of selection gone wrong: ChatGPT fetched his location from memories and unexpectedly injected it into a requested image. This type of unexpected or undesired memory retrieval can make some users feel like the context window “no longer belongs to them”!
Tools
Agents use tools, but can become overloaded if they are provided with too many. This is often because the tool descriptions overlap, causing model confusion about which tool to use. One approach is to apply RAG (retrieval augmented generation) to tool descriptions in order to fetch only the most relevant tools for a task. Some recent papers have shown that this improve tool selection accuracy by 3-fold.
Knowledge
RAG is a rich topic and it can be a central context engineering challenge. Code agents are some of the best examples of RAG in large-scale production. Varun from Windsurf captures some of these challenges well:
Indexing code ≠ context retrieval … [We are doing indexing & embedding search … [with] AST parsing code and chunking along semantically meaningful boundaries … embedding search becomes unreliable as a retrieval heuristic as the size of the codebase grows … we must rely on a combination of techniques like grep/file search, knowledge graph based retrieval, and … a re-ranking step where [context] is ranked in order of relevance.
Compressing Context
Compressing context involves retaining only the tokens required to perform a task.
Context Summarization
Agent interactions can span hundreds of turns and use token-heavy tool calls. Summarization is one common way to manage these challenges. If you’ve used Claude Code, you’ve seen this in action. Claude Code runs “auto-compact” after you exceed 95% of the context window and it will summarize the full trajectory of user-agent interactions. This type of compression across an agent trajectory can use various strategies such as recursive or hierarchical summarization.
A few places where summarization can be applied
It can also be useful to add summarization at specific points in an agent’s design. For example, it can be used to post-process certain tool calls (e.g., token-heavy search tools). As a second example, Cognition mentioned summarization at agent-agent boundaries to reduce tokens during knowledge hand-off. Summarization can be a challenge if specific events or decisions need to be captured. Cognition uses a fine-tuned model for this, which underscores how much work can go into this step.
Context Trimming
Whereas summarization typically uses an LLM to distill the most relevant pieces of context, trimming can often filter or, as Drew Breunig points out, “prune” context. This can use hard-coded heuristics like removing older messages from a list. Drew also mentions Provence, a trained context pruner for Question-Answering.
Isolating Context
Isolating context involves splitting it up to help an agent perform a task.
Multi-agent
One of the most popular ways to isolate context is to split it across sub-agents. A motivation for the OpenAI Swarm library was separation of concerns, where a team of agents can handle specific sub-tasks. Each agent has a specific set of tools, instructions, and its own context window.
Split context across multiple agents
Anthropic’s multi-agent researcher makes a case for this: many agents with isolated contexts outperformed single-agent, largely because each subagent context window can be allocated to a more narrow sub-task. As the blog said:
[Subagents operate] in parallel with their own context windows, exploring different aspects of the question simultaneously.
Of course, the challenges with multi-agent include token use (e.g., up to 15× more tokens than chat as reported by Anthropic), the need for careful prompt engineering to plan sub-agent work, and coordination of sub-agents.
Context Isolation with Environments
HuggingFace’s deep researcher shows another interesting example of context isolation. Most agents use tool calling APIs, which return JSON objects (tool arguments) that can be passed to tools (e.g., a search API) to get tool feedback (e.g., search results). HuggingFace uses a CodeAgent, which outputs that contains the desired tool calls. The code then runs in a sandbox. Selected context (e.g., return values) from the tool calls is then passed back to the LLM.
Sandboxes can isolate context from the LLM.
This allows context to be isolated from the LLM in the environment. Hugging Face noted that this is a great way to isolate token-heavy objects in particular:
[Code Agents allow for] a better handling of state … Need to store this image / audio / other for later use? No problem, just assign it as a variable in your state and you [use it later].
State
It’s worth calling out that an agent’s runtime state object can also be a great way to isolate context. This can serve the same purpose as sandboxing. A state object can be designed with a schema that has fields that context can be written to. One field of the schema (e.g., messages) can be exposed to the LLM at each turn of the agent, but the schema can isolate information in other fields for more selective use.
Context Engineering with LangSmith / LangGraph
So, how can you apply these ideas? Before you start, there are two foundational pieces that are helpful. First, ensure that you have a way to look at your data and track token-usage across your agent. This helps inform where best to apply effort context engineering. LangSmith is well-suited for agent tracing / observability, and offers a great way to do this. Second, be sure you have a simple way to test whether context engineering hurts or improve agent performance. LangSmith enables agent evaluation to test the impact of any context engineering effort.
Write context
LangGraph was designed with both thread-scoped (short-term) and long-term memory. Short-term memory uses checkpointing to persist agent state across all steps of an agent. This is extremely useful as a “scratchpad”, allowing you to write information to state and fetch it at any step in your agent trajectory.
LangGraph’s long-term memory lets you to persist context across many sessions with your agent. It is flexible, allowing you to save small sets of files (e.g., a user profile or rules) or larger collections of memories. In addition, LangMem provides a broad set of useful abstractions to aid with LangGraph memory management.
Select context
Within each node (step) of a LangGraph agent, you can fetch state. This give you fine-grained control over what context you present to the LLM at each agent step.
In addition, LangGraph’s long-term memory is accessible within each node and supports various types of retrieval (e.g., fetching files as well as embedding-based retrieval on a memory collection). For an overview of long-term memory, see our Deeplearning.ai course. And for an entry point to memory applied to a specific agent, see our Ambient Agents course. This shows how to use LangGraph memory in a long-running agent that can manage your email and learn from your feedback.
Email agent with user feedback and long-term memory
For tool selection, the LangGraph Bigtool library is a great way to apply semantic search over tool descriptions. This helps select the most relevant tools for a task when working with a large collection of tools. Finally, we have several tutorials and videos that show how to use various types of RAG with LangGraph.
Compressing context
Because LangGraph is a low-level orchestration framework, you lay out your agent as a set of nodes, define the logic within each one, and define an state object that is passed between them. This control offers several ways to compress context.
One common approach is to use a message list as your agent state and summarize or trim it periodically using a few built-in utilities. However, you can also add logic to post-process tool calls or work phases of your agent in a few different ways. You can add summarization nodes at specific points or also add summarization logic to your tool calling node in order to compress the output of specific tool calls.
Isolating context
LangGraph is designed around a state object, allowing you to specify a state schema and access state at each agent step. For example, you can store context from tool calls in certain fields in state, isolating them from the LLM until that context is required. In addition to state, LangGraph supports use of sandboxes for context isolation. See this repo for an example LangGraph agent that uses an E2B sandbox for tool calls. See this video for an example of sandboxing using Pyodide where state can be persisted. LangGraph also has a lot of support for building multi-agent architecture, such as the supervisor and swarm libraries. You can see these videos for more detail on using multi-agent with LangGraph.
Conclusion
Context engineering is becoming a craft that agents builders should aim to master. Here, we covered a few common patterns seen across many popular agents today:
Writing context - saving it outside the context window to help an agent perform a task.
Selecting context - pulling it into the context window to help an agent perform a task.
Compressing context - retaining only the tokens required to perform a task.
Isolating context - splitting it up to help an agent perform a task.
LangGraph makes it easy to implement each of them and LangSmith provides an easy way to test your agent and track context usage. Together, LangGraph and LangGraph enable a virtuous feedback loop for identifying the best opportunity to apply context engineering, implementing it, testing it, and repeating.
---------
Context Engineering in Manus
Oct 15, 2025
Lance Martin
Why Context Engineering
Earlier this week, I had a webinar with Manus co-founder and CSO Yichao “Peak” Ji. You can see the video here, my slides here, and Peak’s slides here. Below are my notes.
Anthropic defines agents as systems where LLMs direct their own processes and tool usage, maintaining control over how they accomplish tasks. In short, it’s an LLM calling tools in a loop.
Manus is one of the most popular general-purpose consumer agents. The typical Manus task uses 50 tool calls. Without context engineering, these tool call results would accumulate in the LLM context window. As the context window fills, many have observed that LLM performance degrades.
For example, Chroma has a great study on context rot and Anthropic has explained how growing context depletes an LLM’s attention budget. So, it’s important to carefully manage what goes into the LLM’s context window when building agents. Karpathy laid this out clearly:
Context engineering is the delicate art and science of filling the context window with just the right information for the next step (in an agent’s trajectory)
Context Engineering Approaches
Each Manus session uses a dedicated cloud-based virtual machine, giving the agent a virtual computer with a filesystem, tools to navigate it, and the ability to execute commands (e.g., provided utilities and standard shell commands) in that sandbox environment.
In this sandbox, Manus uses three primary strategies for context engineering, which align with approaches Anthropic covers here and I’ve seen in across many projects:
Reduce Context
Offload Context
Isolate Context
Context Reduction
Tool calls in Manus have a “full” and “compact” representation. The full version contains the raw content from tool invocation (e.g., a complete search tool result), which is stored in the sandbox (e.g., filesystem). The compact version stores a reference to the full result (e.g., a file path).
Manus applies compaction to older (“stale”) tool results. This just means swapping out the full tool result for the compact version. This allows the agent to still fetch the full result if ever needed, but saves tokens by removing “stale” results that the agent has already used to make decisions.
Newer tool results remain in full to guide the agent’s next decision. This seems to be a generally useful strategy for context reduction, and I notice that it’s similar to Anthropic’s context editing feature:
Context editing automatically clears stale tool calls and results from within the context window when approaching token limits. As your agent executes tasks and accumulates tool results, context editing removes stale content while preserving the conversation flow, effectively extending how long agents can run without manual intervention.
When compaction reaches diminishing returns (see figure below), Manus applies summarization to the trajectory. Summaries are generated using full tool results and Manus uses a schema to define the summary fields. This creates a consistent summary object for any agent trajectory.
Context Isolation
Manus takes a pragmatic approach to multi-agent, avoiding anthropomorphized divisions of labor. While humans organize by role (designer, engineer, project manager) due to cognitive limitations, LLMs don’t necessarily share these same constraints.
With this in mind, the primary goal of sub-agents in Manus is to isolate context. For example, if there’s a task to be done, Manus will assign that task to a sub-agent with its own context window.
Manus uses multi-agent with a planner that assigns tasks, a knowledge manager that reviews conversations and determines what should be saved in the filesystem, and an executor sub-agent that performs tasks assigned by the planner.
Manus initially used a todo.md for task planning, but found that roughly one-third of all actions were spent updating the todo list, wasting valuable tokens. They shifted to a dedicated planner agent that calls executor sub-agents to perform tasks.
In a recent podcast, Erik Schluntz (multi-agent research at Anthropic) mentioned that they similarly design multi-agent systems with a planner to assign tasks and use function calling as the communication protocol to initiate sub-agents. A central challenge raised by Erik as well as Walden Yan (Cognition) is context sharing between planner and sub-agents.
Manus addresses this in two ways. For simple tasks (e.g., a discrete task where the planner only needs the output of the sub-agent), the planner simply creates instructions and passes them to the sub-agent via the function call. This resembles Claude Code’s task tool.
For more complex tasks (e.g., the sub-agent needs to write to files that the planner also uses), the planner shares its full context with the sub-agent. The sub-agent still has its own action space (tools) and instructions, but receives the full context that the planner also has access to.
In both cases, the planner defines the sub-agent’s output schema. Sub-agents have a submit results tool to populate this schema before returning results to the planner and Manus uses constrained decoding to ensure output adheres to the defined schema.
Context Offloading
Tools Definitions
We often want agents that can perform a wide range of actions. We can, of course, bind a large collection of tools to the LLM and provide detailed instructions on how to use all of them. But, tool descriptions use valuable tokens and many (often overlapping or ambiguous) tools can cause model confusion.
A trend I’m seeing is that agents use a small set of general tools that give the agent access to a computer. For example, with only a Bash tool and a few tools to access a filesystem, an agent can perform a wide range of actions!
Manus thinks about this as a layered action space with function/tool calling and its virtual computer sandbox. Peak mentioned that Manus uses a small set (< 20) of atomic functions; this includes things like a Bash tool, tools to manage the filesystem, and a code execution tool.
Rather than bloating the function calling layer, Manus offloads most actions to the sandbox layer. Manus can execute many utilities directly in the sandbox with its Bash tool and MCP tools are exposed through a CLI that the agent can also execute using the Bash tool.
Claude’s skills feature uses a similar idea:skills are stored in the filesystem, not as bound tools, and Claude only needs a few simple function calls (Bash, file system) to progressively discover and use them.
Progressive disclosure is the core design principle that makes Agent Skills flexible and scalable. Like a well-organized manual that starts with a table of contents, then specific chapters, and finally a detailed appendix, skills let Claude load information only as needed … agents with a filesystem and code execution tools don’t need to read the entirety of a skill into their context window when working on a particular task.
Tool Results
Because Manus has access to a filesystem, it can also offload context (e.g., tool results). As explained above, this is central for context reduction; tool results are offloaded to the filesystem in order to produce the compact version and this is used to prune stale tokens from the agent’s context window. Similar to Claude Code, Manus uses basic utilities (e.g., glob and grep) to search the filesystem without the need for indexing (e.g., vectorstores).
Model Choice
Rather than committing to a single model, Manus uses task-level routing: it might use Claude for coding, Gemini for multi-modal tasks, or OpenAI for math and reasoning. Broadly, Manus’s approach to model selection is driven by cost considerations, with KV cache efficiency playing a central role.
Manus uses caching (e.g., for system instructions, older tool results, etc) to reduce both cost and latency across many agent turns. Peak mentioned that distributed KV cache infrastructure is challenging to implement with open source models, but is well-supported by frontier providers. This caching support can make frontier models cheaper for certain (agent) use-cases in practice.
Build with the Bitter Lesson in Mind
We closed the discussion talking about the Bitter Lesson. I’ve been interested in its implications for AI engineering. Boris Cherny (creator of Claude Code) mentioned that The Bitter Lesson influenced his decision to keep Claude Code unopinionated, making it easier to adapt to model improvements.
Building on constantly improving models means accepting constant change. Peak mentioned that Manus has been refactored five times since their launch in March!
In addition, Peak warned that the agent’s harness can limit performance as models advance; this is exactly the challenge called out by the Bitter Lesson. We add structure to improve performance at a point in time, but this structure can limit performance as compute (models) grows.
To guard against this, Peak suggested running agent evaluations across varying model strengths. If performance doesn’t improve with stronger models, your harness may be hobbling the agent. This can help test whether your harness is “future proof”.
Hyung Won Chung’s (OpenAI/MSL) talk on this topic further emphasizes the need to consistently re-evaluate structure (e.g., your harness / assumptions) as models improve.
Add structures needed for the given level of compute and data available. Remove them later, because these shortcuts will bottleneck further improvement.
Conclusions
Giving agents access to a computer (e.g., filesystem, terminal, utilities) is a common pattern we see across many agents, including Manus. It enables a few context engineering strategies:
1. Offload Context
Store tool results externally: Save full tool results to the filesystem (not in context) and access on demand with utilities like glob and grep
Push actions to the sandbox: Use a small set of function calls (Bash, filesystem access) that can execute many utilities in the sandbox rather than binding every utility as a tool
2. Reduce Context
Compact stale results: Replace older tool results with references (e.g., file paths) as context fills; keep recent results in full to guide the next decision
Summarize when needed: Once compaction reaches diminishing returns, apply schema-based summarization to the full trajectory
3. Isolate Context
Use sub-agents for discrete tasks: Assign tasks to sub-agents with their own context windows, primarily to isolate context (not to divide labor by role)
Share context deliberately: Pass only instructions for simple tasks; pass full context (e.g., trajectory and shared filesystem) for complex tasks where sub-agents need more context
A final consideration is to ensure your harness is not limiting performance as models improve (e.g., be “Bitter Lesson-pilled”). Test across model strengths to verify this. Simple, unopinionated designs often adapt better to model improvements. Finally, don’t be afraid to re-build your agent as models improve (Manus refactored 5 times since March)!
-----
Context Engineering for AI Agents: Lessons from Building Manus
2025/7/18 --Yichao 'Peak' Ji
At the very beginning of the project, my team and I faced a key decision: should we train an end-to-end agentic model using open-source foundations, or build an agent on top of the abilities of frontier models?
Back in my first decade in NLP, we didn't have the luxury of that choice. In the distant days of (yes, it's been seven years), models had to be fine-tuned—and evaluated—before they could transfer to a new task. That process often took weeks per iteration, even though the models were tiny compared to today's LLMs. For fast-moving applications, especially pre–PMF, such slow feedback loops are a deal-breaker. That was a bitter lesson from my last startup, where I trained models from scratch for and semantic search. Then came and , and my in-house models became irrelevant overnight. Ironically, those same models marked the beginning of in-context learning—and a whole new path forward.
That hard-earned lesson made the choice clear: Manus would bet on context engineering. This allows us to ship improvements in hours instead of weeks, and kept our product orthogonal to the underlying models: If model progress is the rising tide, we want Manus to be the boat, not the pillar stuck to the seabed.
Still, context engineering turned out to be anything but straightforward. It's an experimental science—and we've rebuilt our agent framework four times, each time after discovering a better way to shape context. We affectionately refer to this manual process of architecture searching, prompt fiddling, and empirical guesswork as "Stochastic Graduate Descent". It's not elegant, but it works.
This post shares the local optima we arrived at through our own "SGD". If you're building your own AI agent, I hope these principles help you converge faster.
Design Around the KV-Cache
If I had to choose just one metric, I'd argue that the KV-cache hit rate is the single most important metric for a production-stage AI agent. It directly affects both latency and cost. To understand why, let's look at how operates:
After receiving a user input, the agent proceeds through a chain of tool uses to complete the task. In each iteration, the model selects an action from a predefined action space based on the current context. That action is then executed in the environment (e.g., Manus's virtual machine sandbox) to produce an observation. The action and observation are appended to the context, forming the input for the next iteration. This loop continues until the task is complete.
As you can imagine, the context grows with every step, while the output—usually a structured function call—remains relatively short. This makes the ratio between prefilling and decoding highly skewed in agents compared to chatbots. In Manus, for example, the average input-to-output token ratio is around 100:1.
Fortunately, contexts with identical prefixes can take advantage of , which drastically reduces time-to-first-token (TTFT) and inference cost—whether you're using a self-hosted model or calling an inference API. And we're not talking about small savings: with Claude Sonnet, for instance, cached input tokens cost 0.30 USD/MTok, while uncached ones cost 3 USD/MTok—a 10x difference.
From a context engineering perspective, improving KV-cache hit rate involves a few key practices:
Keep your prompt prefix stable. Due to the nature of LLMs, even a single-token difference can invalidate the cache from that token onward. A common mistake is including a timestamp—especially one precise to the second—at the beginning of the system prompt. Sure, it lets the model tell you the current time, but it also kills your cache hit rate.
Make your context append-only. Avoid modifying previous actions or observations. Ensure your serialization is deterministic. Many programming languages and libraries don't guarantee stable key ordering when serializing JSON objects, which can silently break the cache.
Mark cache breakpoints explicitly when needed. Some model providers or inference frameworks don't support automatic incremental prefix caching, and instead require manual insertion of cache breakpoints in the context. When assigning these, account for potential cache expiration and at minimum, ensure the breakpoint includes the end of the system prompt.
Additionally, if you're self-hosting models using frameworks like , make sure is enabled, and that you're using techniques like session IDs to route requests consistently across distributed workers.
Mask, Don't Remove
As your agent takes on more capabilities, its action space naturally grows more complex—in plain terms, the number of tools explodes. The recent popularity of only adds fuel to the fire. If you allow user-configurable tools, trust me: someone will inevitably plug hundreds of mysterious tools into your carefully curated action space. As a result, the model is more likely to select the wrong action or take an inefficient path. In short, your heavily armed agent gets dumber.
A natural reaction is to design a dynamic action space—perhaps loading tools on demand using something -like. We tried that in Manus too. But our experiments suggest a clear rule: unless absolutely necessary, avoid dynamically adding or removing tools mid-iteration. There are two main reasons for this:
In most LLMs, tool definitions live near the front of the context after serialization, typically before or after the system prompt. So any change will invalidate the KV-cache for all subsequent actions and observations.
When previous actions and observations still refer to tools that are no longer defined in the current context, the model gets confused. Without , this often leads to schema violations or hallucinated actions.
To solve this while still improving action selection, Manus uses a context-aware to manage tool availability. Rather than removing tools, it masks the token logits during decoding to prevent (or enforce) the selection of certain actions based on the current context.
In practice, most model providers and inference frameworks support some form of response prefill, which allows you to constrain the action space without modifying the tool definitions. There are generally three modes of function calling (we'll use the from NousResearch as an example):
Auto – The model may choose to call a function or not. Implemented by prefilling only the reply prefix: <|im_start|>assistant
Required – The model must call a function, but the choice is unconstrained. Implemented by prefilling up to tool call token: <|im_start|>assistant<tool_call>
Specified – The model must call a function from a specific subset. Implemented by prefilling up to the beginning of the function name: <|im_start|>assistant<tool_call>{"name": “browser_
Using this, we constrain action selection by masking token logits directly. For example, when the user provides a new input, Manus must reply immediately instead of taking an action. We've also deliberately designed action names with consistent prefixes—e.g., all browser-related tools start with browser_, and command-line tools with shell_. This allows us to easily enforce that the agent only chooses from a certain group of tools at a given state without using stateful logits processors.
These designs help ensure that the Manus agent loop remains stable—even under a model-driven architecture.
Use the File System as Context
Modern frontier LLMs now offer context windows of 128K tokens or more. But in real-world agentic scenarios, that's often not enough, and sometimes even a liability. There are three common pain points:
Observations can be huge, especially when agents interact with unstructured data like web pages or PDFs. It's easy to blow past the context limit.
Model performance tends to degrade beyond a certain context length, even if the window technically supports it.
Long inputs are expensive, even with prefix caching. You're still paying to transmit and prefill every token.
To deal with this, many agent systems implement context truncation or compression strategies. But overly aggressive compression inevitably leads to information loss. The problem is fundamental: an agent, by nature, must predict the next action based on all prior state—and you can't reliably predict which observation might become critical ten steps later. From a logical standpoint, any irreversible compression carries risk.
That's why we treat the file system as the ultimate context in Manus: unlimited in size, persistent by nature, and directly operable by the agent itself. The model learns to write to and read from files on demand—using the file system not just as storage, but as structured, externalized memory.
Our compression strategies are always designed to be restorable. For instance, the content of a web page can be dropped from the context as long as the URL is preserved, and a document's contents can be omitted if its path remains available in the sandbox. This allows Manus to shrink context length without permanently losing information.
While developing this feature, I found myself imagining what it would take for a State Space Model (SSM) to work effectively in an agentic setting. Unlike Transformers, SSMs lack full attention and struggle with long-range backward dependencies. But if they could master file-based memory—externalizing long-term state instead of holding it in context—then their speed and efficiency might unlock a new class of agents. Agentic SSMs could be the real successors to .
Manipulate Attention Through Recitation
If you've worked with Manus, you've probably noticed something curious: when handling complex tasks, it tends to create a todo.md file—and update it step-by-step as the task progresses, checking off completed items.
That's not just cute behavior—it's a deliberate mechanism to manipulate attention.
A typical task in Manus requires around 50 tool calls on average. That's a long loop—and since Manus relies on LLMs for decision-making, it's vulnerable to drifting off-topic or forgetting earlier goals, especially in long contexts or complicated tasks.
By constantly rewriting the todo list, Manus is reciting its objectives into the end of the context. This pushes the global plan into the model's recent attention span, avoiding "lost-in-the-middle" issues and reducing goal misalignment. In effect, it's using natural language to bias its own focus toward the task objective—without needing special architectural changes.
Keep the Wrong Stuff In
Agents make mistakes. That's not a bug—it's reality. Language models hallucinate, environments return errors, external tools misbehave, and unexpected edge cases show up all the time. In multi-step tasks, failure is not the exception; it's part of the loop.
And yet, a common impulse is to hide these errors: clean up the trace, retry the action, or reset the model's state and leave it to the magical "". That feels safer, more controlled. But it comes at a cost: Erasing failure removes evidence. And without evidence, the model can't adapt.
In our experience, one of the most effective ways to improve agent behavior is deceptively simple: leave the wrong turns in the context. When the model sees a failed action—and the resulting observation or stack trace—it implicitly updates its internal beliefs. This shifts its prior away from similar actions, reducing the chance of repeating the same mistake. In fact, we believe error recovery is one of the clearest indicators of true agentic behavior. Yet it's still underrepresented in most academic work and public benchmarks, which often focus on task success under ideal conditions.
Don't Get Few-Shotted
is a common technique for improving LLM outputs. But in agent systems, it can backfire in subtle ways.
Language models are excellent mimics; they imitate the pattern of behavior in the context. If your context is full of similar past action-observation pairs, the model will tend to follow that pattern, even when it's no longer optimal.
This can be dangerous in tasks that involve repetitive decisions or actions. For example, when using Manus to help review a batch of 20 resumes, the agent often falls into a rhythm—repeating similar actions simply because that's what it sees in the context. This leads to drift, overgeneralization, or sometimes hallucination.
The fix is to increase diversity. Manus introduces small amounts of structured variation in actions and observations—different serialization templates, alternate phrasing, minor noise in order or formatting. This controlled randomness helps break the pattern and tweaks the model's attention. In other words, don't few-shot yourself into a rut. The more uniform your context, the more brittle your agent becomes.
Conclusion
Context engineering is still an emerging science—but for agent systems, it's already essential. Models may be getting stronger, faster, and cheaper, but no amount of raw capability replaces the need for memory, environment, and feedback. How you shape the context ultimately defines how your agent behaves: how fast it runs, how well it recovers, and how far it scales.
At Manus, we've learned these lessons through repeated rewrites, dead ends, and real-world testing across millions of users. None of what we've shared here is universal truth—but these are the patterns that worked for us. If they help you avoid even one painful iteration, then this post did its job.
The agentic future will be built one context at a time. Engineer them well.
------
Wide Research: Beyond the Context Window
The promise of AI-driven research has always been compelling: delegate the tedious work of information gathering and synthesis to an intelligent system, freeing up human cognition for higher-order analysis and decision-making. Yet, anyone who has pushed these systems on non-trivial use cases has run into a frustrating reality: by the eighth or ninth item in a multi-subject research task, the AI starts fabricating.
Not just simplifying. Not just summarizing more concisely. Fabricating.
This isn't a prompt engineering problem. It's not a model capability problem. It is an architectural constraint that has quietly limited the utility of AI research tools since their inception. And it's the constraint that Wide Research is designed to overcome.
The Context Window: A Fundamental Bottleneck
Every large language model operates within a context window, a finite memory buffer that limits the amount of information the model can actively process at any given moment. Modern models have pushed this boundary impressively: from 4K tokens to 32K, 128K, and even 1M tokens in recent versions.
Yet the problem persists.
When you ask an AI to research multiple entities-say, fifty companies, thirty research papers, or twenty competing products-the context window fills up rapidly. It's not just the raw information about each entity, but also:
The original task specification and requirements
The structural template for consistent output formatting
Intermediate reasoning and analysis for each item
Cross-referencing and comparative notes
The cumulative context of all preceding items
By the time the model reaches the eighth or ninth item, the context window is under immense strain. The model faces an impossible choice: fail explicitly, or start cutting corners.
It always chooses the latter.
The Fabrication Threshold
Here's what happens in practice:
Items 1-5: The model performs genuine research. It retrieves information, cross-references sources, and produces detailed, accurate analysis.
Items 6-8: The quality begins to subtly degrade. Descriptions become slightly more generic. The model starts relying more on prior patterns than fresh research.
Items 9+: The model enters fabrication mode. Unable to maintain the cognitive load of thorough research while managing an overflowing context, it begins generating plausible-sounding content based on statistical patterns, not actual investigation.
These fabrications are sophisticated. They sound authoritative. They follow the established format perfectly. They are often grammatically flawless and stylistically consistent with the earlier, legitimate entries.
They are also frequently wrong.
A competitor analysis might attribute features to companies that don't offer them. A literature review might cite papers with fabricated findings. A product comparison might invent pricing tiers or specifications.
The insidious part is that these fabrications are difficult to detect without manual verification—which defeats the entire purpose of automated research.
Why Bigger Context Windows Can't Fix This
The intuitive response is to simply expand the context window. If 32K tokens aren't enough, use 128K. If that's not enough, push to 200K or beyond.
This approach misunderstands the problem.
First, context decay is not binary. A model does not maintain perfect recall across its entire context window. Studies have shown that retrieval accuracy degrades with distance from the current position—the "lost in the middle" phenomenon. Information at the beginning and end of the context is recalled more reliably than information in the middle.
Second, the processing cost grows disproportionately. The cost to process a 400K token context isn't just double the cost of 200K—it increases exponentially in both time and computing resources. This makes massive-context processing economically impractical for many use cases.
Third, the problem is cognitive load. Even with an infinite context, asking a single model to maintain consistent quality across dozens of independent research tasks creates a cognitive bottleneck. The model must constantly switch context between items, maintain a comparative framework, and ensure stylistic consistency—all while performing the core research task.
Fourth, context length pressure. The model’s “patience” is, to some extent, determined by the length distribution of samples in its training data. However, the post-training data mixture of current language models is still dominated by relatively short trajectories designed for chatbot-style interactions. As a result, when the length of an assistant message’s content exceeds a certain threshold, the model naturally experiences a kind of context length pressure, prompting it to hasten toward summarizing or to resort to incomplete expression forms such as bullet points.
The context window is a constraint, yes. But it's a symptom of a deeper architectural limitation: the single-processor, sequential paradigm.
The Architectural Shift: Parallel Processing
Wide Research represents a fundamental rethinking of how an AI system should approach large-scale research tasks. Instead of asking one processor to handle n items sequentially, we deploy n parallel sub-agents to process n items simultaneously.
The Wide Research Architecture
When you launch a Wide Research task, the system operates as follows:
1. Intelligent Decomposition
The main controller analyzes your request and breaks it down into independent, parallelizable sub-tasks. This involves understanding the task structure, identifying dependencies, and creating coherent sub-specifications.
2. Sub-agent Delegation
For each sub-task, the system spins up a dedicated sub-agent. Crucially, these are not lightweight processes—they are full-featured Manus instances, each with:
A complete virtual machine environment
Access to the full tool library (search, browsing, code execution, file handling)
An independent internet connection
A fresh, empty context window
3. Parallel Execution
All sub-agents execute simultaneously. Each one focuses exclusively on its assigned item, performing the same depth of research and analysis it would for a single-item task.
4. Centralized Coordination
The main controller maintains oversight, collecting results as the sub-agents complete their jobs. Importantly, the sub-agents do not communicate with each other, all coordination flows through the main controller. This prevents context pollution and maintains independence.
5. Synthesis and Integration
Once all sub-agents have reported back, the main controller synthesizes the results into a single, coherent, and comprehensive report. This synthesis step leverages the full context capacity of the main controller, as it is not burdened with the original research effort.
Why This Changes Everything
Consistent Quality at Scale
Every item gets the same treatment. The 50th item is researched just as thoroughly as the first. There is no degradation curve, no fabrication threshold, and no quality cliff.
True Horizontal Scalability
Need to analyze 10 items? The system deploys 10 sub-agents. Need to analyze 500? It deploys 500. The architecture scales linearly with the size of the task, not exponentially like context-based approaches.
Significant Speed-Up
Because the sub-agents operate in parallel, the real-world time required to analyze 50 items is roughly the same as the time to analyze 5. The bottleneck shifts from sequential processing time to synthesis time—a much smaller component of the overall task.
Reduced Hallucination Rate
Each sub-agent operates within its cognitive comfort zone. With a fresh context and a single, focused task, there is no pressure to fabricate. The sub-agent can perform genuine research, verify facts, and maintain accuracy.
Independence and Reliability
Because the sub-agents do not share context, an error or hallucination in one sub-agent's job does not propagate to the others. Each analysis stands on its own, reducing systemic risk.
Beyond the Single-Processor Paradigm
Wide Research is more than a feature—it represents a fundamental shift away from the single-processor paradigm and toward an orchestrated, parallel architecture. The future of AI systems lies not in ever-larger context windows, but in intelligent task decomposition and parallel execution.
We are moving from the era of the "AI assistant" to the era of the "AI workforce."
When to use Wide Research: Any task involving multiple, similar items that require consistent analysiscompetitive research, literature reviews, bulk processing, multi-asset generation.
When not to use: Deeply sequential tasks where each step heavily depends on the prior result, or small tasks (fewer than 10 items) where single-processor handling is more cost-effective.
--------
How we built our multi-agent research system
Published Jun 13, 2025
Our Research feature uses multiple Claude agents to explore complex topics more effectively. We share the engineering challenges and the lessons we learned from building this system.
Claude now has Research capabilities that allow it to search across the web, Google Workspace, and any integrations to accomplish complex tasks.
The journey of this multi-agent system from prototype to production taught us critical lessons about system architecture, tool design, and prompt engineering. A multi-agent system consists of multiple agents (LLMs autonomously using tools in a loop) working together. Our Research feature involves an agent that plans a research process based on user queries, and then uses tools to create parallel agents that search for information simultaneously. Systems with multiple agents introduce new challenges in agent coordination, evaluation, and reliability.
This post breaks down the principles that worked for us—we hope you'll find them useful to apply when building your own multi-agent systems.
Benefits of a multi-agent system
Research work involves open-ended problems where it’s very difficult to predict the required steps in advance. You can’t hardcode a fixed path for exploring complex topics, as the process is inherently dynamic and path-dependent. When people conduct research, they tend to continuously update their approach based on discoveries, following leads that emerge during investigation.
This unpredictability makes AI agents particularly well-suited for research tasks. Research demands the flexibility to pivot or explore tangential connections as the investigation unfolds. The model must operate autonomously for many turns, making decisions about which directions to pursue based on intermediate findings. A linear, one-shot pipeline cannot handle these tasks.
The essence of search is compression: distilling insights from a vast corpus. Subagents facilitate compression by operating in parallel with their own context windows, exploring different aspects of the question simultaneously before condensing the most important tokens for the lead research agent. Each subagent also provides separation of concerns—distinct tools, prompts, and exploration trajectories—which reduces path dependency and enables thorough, independent investigations.
Once intelligence reaches a threshold, multi-agent systems become a vital way to scale performance. For instance, although individual humans have become more intelligent in the last 100,000 years, human societies have become exponentially more capable in the information age because of our collective intelligence and ability to coordinate. Even generally-intelligent agents face limits when operating as individuals; groups of agents can accomplish far more.
Our internal evaluations show that multi-agent research systems excel especially for breadth-first queries that involve pursuing multiple independent directions simultaneously. We found that a multi-agent system with Claude Opus 4 as the lead agent and Claude Sonnet 4 subagents outperformed single-agent Claude Opus 4 by 90.2% on our internal research eval. For example, when asked to identify all the board members of the companies in the Information Technology S&P 500, the multi-agent system found the correct answers by decomposing this into tasks for subagents, while the single agent system failed to find the answer with slow, sequential searches.
Multi-agent systems work mainly because they help spend enough tokens to solve the problem. In our analysis, three factors explained 95% of the performance variance in the BrowseComp evaluation (which tests the ability of browsing agents to locate hard-to-find information). We found that token usage by itself explains 80% of the variance, with the number of tool calls and the model choice as the two other explanatory factors. This finding validates our architecture that distributes work across agents with separate context windows to add more capacity for parallel reasoning. The latest Claude models act as large efficiency multipliers on token use, as upgrading to Claude Sonnet 4 is a larger performance gain than doubling the token budget on Claude Sonnet 3.7. Multi-agent architectures effectively scale token usage for tasks that exceed the limits of single agents.
There is a downside: in practice, these architectures burn through tokens fast. In our data, agents typically use about 4× more tokens than chat interactions, and multi-agent systems use about 15× more tokens than chats. For economic viability, multi-agent systems require tasks where the value of the task is high enough to pay for the increased performance. Further, some domains that require all agents to share the same context or involve many dependencies between agents are not a good fit for multi-agent systems today. For instance, most coding tasks involve fewer truly parallelizable tasks than research, and LLM agents are not yet great at coordinating and delegating to other agents in real time. We’ve found that multi-agent systems excel at valuable tasks that involve heavy parallelization, information that exceeds single context windows, and interfacing with numerous complex tools.
Architecture overview for Research
Our Research system uses a multi-agent architecture with an orchestrator-worker pattern, where a lead agent coordinates the process while delegating to specialized subagents that operate in parallel.
The multi-agent architecture in action: user queries flow through a lead agent that creates specialized subagents to search for different aspects in parallel.
When a user submits a query, the lead agent analyzes it, develops a strategy, and spawns subagents to explore different aspects simultaneously. As shown in the diagram above, the subagents act as intelligent filters by iteratively using search tools to gather information, in this case on AI agent companies in 2025, and then returning a list of companies to the lead agent so it can compile a final answer.
Traditional approaches using Retrieval Augmented Generation (RAG) use static retrieval. That is, they fetch some set of chunks that are most similar to an input query and use these chunks to generate a response. In contrast, our architecture uses a multi-step search that dynamically finds relevant information, adapts to new findings, and analyzes results to formulate high-quality answers.
Process diagram showing the complete workflow of our multi-agent Research system. When a user submits a query, the system creates a LeadResearcher agent that enters an iterative research process. The LeadResearcher begins by thinking through the approach and saving its plan to Memory to persist the context, since if the context window exceeds 200,000 tokens it will be truncated and it is important to retain the plan. It then creates specialized Subagents (two are shown here, but it can be any number) with specific research tasks. Each Subagent independently performs web searches, evaluates tool results using interleaved thinking, and returns findings to the LeadResearcher. The LeadResearcher synthesizes these results and decides whether more research is needed—if so, it can create additional subagents or refine its strategy. Once sufficient information is gathered, the system exits the research loop and passes all findings to a CitationAgent, which processes the documents and research report to identify specific locations for citations. This ensures all claims are properly attributed to their sources. The final research results, complete with citations, are then returned to the user.
Prompt engineering and evaluations for research agents
Multi-agent systems have key differences from single-agent systems, including a rapid growth in coordination complexity. Early agents made errors like spawning 50 subagents for simple queries, scouring the web endlessly for nonexistent sources, and distracting each other with excessive updates. Since each agent is steered by a prompt, prompt engineering was our primary lever for improving these behaviors. Below are some principles we learned for prompting agents:
Think like your agents. To iterate on prompts, you must understand their effects. To help us do this, we built simulations using our Console with the exact prompts and tools from our system, then watched agents work step-by-step. This immediately revealed failure modes: agents continuing when they already had sufficient results, using overly verbose search queries, or selecting incorrect tools. Effective prompting relies on developing an accurate mental model of the agent, which can make the most impactful changes obvious.
Teach the orchestrator how to delegate. In our system, the lead agent decomposes queries into subtasks and describes them to subagents. Each subagent needs an objective, an output format, guidance on the tools and sources to use, and clear task boundaries. Without detailed task descriptions, agents duplicate work, leave gaps, or fail to find necessary information. We started by allowing the lead agent to give simple, short instructions like 'research the semiconductor shortage,' but found these instructions often were vague enough that subagents misinterpreted the task or performed the exact same searches as other agents. For instance, one subagent explored the 2021 automotive chip crisis while 2 others duplicated work investigating current 2025 supply chains, without an effective division of labor.
Scale effort to query complexity. Agents struggle to judge appropriate effort for different tasks, so we embedded scaling rules in the prompts. Simple fact-finding requires just 1 agent with 3-10 tool calls, direct comparisons might need 2-4 subagents with 10-15 calls each, and complex research might use more than 10 subagents with clearly divided responsibilities. These explicit guidelines help the lead agent allocate resources efficiently and prevent overinvestment in simple queries, which was a common failure mode in our early versions.
Tool design and selection are critical. Agent-tool interfaces are as critical as human-computer interfaces. Using the right tool is efficient—often, it’s strictly necessary. For instance, an agent searching the web for context that only exists in Slack is doomed from the start. With MCP servers that give the model access to external tools, this problem compounds, as agents encounter unseen tools with descriptions of wildly varying quality. We gave our agents explicit heuristics: for example, examine all available tools first, match tool usage to user intent, search the web for broad external exploration, or prefer specialized tools over generic ones. Bad tool descriptions can send agents down completely wrong paths, so each tool needs a distinct purpose and a clear description.
Let agents improve themselves. We found that the Claude 4 models can be excellent prompt engineers. When given a prompt and a failure mode, they are able to diagnose why the agent is failing and suggest improvements. We even created a tool-testing agent—when given a flawed MCP tool, it attempts to use the tool and then rewrites the tool description to avoid failures. By testing the tool dozens of times, this agent found key nuances and bugs. This process for improving tool ergonomics resulted in a 40% decrease in task completion time for future agents using the new description, because they were able to avoid most mistakes.
Start wide, then narrow down. Search strategy should mirror expert human research: explore the landscape before drilling into specifics. Agents often default to overly long, specific queries that return few results. We counteracted this tendency by prompting agents to start with short, broad queries, evaluate what’s available, then progressively narrow focus.
Guide the thinking process. Extended thinking mode, which leads Claude to output additional tokens in a visible thinking process, can serve as a controllable scratchpad. The lead agent uses thinking to plan its approach, assessing which tools fit the task, determining query complexity and subagent count, and defining each subagent’s role. Our testing showed that extended thinking improved instruction-following, reasoning, and efficiency. Subagents also plan, then use interleaved thinking after tool results to evaluate quality, identify gaps, and refine their next query. This makes subagents more effective in adapting to any task.
Parallel tool calling transforms speed and performance. Complex research tasks naturally involve exploring many sources. Our early agents executed sequential searches, which was painfully slow. For speed, we introduced two kinds of parallelization: (1) the lead agent spins up 3-5 subagents in parallel rather than serially; (2) the subagents use 3+ tools in parallel. These changes cut research time by up to 90% for complex queries, allowing Research to do more work in minutes instead of hours while covering more information than other systems.
Our prompting strategy focuses on instilling good heuristics rather than rigid rules. We studied how skilled humans approach research tasks and encoded these strategies in our prompts—strategies like decomposing difficult questions into smaller tasks, carefully evaluating the quality of sources, adjusting search approaches based on new information, and recognizing when to focus on depth (investigating one topic in detail) vs. breadth (exploring many topics in parallel). We also proactively mitigated unintended side effects by setting explicit guardrails to prevent the agents from spiraling out of control. Finally, we focused on a fast iteration loop with observability and test cases.
Effective evaluation of agents
Good evaluations are essential for building reliable AI applications, and agents are no different. However, evaluating multi-agent systems presents unique challenges. Traditional evaluations often assume that the AI follows the same steps each time: given input X, the system should follow path Y to produce output Z. But multi-agent systems don't work this way. Even with identical starting points, agents might take completely different valid paths to reach their goal. One agent might search three sources while another searches ten, or they might use different tools to find the same answer. Because we don’t always know what the right steps are, we usually can't just check if agents followed the “correct” steps we prescribed in advance. Instead, we need flexible evaluation methods that judge whether agents achieved the right outcomes while also following a reasonable process.
Start evaluating immediately with small samples. In early agent development, changes tend to have dramatic impacts because there is abundant low-hanging fruit. A prompt tweak might boost success rates from 30% to 80%. With effect sizes this large, you can spot changes with just a few test cases. We started with a set of about 20 queries representing real usage patterns. Testing these queries often allowed us to clearly see the impact of changes. We often hear that AI developer teams delay creating evals because they believe that only large evals with hundreds of test cases are useful. However, it’s best to start with small-scale testing right away with a few examples, rather than delaying until you can build more thorough evals.
LLM-as-judge evaluation scales when done well. Research outputs are difficult to evaluate programmatically, since they are free-form text and rarely have a single correct answer. LLMs are a natural fit for grading outputs. We used an LLM judge that evaluated each output against criteria in a rubric: factual accuracy (do claims match sources?), citation accuracy (do the cited sources match the claims?), completeness (are all requested aspects covered?), source quality (did it use primary sources over lower-quality secondary sources?), and tool efficiency (did it use the right tools a reasonable number of times?). We experimented with multiple judges to evaluate each component, but found that a single LLM call with a single prompt outputting scores from 0.0-1.0 and a pass-fail grade was the most consistent and aligned with human judgements. This method was especially effective when the eval test cases did have a clear answer, and we could use the LLM judge to simply check if the answer was correct (i.e. did it accurately list the pharma companies with the top 3 largest R&D budgets?). Using an LLM as a judge allowed us to scalably evaluate hundreds of outputs.
Human evaluation catches what automation misses. People testing agents find edge cases that evals miss. These include hallucinated answers on unusual queries, system failures, or subtle source selection biases. In our case, human testers noticed that our early agents consistently chose SEO-optimized content farms over authoritative but less highly-ranked sources like academic PDFs or personal blogs. Adding source quality heuristics to our prompts helped resolve this issue. Even in a world of automated evaluations, manual testing remains essential.
Multi-agent systems have emergent behaviors, which arise without specific programming. For instance, small changes to the lead agent can unpredictably change how subagents behave. Success requires understanding interaction patterns, not just individual agent behavior. Therefore, the best prompts for these agents are not just strict instructions, but frameworks for collaboration that define the division of labor, problem-solving approaches, and effort budgets. Getting this right relies on careful prompting and tool design, solid heuristics, observability, and tight feedback loops. See the open-source prompts in our Cookbook for example prompts from our system.
Production reliability and engineering challenges
In traditional software, a bug might break a feature, degrade performance, or cause outages. In agentic systems, minor changes cascade into large behavioral changes, which makes it remarkably difficult to write code for complex agents that must maintain state in a long-running process.
Agents are stateful and errors compound. Agents can run for long periods of time, maintaining state across many tool calls. This means we need to durably execute code and handle errors along the way. Without effective mitigations, minor system failures can be catastrophic for agents. When errors occur, we can't just restart from the beginning: restarts are expensive and frustrating for users. Instead, we built systems that can resume from where the agent was when the errors occurred. We also use the model’s intelligence to handle issues gracefully: for instance, letting the agent know when a tool is failing and letting it adapt works surprisingly well. We combine the adaptability of AI agents built on Claude with deterministic safeguards like retry logic and regular checkpoints.
Debugging benefits from new approaches. Agents make dynamic decisions and are non-deterministic between runs, even with identical prompts. This makes debugging harder. For instance, users would report agents “not finding obvious information,” but we couldn't see why. Were the agents using bad search queries? Choosing poor sources? Hitting tool failures? Adding full production tracing let us diagnose why agents failed and fix issues systematically. Beyond standard observability, we monitor agent decision patterns and interaction structures—all without monitoring the contents of individual conversations, to maintain user privacy. This high-level observability helped us diagnose root causes, discover unexpected behaviors, and fix common failures.
Deployment needs careful coordination. Agent systems are highly stateful webs of prompts, tools, and execution logic that run almost continuously. This means that whenever we deploy updates, agents might be anywhere in their process. We therefore need to prevent our well-meaning code changes from breaking existing agents. We can’t update every agent to the new version at the same time. Instead, we use rainbow deployments to avoid disrupting running agents, by gradually shifting traffic from old to new versions while keeping both running simultaneously.
Synchronous execution creates bottlenecks. Currently, our lead agents execute subagents synchronously, waiting for each set of subagents to complete before proceeding. This simplifies coordination, but creates bottlenecks in the information flow between agents. For instance, the lead agent can’t steer subagents, subagents can’t coordinate, and the entire system can be blocked while waiting for a single subagent to finish searching. Asynchronous execution would enable additional parallelism: agents working concurrently and creating new subagents when needed. But this asynchronicity adds challenges in result coordination, state consistency, and error propagation across the subagents. As models can handle longer and more complex research tasks, we expect the performance gains will justify the complexity.
Conclusion
When building AI agents, the last mile often becomes most of the journey. Codebases that work on developer machines require significant engineering to become reliable production systems. The compound nature of errors in agentic systems means that minor issues for traditional software can derail agents entirely. One step failing can cause agents to explore entirely different trajectories, leading to unpredictable outcomes. For all the reasons described in this post, the gap between prototype and production is often wider than anticipated.
Despite these challenges, multi-agent systems have proven valuable for open-ended research tasks. Users have said that Claude helped them find business opportunities they hadn’t considered, navigate complex healthcare options, resolve thorny technical bugs, and save up to days of work by uncovering research connections they wouldn't have found alone. Multi-agent research systems can operate reliably at scale with careful engineering, comprehensive testing, detail-oriented prompt and tool design, robust operational practices, and tight collaboration between research, product, and engineering teams who have a strong understanding of current agent capabilities. We're already seeing these systems transform how people solve complex problems.
A Clio embedding plot showing the most common ways people are using the Research feature today. The top use case categories are developing software systems across specialized domains (10%), develop and optimize professional and technical content (8%), develop business growth and revenue generation strategies (8%), assist with academic research and educational material development (7%), and research and verify information about people, places, or organizations (5%).
Acknowledgements
Written by Jeremy Hadfield, Barry Zhang, Kenneth Lien, Florian Scholz, Jeremy Fox, and Daniel Ford. This work reflects the collective efforts of several teams across Anthropic who made the Research feature possible. Special thanks go to the Anthropic apps engineering team, whose dedication brought this complex multi-agent system to production. We're also grateful to our early users for their excellent feedback.
Appendix
Below are some additional miscellaneous tips for multi-agent systems.
End-state evaluation of agents that mutate state over many turns. Evaluating agents that modify persistent state across multi-turn conversations presents unique challenges. Unlike read-only research tasks, each action can change the environment for subsequent steps, creating dependencies that traditional evaluation methods struggle to handle. We found success focusing on end-state evaluation rather than turn-by-turn analysis. Instead of judging whether the agent followed a specific process, evaluate whether it achieved the correct final state. This approach acknowledges that agents may find alternative paths to the same goal while still ensuring they deliver the intended outcome. For complex workflows, break evaluation into discrete checkpoints where specific state changes should have occurred, rather than attempting to validate every intermediate step.
Long-horizon conversation management. Production agents often engage in conversations spanning hundreds of turns, requiring careful context management strategies. As conversations extend, standard context windows become insufficient, necessitating intelligent compression and memory mechanisms. We implemented patterns where agents summarize completed work phases and store essential information in external memory before proceeding to new tasks. When context limits approach, agents can spawn fresh subagents with clean contexts while maintaining continuity through careful handoffs. Further, they can retrieve stored context like the research plan from their memory rather than losing previous work when reaching the context limit. This distributed approach prevents context overflow while preserving conversation coherence across extended interactions.
Subagent output to a filesystem to minimize the ‘game of telephone.’ Direct subagent outputs can bypass the main coordinator for certain types of results, improving both fidelity and performance. Rather than requiring subagents to communicate everything through the lead agent, implement artifact systems where specialized agents can create outputs that persist independently. Subagents call tools to store their work in external systems, then pass lightweight references back to the coordinator. This prevents information loss during multi-stage processing and reduces token overhead from copying large outputs through conversation history. The pattern works particularly well for structured outputs like code, reports, or data visualizations where the subagent's specialized prompt produces better results than filtering through a general coordinator.
-------
riting effective tools for agents — with agents
Published Sep 11, 2025
Agents are only as effective as the tools we give them. We share how to write high-quality tools and evaluations, and how you can boost performance by using Claude to optimize its tools for itself.
The Model Context Protocol (MCP) can empower LLM agents with potentially hundreds of tools to solve real-world tasks. But how do we make those tools maximally effective?
In this post, we describe our most effective techniques for improving performance in a variety of agentic AI systems1.
We begin by covering how you can:
Build and test prototypes of your tools
Create and run comprehensive evaluations of your tools with agents
Collaborate with agents like Claude Code to automatically increase the performance of your tools
We conclude with key principles for writing high-quality tools we’ve identified along the way:
Choosing the right tools to implement (and not to implement)
Namespacing tools to define clear boundaries in functionality
Returning meaningful context from tools back to agents
Optimizing tool responses for token efficiency
Prompt-engineering tool descriptions and specs
This is an image depicting how an engineer might use Claude Code to evaluate the efficacy of agentic tools.
Building an evaluation allows you to systematically measure the performance of your tools. You can use Claude Code to automatically optimize your tools against this evaluation.
What is a tool?
In computing, deterministic systems produce the same output every time given identical inputs, while non-deterministic systems—like agents—can generate varied responses even with the same starting conditions.
When we traditionally write software, we’re establishing a contract between deterministic systems. For instance, a function call like getWeather(“NYC”) will always fetch the weather in New York City in the exact same manner every time it is called.
Tools are a new kind of software which reflects a contract between deterministic systems and non-deterministic agents. When a user asks "Should I bring an umbrella today?,” an agent might call the weather tool, answer from general knowledge, or even ask a clarifying question about location first. Occasionally, an agent might hallucinate or even fail to grasp how to use a tool.
This means fundamentally rethinking our approach when writing software for agents: instead of writing tools and MCP servers the way we’d write functions and APIs for other developers or systems, we need to design them for agents.
Our goal is to increase the surface area over which agents can be effective in solving a wide range of tasks by using tools to pursue a variety of successful strategies. Fortunately, in our experience, the tools that are most “ergonomic” for agents also end up being surprisingly intuitive to grasp as humans.
How to write tools
In this section, we describe how you can collaborate with agents both to write and to improve the tools you give them. Start by standing up a quick prototype of your tools and testing them locally. Next, run a comprehensive evaluation to measure subsequent changes. Working alongside agents, you can repeat the process of evaluating and improving your tools until your agents achieve strong performance on real-world tasks.
Building a prototype
It can be difficult to anticipate which tools agents will find ergonomic and which tools they won’t without getting hands-on yourself. Start by standing up a quick prototype of your tools. If you’re using Claude Code to write your tools (potentially in one-shot), it helps to give Claude documentation for any software libraries, APIs, or SDKs (including potentially the MCP SDK) your tools will rely on. LLM-friendly documentation can commonly be found in flat llms.txt files on official documentation sites (here’s our API’s).
Wrapping your tools in a local MCP server or Desktop extension (DXT) will allow you to connect and test your tools in Claude Code or the Claude Desktop app.
To connect your local MCP server to Claude Code, run claude mcp add <name> <command> [args...].
To connect your local MCP server or DXT to the Claude Desktop app, navigate to Settings > Developer or Settings > Extensions, respectively.
Tools can also be passed directly into Anthropic API calls for programmatic testing.
Test the tools yourself to identify any rough edges. Collect feedback from your users to build an intuition around the use-cases and prompts you expect your tools to enable.
Running an evaluation
Next, you need to measure how well Claude uses your tools by running an evaluation. Start by generating lots of evaluation tasks, grounded in real world uses. We recommend collaborating with an agent to help analyze your results and determine how to improve your tools. See this process end-to-end in our tool evaluation cookbook.
This graph measures the test set accuracy of human-written vs. Claude-optimized Slack MCP servers.
Held-out test set performance of our internal Slack tools
Generating evaluation tasks
With your early prototype, Claude Code can quickly explore your tools and create dozens of prompt and response pairs. Prompts should be inspired by real-world uses and be based on realistic data sources and services (for example, internal knowledge bases and microservices). We recommend you avoid overly simplistic or superficial “sandbox” environments that don’t stress-test your tools with sufficient complexity. Strong evaluation tasks might require multiple tool calls—potentially dozens.
Here are some examples of strong tasks:
Schedule a meeting with Jane next week to discuss our latest Acme Corp project. Attach the notes from our last project planning meeting and reserve a conference room.
Customer ID 9182 reported that they were charged three times for a single purchase attempt. Find all relevant log entries and determine if any other customers were affected by the same issue.
Customer Sarah Chen just submitted a cancellation request. Prepare a retention offer. Determine: (1) why they're leaving, (2) what retention offer would be most compelling, and (3) any risk factors we should be aware of before making an offer.
And here are some weaker tasks:
Schedule a meeting with jane@acme.corp next week.
Search the payment logs for purchase_complete and customer_id=9182.
Find the cancellation request by Customer ID 45892.
Each evaluation prompt should be paired with a verifiable response or outcome. Your verifier can be as simple as an exact string comparison between ground truth and sampled responses, or as advanced as enlisting Claude to judge the response. Avoid overly strict verifiers that reject correct responses due to spurious differences like formatting, punctuation, or valid alternative phrasings.
For each prompt-response pair, you can optionally also specify the tools you expect an agent to call in solving the task, to measure whether or not agents are successful in grasping each tool’s purpose during evaluation. However, because there might be multiple valid paths to solving tasks correctly, try to avoid overspecifying or overfitting to strategies.
Running the evaluation
We recommend running your evaluation programmatically with direct LLM API calls. Use simple agentic loops (while-loops wrapping alternating LLM API and tool calls): one loop for each evaluation task. Each evaluation agent should be given a single task prompt and your tools.
In your evaluation agents’ system prompts, we recommend instructing agents to output not just structured response blocks (for verification), but also reasoning and feedback blocks. Instructing agents to output these before tool call and response blocks may increase LLMs’ effective intelligence by triggering chain-of-thought (CoT) behaviors.
If you’re running your evaluation with Claude, you can turn on interleaved thinking for similar functionality “off-the-shelf”. This will help you probe why agents do or don’t call certain tools and highlight specific areas of improvement in tool descriptions and specs.
As well as top-level accuracy, we recommend collecting other metrics like the total runtime of individual tool calls and tasks, the total number of tool calls, the total token consumption, and tool errors. Tracking tool calls can help reveal common workflows that agents pursue and offer some opportunities for tools to consolidate.
This graph measures the test set accuracy of human-written vs. Claude-optimized Asana MCP servers.
Held-out test set performance of our internal Asana tools
Analyzing results
Agents are your helpful partners in spotting issues and providing feedback on everything from contradictory tool descriptions to inefficient tool implementations and confusing tool schemas. However, keep in mind that what agents omit in their feedback and responses can often be more important than what they include. LLMs don’t always say what they mean.
Observe where your agents get stumped or confused. Read through your evaluation agents’ reasoning and feedback (or CoT) to identify rough edges. Review the raw transcripts (including tool calls and tool responses) to catch any behavior not explicitly described in the agent’s CoT. Read between the lines; remember that your evaluation agents don’t necessarily know the correct answers and strategies.
Analyze your tool calling metrics. Lots of redundant tool calls might suggest some rightsizing of pagination or token limit parameters is warranted; lots of tool errors for invalid parameters might suggest tools could use clearer descriptions or better examples. When we launched Claude’s web search tool, we identified that Claude was needlessly appending 2025 to the tool’s query parameter, biasing search results and degrading performance (we steered Claude in the right direction by improving the tool description).
Collaborating with agents
You can even let agents analyze your results and improve your tools for you. Simply concatenate the transcripts from your evaluation agents and paste them into Claude Code. Claude is an expert at analyzing transcripts and refactoring lots of tools all at once—for example, to ensure tool implementations and descriptions remain self-consistent when new changes are made.
In fact, most of the advice in this post came from repeatedly optimizing our internal tool implementations with Claude Code. Our evaluations were created on top of our internal workspace, mirroring the complexity of our internal workflows, including real projects, documents, and messages.
We relied on held-out test sets to ensure we did not overfit to our “training” evaluations. These test sets revealed that we could extract additional performance improvements even beyond what we achieved with "expert" tool implementations—whether those tools were manually written by our researchers or generated by Claude itself.
In the next section, we’ll share some of what we learned from this process.
Principles for writing effective tools
In this section, we distill our learnings into a few guiding principles for writing effective tools.
Choosing the right tools for agents
More tools don’t always lead to better outcomes. A common error we’ve observed is tools that merely wrap existing software functionality or API endpoints—whether or not the tools are appropriate for agents. This is because agents have distinct “affordances” to traditional software—that is, they have different ways of perceiving the potential actions they can take with those tools
LLM agents have limited "context" (that is, there are limits to how much information they can process at once), whereas computer memory is cheap and abundant. Consider the task of searching for a contact in an address book. Traditional software programs can efficiently store and process a list of contacts one at a time, checking each one before moving on.
However, if an LLM agent uses a tool that returns ALL contacts and then has to read through each one token-by-token, it's wasting its limited context space on irrelevant information (imagine searching for a contact in your address book by reading each page from top-to-bottom—that is, via brute-force search). The better and more natural approach (for agents and humans alike) is to skip to the relevant page first (perhaps finding it alphabetically).
We recommend building a few thoughtful tools targeting specific high-impact workflows, which match your evaluation tasks and scaling up from there. In the address book case, you might choose to implement a search_contacts or message_contact tool instead of a list_contacts tool.
Tools can consolidate functionality, handling potentially multiple discrete operations (or API calls) under the hood. For example, tools can enrich tool responses with related metadata or handle frequently chained, multi-step tasks in a single tool call.
Here are some examples:
Instead of implementing a list_users, list_events, and create_event tools, consider implementing a schedule_event tool which finds availability and schedules an event.
Instead of implementing a read_logs tool, consider implementing a search_logs tool which only returns relevant log lines and some surrounding context.
Instead of implementing get_customer_by_id, list_transactions, and list_notes tools, implement a get_customer_context tool which compiles all of a customer’s recent & relevant information all at once.
Make sure each tool you build has a clear, distinct purpose. Tools should enable agents to subdivide and solve tasks in much the same way that a human would, given access to the same underlying resources, and simultaneously reduce the context that would have otherwise been consumed by intermediate outputs.
Too many tools or overlapping tools can also distract agents from pursuing efficient strategies. Careful, selective planning of the tools you build (or don’t build) can really pay off.
Namespacing your tools
Your AI agents will potentially gain access to dozens of MCP servers and hundreds of different tools–including those by other developers. When tools overlap in function or have a vague purpose, agents can get confused about which ones to use.
Namespacing (grouping related tools under common prefixes) can help delineate boundaries between lots of tools; MCP clients sometimes do this by default. For example, namespacing tools by service (e.g., asana_search, jira_search) and by resource (e.g., asana_projects_search, asana_users_search), can help agents select the right tools at the right time.
We have found selecting between prefix- and suffix-based namespacing to have non-trivial effects on our tool-use evaluations. Effects vary by LLM and we encourage you to choose a naming scheme according to your own evaluations.
Agents might call the wrong tools, call the right tools with the wrong parameters, call too few tools, or process tool responses incorrectly. By selectively implementing tools whose names reflect natural subdivisions of tasks, you simultaneously reduce the number of tools and tool descriptions loaded into the agent’s context and offload agentic computation from the agent’s context back into the tool calls themselves. This reduces an agent’s overall risk of making mistakes.
Returning meaningful context from your tools
In the same vein, tool implementations should take care to return only high signal information back to agents. They should prioritize contextual relevance over flexibility, and eschew low-level technical identifiers (for example: uuid, 256px_image_url, mime_type). Fields like name, image_url, and file_type are much more likely to directly inform agents’ downstream actions and responses.
Agents also tend to grapple with natural language names, terms, or identifiers significantly more successfully than they do with cryptic identifiers. We’ve found that merely resolving arbitrary alphanumeric UUIDs to more semantically meaningful and interpretable language (or even a 0-indexed ID scheme) significantly improves Claude’s precision in retrieval tasks by reducing hallucinations.
In some instances, agents may require the flexibility to interact with both natural language and technical identifiers outputs, if only to trigger downstream tool calls (for example, search_user(name=’jane’) → send_message(id=12345)). You can enable both by exposing a simple response_format enum parameter in your tool, allowing your agent to control whether tools return “concise” or “detailed” responses (images below).
You can add more formats for even greater flexibility, similar to GraphQL where you can choose exactly which pieces of information you want to receive. Here is an example ResponseFormat enum to control tool response verbosity:
enum ResponseFormat {
DETAILED = "detailed",
CONCISE = "concise"
}
Copy
Here’s an example of a detailed tool response (206 tokens):
This code snippet depicts an example of a detailed tool response.
Here’s an example of a concise tool response (72 tokens):
This code snippet depicts a concise tool response.
Slack threads and thread replies are identified by unique thread_ts which are required to fetch thread replies. thread_ts and other IDs (channel_id, user_id) can be retrieved from a “detailed” tool response to enable further tool calls that require these. “concise” tool responses return only thread content and exclude IDs. In this example, we use ~⅓ of the tokens with “concise” tool responses.
Even your tool response structure—for example XML, JSON, or Markdown—can have an impact on evaluation performance: there is no one-size-fits-all solution. This is because LLMs are trained on next-token prediction and tend to perform better with formats that match their training data. The optimal response structure will vary widely by task and agent. We encourage you to select the best response structure based on your own evaluation.
Optimizing tool responses for token efficiency
Optimizing the quality of context is important. But so is optimizing the quantity of context returned back to agents in tool responses.
We suggest implementing some combination of pagination, range selection, filtering, and/or truncation with sensible default parameter values for any tool responses that could use up lots of context. For Claude Code, we restrict tool responses to 25,000 tokens by default. We expect the effective context length of agents to grow over time, but the need for context-efficient tools to remain.
If you choose to truncate responses, be sure to steer agents with helpful instructions. You can directly encourage agents to pursue more token-efficient strategies, like making many small and targeted searches instead of a single, broad search for a knowledge retrieval task. Similarly, if a tool call raises an error (for example, during input validation), you can prompt-engineer your error responses to clearly communicate specific and actionable improvements, rather than opaque error codes or tracebacks.
Here’s an example of a truncated tool response:
This image depicts an example of a truncated tool response.
Here’s an example of an unhelpful error response:
This image depicts an example of an unhelpful tool response.
Here’s an example of a helpful error response:
This image depicts an example of a helpful error response.
Tool truncation and error responses can steer agents towards more token-efficient tool-use behaviors (using filters or pagination) or give examples of correctly formatted tool inputs.
Prompt-engineering your tool descriptions
We now come to one of the most effective methods for improving tools: prompt-engineering your tool descriptions and specs. Because these are loaded into your agents’ context, they can collectively steer agents toward effective tool-calling behaviors.
When writing tool descriptions and specs, think of how you would describe your tool to a new hire on your team. Consider the context that you might implicitly bring—specialized query formats, definitions of niche terminology, relationships between underlying resources—and make it explicit. Avoid ambiguity by clearly describing (and enforcing with strict data models) expected inputs and outputs. In particular, input parameters should be unambiguously named: instead of a parameter named user, try a parameter named user_id.
With your evaluation you can measure the impact of your prompt engineering with greater confidence. Even small refinements to tool descriptions can yield dramatic improvements. Claude Sonnet 3.5 achieved state-of-the-art performance on the SWE-bench Verified evaluation after we made precise refinements to tool descriptions, dramatically reducing error rates and improving task completion.
You can find other best practices for tool definitions in our Developer Guide. If you’re building tools for Claude, we also recommend reading about how tools are dynamically loaded into Claude’s system prompt. Lastly, if you’re writing tools for an MCP server, tool annotations help disclose which tools require open-world access or make destructive changes.
Looking ahead
To build effective tools for agents, we need to re-orient our software development practices from predictable, deterministic patterns to non-deterministic ones.
Through the iterative, evaluation-driven process we’ve described in this post, we've identified consistent patterns in what makes tools successful: Effective tools are intentionally and clearly defined, use agent context judiciously, can be combined together in diverse workflows, and enable agents to intuitively solve real-world tasks.
In the future, we expect the specific mechanisms through which agents interact with the world to evolve—from updates to the MCP protocol to upgrades to the underlying LLMs themselves. With a systematic, evaluation-driven approach to improving tools for agents, we can ensure that as agents become more capable, the tools they use will evolve alongside them.
Acknowledgements
Written by Ken Aizawa with valuable contributions from colleagues across Research (Barry Zhang, Zachary Witten, Daniel Jiang, Sami Al-Sheikh, Matt Bell, Maggie Vo), MCP (Theodora Chu, John Welsh, David Soria Parra, Adam Jones), Product Engineering (Santiago Seira), Marketing (Molly Vorwerck), Design (Drew Roper), and Applied AI (Christian Ryan, Alexander Bricken).
1Beyond training the underlying LLMs themselves.
-------
Effective context engineering for AI agents
Published Sep 29, 2025
Context is a critical but finite resource for AI agents. In this post, we explore strategies for effectively curating and managing the context that powers them.
After a few years of prompt engineering being the focus of attention in applied AI, a new term has come to prominence: context engineering. Building with language models is becoming less about finding the right words and phrases for your prompts, and more about answering the broader question of “what configuration of context is most likely to generate our model’s desired behavior?"
Context refers to the set of tokens included when sampling from a large-language model (LLM). The engineering problem at hand is optimizing the utility of those tokens against the inherent constraints of LLMs in order to consistently achieve a desired outcome. Effectively wrangling LLMs often requires thinking in context — in other words: considering the holistic state available to the LLM at any given time and what potential behaviors that state might yield.
In this post, we’ll explore the emerging art of context engineering and offer a refined mental model for building steerable, effective agents.
Context engineering vs. prompt engineering
At Anthropic, we view context engineering as the natural progression of prompt engineering. Prompt engineering refers to methods for writing and organizing LLM instructions for optimal outcomes (see our docs for an overview and useful prompt engineering strategies). Context engineering refers to the set of strategies for curating and maintaining the optimal set of tokens (information) during LLM inference, including all the other information that may land there outside of the prompts.
In the early days of engineering with LLMs, prompting was the biggest component of AI engineering work, as the majority of use cases outside of everyday chat interactions required prompts optimized for one-shot classification or text generation tasks. As the term implies, the primary focus of prompt engineering is how to write effective prompts, particularly system prompts. However, as we move towards engineering more capable agents that operate over multiple turns of inference and longer time horizons, we need strategies for managing the entire context state (system instructions, tools, Model Context Protocol (MCP), external data, message history, etc).
An agent running in a loop generates more and more data that could be relevant for the next turn of inference, and this information must be cyclically refined. Context engineering is the art and science of curating what will go into the limited context window from that constantly evolving universe of possible information.
Prompt engineering vs. context engineering
In contrast to the discrete task of writing a prompt, context engineering is iterative and the curation phase happens each time we decide what to pass to the model.
Why context engineering is important to building capable agents
Despite their speed and ability to manage larger and larger volumes of data, we’ve observed that LLMs, like humans, lose focus or experience confusion at a certain point. Studies on needle-in-a-haystack style benchmarking have uncovered the concept of context rot: as the number of tokens in the context window increases, the model’s ability to accurately recall information from that context decreases.
While some models exhibit more gentle degradation than others, this characteristic emerges across all models. Context, therefore, must be treated as a finite resource with diminishing marginal returns. Like humans, who have limited working memory capacity, LLMs have an “attention budget” that they draw on when parsing large volumes of context. Every new token introduced depletes this budget by some amount, increasing the need to carefully curate the tokens available to the LLM.
This attention scarcity stems from architectural constraints of LLMs. LLMs are based on the transformer architecture, which enables every token to attend to every other token across the entire context. This results in n² pairwise relationships for n tokens.
As its context length increases, a model's ability to capture these pairwise relationships gets stretched thin, creating a natural tension between context size and attention focus. Additionally, models develop their attention patterns from training data distributions where shorter sequences are typically more common than longer ones. This means models have less experience with, and fewer specialized parameters for, context-wide dependencies.
Techniques like position encoding interpolation allow models to handle longer sequences by adapting them to the originally trained smaller context, though with some degradation in token position understanding. These factors create a performance gradient rather than a hard cliff: models remain highly capable at longer contexts but may show reduced precision for information retrieval and long-range reasoning compared to their performance on shorter contexts.
These realities mean that thoughtful context engineering is essential for building capable agents.
The anatomy of effective context
Given that LLMs are constrained by a finite attention budget, good context engineering means finding the smallest possible set of high-signal tokens that maximize the likelihood of some desired outcome. Implementing this practice is much easier said than done, but in the following section, we outline what this guiding principle means in practice across the different components of context.
System prompts should be extremely clear and use simple, direct language that presents ideas at the right altitude for the agent. The right altitude is the Goldilocks zone between two common failure modes. At one extreme, we see engineers hardcoding complex, brittle logic in their prompts to elicit exact agentic behavior. This approach creates fragility and increases maintenance complexity over time. At the other extreme, engineers sometimes provide vague, high-level guidance that fails to give the LLM concrete signals for desired outputs or falsely assumes shared context. The optimal altitude strikes a balance: specific enough to guide behavior effectively, yet flexible enough to provide the model with strong heuristics to guide behavior.
Calibrating the system prompt in the process of context engineering.
At one end of the spectrum, we see brittle if-else hardcoded prompts, and at the other end we see prompts that are overly general or falsely assume shared context.
We recommend organizing prompts into distinct sections (like <background_information>, <instructions>, ## Tool guidance, ## Output description, etc) and using techniques like XML tagging or Markdown headers to delineate these sections, although the exact formatting of prompts is likely becoming less important as models become more capable.
Regardless of how you decide to structure your system prompt, you should be striving for the minimal set of information that fully outlines your expected behavior. (Note that minimal does not necessarily mean short; you still need to give the agent sufficient information up front to ensure it adheres to the desired behavior.) It’s best to start by testing a minimal prompt with the best model available to see how it performs on your task, and then add clear instructions and examples to improve performance based on failure modes found during initial testing.
Tools allow agents to operate with their environment and pull in new, additional context as they work. Because tools define the contract between agents and their information/action space, it’s extremely important that tools promote efficiency, both by returning information that is token efficient and by encouraging efficient agent behaviors.
In Writing tools for AI agents – with AI agents, we discussed building tools that are well understood by LLMs and have minimal overlap in functionality. Similar to the functions of a well-designed codebase, tools should be self-contained, robust to error, and extremely clear with respect to their intended use. Input parameters should similarly be descriptive, unambiguous, and play to the inherent strengths of the model.
One of the most common failure modes we see is bloated tool sets that cover too much functionality or lead to ambiguous decision points about which tool to use. If a human engineer can’t definitively say which tool should be used in a given situation, an AI agent can’t be expected to do better. As we’ll discuss later, curating a minimal viable set of tools for the agent can also lead to more reliable maintenance and pruning of context over long interactions.
Providing examples, otherwise known as few-shot prompting, is a well known best practice that we continue to strongly advise. However, teams will often stuff a laundry list of edge cases into a prompt in an attempt to articulate every possible rule the LLM should follow for a particular task. We do not recommend this. Instead, we recommend working to curate a set of diverse, canonical examples that effectively portray the expected behavior of the agent. For an LLM, examples are the “pictures” worth a thousand words.
Our overall guidance across the different components of context (system prompts, tools, examples, message history, etc) is to be thoughtful and keep your context informative, yet tight. Now let's dive into dynamically retrieving context at runtime.
Context retrieval and agentic search
In Building effective AI agents, we highlighted the differences between LLM-based workflows and agents. Since we wrote that post, we’ve gravitated towards a simple definition for agents: LLMs autonomously using tools in a loop.
Working alongside our customers, we’ve seen the field converging on this simple paradigm. As the underlying models become more capable, the level of autonomy of agents can scale: smarter models allow agents to independently navigate nuanced problem spaces and recover from errors.
We’re now seeing a shift in how engineers think about designing context for agents. Today, many AI-native applications employ some form of embedding-based pre-inference time retrieval to surface important context for the agent to reason over. As the field transitions to more agentic approaches, we increasingly see teams augmenting these retrieval systems with “just in time” context strategies.
Rather than pre-processing all relevant data up front, agents built with the “just in time” approach maintain lightweight identifiers (file paths, stored queries, web links, etc.) and use these references to dynamically load data into context at runtime using tools. Anthropic’s agentic coding solution Claude Code uses this approach to perform complex data analysis over large databases. The model can write targeted queries, store results, and leverage Bash commands like head and tail to analyze large volumes of data without ever loading the full data objects into context. This approach mirrors human cognition: we generally don’t memorize entire corpuses of information, but rather introduce external organization and indexing systems like file systems, inboxes, and bookmarks to retrieve relevant information on demand.
Beyond storage efficiency, the metadata of these references provides a mechanism to efficiently refine behavior, whether explicitly provided or intuitive. To an agent operating in a file system, the presence of a file named test_utils.py in a tests folder implies a different purpose than a file with the same name located in src/core_logic/ Folder hierarchies, naming conventions, and timestamps all provide important signals that help both humans and agents understand how and when to utilize information.
Letting agents navigate and retrieve data autonomously also enables progressive disclosure—in other words, allows agents to incrementally discover relevant context through exploration. Each interaction yields context that informs the next decision: file sizes suggest complexity; naming conventions hint at purpose; timestamps can be a proxy for relevance. Agents can assemble understanding layer by layer, maintaining only what's necessary in working memory and leveraging note-taking strategies for additional persistence. This self-managed context window keeps the agent focused on relevant subsets rather than drowning in exhaustive but potentially irrelevant information.
Of course, there's a trade-off: runtime exploration is slower than retrieving pre-computed data. Not only that, but opinionated and thoughtful engineering is required to ensure that an LLM has the right tools and heuristics for effectively navigating its information landscape. Without proper guidance, an agent can waste context by misusing tools, chasing dead-ends, or failing to identify key information.
In certain settings, the most effective agents might employ a hybrid strategy, retrieving some data up front for speed, and pursuing further autonomous exploration at its discretion. The decision boundary for the ‘right’ level of autonomy depends on the task. Claude Code is an agent that employs this hybrid model: CLAUDE.md files are naively dropped into context up front, while primitives like glob and grep allow it to navigate its environment and retrieve files just-in-time, effectively bypassing the issues of stale indexing and complex syntax trees.
The hybrid strategy might be better suited for contexts with less dynamic content, such as legal or finance work. As model capabilities improve, agentic design will trend towards letting intelligent models act intelligently, with progressively less human curation. Given the rapid pace of progress in the field, "do the simplest thing that works" will likely remain our best advice for teams building agents on top of Claude.
Context engineering for long-horizon tasks
Long-horizon tasks require agents to maintain coherence, context, and goal-directed behavior over sequences of actions where the token count exceeds the LLM’s context window. For tasks that span tens of minutes to multiple hours of continuous work, like large codebase migrations or comprehensive research projects, agents require specialized techniques to work around the context window size limitation.
Waiting for larger context windows might seem like an obvious tactic. But it's likely that for the foreseeable future, context windows of all sizes will be subject to context pollution and information relevance concerns—at least for situations where the strongest agent performance is desired. To enable agents to work effectively across extended time horizons, we've developed a few techniques that address these context pollution constraints directly: compaction, structured note-taking, and multi-agent architectures.
Compaction
Compaction is the practice of taking a conversation nearing the context window limit, summarizing its contents, and reinitiating a new context window with the summary. Compaction typically serves as the first lever in context engineering to drive better long-term coherence. At its core, compaction distills the contents of a context window in a high-fidelity manner, enabling the agent to continue with minimal performance degradation.
In Claude Code, for example, we implement this by passing the message history to the model to summarize and compress the most critical details. The model preserves architectural decisions, unresolved bugs, and implementation details while discarding redundant tool outputs or messages. The agent can then continue with this compressed context plus the five most recently accessed files. Users get continuity without worrying about context window limitations.
The art of compaction lies in the selection of what to keep versus what to discard, as overly aggressive compaction can result in the loss of subtle but critical context whose importance only becomes apparent later. For engineers implementing compaction systems, we recommend carefully tuning your prompt on complex agent traces. Start by maximizing recall to ensure your compaction prompt captures every relevant piece of information from the trace, then iterate to improve precision by eliminating superfluous content.
An example of low-hanging superfluous content is clearing tool calls and results – once a tool has been called deep in the message history, why would the agent need to see the raw result again? One of the safest lightest touch forms of compaction is tool result clearing, most recently launched as a feature on the Claude Developer Platform.
Structured note-taking
Structured note-taking, or agentic memory, is a technique where the agent regularly writes notes persisted to memory outside of the context window. These notes get pulled back into the context window at later times.
This strategy provides persistent memory with minimal overhead. Like Claude Code creating a to-do list, or your custom agent maintaining a NOTES.md file, this simple pattern allows the agent to track progress across complex tasks, maintaining critical context and dependencies that would otherwise be lost across dozens of tool calls.
Claude playing Pokémon demonstrates how memory transforms agent capabilities in non-coding domains. The agent maintains precise tallies across thousands of game steps—tracking objectives like "for the last 1,234 steps I've been training my Pokémon in Route 1, Pikachu has gained 8 levels toward the target of 10." Without any prompting about memory structure, it develops maps of explored regions, remembers which key achievements it has unlocked, and maintains strategic notes of combat strategies that help it learn which attacks work best against different opponents.
After context resets, the agent reads its own notes and continues multi-hour training sequences or dungeon explorations. This coherence across summarization steps enables long-horizon strategies that would be impossible when keeping all the information in the LLM’s context window alone.
As part of our Sonnet 4.5 launch, we released a memory tool in public beta on the Claude Developer Platform that makes it easier to store and consult information outside the context window through a file-based system. This allows agents to build up knowledge bases over time, maintain project state across sessions, and reference previous work without keeping everything in context.
Sub-agent architectures
Sub-agent architectures provide another way around context limitations. Rather than one agent attempting to maintain state across an entire project, specialized sub-agents can handle focused tasks with clean context windows. The main agent coordinates with a high-level plan while subagents perform deep technical work or use tools to find relevant information. Each subagent might explore extensively, using tens of thousands of tokens or more, but returns only a condensed, distilled summary of its work (often 1,000-2,000 tokens).
This approach achieves a clear separation of concerns—the detailed search context remains isolated within sub-agents, while the lead agent focuses on synthesizing and analyzing the results. This pattern, discussed in How we built our multi-agent research system, showed a substantial improvement over single-agent systems on complex research tasks.
The choice between these approaches depends on task characteristics. For example:
Compaction maintains conversational flow for tasks requiring extensive back-and-forth;
Note-taking excels for iterative development with clear milestones;
Multi-agent architectures handle complex research and analysis where parallel exploration pays dividends.
Even as models continue to improve, the challenge of maintaining coherence across extended interactions will remain central to building more effective agents.
Conclusion
Context engineering represents a fundamental shift in how we build with LLMs. As models become more capable, the challenge isn't just crafting the perfect prompt—it's thoughtfully curating what information enters the model's limited attention budget at each step. Whether you're implementing compaction for long-horizon tasks, designing token-efficient tools, or enabling agents to explore their environment just-in-time, the guiding principle remains the same: find the smallest set of high-signal tokens that maximize the likelihood of your desired outcome.
The techniques we've outlined will continue evolving as models improve. We're already seeing that smarter models require less prescriptive engineering, allowing agents to operate with more autonomy. But even as capabilities scale, treating context as a precious, finite resource will remain central to building reliable, effective agents.
Get started with context engineering in the Claude Developer Platform today, and access helpful tips and best practices via our memory and context management cookbook.
---------
ffective harnesses for long-running agents
Published Nov 26, 2025
Agents still face challenges working across many context windows. We
gitextract_kaaor9qo/
├── .claude-plugin/
│ └── marketplace.json
├── .cursorindexingignore
├── .gitignore
├── .plugin/
│ └── plugin.json
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SKILL.md
├── docs/
│ ├── agentskills.md
│ ├── blogs.md
│ ├── claude_research.md
│ ├── compression.md
│ ├── gemini_research.md
│ ├── hncapsule.md
│ ├── netflix_context.md
│ ├── skills-improvement-analysis.md
│ └── vercel_tool.md
├── examples/
│ ├── book-sft-pipeline/
│ │ ├── README.md
│ │ ├── SKILL.md
│ │ ├── examples/
│ │ │ └── gertrude-stein/
│ │ │ ├── README.md
│ │ │ ├── dataset_sample.jsonl
│ │ │ ├── sample_outputs.md
│ │ │ └── training_config.json
│ │ ├── references/
│ │ │ ├── segmentation-strategies.md
│ │ │ ├── tinker-format.md
│ │ │ └── tinker.txt
│ │ └── scripts/
│ │ └── pipeline_example.py
│ ├── digital-brain-skill/
│ │ ├── .gitignore
│ │ ├── AGENT.md
│ │ ├── HOW-SKILLS-BUILT-THIS.md
│ │ ├── README.md
│ │ ├── SKILL.md
│ │ ├── SKILLS-MAPPING.md
│ │ ├── agents/
│ │ │ ├── AGENTS.md
│ │ │ └── scripts/
│ │ │ ├── content_ideas.py
│ │ │ ├── idea_to_draft.py
│ │ │ ├── stale_contacts.py
│ │ │ └── weekly_review.py
│ │ ├── content/
│ │ │ ├── CONTENT.md
│ │ │ ├── calendar.md
│ │ │ ├── engagement.jsonl
│ │ │ ├── ideas.jsonl
│ │ │ ├── posts.jsonl
│ │ │ └── templates/
│ │ │ ├── linkedin-post.md
│ │ │ ├── newsletter.md
│ │ │ └── thread.md
│ │ ├── examples/
│ │ │ ├── content-workflow.md
│ │ │ └── meeting-prep.md
│ │ ├── identity/
│ │ │ ├── IDENTITY.md
│ │ │ ├── bio-variants.md
│ │ │ ├── brand.md
│ │ │ ├── prompts/
│ │ │ │ ├── content-generation.xml
│ │ │ │ └── reply-generator.xml
│ │ │ ├── values.yaml
│ │ │ └── voice.md
│ │ ├── knowledge/
│ │ │ ├── KNOWLEDGE.md
│ │ │ ├── bookmarks.jsonl
│ │ │ ├── competitors.md
│ │ │ ├── learning.yaml
│ │ │ └── research/
│ │ │ └── _template.md
│ │ ├── network/
│ │ │ ├── NETWORK.md
│ │ │ ├── circles.yaml
│ │ │ ├── contacts.jsonl
│ │ │ ├── interactions.jsonl
│ │ │ └── intros.md
│ │ ├── operations/
│ │ │ ├── OPERATIONS.md
│ │ │ ├── goals.yaml
│ │ │ ├── meetings.jsonl
│ │ │ ├── metrics.jsonl
│ │ │ ├── reviews/
│ │ │ │ └── _weekly_template.md
│ │ │ └── todos.md
│ │ ├── package.json
│ │ ├── references/
│ │ │ └── file-formats.md
│ │ └── scripts/
│ │ └── install.sh
│ ├── interleaved-thinking/
│ │ ├── README.md
│ │ ├── SKILL.md
│ │ ├── docs/
│ │ │ ├── agentthinking.md
│ │ │ ├── interleavedthinking.md
│ │ │ └── m2-1.md
│ │ ├── examples/
│ │ │ ├── 01_basic_capture.py
│ │ │ ├── 02_tool_usage.py
│ │ │ └── 03_full_optimization.py
│ │ ├── generated_skills/
│ │ │ └── comprehensive-research-agent/
│ │ │ ├── SKILL.md
│ │ │ └── references/
│ │ │ ├── optimization_summary.json
│ │ │ ├── optimized_prompt.txt
│ │ │ └── patterns_found.json
│ │ ├── optimization_artifacts/
│ │ │ ├── final_prompt.txt
│ │ │ ├── iteration_1/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_10/
│ │ │ │ ├── analysis.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_2/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_3/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_4/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_5/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_6/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_7/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_8/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ ├── iteration_9/
│ │ │ │ ├── analysis.txt
│ │ │ │ ├── optimization.txt
│ │ │ │ ├── optimized_prompt.txt
│ │ │ │ └── trace.txt
│ │ │ └── summary.json
│ │ ├── pyproject.toml
│ │ ├── reasoning_trace_optimizer/
│ │ │ ├── __init__.py
│ │ │ ├── analyzer.py
│ │ │ ├── capture.py
│ │ │ ├── cli.py
│ │ │ ├── loop.py
│ │ │ ├── models.py
│ │ │ ├── optimizer.py
│ │ │ └── skill_generator.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ └── test_models.py
│ ├── llm-as-judge-skills/
│ │ ├── .gitignore
│ │ ├── .prettierrc
│ │ ├── CONTRIBUTING.md
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── agents/
│ │ │ ├── evaluator-agent/
│ │ │ │ └── evaluator-agent.md
│ │ │ ├── index.md
│ │ │ ├── orchestrator-agent/
│ │ │ │ └── orchestrator-agent.md
│ │ │ └── research-agent/
│ │ │ └── research-agent.md
│ │ ├── env.example
│ │ ├── eslint.config.js
│ │ ├── examples/
│ │ │ ├── basic-evaluation.ts
│ │ │ ├── full-evaluation-workflow.ts
│ │ │ ├── generate-rubric.ts
│ │ │ └── pairwise-comparison.ts
│ │ ├── package.json
│ │ ├── prompts/
│ │ │ ├── agent-system/
│ │ │ │ └── orchestrator-prompt.md
│ │ │ ├── evaluation/
│ │ │ │ ├── direct-scoring-prompt.md
│ │ │ │ └── pairwise-comparison-prompt.md
│ │ │ ├── index.md
│ │ │ └── research/
│ │ │ └── research-synthesis-prompt.md
│ │ ├── skills/
│ │ │ ├── context-fundamentals/
│ │ │ │ └── context-fundamentals.md
│ │ │ ├── index.md
│ │ │ ├── llm-evaluator/
│ │ │ │ └── llm-evaluator.md
│ │ │ └── tool-design/
│ │ │ └── tool-design.md
│ │ ├── src/
│ │ │ ├── agents/
│ │ │ │ ├── evaluator.ts
│ │ │ │ └── index.ts
│ │ │ ├── config/
│ │ │ │ └── index.ts
│ │ │ ├── index.ts
│ │ │ └── tools/
│ │ │ └── evaluation/
│ │ │ ├── direct-score.ts
│ │ │ ├── generate-rubric.ts
│ │ │ ├── index.ts
│ │ │ └── pairwise-compare.ts
│ │ ├── tests/
│ │ │ ├── evaluation.test.ts
│ │ │ ├── setup.ts
│ │ │ └── skills.test.ts
│ │ ├── tools/
│ │ │ ├── evaluation/
│ │ │ │ ├── direct-score.md
│ │ │ │ ├── generate-rubric.md
│ │ │ │ └── pairwise-compare.md
│ │ │ ├── index.md
│ │ │ ├── orchestration/
│ │ │ │ └── delegate-to-agent.md
│ │ │ └── research/
│ │ │ ├── read-url.md
│ │ │ └── web-search.md
│ │ ├── tsconfig.json
│ │ └── vitest.config.ts
│ └── x-to-book-system/
│ ├── PRD.md
│ ├── README.md
│ └── SKILLS-MAPPING.md
├── researcher/
│ ├── example_output.md
│ └── llm-as-a-judge.md
├── skills/
│ ├── advanced-evaluation/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ ├── bias-mitigation.md
│ │ │ ├── evaluation-pipeline.md
│ │ │ ├── implementation-patterns.md
│ │ │ └── metrics-guide.md
│ │ └── scripts/
│ │ └── evaluation_example.py
│ ├── bdi-mental-states/
│ │ ├── SKILL.md
│ │ └── references/
│ │ ├── bdi-ontology-core.md
│ │ ├── framework-integration.md
│ │ ├── rdf-examples.md
│ │ └── sparql-competency.md
│ ├── context-compression/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── evaluation-framework.md
│ │ └── scripts/
│ │ └── compression_evaluator.py
│ ├── context-degradation/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── patterns.md
│ │ └── scripts/
│ │ └── degradation_detector.py
│ ├── context-fundamentals/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── context-components.md
│ │ └── scripts/
│ │ └── context_manager.py
│ ├── context-optimization/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── optimization_techniques.md
│ │ └── scripts/
│ │ └── compaction.py
│ ├── evaluation/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── metrics.md
│ │ └── scripts/
│ │ └── evaluator.py
│ ├── filesystem-context/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── implementation-patterns.md
│ │ └── scripts/
│ │ └── filesystem_context.py
│ ├── hosted-agents/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── infrastructure-patterns.md
│ │ └── scripts/
│ │ └── sandbox_manager.py
│ ├── memory-systems/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── implementation.md
│ │ └── scripts/
│ │ └── memory_store.py
│ ├── multi-agent-patterns/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ └── frameworks.md
│ │ └── scripts/
│ │ └── coordination.py
│ ├── project-development/
│ │ ├── SKILL.md
│ │ ├── references/
│ │ │ ├── case-studies.md
│ │ │ └── pipeline-patterns.md
│ │ └── scripts/
│ │ └── pipeline_template.py
│ └── tool-design/
│ ├── SKILL.md
│ ├── references/
│ │ ├── architectural_reduction.md
│ │ └── best_practices.md
│ └── scripts/
│ └── description_generator.py
└── template/
└── SKILL.md
SYMBOL INDEX (431 symbols across 38 files)
FILE: examples/book-sft-pipeline/scripts/pipeline_example.py
class Chunk (line 17) | class Chunk:
class TrainingExample (line 23) | class TrainingExample:
method to_messages (line 28) | def to_messages(self) -> dict:
function segment_text (line 41) | def segment_text(text: str, min_words: int = 150, max_words: int = 400) ...
function build_examples (line 92) | def build_examples(chunk: Chunk, instruction: str, author: str, variants...
function generate_instruction (line 119) | def generate_instruction(chunk: Chunk, llm_call) -> str:
function build_tinker_datum (line 137) | def build_tinker_datum(example: dict, tokenizer, renderer):
function validate_style_transfer (line 163) | def validate_style_transfer(output: str, training_data_path: str) -> dict:
FILE: examples/digital-brain-skill/agents/scripts/content_ideas.py
function load_jsonl (line 14) | def load_jsonl(filepath):
function get_top_performing_content (line 32) | def get_top_performing_content():
function get_recent_bookmarks (line 48) | def get_recent_bookmarks(category=None):
function get_undeveloped_ideas (line 59) | def get_undeveloped_ideas():
function generate_suggestions (line 66) | def generate_suggestions(pillar=None, count=5):
FILE: examples/digital-brain-skill/agents/scripts/idea_to_draft.py
function load_jsonl (line 14) | def load_jsonl(filepath):
function find_idea (line 32) | def find_idea(idea_id):
function find_related_bookmarks (line 47) | def find_related_bookmarks(tags, pillar):
function find_similar_posts (line 63) | def find_similar_posts(pillar):
function generate_draft_scaffold (line 70) | def generate_draft_scaffold(idea_id):
FILE: examples/digital-brain-skill/agents/scripts/stale_contacts.py
function load_jsonl (line 21) | def load_jsonl(filepath):
function days_since (line 39) | def days_since(date_str):
function find_stale_contacts (line 49) | def find_stale_contacts():
function generate_report (line 82) | def generate_report():
FILE: examples/digital-brain-skill/agents/scripts/weekly_review.py
function load_jsonl (line 15) | def load_jsonl(filepath):
function get_week_range (line 34) | def get_week_range():
function analyze_content (line 41) | def analyze_content(week_start):
function analyze_network (line 55) | def analyze_network(week_start):
function analyze_metrics (line 66) | def analyze_metrics():
function generate_review (line 73) | def generate_review():
FILE: examples/interleaved-thinking/examples/01_basic_capture.py
function main (line 20) | def main():
FILE: examples/interleaved-thinking/examples/02_tool_usage.py
function execute_tool (line 61) | def execute_tool(name: str, input_data: dict) -> str:
function main (line 107) | def main():
FILE: examples/interleaved-thinking/examples/03_full_optimization.py
function execute_tool (line 838) | def execute_tool(name: str, input_data: dict) -> str:
function main (line 1015) | def main():
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/analyzer.py
class TraceAnalyzer (line 123) | class TraceAnalyzer:
method __init__ (line 142) | def __init__(
method analyze (line 162) | def analyze(
method analyze_batch (line 216) | def analyze_batch(
method quick_score (line 223) | def quick_score(
method _format_trace_for_analysis (line 266) | def _format_trace_for_analysis(self, trace: ReasoningTrace) -> str:
method _format_tool_calls (line 276) | def _format_tool_calls(self, trace: ReasoningTrace) -> str:
method _get_thinking_excerpts (line 291) | def _get_thinking_excerpts(self, trace: ReasoningTrace, max_chars: int...
method _parse_analysis_response (line 305) | def _parse_analysis_response(
method _fallback_parse_analysis (line 367) | def _fallback_parse_analysis(
method _extract_fallback_score (line 404) | def _extract_fallback_score(self, response_text: str) -> float:
function format_analysis_report (line 426) | def format_analysis_report(analysis: AnalysisResult) -> str:
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/capture.py
class TraceCapture (line 23) | class TraceCapture:
method __init__ (line 42) | def __init__(
method run (line 62) | def run(
method _process_response (line 159) | def _process_response(
method _execute_tool (line 191) | def _execute_tool(
method run_streaming (line 231) | def run_streaming(
function format_trace_for_display (line 379) | def format_trace_for_display(trace: ReasoningTrace) -> str:
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/cli.py
function cmd_capture (line 23) | def cmd_capture(args: argparse.Namespace) -> None:
function cmd_analyze (line 48) | def cmd_analyze(args: argparse.Namespace) -> None:
function cmd_optimize (line 82) | def cmd_optimize(args: argparse.Namespace) -> None:
function cmd_generate_skill (line 127) | def cmd_generate_skill(args: argparse.Namespace) -> None:
function main (line 172) | def main() -> None:
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/loop.py
class LoopConfig (line 35) | class LoopConfig:
class OptimizationLoop (line 58) | class OptimizationLoop:
method __init__ (line 84) | def __init__(
method run (line 111) | def run(
method run_single (line 307) | def run_single(
method _calculate_score (line 332) | def _calculate_score(
method _check_convergence (line 349) | def _check_convergence(
method _print_iteration_summary (line 378) | def _print_iteration_summary(self, iteration: LoopIteration) -> None:
method _print_final_summary (line 398) | def _print_final_summary(self, result: LoopResult) -> None:
method _save_iteration_artifacts (line 410) | def _save_iteration_artifacts(self, iteration: LoopIteration, num: int...
method _save_final_artifacts (line 431) | def _save_final_artifacts(self, result: LoopResult) -> None:
function run_quick_optimization (line 453) | def run_quick_optimization(
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/models.py
class PatternType (line 11) | class PatternType(Enum):
class Severity (line 26) | class Severity(Enum):
class ThinkingBlock (line 36) | class ThinkingBlock:
class ToolCall (line 52) | class ToolCall:
class ReasoningTrace (line 65) | class ReasoningTrace:
method get_thinking_at_turn (line 84) | def get_thinking_at_turn(self, turn: int) -> ThinkingBlock | None:
method get_tool_calls_at_turn (line 91) | def get_tool_calls_at_turn(self, turn: int) -> list[ToolCall]:
class Pattern (line 97) | class Pattern:
class AnalysisResult (line 110) | class AnalysisResult:
class PromptDiff (line 134) | class PromptDiff:
class OptimizationResult (line 144) | class OptimizationResult:
class LoopIteration (line 161) | class LoopIteration:
class LoopResult (line 176) | class LoopResult:
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/optimizer.py
class PromptOptimizer (line 99) | class PromptOptimizer:
method __init__ (line 120) | def __init__(
method optimize (line 140) | def optimize(
method optimize_iterative (line 198) | def optimize_iterative(
method suggest_tool_improvements (line 247) | def suggest_tool_improvements(
method _format_patterns (line 304) | def _format_patterns(self, analysis: AnalysisResult) -> str:
method _format_patterns_for_tools (line 319) | def _format_patterns_for_tools(self, patterns: list) -> str:
method _parse_optimization_response (line 325) | def _parse_optimization_response(
method _fallback_extract_prompt (line 372) | def _fallback_extract_prompt(self, response_text: str) -> str | None:
function format_optimization_report (line 414) | def format_optimization_report(result: OptimizationResult) -> str:
FILE: examples/interleaved-thinking/reasoning_trace_optimizer/skill_generator.py
function _format_list_to_markdown (line 83) | def _format_list_to_markdown(items: list | str) -> str:
function _format_numbered_list_to_markdown (line 99) | def _format_numbered_list_to_markdown(items: list | str) -> str:
function _format_examples_to_markdown (line 115) | def _format_examples_to_markdown(examples: list | str) -> str:
class SkillGenerator (line 138) | class SkillGenerator:
method __init__ (line 157) | def __init__(
method generate (line 177) | def generate(
method generate_from_analysis (line 253) | def generate_from_analysis(
method _collect_patterns (line 321) | def _collect_patterns(self, result: LoopResult) -> list[Pattern]:
method _collect_recommendations (line 335) | def _collect_recommendations(self, result: LoopResult) -> list[str]:
method _collect_key_changes (line 348) | def _collect_key_changes(self, result: LoopResult) -> list[str]:
method _generate_skill_content (line 358) | def _generate_skill_content(
method _save_references (line 442) | def _save_references(
function generate_skill_from_loop (line 485) | def generate_skill_from_loop(
FILE: examples/interleaved-thinking/tests/test_models.py
function test_thinking_block_creation (line 19) | def test_thinking_block_creation():
function test_tool_call_creation (line 31) | def test_tool_call_creation():
function test_reasoning_trace_creation (line 45) | def test_reasoning_trace_creation():
function test_pattern_creation (line 73) | def test_pattern_creation():
function test_analysis_result_creation (line 89) | def test_analysis_result_creation():
function test_optimization_result_creation (line 97) | def test_optimization_result_creation():
function test_loop_result_creation (line 113) | def test_loop_result_creation():
function test_pattern_types (line 121) | def test_pattern_types():
function test_severity_levels (line 139) | def test_severity_levels():
FILE: examples/llm-as-judge-skills/examples/basic-evaluation.ts
function main (line 13) | async function main() {
FILE: examples/llm-as-judge-skills/examples/full-evaluation-workflow.ts
function main (line 16) | async function main() {
FILE: examples/llm-as-judge-skills/examples/generate-rubric.ts
function main (line 13) | async function main() {
FILE: examples/llm-as-judge-skills/examples/pairwise-comparison.ts
function main (line 13) | async function main() {
FILE: examples/llm-as-judge-skills/src/agents/evaluator.ts
type EvaluatorAgentConfig (line 13) | interface EvaluatorAgentConfig {
class EvaluatorAgent (line 19) | class EvaluatorAgent {
method constructor (line 23) | constructor(agentConfig?: EvaluatorAgentConfig) {
method score (line 31) | async score(input: DirectScoreInput) {
method compare (line 38) | async compare(input: PairwiseCompareInput) {
method generateRubric (line 45) | async generateRubric(input: GenerateRubricInput) {
method evaluateWithGeneratedRubric (line 52) | async evaluateWithGeneratedRubric(
method chat (line 93) | async chat(userMessage: string) {
FILE: examples/llm-as-judge-skills/src/config/index.ts
function validateConfig (line 13) | function validateConfig(): void {
FILE: examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts
type DirectScoreInput (line 26) | type DirectScoreInput = z.infer<typeof DirectScoreInputSchema>;
type DirectScoreOutput (line 53) | type DirectScoreOutput = z.infer<typeof DirectScoreOutputSchema>;
function executeDirectScore (line 55) | async function executeDirectScore(input: DirectScoreInput): Promise<Dire...
FILE: examples/llm-as-judge-skills/src/tools/evaluation/generate-rubric.ts
type GenerateRubricInput (line 16) | type GenerateRubricInput = z.infer<typeof GenerateRubricInputSchema>;
type GenerateRubricOutput (line 48) | type GenerateRubricOutput = z.infer<typeof GenerateRubricOutputSchema>;
function executeGenerateRubric (line 50) | async function executeGenerateRubric(input: GenerateRubricInput): Promis...
FILE: examples/llm-as-judge-skills/src/tools/evaluation/pairwise-compare.ts
type PairwiseCompareInput (line 17) | type PairwiseCompareInput = z.infer<typeof PairwiseCompareInputSchema>;
type PairwiseCompareOutput (line 53) | type PairwiseCompareOutput = z.infer<typeof PairwiseCompareOutputSchema>;
function evaluatePair (line 55) | async function evaluatePair(
function executePairwiseCompare (line 124) | async function executePairwiseCompare(input: PairwiseCompareInput): Prom...
FILE: examples/llm-as-judge-skills/tests/evaluation.test.ts
constant TEST_PROMPT (line 11) | const TEST_PROMPT = 'Explain quantum entanglement to a high school stude...
constant GOOD_RESPONSE (line 13) | const GOOD_RESPONSE = `Quantum entanglement is like having two magical c...
constant POOR_RESPONSE (line 26) | const POOR_RESPONSE = `Quantum entanglement is when particles are connec...
constant MEDIUM_RESPONSE (line 29) | const MEDIUM_RESPONSE = `Quantum entanglement happens when two particles...
FILE: skills/advanced-evaluation/scripts/evaluation_example.py
function direct_scoring_example (line 28) | def direct_scoring_example() -> dict[str, Any]:
function pairwise_comparison_example (line 143) | def pairwise_comparison_example() -> dict[str, Any]:
function rubric_generation_example (line 261) | def rubric_generation_example() -> dict[str, Any]:
FILE: skills/context-compression/scripts/compression_evaluator.py
class ProbeType (line 56) | class ProbeType(Enum):
class Probe (line 65) | class Probe:
class CriterionResult (line 79) | class CriterionResult:
class EvaluationResult (line 87) | class EvaluationResult:
class ProbeGenerator (line 188) | class ProbeGenerator:
method __init__ (line 200) | def __init__(self, conversation_history: str) -> None:
method generate_probes (line 206) | def generate_probes(self) -> List[Probe]:
method _extract_facts (line 252) | def _extract_facts(self) -> Dict[str, str]:
method _extract_files (line 284) | def _extract_files(self) -> List[Dict[str, str]]:
method _extract_decisions (line 306) | def _extract_decisions(self) -> List[Dict[str, str]]:
class CompressionEvaluator (line 328) | class CompressionEvaluator:
method __init__ (line 341) | def __init__(self, model: str = "gpt-5.2") -> None:
method evaluate (line 345) | def evaluate(self,
method get_summary (line 396) | def get_summary(self) -> Dict:
method _get_criteria_for_probe (line 434) | def _get_criteria_for_probe(self, probe_type: ProbeType) -> List[Dict]:
method _evaluate_criterion (line 457) | def _evaluate_criterion(self,
method _heuristic_score (line 489) | def _heuristic_score(self,
method _calculate_dimension_scores (line 515) | def _calculate_dimension_scores(self,
class StructuredSummarizer (line 542) | class StructuredSummarizer:
method __init__ (line 574) | def __init__(self) -> None:
method update_from_span (line 584) | def update_from_span(self, new_content: str) -> str:
method _extract_from_content (line 607) | def _extract_from_content(self, content: str) -> Dict:
method _merge_sections (line 640) | def _merge_sections(self, new_info: Dict) -> None:
method _format_summary (line 667) | def _format_summary(self) -> str:
function evaluate_compression_quality (line 696) | def evaluate_compression_quality(
function mock_model_response (line 786) | def mock_model_response(context: str, question: str) -> str:
FILE: skills/context-degradation/scripts/degradation_detector.py
function measure_attention_distribution (line 45) | def measure_attention_distribution(
function _estimate_attention (line 81) | def _estimate_attention(
function detect_lost_in_middle (line 112) | def detect_lost_in_middle(
function analyze_context_structure (line 166) | def analyze_context_structure(context: str) -> Dict[str, object]:
class PoisoningDetector (line 225) | class PoisoningDetector:
method __init__ (line 233) | def __init__(self) -> None:
method extract_claims (line 245) | def extract_claims(self, text: str) -> List[Dict[str, object]]:
method detect_poisoning (line 279) | def detect_poisoning(self, context: str) -> Dict[str, object]:
method _detect_contradictions (line 340) | def _detect_contradictions(self, text: str) -> List[str]:
method _detect_hallucination_markers (line 363) | def _detect_hallucination_markers(self, text: str) -> List[str]:
class ContextHealthAnalyzer (line 385) | class ContextHealthAnalyzer:
method __init__ (line 396) | def __init__(self, context_limit: int = 100_000) -> None:
method analyze (line 400) | def analyze(
method _calculate_health_score (line 463) | def _calculate_health_score(
method _interpret_score (line 477) | def _interpret_score(self, score: float) -> str:
method _generate_recommendations (line 488) | def _generate_recommendations(
function analyze_agent_context (line 519) | def analyze_agent_context(
FILE: skills/context-fundamentals/scripts/context_manager.py
function estimate_token_count (line 62) | def estimate_token_count(text: str) -> int:
function estimate_message_tokens (line 79) | def estimate_message_tokens(messages: List[Dict[str, Any]]) -> int:
function count_tokens_by_type (line 94) | def count_tokens_by_type(context: Dict[str, Any]) -> Dict[str, int]:
class ContextBuilder (line 133) | class ContextBuilder:
method __init__ (line 149) | def __init__(self, context_limit: int = 100_000) -> None:
method add_section (line 154) | def add_section(
method build (line 175) | def build(self, max_tokens: Optional[int] = None) -> str:
method get_usage_report (line 203) | def get_usage_report(self) -> Dict[str, Any]:
method _get_status (line 220) | def _get_status(self, total: int) -> str:
function truncate_context (line 235) | def truncate_context(
function truncate_messages (line 261) | def truncate_messages(
function validate_context_structure (line 320) | def validate_context_structure(context: Dict[str, Any]) -> Dict[str, Any]:
class ProgressiveDisclosureManager (line 377) | class ProgressiveDisclosureManager:
method __init__ (line 393) | def __init__(self, base_dir: str = ".") -> None:
method load_summary (line 397) | def load_summary(self, summary_path: str) -> str:
method load_detail (line 409) | def load_detail(self, detail_path: str, force: bool = False) -> str:
method get_contextual_info (line 425) | def get_contextual_info(self, reference: Dict[str, Any]) -> str:
function build_agent_context (line 448) | def build_agent_context(
FILE: skills/context-optimization/scripts/compaction.py
function estimate_token_count (line 61) | def estimate_token_count(text: str) -> int:
function estimate_message_tokens (line 84) | def estimate_message_tokens(messages: List[Dict[str, str]]) -> int:
function categorize_messages (line 104) | def categorize_messages(messages: List[Dict]) -> Dict[str, List[Dict]]:
function summarize_content (line 141) | def summarize_content(content: str, category: str, max_length: int = 500...
function summarize_tool_output (line 159) | def summarize_tool_output(content: str, max_length: int = 500) -> str:
function summarize_conversation (line 186) | def summarize_conversation(content: str, max_length: int = 500) -> str:
function summarize_document (line 214) | def summarize_document(content: str, max_length: int = 500) -> str:
function summarize_general (line 231) | def summarize_general(content: str, max_length: int = 500) -> str:
class ObservationStore (line 245) | class ObservationStore:
method __init__ (line 263) | def __init__(self, max_size: int = 1000) -> None:
method store (line 268) | def store(self, content: str, metadata: Optional[Dict] = None) -> str:
method retrieve (line 287) | def retrieve(self, ref_id: str) -> Optional[str]:
method mask (line 294) | def mask(self, content: str, max_length: int = 200) -> Tuple[str, Opti...
method _generate_ref_id (line 310) | def _generate_ref_id(self, content: str) -> str:
method _extract_key_point (line 315) | def _extract_key_point(self, content: str) -> str:
class ContextBudget (line 330) | class ContextBudget:
method __init__ (line 350) | def __init__(self, total_limit: int) -> None:
method allocate (line 363) | def allocate(self, category: str, amount: int) -> bool:
method remaining (line 380) | def remaining(self) -> int:
method get_usage (line 385) | def get_usage(self) -> Dict[str, object]:
method should_optimize (line 401) | def should_optimize(
function design_stable_prompt (line 434) | def design_stable_prompt(template: str, dynamic_values: Optional[Dict] =...
function calculate_cache_metrics (line 460) | def calculate_cache_metrics(
function generate_cache_recommendations (line 492) | def generate_cache_recommendations(hits: int, misses: int) -> List[str]:
FILE: skills/evaluation/scripts/evaluator.py
class ScoreLevel (line 33) | class ScoreLevel(Enum):
class RubricDimension (line 44) | class RubricDimension:
class AgentEvaluator (line 125) | class AgentEvaluator:
method __init__ (line 132) | def __init__(self, rubric: Optional[Dict[str, RubricDimension]] = None...
method evaluate (line 136) | def evaluate(
method _evaluate_dimension (line 182) | def _evaluate_dimension(
method _check_factual_accuracy (line 243) | def _check_factual_accuracy(
method _estimate_expected_tools (line 269) | def _estimate_expected_tools(self, task_type: str) -> int:
method _score_to_level (line 279) | def _score_to_level(self, score: float) -> str:
class TestSet (line 298) | class TestSet:
method __init__ (line 306) | def __init__(self, name: str) -> None:
method add_test (line 311) | def add_test(self, test: Dict[str, Any]) -> None:
method filter (line 324) | def filter(self, **criteria: Any) -> List[Dict[str, Any]]:
method get_complexity_distribution (line 340) | def get_complexity_distribution(self) -> Dict[str, int]:
method create_standard_tests (line 351) | def create_standard_tests(self) -> "TestSet":
class EvaluationRunner (line 395) | class EvaluationRunner:
method __init__ (line 402) | def __init__(self, evaluator: AgentEvaluator, test_set: TestSet) -> None:
method run_all (line 407) | def run_all(self, verbose: bool = False) -> Dict[str, Any]:
method run_test (line 425) | def run_test(self, test: Dict[str, Any]) -> Dict[str, Any]:
method summarize (line 449) | def summarize(self) -> Dict[str, Any]:
class ProductionMonitor (line 496) | class ProductionMonitor:
method __init__ (line 504) | def __init__(self, sample_rate: float = 0.01) -> None:
method should_sample (line 515) | def should_sample(self) -> bool:
method record_sample (line 522) | def record_sample(
method get_metrics (line 538) | def get_metrics(self) -> Dict[str, Any]:
method _generate_alerts (line 564) | def _generate_alerts(
FILE: skills/filesystem-context/scripts/filesystem_context.py
class ScratchPadManager (line 49) | class ScratchPadManager:
method __init__ (line 57) | def __init__(self, base_path: str = "scratch", token_threshold: int = ...
method estimate_tokens (line 62) | def estimate_tokens(self, content: str) -> int:
method should_offload (line 70) | def should_offload(self, content: str) -> bool:
method offload (line 77) | def offload(self, content: str, source: str) -> Dict[str, Any]:
method format_reference (line 104) | def format_reference(self, ref: Dict[str, Any]) -> str:
method cleanup (line 116) | def cleanup(self, max_age_seconds: int = 3600) -> int:
class PlanStep (line 139) | class PlanStep:
class AgentPlan (line 153) | class AgentPlan:
method to_dict (line 165) | def to_dict(self) -> Dict[str, Any]:
method save (line 181) | def save(self, path: str = "scratch/current_plan.json") -> None:
method load (line 193) | def load(cls, path: str = "scratch/current_plan.json") -> AgentPlan:
method current_step (line 216) | def current_step(self) -> Optional[PlanStep]:
method complete_step (line 226) | def complete_step(self, step_id: int, notes: Optional[str] = None) -> ...
method progress_summary (line 240) | def progress_summary(self) -> str:
class ToolOutputHandler (line 265) | class ToolOutputHandler:
method __init__ (line 272) | def __init__(self, scratch_pad: Optional[ScratchPadManager] = None) ->...
method process_output (line 275) | def process_output(self, tool_name: str, output: str) -> str:
function _demo_scratch_pad (line 292) | def _demo_scratch_pad() -> None:
function _demo_plan_persistence (line 341) | def _demo_plan_persistence() -> None:
function _demo_tool_handler (line 377) | def _demo_tool_handler() -> None:
function _cleanup_demo (line 410) | def _cleanup_demo() -> None:
FILE: skills/hosted-agents/scripts/sandbox_manager.py
class SandboxState (line 36) | class SandboxState(Enum):
class UserIdentity (line 47) | class UserIdentity:
class SandboxConfig (line 60) | class SandboxConfig:
class Sandbox (line 75) | class Sandbox:
method execute_command (line 91) | async def execute_command(self, command: str) -> dict[str, Any]:
method read_file (line 103) | async def read_file(self, path: str) -> str:
method write_file (line 111) | async def write_file(self, path: str, content: str) -> None:
method snapshot (line 119) | async def snapshot(self) -> str:
method _create_snapshot (line 131) | async def _create_snapshot(self) -> str:
method restore (line 135) | async def restore(self, snapshot_id: str) -> None:
method terminate (line 139) | async def terminate(self) -> None:
class RepositoryImage (line 145) | class RepositoryImage:
method is_stale (line 156) | def is_stale(self, max_age: timedelta = timedelta(minutes=30)) -> bool:
class ImageBuilder (line 161) | class ImageBuilder:
method __init__ (line 168) | def __init__(self, github_app_token_provider: Callable[[], str]) -> None:
method build_image (line 172) | async def build_image(self, repo_url: str) -> RepositoryImage:
method get_latest_image (line 218) | def get_latest_image(self, repo_url: str) -> Optional[RepositoryImage]:
method _execute_build_step (line 222) | async def _execute_build_step(self, command: str) -> None:
method _get_commit_sha (line 226) | async def _get_commit_sha(self) -> str:
method _finalize_image (line 230) | async def _finalize_image(self) -> str:
class WarmSandbox (line 236) | class WarmSandbox:
class WarmPoolManager (line 250) | class WarmPoolManager:
method __init__ (line 257) | def __init__(
method get_warm_sandbox (line 268) | async def get_warm_sandbox(self, repo_url: str) -> Optional[WarmSandbox]:
method _is_valid (line 284) | def _is_valid(self, warm: WarmSandbox) -> bool:
method maintain_pool (line 297) | async def maintain_pool(self, repo_url: str) -> None:
method _create_warm_sandbox (line 319) | async def _create_warm_sandbox(self, repo_url: str) -> WarmSandbox:
method _sync_to_latest (line 341) | async def _sync_to_latest(self, warm: WarmSandbox) -> None:
method _create_sandbox_from_image (line 347) | async def _create_sandbox_from_image(self, image: RepositoryImage) -> ...
class SandboxManager (line 352) | class SandboxManager:
method __init__ (line 360) | def __init__(
method start_build_loop (line 372) | async def start_build_loop(self) -> None:
method start_session (line 388) | async def start_session(
method on_user_typing (line 423) | async def on_user_typing(self, user: UserIdentity, repo_url: str) -> N...
method end_session (line 435) | async def end_session(self, session_id: str) -> Optional[str]:
method _configure_for_user (line 456) | async def _configure_for_user(
method _wait_for_sync (line 472) | async def _wait_for_sync(self, warm: WarmSandbox) -> None:
method _restore_from_snapshot (line 477) | async def _restore_from_snapshot(self, snapshot_id: str) -> Sandbox:
method _cold_start (line 481) | async def _cold_start(self, repo_url: str) -> Sandbox:
class AgentSession (line 486) | class AgentSession:
method __init__ (line 494) | def __init__(self, sandbox: Sandbox) -> None:
method read_file (line 499) | async def read_file(self, path: str) -> str:
method write_file (line 508) | async def write_file(self, path: str, content: str) -> None:
method mark_sync_complete (line 521) | def mark_sync_complete(self) -> None:
method _wait_for_sync (line 525) | async def _wait_for_sync(self) -> None:
function _demo (line 537) | async def _demo() -> None:
FILE: skills/memory-systems/scripts/memory_store.py
class VectorStore (line 39) | class VectorStore:
method __init__ (line 46) | def __init__(self, dimension: int = 768) -> None:
method add (line 53) | def add(self, text: str, metadata: Optional[Dict[str, Any]] = None) ->...
method search (line 82) | def search(
method search_by_entity (line 124) | def search_by_entity(
method _embed (line 159) | def _embed(self, text: str) -> np.ndarray:
method _time_key (line 170) | def _time_key(self, timestamp: Any) -> str:
method _matches_filters (line 176) | def _matches_filters(self, metadata: Dict[str, Any], filters: Dict[str...
class PropertyGraph (line 189) | class PropertyGraph:
method __init__ (line 197) | def __init__(self) -> None:
method get_or_create_node (line 204) | def get_or_create_node(
method create_node (line 222) | def create_node(self, label: str, properties: Optional[Dict[str, Any]]...
method create_relationship (line 243) | def create_relationship(
method query (line 279) | def query(self, pattern: Dict[str, Any]) -> List[Dict[str, Any]]:
method get_node (line 309) | def get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
method get_relationships (line 313) | def get_relationships(
class TemporalKnowledgeGraph (line 344) | class TemporalKnowledgeGraph(PropertyGraph):
method create_temporal_relationship (line 352) | def create_temporal_relationship(
method query_at_time (line 378) | def query_at_time(
method query_time_range (line 411) | def query_time_range(
class IntegratedMemorySystem (line 455) | class IntegratedMemorySystem:
method __init__ (line 464) | def __init__(self) -> None:
method start_session (line 469) | def start_session(self, session_id: str) -> None:
method store_fact (line 477) | def store_fact(
method retrieve_memories (line 516) | def retrieve_memories(
method retrieve_entity_context (line 548) | def retrieve_entity_context(self, entity: str) -> Dict[str, Any]:
method consolidate (line 578) | def consolidate(self) -> None:
FILE: skills/multi-agent-patterns/scripts/coordination.py
class MessageType (line 32) | class MessageType(Enum):
class AgentMessage (line 43) | class AgentMessage:
class AgentCommunication (line 60) | class AgentCommunication:
method __init__ (line 67) | def __init__(self) -> None:
method send (line 72) | def send(self, message: AgentMessage) -> None:
method receive (line 80) | def receive(self, agent_id: str) -> List[AgentMessage]:
method broadcast (line 86) | def broadcast(
class SupervisorAgent (line 110) | class SupervisorAgent:
method __init__ (line 117) | def __init__(self, name: str, communication: AgentCommunication) -> None:
method register_worker (line 125) | def register_worker(self, worker_id: str, capabilities: List[str]) -> ...
method decompose_task (line 134) | def decompose_task(self, task: Dict[str, Any]) -> List[Dict[str, Any]]:
method assign_task (line 169) | def assign_task(self, subtask: Dict[str, Any], worker_id: str) -> None:
method select_worker (line 188) | def select_worker(self, subtask: Dict[str, Any]) -> str:
method aggregate_results (line 218) | def aggregate_results(
method run_workflow (line 238) | def run_workflow(self, task: Dict[str, Any]) -> Dict[str, Any]:
method _simulate_worker_response (line 283) | def _simulate_worker_response(
method _send (line 298) | def _send(self, message: AgentMessage) -> None:
class HandoffProtocol (line 308) | class HandoffProtocol:
method __init__ (line 315) | def __init__(self, communication: AgentCommunication) -> None:
method create_handoff (line 318) | def create_handoff(
method accept_handoff (line 338) | def accept_handoff(self, agent_id: str) -> Optional[AgentMessage]:
method transfer_with_state (line 348) | def transfer_with_state(
class ConsensusManager (line 391) | class ConsensusManager:
method __init__ (line 399) | def __init__(self) -> None:
method initiate_vote (line 403) | def initiate_vote(
method submit_vote (line 417) | def submit_vote(
method calculate_weighted_consensus (line 435) | def calculate_weighted_consensus(self, topic_id: str) -> Dict[str, Any]:
class AgentFailureHandler (line 490) | class AgentFailureHandler:
method __init__ (line 498) | def __init__(
method handle_failure (line 508) | def handle_failure(
method _activate_circuit_breaker (line 535) | def _activate_circuit_breaker(self, agent_id: str) -> None:
method _find_alternative_agent (line 539) | def _find_alternative_agent(self, failed_agent: str) -> str:
method is_available (line 546) | def is_available(self, agent_id: str) -> bool:
method record_success (line 555) | def record_success(self, agent_id: str) -> None:
FILE: skills/project-development/scripts/pipeline_template.py
class Item (line 99) | class Item:
class ParsedResult (line 113) | class ParsedResult:
function get_batch_dir (line 132) | def get_batch_dir(batch_id: str) -> Path:
function get_item_dir (line 140) | def get_item_dir(batch_id: str, item_id: str) -> Path:
function get_output_dir (line 148) | def get_output_dir(batch_id: str) -> Path:
function stage_acquire (line 160) | def stage_acquire(batch_id: str, limit: int | None = None) -> list[Path]:
function fetch_items_from_source (line 194) | def fetch_items_from_source(limit: int | None = None) -> list[Item]:
function stage_prepare (line 216) | def stage_prepare(batch_id: str) -> int:
function generate_prompt (line 256) | def generate_prompt(item_data: dict[str, Any]) -> str:
function stage_process (line 271) | def stage_process(
function call_llm (line 336) | def call_llm(prompt: str, model: str) -> str:
function stage_parse (line 379) | def stage_parse(batch_id: str) -> list[dict[str, Any]]:
function parse_response (line 424) | def parse_response(text: str) -> ParsedResult:
function extract_section (line 465) | def extract_section(text: str, section_name: str) -> str | None:
function extract_field (line 475) | def extract_field(text: str, field_name: str) -> str | None:
function extract_list_items (line 485) | def extract_list_items(text: str, section_name: str) -> list[str]:
function extract_score (line 498) | def extract_score(
function stage_render (line 521) | def stage_render(batch_id: str) -> Path | None:
function render_html (line 554) | def render_html(results: list[dict[str, Any]], batch_id: str) -> str:
function stage_clean (line 603) | def stage_clean(batch_id: str, from_stage: str | None = None) -> int:
function stage_estimate (line 662) | def stage_estimate(batch_id: str) -> dict[str, Any] | None:
function main (line 726) | def main() -> None:
FILE: skills/tool-design/scripts/description_generator.py
class ToolSpec (line 48) | class ToolSpec(Protocol):
class _BuiltToolSpec (line 65) | class _BuiltToolSpec:
function generate_tool_description (line 111) | def generate_tool_description(tool_spec: ToolSpec) -> str:
function generate_usage_context (line 128) | def generate_usage_context(tool_spec: ToolSpec) -> str:
function _generate_parameters (line 151) | def _generate_parameters(parameters: Sequence[Dict[str, Any]]) -> str:
function _generate_returns (line 166) | def _generate_returns(returns: Optional[Dict[str, Any]]) -> str:
function _generate_errors (line 175) | def _generate_errors(errors: Sequence[Dict[str, Any]]) -> str:
class ToolDescriptionEvaluator (line 189) | class ToolDescriptionEvaluator:
method evaluate (line 204) | def evaluate(self, description: str, tool_spec: ToolSpec) -> Dict[str,...
method _check_clarity (line 221) | def _check_clarity(self, description: str) -> float:
method _check_completeness (line 236) | def _check_completeness(self, description: str, tool_spec: ToolSpec) -...
method _check_accuracy (line 253) | def _check_accuracy(self, description: str, tool_spec: ToolSpec) -> fl...
method _check_actionability (line 271) | def _check_actionability(self, description: str) -> float:
method _check_consistency (line 281) | def _check_consistency(self, description: str, tool_spec: ToolSpec) ->...
class ErrorMessageGenerator (line 299) | class ErrorMessageGenerator:
method generate (line 329) | def generate(self, error_type: str, context: Dict[str, str]) -> str:
class ToolSchemaBuilder (line 343) | class ToolSchemaBuilder:
method __init__ (line 350) | def __init__(self, name: str) -> None:
method set_description (line 360) | def set_description(self, short: str, detailed: str) -> "ToolSchemaBui...
method add_parameter (line 370) | def add_parameter(
method set_returns (line 393) | def set_returns(
method add_error (line 410) | def add_error(
method build (line 428) | def build(self) -> "_BuiltToolSpec":
method add_trigger (line 449) | def add_trigger(self, trigger: str) -> "ToolSchemaBuilder":
method add_example (line 457) | def add_example(
Condensed preview — 234 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,275K chars).
[
{
"path": ".claude-plugin/marketplace.json",
"chars": 2180,
"preview": "{\n \"name\": \"context-engineering-marketplace\",\n \"owner\": {\n \"name\": \"Muratcan Koylan\",\n \"email\": \"muratcan.koylan"
},
{
"path": ".cursorindexingignore",
"chars": 110,
"preview": "\n# Don't index SpecStory auto-save files, but allow explicit context inclusion via @ references\n.specstory/**\n"
},
{
"path": ".gitignore",
"chars": 534,
"preview": "# Python\n__pycache__/\n*.py[cod]\n*$py.class\n*.so\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\np"
},
{
"path": ".plugin/plugin.json",
"chars": 352,
"preview": "{\n \"name\": \"context-engineering\",\n \"description\": \"Context engineering skills for building production-grade AI agent s"
},
{
"path": "CONTRIBUTING.md",
"chars": 2444,
"preview": "# Contributing to Agent Skills for Context Engineering\n\nThank you for your interest in contributing to this collection o"
},
{
"path": "LICENSE",
"chars": 1103,
"preview": "MIT License\n\nCopyright (c) 2025 Context Engineering Agent Skills Contributors\n\nPermission is hereby granted, free of cha"
},
{
"path": "README.md",
"chars": 13266,
"preview": "# Agent Skills for Context Engineering\n\nA comprehensive, open collection of Agent Skills focused on context engineering "
},
{
"path": "SKILL.md",
"chars": 7750,
"preview": "---\nname: context-engineering-collection\ndescription: A comprehensive collection of Agent Skills for context engineering"
},
{
"path": "docs/agentskills.md",
"chars": 51360,
"preview": "---\nname: agent-skills-format\ndescription: Official documentation for the Agent Skills format - a lightweight, open stan"
},
{
"path": "docs/blogs.md",
"chars": 132136,
"preview": "---\nname: context-engineering-blogs\ndescription: Collection of technical blogs about context engineering, covering strat"
},
{
"path": "docs/claude_research.md",
"chars": 15416,
"preview": "---\nname: production-grade-llm-agents\ndescription: Comprehensive technical analysis of production-grade LLM agents cover"
},
{
"path": "docs/compression.md",
"chars": 20447,
"preview": "---\nname: context-compression-evaluation\ndescription: Evaluation framework for measuring how much context different comp"
},
{
"path": "docs/gemini_research.md",
"chars": 32273,
"preview": "---\nname: advanced-agentic-architectures\ndescription: Comprehensive technical analysis of advanced architectures in agen"
},
{
"path": "docs/hncapsule.md",
"chars": 7853,
"preview": "---\nname: karpathy-hn-time-capsule\ndescription: Andrej Karpathy's project auto-grading decade-old Hacker News discussion"
},
{
"path": "docs/netflix_context.md",
"chars": 20579,
"preview": "---\nname: netflix-context-compression\ndescription: Video transcript from Netflix engineer discussing context compression"
},
{
"path": "docs/skills-improvement-analysis.md",
"chars": 7980,
"preview": "# Skills Improvement Analysis: Lessons from Anthropic's \"Building Claude Code\" Article\n\n*Analysis date: 2026-03-17*\n*Sou"
},
{
"path": "docs/vercel_tool.md",
"chars": 7638,
"preview": "---\nname: vercel-tool-reduction\ndescription: Vercel's case study on removing 80% of their agent's specialized tools and "
},
{
"path": "examples/book-sft-pipeline/README.md",
"chars": 2317,
"preview": "# Book SFT Pipeline\n\nA standalone skill for training language models to write in any author's style. This is a **separat"
},
{
"path": "examples/book-sft-pipeline/SKILL.md",
"chars": 13308,
"preview": "---\nname: book-sft-pipeline\ndescription: This skill should be used when the user asks to \"fine-tune on books\", \"create S"
},
{
"path": "examples/book-sft-pipeline/examples/gertrude-stein/README.md",
"chars": 5459,
"preview": "# Example: Gertrude Stein Style Transfer\n\nA complete case study of training Qwen3-8B-Base to write in Gertrude Stein's s"
},
{
"path": "examples/book-sft-pipeline/examples/gertrude-stein/dataset_sample.jsonl",
"chars": 3968,
"preview": "{\"messages\":[{\"role\":\"system\",\"content\":\"You are an expert creative writer capable of emulating specific literary styles"
},
{
"path": "examples/book-sft-pipeline/examples/gertrude-stein/sample_outputs.md",
"chars": 3564,
"preview": "# Sample Model Outputs\n\nRaw outputs from the trained model for AI detector testing and style analysis.\n\n## 1. Real Estat"
},
{
"path": "examples/book-sft-pipeline/examples/gertrude-stein/training_config.json",
"chars": 2814,
"preview": "{\n \"project\": {\n \"name\": \"Gertrude Stein Style Transfer\",\n \"source_book\": \"Three Lives (1909)\",\n \"author\": \"Ge"
},
{
"path": "examples/book-sft-pipeline/references/segmentation-strategies.md",
"chars": 10813,
"preview": "# Segmentation Strategies\n\nAdvanced patterns for splitting books into training chunks while preserving narrative coheren"
},
{
"path": "examples/book-sft-pipeline/references/tinker-format.md",
"chars": 6068,
"preview": "# Tinker Format Specification\n\nThis reference documents the exact data structures required for Tinker supervised fine-tu"
},
{
"path": "examples/book-sft-pipeline/references/tinker.txt",
"chars": 144931,
"preview": "# TINKER DOCUMENTATION\nThis file contains the complete Tinker documentation and SDK reference.\n\n## Table of Contents\n\n1."
},
{
"path": "examples/book-sft-pipeline/scripts/pipeline_example.py",
"chars": 6695,
"preview": "\"\"\"\nBook SFT Pipeline - Conceptual Implementation\n\nThis demonstrates the core patterns for building book-to-SFT pipeline"
},
{
"path": "examples/digital-brain-skill/.gitignore",
"chars": 442,
"preview": "# OS generated files\n.DS_Store\n.DS_Store?\n._*\n.Spotlight-V100\n.Trashes\nehthumbs.db\nThumbs.db\n\n# Editor directories\n.idea"
},
{
"path": "examples/digital-brain-skill/AGENT.md",
"chars": 1555,
"preview": "# Digital Brain - Claude Instructions\n\nThis is a Digital Brain personal operating system. When working in this project:\n"
},
{
"path": "examples/digital-brain-skill/HOW-SKILLS-BUILT-THIS.md",
"chars": 12855,
"preview": "# How Agent Skills for Context Engineering Built Digital Brain\n\n> This document demonstrates how the [Agent Skills for C"
},
{
"path": "examples/digital-brain-skill/README.md",
"chars": 7032,
"preview": "# Digital Brain\n\n> A personal operating system for founders, creators, and builders. Part of the [Agent Skills for Conte"
},
{
"path": "examples/digital-brain-skill/SKILL.md",
"chars": 6985,
"preview": "---\nname: digital-brain\ndescription: This skill should be used when the user asks to \"write a post\", \"check my voice\", \""
},
{
"path": "examples/digital-brain-skill/SKILLS-MAPPING.md",
"chars": 8790,
"preview": "# Skills Mapping: Digital Brain\n\nThis document maps how [Agent Skills for Context Engineering](https://github.com/muratc"
},
{
"path": "examples/digital-brain-skill/agents/AGENTS.md",
"chars": 2599,
"preview": "---\nname: agents-module\ndescription: Automation scripts and agent helpers for the Digital Brain. Use these scripts for r"
},
{
"path": "examples/digital-brain-skill/agents/scripts/content_ideas.py",
"chars": 4325,
"preview": "#!/usr/bin/env python3\n\"\"\"\nContent Ideas Generator\nGenerates content ideas based on knowledge base and past successful c"
},
{
"path": "examples/digital-brain-skill/agents/scripts/idea_to_draft.py",
"chars": 4164,
"preview": "#!/usr/bin/env python3\n\"\"\"\nIdea to Draft Expander\nTakes an idea ID and creates a draft scaffold with relevant context.\n\""
},
{
"path": "examples/digital-brain-skill/agents/scripts/stale_contacts.py",
"chars": 3942,
"preview": "#!/usr/bin/env python3\n\"\"\"\nStale Contacts Finder\nIdentifies contacts that haven't been reached out to recently.\n\"\"\"\n\nimp"
},
{
"path": "examples/digital-brain-skill/agents/scripts/weekly_review.py",
"chars": 3224,
"preview": "#!/usr/bin/env python3\n\"\"\"\nWeekly Review Generator\nCompiles data from Digital Brain into a weekly review document.\n\"\"\"\n\n"
},
{
"path": "examples/digital-brain-skill/content/CONTENT.md",
"chars": 2242,
"preview": "---\nname: content-module\ndescription: Content creation hub - ideas, drafts, calendar, and published posts. Use for conte"
},
{
"path": "examples/digital-brain-skill/content/calendar.md",
"chars": 2096,
"preview": "# Content Calendar\n\n## Publishing Schedule\n\n### Weekly Cadence\n```yaml\nmonday:\n platform: \"[PLACEHOLDER: e.g., Twitter]"
},
{
"path": "examples/digital-brain-skill/content/engagement.jsonl",
"chars": 663,
"preview": "{\"_schema\": \"saved_content\", \"_version\": \"1.0\", \"_description\": \"Content from others that inspired you, engaged with, or"
},
{
"path": "examples/digital-brain-skill/content/ideas.jsonl",
"chars": 486,
"preview": "{\"_schema\": \"content_idea\", \"_version\": \"1.0\", \"_description\": \"Append new ideas below this line. Never delete entries -"
},
{
"path": "examples/digital-brain-skill/content/posts.jsonl",
"chars": 627,
"preview": "{\"_schema\": \"published_post\", \"_version\": \"1.0\", \"_description\": \"Log of all published content with performance metrics."
},
{
"path": "examples/digital-brain-skill/content/templates/linkedin-post.md",
"chars": 1605,
"preview": "# LinkedIn Post Template\n\n## Metadata\n```yaml\ntopic: \"[PLACEHOLDER]\"\npillar: \"[PLACEHOLDER: Content pillar]\"\nformat: \"st"
},
{
"path": "examples/digital-brain-skill/content/templates/newsletter.md",
"chars": 1580,
"preview": "# Newsletter Template\n\n## Metadata\n```yaml\nissue_number: \"[X]\"\ntitle: \"[PLACEHOLDER]\"\nsubtitle: \"[PLACEHOLDER: One-liner"
},
{
"path": "examples/digital-brain-skill/content/templates/thread.md",
"chars": 1170,
"preview": "# Thread Template\n\n## Metadata\n```yaml\ntopic: \"[PLACEHOLDER]\"\npillar: \"[PLACEHOLDER: Content pillar]\"\ntarget_platform: \""
},
{
"path": "examples/digital-brain-skill/examples/content-workflow.md",
"chars": 4510,
"preview": "# Example: Content Creation Workflow\n\nA complete walkthrough of using Digital Brain for content creation.\n\n---\n\n## Scena"
},
{
"path": "examples/digital-brain-skill/examples/meeting-prep.md",
"chars": 5749,
"preview": "# Example: Meeting Preparation Workflow\n\nA complete walkthrough of using Digital Brain for meeting preparation.\n\n---\n\n##"
},
{
"path": "examples/digital-brain-skill/identity/IDENTITY.md",
"chars": 1457,
"preview": "---\nname: identity-module\ndescription: Personal brand, voice, values, and positioning. Reference before creating any con"
},
{
"path": "examples/digital-brain-skill/identity/bio-variants.md",
"chars": 1696,
"preview": "# Bio Variants\n\nPre-written bios for different platforms and contexts.\n\n---\n\n## Twitter/X Bio (160 chars)\n```\n[PLACEHOLD"
},
{
"path": "examples/digital-brain-skill/identity/brand.md",
"chars": 3730,
"preview": "# Personal Brand Strategy\n\nYour positioning, audience, and strategic narrative.\n\n---\n\n## Brand Positioning\n\n### One-Line"
},
{
"path": "examples/digital-brain-skill/identity/prompts/content-generation.xml",
"chars": 1439,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<prompt name=\"content-generation\" version=\"1.0\">\n <description>\n Master promp"
},
{
"path": "examples/digital-brain-skill/identity/prompts/reply-generator.xml",
"chars": 1318,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<prompt name=\"reply-generator\" version=\"1.0\">\n <description>\n Generate authen"
},
{
"path": "examples/digital-brain-skill/identity/values.yaml",
"chars": 1960,
"preview": "# Core Values & Principles\n# These guide decision-making and content creation\n\ncore_values:\n - name: \"[PLACEHOLDER: e.g"
},
{
"path": "examples/digital-brain-skill/identity/voice.md",
"chars": 3377,
"preview": "# Voice & Tone Guide\n\nYour authentic voice captured for consistent communication across all platforms.\n\n---\n\n## Core Voi"
},
{
"path": "examples/digital-brain-skill/knowledge/KNOWLEDGE.md",
"chars": 2439,
"preview": "---\nname: knowledge-module\ndescription: Personal knowledge base - research, bookmarks, learning resources, and notes. Us"
},
{
"path": "examples/digital-brain-skill/knowledge/bookmarks.jsonl",
"chars": 559,
"preview": "{\"_schema\": \"bookmark\", \"_version\": \"1.0\", \"_description\": \"Saved resources and links. Append only. Use categories: ai_a"
},
{
"path": "examples/digital-brain-skill/knowledge/competitors.md",
"chars": 2285,
"preview": "# Competitive Landscape\n\nUnderstanding your market and differentiating your positioning.\n\n---\n\n## Direct Competitors\n<!-"
},
{
"path": "examples/digital-brain-skill/knowledge/learning.yaml",
"chars": 2120,
"preview": "# Learning & Skills Development\n# Track skills you're building and resources for each\n\ncurrent_focus:\n skill: \"[PLACEHO"
},
{
"path": "examples/digital-brain-skill/knowledge/research/_template.md",
"chars": 1285,
"preview": "# Research: [TOPIC]\n\n## Metadata\n```yaml\ncreated: \"[DATE]\"\nupdated: \"[DATE]\"\nstatus: \"exploring|synthesizing|complete\"\nr"
},
{
"path": "examples/digital-brain-skill/network/NETWORK.md",
"chars": 3096,
"preview": "---\nname: network-module\ndescription: Relationship and contact management - people you know, interaction history, and ne"
},
{
"path": "examples/digital-brain-skill/network/circles.yaml",
"chars": 2020,
"preview": "# Relationship Circles\n# Organize your network by relationship depth\n\ncircles:\n inner:\n description: \"Close relation"
},
{
"path": "examples/digital-brain-skill/network/contacts.jsonl",
"chars": 897,
"preview": "{\"_schema\": \"contact\", \"_version\": \"1.0\", \"_description\": \"Personal contact database. Append new contacts, update existi"
},
{
"path": "examples/digital-brain-skill/network/interactions.jsonl",
"chars": 454,
"preview": "{\"_schema\": \"interaction\", \"_version\": \"1.0\", \"_description\": \"Log of all meaningful interactions. Append only.\"}\n{\"id\":"
},
{
"path": "examples/digital-brain-skill/network/intros.md",
"chars": 2177,
"preview": "# Introductions Tracker\n\nManaging introductions - made and received.\n\n---\n\n## Pending Introductions\n\n### To Make\n<!-- In"
},
{
"path": "examples/digital-brain-skill/operations/OPERATIONS.md",
"chars": 2058,
"preview": "---\nname: operations-module\ndescription: Personal productivity - todos, goals, meetings, and metrics. Use for task manag"
},
{
"path": "examples/digital-brain-skill/operations/goals.yaml",
"chars": 2132,
"preview": "# Goals & OKRs\n# Review quarterly, update progress weekly\n\ncurrent_period:\n quarter: \"[PLACEHOLDER: e.g., Q1 2025]\"\n t"
},
{
"path": "examples/digital-brain-skill/operations/meetings.jsonl",
"chars": 635,
"preview": "{\"_schema\": \"meeting\", \"_version\": \"1.0\", \"_description\": \"Meeting log with notes and action items. Append after each me"
},
{
"path": "examples/digital-brain-skill/operations/metrics.jsonl",
"chars": 707,
"preview": "{\"_schema\": \"metrics_snapshot\", \"_version\": \"1.0\", \"_description\": \"Weekly metrics snapshot. Append a new entry each wee"
},
{
"path": "examples/digital-brain-skill/operations/reviews/_weekly_template.md",
"chars": 1720,
"preview": "# Weekly Review: Week of [DATE]\n\n## Metrics Snapshot\n<!-- Pull from metrics.jsonl or fill manually -->\n\n| Metric | Last "
},
{
"path": "examples/digital-brain-skill/operations/todos.md",
"chars": 1293,
"preview": "# Task List\n\n## Today's Focus\n\n### P0 - Must Do Today\n<!-- Blocking, urgent, non-negotiable -->\n- [ ] [PLACEHOLDER: Crit"
},
{
"path": "examples/digital-brain-skill/package.json",
"chars": 1287,
"preview": "{\n \"name\": \"digital-brain-skill\",\n \"version\": \"1.0.0\",\n \"description\": \"A structured personal operating system for fo"
},
{
"path": "examples/digital-brain-skill/references/file-formats.md",
"chars": 7478,
"preview": "# File Format Reference\n\nDetailed specifications for each file format used in Digital Brain.\n\n---\n\n## JSONL Files\n\n### S"
},
{
"path": "examples/digital-brain-skill/scripts/install.sh",
"chars": 1993,
"preview": "#!/bin/bash\n# Digital Brain Installation Script\n# Installs Digital Brain as a Claude Code skill\n\nset -e\n\nSKILL_NAME=\"dig"
},
{
"path": "examples/interleaved-thinking/README.md",
"chars": 18978,
"preview": "# Reasoning Trace Optimizer\n\n<p align=\"center\">\n <strong>Debug and optimize AI agents by analyzing reasoning traces wit"
},
{
"path": "examples/interleaved-thinking/SKILL.md",
"chars": 6418,
"preview": "---\nname: reasoning-trace-optimizer\ndescription: \"Debug and optimize AI agents by analyzing reasoning traces. Activates "
},
{
"path": "examples/interleaved-thinking/docs/agentthinking.md",
"chars": 6506,
"preview": "# Aligning to What? Rethinking Agent Generalization in MiniMax M2\n\nIt's been fantastic to see the community dive into ou"
},
{
"path": "examples/interleaved-thinking/docs/interleavedthinking.md",
"chars": 30584,
"preview": "# M2.1 Tool Use & Interleaved Thinking\n\n> MiniMax-M2.1 is an Agentic Model with exceptional Tool Use capabilities.\n\nM2.1"
},
{
"path": "examples/interleaved-thinking/docs/m2-1.md",
"chars": 9539,
"preview": "# Compatible Anthropic API\n\n> Call MiniMax models using the Anthropic SDK\n\nTo meet developers' needs for the Anthropic A"
},
{
"path": "examples/interleaved-thinking/examples/01_basic_capture.py",
"chars": 2293,
"preview": "\"\"\"\nExample 1: Basic Trace Capture\n\nDemonstrates capturing reasoning traces from M2.1 for a simple task.\nThis shows how "
},
{
"path": "examples/interleaved-thinking/examples/02_tool_usage.py",
"chars": 5870,
"preview": "\"\"\"\nExample 2: Tool Usage with Trace Capture\n\nDemonstrates how M2.1's interleaved thinking reasons between tool calls.\nT"
},
{
"path": "examples/interleaved-thinking/examples/03_full_optimization.py",
"chars": 39936,
"preview": "\"\"\"\nExample 3: Full Optimization Loop with Comprehensive Tools\n\nDemonstrates the complete optimization cycle with realis"
},
{
"path": "examples/interleaved-thinking/generated_skills/comprehensive-research-agent/SKILL.md",
"chars": 8513,
"preview": "---\nname: comprehensive-research-agent\ndescription: \"Ensure thorough validation, error recovery, and transparent reasoni"
},
{
"path": "examples/interleaved-thinking/generated_skills/comprehensive-research-agent/references/optimization_summary.json",
"chars": 899,
"preview": "{\n \"task\": \"Research the topic of \\\"context engineering for AI agents\\\" and create a comprehensive summary.\\n\\nYour res"
},
{
"path": "examples/interleaved-thinking/generated_skills/comprehensive-research-agent/references/optimized_prompt.txt",
"chars": 81,
"preview": "You are a research assistant. Help with research tasks using the available tools."
},
{
"path": "examples/interleaved-thinking/generated_skills/comprehensive-research-agent/references/patterns_found.json",
"chars": 12732,
"preview": "[\n {\n \"type\": \"tool_confusion\",\n \"severity\": \"medium\",\n \"description\": \"Agent attempted to fetch non-existent "
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/final_prompt.txt",
"chars": 3256,
"preview": "You are a research assistant specializing in thorough, rigorous research with explicit validation and error handling.\n\n#"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_1/analysis.txt",
"chars": 2823,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_1/optimization.txt",
"chars": 485,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_1/optimized_prompt.txt",
"chars": 81,
"preview": "You are a research assistant. Help with research tasks using the available tools."
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_1/trace.txt",
"chars": 28468,
"preview": "Session: 1570d534-052d-42da-9ef4-0d89fce103a7\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_10/analysis.txt",
"chars": 3764,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_10/trace.txt",
"chars": 28925,
"preview": "Session: e9508c77-db61-48bf-b084-e34c6fd28c4e\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_2/analysis.txt",
"chars": 3455,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_2/optimization.txt",
"chars": 6968,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_2/optimized_prompt.txt",
"chars": 3529,
"preview": "You are an expert research assistant specializing in technology and AI topics. Your task is to conduct thorough, verifia"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_2/trace.txt",
"chars": 22276,
"preview": "Session: 0b509787-5513-4917-a2c0-32c48e99a3cc\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_3/analysis.txt",
"chars": 3221,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_3/optimization.txt",
"chars": 7888,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_3/optimized_prompt.txt",
"chars": 4408,
"preview": "You are a Research Analyst AI, specialized in conducting thorough, validated research on technical topics with rigorous "
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_3/trace.txt",
"chars": 28967,
"preview": "Session: f3608c7c-f4e9-490f-9917-0c2c790d8827\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_4/analysis.txt",
"chars": 3062,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_4/optimization.txt",
"chars": 6559,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_4/optimized_prompt.txt",
"chars": 3256,
"preview": "You are a research assistant specializing in thorough, rigorous research with explicit validation and error handling.\n\n#"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_4/trace.txt",
"chars": 23899,
"preview": "Session: ed58e1ce-f051-483e-9d2b-675b53eb14e0\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_5/analysis.txt",
"chars": 3646,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_5/optimization.txt",
"chars": 6426,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_5/optimized_prompt.txt",
"chars": 3061,
"preview": "You are a Research Specialist focused on thorough, methodical investigation and clear documentation of findings.\n\n## Res"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_5/trace.txt",
"chars": 31337,
"preview": "Session: f97188f0-71cb-40eb-8693-1f897d88654d\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_6/analysis.txt",
"chars": 460,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_6/optimization.txt",
"chars": 485,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_6/optimized_prompt.txt",
"chars": 81,
"preview": "You are a research assistant. Help with research tasks using the available tools."
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_6/trace.txt",
"chars": 19325,
"preview": "Session: 40b94c14-2980-4d65-9c0b-5852abf330ad\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_7/analysis.txt",
"chars": 2659,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_7/optimization.txt",
"chars": 5490,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_7/optimized_prompt.txt",
"chars": 2613,
"preview": "You are a research assistant specializing in thorough, accurate information gathering and synthesis.\n\n## Research Method"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_7/trace.txt",
"chars": 21242,
"preview": "Session: ff663ef2-21ac-4e69-ab45-21dadb54d687\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_8/analysis.txt",
"chars": 3188,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_8/optimization.txt",
"chars": 6468,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_8/optimized_prompt.txt",
"chars": 3385,
"preview": "You are an expert research assistant specializing in comprehensive topic research and synthesis.\n\n## Core Principles\n\n**"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_8/trace.txt",
"chars": 22080,
"preview": "Session: d7f77aa7-b68a-4d12-a2b7-206bfe732990\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_9/analysis.txt",
"chars": 2768,
"preview": "============================================================\nREASONING TRACE ANALYSIS REPORT\n==========================="
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_9/optimization.txt",
"chars": 5954,
"preview": "============================================================\nPROMPT OPTIMIZATION REPORT\n================================"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_9/optimized_prompt.txt",
"chars": 2673,
"preview": "You are a research assistant helping users conduct thorough, well-documented research on complex topics.\n\n## Your Workfl"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/iteration_9/trace.txt",
"chars": 26144,
"preview": "Session: c173a6b7-aa94-41f7-8eb5-0e2684ada38e\nTask: Research the topic of \"context engineering for AI agents\" and create"
},
{
"path": "examples/interleaved-thinking/optimization_artifacts/summary.json",
"chars": 1002,
"preview": "{\n \"task\": \"Research the topic of \\\"context engineering for AI agents\\\" and create a comprehensive summary.\\n\\nYour res"
},
{
"path": "examples/interleaved-thinking/pyproject.toml",
"chars": 1931,
"preview": "[build-system]\nrequires = [\"setuptools>=61.0\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"reaso"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/__init__.py",
"chars": 1188,
"preview": "\"\"\"\nReasoning Trace Optimizer\n\nDebug and optimize AI agents by analyzing reasoning traces\nusing MiniMax M2.1's interleav"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/analyzer.py",
"chars": 16141,
"preview": "\"\"\"\nTraceAnalyzer: Analyzes reasoning traces to detect patterns and issues.\n\nUses M2.1's own interleaved thinking to ana"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/capture.py",
"chars": 14610,
"preview": "\"\"\"\nTraceCapture: Wraps M2.1 API to capture interleaved thinking traces.\n\nThis module provides the core functionality fo"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/cli.py",
"chars": 8612,
"preview": "\"\"\"\nCLI interface for Reasoning Trace Optimizer.\n\nProvides command-line access to the optimization tools.\n\"\"\"\n\nimport ar"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/loop.py",
"chars": 18042,
"preview": "\"\"\"\nOptimizationLoop: Orchestrates the full capture → analyze → improve → re-run cycle.\n\nThis is the main entry point fo"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/models.py",
"chars": 4955,
"preview": "\"\"\"\nCore data models for reasoning trace optimization.\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom datetime impor"
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/optimizer.py",
"chars": 14558,
"preview": "\"\"\"\nPromptOptimizer: Generates improved prompts based on trace analysis.\n\nUses M2.1 to synthesize analysis results into "
},
{
"path": "examples/interleaved-thinking/reasoning_trace_optimizer/skill_generator.py",
"chars": 16392,
"preview": "\"\"\"\nSkillGenerator: Converts optimization insights into shareable Agent Skills.\n\nTransforms the learnings from optimizat"
},
{
"path": "examples/interleaved-thinking/tests/__init__.py",
"chars": 43,
"preview": "\"\"\"Tests for Reasoning Trace Optimizer.\"\"\"\n"
},
{
"path": "examples/interleaved-thinking/tests/test_models.py",
"chars": 3936,
"preview": "\"\"\"Tests for data models.\"\"\"\n\nfrom datetime import datetime\n\nfrom reasoning_trace_optimizer.models import (\n Analysis"
},
{
"path": "examples/llm-as-judge-skills/.gitignore",
"chars": 239,
"preview": "# Dependencies\nnode_modules/\n\n# Build output\ndist/\n\n# Environment files\n.env\n.env.local\n.env.*.local\n\n# IDE\n.idea/\n.vsco"
},
{
"path": "examples/llm-as-judge-skills/.prettierrc",
"chars": 107,
"preview": "{\n \"semi\": true,\n \"singleQuote\": true,\n \"tabWidth\": 2,\n \"trailingComma\": \"es5\",\n \"printWidth\": 100\n}\n\n"
},
{
"path": "examples/llm-as-judge-skills/CONTRIBUTING.md",
"chars": 1926,
"preview": "# Contributing to LLM-as-a-Judge Skills\n\nThank you for your interest in contributing! This project is part of the [Agent"
},
{
"path": "examples/llm-as-judge-skills/LICENSE",
"chars": 1072,
"preview": "MIT License\n\nCopyright (c) 2025 Muratcan Koylan\n\nPermission is hereby granted, free of charge, to any person obtaining a"
},
{
"path": "examples/llm-as-judge-skills/README.md",
"chars": 22167,
"preview": "# LLM-as-a-Judge Skills\n\n> A practical implementation of LLM evaluation skills built using insights from [Eugene Yan's L"
},
{
"path": "examples/llm-as-judge-skills/agents/evaluator-agent/evaluator-agent.md",
"chars": 4360,
"preview": "# Evaluator Agent\n\n## Purpose\n\nThe Evaluator Agent assesses the quality of LLM-generated responses using configurable ev"
},
{
"path": "examples/llm-as-judge-skills/agents/index.md",
"chars": 2501,
"preview": "# Agents Index\n\nAgents are reusable AI components with defined capabilities, tools, and instructions.\n\n## Available Agen"
},
{
"path": "examples/llm-as-judge-skills/agents/orchestrator-agent/orchestrator-agent.md",
"chars": 4856,
"preview": "# Orchestrator Agent\n\n## Purpose\n\nThe Orchestrator Agent manages complex workflows by delegating tasks to specialized ag"
},
{
"path": "examples/llm-as-judge-skills/agents/research-agent/research-agent.md",
"chars": 4442,
"preview": "# Research Agent\n\n## Purpose\n\nThe Research Agent gathers, synthesizes, and summarizes information from multiple sources "
},
{
"path": "examples/llm-as-judge-skills/env.example",
"chars": 177,
"preview": "# OpenAI Configuration\nOPENAI_API_KEY=your_openai_api_key_here\nOPENAI_MODEL=gpt-4o\n\n# Optional: Anthropic for alternativ"
},
{
"path": "examples/llm-as-judge-skills/eslint.config.js",
"chars": 465,
"preview": "import eslint from '@eslint/js';\nimport tseslint from 'typescript-eslint';\n\nexport default tseslint.config(\n eslint.con"
},
{
"path": "examples/llm-as-judge-skills/examples/basic-evaluation.ts",
"chars": 2752,
"preview": "/**\n * Basic Evaluation Example\n * \n * Demonstrates how to use the EvaluatorAgent to score responses.\n * \n * Run: npx ts"
},
{
"path": "examples/llm-as-judge-skills/examples/full-evaluation-workflow.ts",
"chars": 4616,
"preview": "/**\n * Full Evaluation Workflow Example\n * \n * Demonstrates a complete evaluation workflow:\n * 1. Generate rubrics for c"
},
{
"path": "examples/llm-as-judge-skills/examples/generate-rubric.ts",
"chars": 2041,
"preview": "/**\n * Rubric Generation Example\n * \n * Demonstrates how to generate evaluation rubrics for custom criteria.\n * \n * Run:"
},
{
"path": "examples/llm-as-judge-skills/examples/pairwise-comparison.ts",
"chars": 3103,
"preview": "/**\n * Pairwise Comparison Example\n * \n * Demonstrates how to compare two responses and pick the better one.\n * \n * Run:"
},
{
"path": "examples/llm-as-judge-skills/package.json",
"chars": 2105,
"preview": "{\n \"name\": \"llm-as-judge-skills\",\n \"version\": \"1.0.0\",\n \"description\": \"LLM-as-a-Judge evaluation skills built with A"
},
{
"path": "examples/llm-as-judge-skills/prompts/agent-system/orchestrator-prompt.md",
"chars": 5466,
"preview": "# Orchestrator System Prompt\n\n## Purpose\n\nSystem prompt for the Orchestrator Agent that manages multi-agent workflows.\n\n"
},
{
"path": "examples/llm-as-judge-skills/prompts/evaluation/direct-scoring-prompt.md",
"chars": 3812,
"preview": "# Direct Scoring Prompt\n\n## Purpose\n\nSystem prompt for evaluating a single LLM response using direct scoring methodology"
},
{
"path": "examples/llm-as-judge-skills/prompts/evaluation/pairwise-comparison-prompt.md",
"chars": 4858,
"preview": "# Pairwise Comparison Prompt\n\n## Purpose\n\nSystem prompt for comparing two LLM responses and selecting the better one.\n\n#"
},
{
"path": "examples/llm-as-judge-skills/prompts/index.md",
"chars": 3064,
"preview": "# Prompts Index\n\nPrompts are reusable templates that define how agents and tools interact with LLMs.\n\n## Prompt Categori"
},
{
"path": "examples/llm-as-judge-skills/prompts/research/research-synthesis-prompt.md",
"chars": 4169,
"preview": "# Research Synthesis Prompt\n\n## Purpose\n\nSystem prompt for synthesizing research findings from multiple sources into a c"
},
{
"path": "examples/llm-as-judge-skills/skills/context-fundamentals/context-fundamentals.md",
"chars": 2964,
"preview": "# Context Fundamentals Skill\n\n## Overview\n\nContext engineering is the systematic approach to managing what information a"
},
{
"path": "examples/llm-as-judge-skills/skills/index.md",
"chars": 2305,
"preview": "# Skills Index\n\nSkills are foundational knowledge modules that inform the design and implementation of agents, tools, an"
},
{
"path": "examples/llm-as-judge-skills/skills/llm-evaluator/llm-evaluator.md",
"chars": 2534,
"preview": "# LLM-Evaluator Skill\n\n## Overview\n\nLLM-Evaluators (LLM-as-a-Judge) are large language models designed to evaluate the q"
},
{
"path": "examples/llm-as-judge-skills/skills/tool-design/tool-design.md",
"chars": 4999,
"preview": "# Agent Tool Design Skill\n\n## Overview\n\nTools are the foundation of an agent's capabilities. An agent's ability to take "
},
{
"path": "examples/llm-as-judge-skills/src/agents/evaluator.ts",
"chars": 2805,
"preview": "import { openai } from '@ai-sdk/openai';\nimport { generateText } from 'ai';\nimport { config } from '../config/index.js';"
},
{
"path": "examples/llm-as-judge-skills/src/agents/index.ts",
"chars": 126,
"preview": "export { EvaluatorAgent, evaluatorAgent } from './evaluator.js';\nexport type { EvaluatorAgentConfig } from './evaluator."
},
{
"path": "examples/llm-as-judge-skills/src/config/index.ts",
"chars": 409,
"preview": "import 'dotenv/config';\n\nexport const config = {\n openai: {\n apiKey: process.env.OPENAI_API_KEY || '',\n model: pr"
},
{
"path": "examples/llm-as-judge-skills/src/index.ts",
"chars": 405,
"preview": "// Configuration\nexport { config, validateConfig } from './config/index.js';\n\n// Tools\nexport * from './tools/evaluation"
},
{
"path": "examples/llm-as-judge-skills/src/tools/evaluation/direct-score.ts",
"chars": 5311,
"preview": "import { tool } from 'ai';\nimport { z } from 'zod';\nimport { openai } from '@ai-sdk/openai';\nimport { generateText } fro"
},
{
"path": "examples/llm-as-judge-skills/src/tools/evaluation/generate-rubric.ts",
"chars": 4694,
"preview": "import { tool } from 'ai';\nimport { z } from 'zod';\nimport { openai } from '@ai-sdk/openai';\nimport { generateText } fro"
},
{
"path": "examples/llm-as-judge-skills/src/tools/evaluation/index.ts",
"chars": 659,
"preview": "export { directScoreTool, executeDirectScore, DirectScoreInputSchema, DirectScoreOutputSchema } from './direct-score.js'"
},
{
"path": "examples/llm-as-judge-skills/src/tools/evaluation/pairwise-compare.ts",
"chars": 7623,
"preview": "import { tool } from 'ai';\nimport { z } from 'zod';\nimport { openai } from '@ai-sdk/openai';\nimport { generateText } fro"
},
{
"path": "examples/llm-as-judge-skills/tests/evaluation.test.ts",
"chars": 7548,
"preview": "import { describe, it, expect, beforeAll } from 'vitest';\nimport { \n executeDirectScore, \n executePairwiseCompare, \n "
},
{
"path": "examples/llm-as-judge-skills/tests/setup.ts",
"chars": 560,
"preview": "/**\n * Test setup file\n * \n * This file runs before all tests to configure the test environment.\n */\n\nimport { beforeAll"
},
{
"path": "examples/llm-as-judge-skills/tests/skills.test.ts",
"chars": 7603,
"preview": "import { describe, it, expect, beforeAll, beforeEach } from 'vitest';\nimport { EvaluatorAgent } from '../src/agents/eval"
},
{
"path": "examples/llm-as-judge-skills/tools/evaluation/direct-score.md",
"chars": 4227,
"preview": "# Direct Score Tool\n\n## Purpose\n\nEvaluate a single LLM response against defined criteria using a scoring rubric.\n\n## Too"
},
{
"path": "examples/llm-as-judge-skills/tools/evaluation/generate-rubric.md",
"chars": 5276,
"preview": "# Generate Rubric Tool\n\n## Purpose\n\nAutomatically generate a scoring rubric for a given evaluation criterion. Creates de"
},
{
"path": "examples/llm-as-judge-skills/tools/evaluation/pairwise-compare.md",
"chars": 4677,
"preview": "# Pairwise Compare Tool\n\n## Purpose\n\nCompare two LLM responses and determine which one better satisfies the given criter"
},
{
"path": "examples/llm-as-judge-skills/tools/index.md",
"chars": 3575,
"preview": "# Tools Index\n\nTools provide specific capabilities that agents can use to accomplish tasks.\n\n## Tool Categories\n\n### Eva"
},
{
"path": "examples/llm-as-judge-skills/tools/orchestration/delegate-to-agent.md",
"chars": 4430,
"preview": "# Delegate to Agent Tool\n\n## Purpose\n\nRoute a task to a specialized agent for execution. Handles context passing, result"
},
{
"path": "examples/llm-as-judge-skills/tools/research/read-url.md",
"chars": 4060,
"preview": "# Read URL Tool\n\n## Purpose\n\nExtract and parse content from a given URL. Returns structured text content with metadata a"
},
{
"path": "examples/llm-as-judge-skills/tools/research/web-search.md",
"chars": 3275,
"preview": "# Web Search Tool\n\n## Purpose\n\nSearch the web for relevant information on a given topic. Returns structured results with"
},
{
"path": "examples/llm-as-judge-skills/tsconfig.json",
"chars": 655,
"preview": "{\n \"compilerOptions\": {\n \"target\": \"ES2022\",\n \"module\": \"ESNext\",\n \"moduleResolution\": \"bundler\",\n \"lib\": ["
},
{
"path": "examples/llm-as-judge-skills/vitest.config.ts",
"chars": 541,
"preview": "import { defineConfig } from 'vitest/config';\n\nexport default defineConfig({\n test: {\n globals: true,\n environmen"
},
{
"path": "examples/x-to-book-system/PRD.md",
"chars": 21471,
"preview": "# PRD: X-to-Book Multi-Agent System\n\n## Overview\n\nA multi-agent system that monitors target X (Twitter) accounts daily, "
},
{
"path": "examples/x-to-book-system/README.md",
"chars": 8928,
"preview": "# Example: X-to-Book Multi-Agent System\n\nThis example demonstrates how the Agent Skills for Context Engineering can be a"
},
{
"path": "examples/x-to-book-system/SKILLS-MAPPING.md",
"chars": 10101,
"preview": "# Skills Mapping: X-to-Book System\n\nThis document provides a detailed mapping between the Agent Skills for Context Engin"
},
{
"path": "researcher/example_output.md",
"chars": 8413,
"preview": "# The Infinite Software Crisis – Jake Nations, Netflix\n\n**Evaluation ID:** `a7f3c8e1-4b2d-4f9a-8c1e-3d5f7a9b2c4e`\n**Time"
},
{
"path": "researcher/llm-as-a-judge.md",
"chars": 19687,
"preview": "You are a Principal Research Curator for the Agent-Skills-for-Context-Engineering repository.\n\n## YOUR MISSION\n\nIdentify"
},
{
"path": "skills/advanced-evaluation/SKILL.md",
"chars": 16250,
"preview": "---\nname: advanced-evaluation\ndescription: This skill should be used when the user asks to \"implement LLM-as-judge\", \"co"
},
{
"path": "skills/advanced-evaluation/references/bias-mitigation.md",
"chars": 9076,
"preview": "# Bias Mitigation Techniques for LLM Evaluation\n\nThis reference details specific techniques for mitigating known biases "
},
{
"path": "skills/advanced-evaluation/references/evaluation-pipeline.md",
"chars": 2049,
"preview": "# Evaluation Pipeline Diagram\n\nVisual layout of a production evaluation pipeline.\n\n```\n┌────────────────────────────────"
},
{
"path": "skills/advanced-evaluation/references/implementation-patterns.md",
"chars": 9042,
"preview": "# LLM-as-Judge Implementation Patterns\n\nThis reference provides detailed implementation patterns for building production"
},
{
"path": "skills/advanced-evaluation/references/metrics-guide.md",
"chars": 9220,
"preview": "# Metric Selection Guide for LLM Evaluation\n\nThis reference provides guidance on selecting appropriate metrics for diffe"
},
{
"path": "skills/advanced-evaluation/scripts/evaluation_example.py",
"chars": 13695,
"preview": "\"\"\"Advanced Evaluation Example\n\nUse when: building LLM-as-judge evaluation pipelines, comparing model outputs\nwith posit"
},
{
"path": "skills/bdi-mental-states/SKILL.md",
"chars": 14608,
"preview": "---\nname: bdi-mental-states\ndescription: This skill should be used when the user asks to \"model agent mental states\", \"i"
},
{
"path": "skills/bdi-mental-states/references/bdi-ontology-core.md",
"chars": 6422,
"preview": "# BDI Ontology Core Patterns\n\nCore ontology design patterns for Belief-Desire-Intention mental state modeling.\n\n## Class"
},
{
"path": "skills/bdi-mental-states/references/framework-integration.md",
"chars": 20423,
"preview": "# BDI Framework Integration Patterns\n\nIntegration patterns for connecting BDI ontology with executable agent frameworks."
},
{
"path": "skills/bdi-mental-states/references/rdf-examples.md",
"chars": 11122,
"preview": "# BDI RDF Examples\n\nComplete RDF/Turtle examples for BDI mental state modeling.\n\n## Complete Cognitive Workflow\n\n```turt"
},
{
"path": "skills/bdi-mental-states/references/sparql-competency.md",
"chars": 9382,
"preview": "# SPARQL Competency Queries\n\nValidation queries for BDI ontology implementations based on competency questions.\n\n## Ment"
},
{
"path": "skills/context-compression/SKILL.md",
"chars": 17747,
"preview": "---\nname: context-compression\ndescription: This skill should be used when the user asks to \"compress context\", \"summariz"
},
{
"path": "skills/context-compression/references/evaluation-framework.md",
"chars": 8449,
"preview": "# Context Compression Evaluation Framework\n\nThis document provides the complete evaluation framework for measuring conte"
}
]
// ... and 34 more files (download for full content)
About this extraction
This page contains the full source code of the muratcankoylan/Agent-Skills-for-Context-Engineering GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 234 files (2.1 MB), approximately 557.2k tokens, and a symbol index with 431 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.