Repository: ntegrals/openbrowser Branch: master Commit: 622f36985df6 Files: 119 Total size: 697.5 KB Directory structure: gitextract_rxlca7z1/ ├── .github/ │ ├── CONTRIBUTING.md │ └── workflows/ │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── biome.json ├── bunfig.toml ├── package.json ├── packages/ │ ├── cli/ │ │ ├── package.json │ │ ├── src/ │ │ │ ├── commands/ │ │ │ │ ├── click.ts │ │ │ │ ├── eval.ts │ │ │ │ ├── extract.ts │ │ │ │ ├── interactive.ts │ │ │ │ ├── open.ts │ │ │ │ ├── run.ts │ │ │ │ ├── screenshot.ts │ │ │ │ ├── sessions.ts │ │ │ │ ├── state.ts │ │ │ │ └── type.ts │ │ │ ├── display.ts │ │ │ ├── globals.ts │ │ │ ├── index.ts │ │ │ ├── protocol.ts │ │ │ ├── server.ts │ │ │ └── sessions.ts │ │ └── tsconfig.json │ ├── core/ │ │ ├── package.json │ │ ├── src/ │ │ │ ├── agent/ │ │ │ │ ├── agent.test.ts │ │ │ │ ├── agent.ts │ │ │ │ ├── conversation/ │ │ │ │ │ ├── service.ts │ │ │ │ │ ├── types.ts │ │ │ │ │ └── utils.ts │ │ │ │ ├── conversation.test.ts │ │ │ │ ├── evaluator.ts │ │ │ │ ├── index.ts │ │ │ │ ├── instructions/ │ │ │ │ │ ├── instructions-compact.md │ │ │ │ │ ├── instructions-direct.md │ │ │ │ │ └── instructions.md │ │ │ │ ├── instructions.ts │ │ │ │ ├── replay-recorder.ts │ │ │ │ ├── stall-detector.test.ts │ │ │ │ ├── stall-detector.ts │ │ │ │ └── types.ts │ │ │ ├── bridge/ │ │ │ │ ├── adapter.ts │ │ │ │ ├── client.ts │ │ │ │ ├── index.ts │ │ │ │ ├── mcp-types.ts │ │ │ │ ├── server.test.ts │ │ │ │ └── server.ts │ │ │ ├── commands/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── catalog.ts │ │ │ │ │ └── types.ts │ │ │ │ ├── catalog.test.ts │ │ │ │ ├── executor.test.ts │ │ │ │ ├── executor.ts │ │ │ │ ├── extraction/ │ │ │ │ │ └── extractor.ts │ │ │ │ ├── index.ts │ │ │ │ ├── types.ts │ │ │ │ └── utils.ts │ │ │ ├── config/ │ │ │ │ ├── config.ts │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── errors.ts │ │ │ ├── index.ts │ │ │ ├── logging.ts │ │ │ ├── metering/ │ │ │ │ ├── index.ts │ │ │ │ ├── tracker.test.ts │ │ │ │ ├── tracker.ts │ │ │ │ └── types.ts │ │ │ ├── model/ │ │ │ │ ├── adapters/ │ │ │ │ │ └── vercel.ts │ │ │ │ ├── index.ts │ │ │ │ ├── interface.ts │ │ │ │ ├── messages.ts │ │ │ │ ├── schema-optimizer.ts │ │ │ │ └── types.ts │ │ │ ├── page/ │ │ │ │ ├── content-extractor.ts │ │ │ │ ├── index.ts │ │ │ │ ├── page-analyzer.test.ts │ │ │ │ ├── page-analyzer.ts │ │ │ │ ├── renderer/ │ │ │ │ │ ├── interactive-elements.ts │ │ │ │ │ ├── layer-order.ts │ │ │ │ │ └── tree-renderer.ts │ │ │ │ ├── renderer.test.ts │ │ │ │ ├── snapshot-builder.ts │ │ │ │ └── types.ts │ │ │ ├── sandbox/ │ │ │ │ ├── file-access.ts │ │ │ │ └── index.ts │ │ │ ├── telemetry.ts │ │ │ ├── types.ts │ │ │ ├── utils.ts │ │ │ └── viewport/ │ │ │ ├── event-hub.ts │ │ │ ├── events.ts │ │ │ ├── guard-base.ts │ │ │ ├── guards/ │ │ │ │ ├── blank-page.ts │ │ │ │ ├── crash.ts │ │ │ │ ├── default-handler.ts │ │ │ │ ├── downloads.ts │ │ │ │ ├── har-capture.ts │ │ │ │ ├── local-instance.ts │ │ │ │ ├── page-ready.ts │ │ │ │ ├── permissions.ts │ │ │ │ ├── persistence.ts │ │ │ │ ├── popups.ts │ │ │ │ ├── screenshot.ts │ │ │ │ ├── url-policy.ts │ │ │ │ └── video-capture.ts │ │ │ ├── index.ts │ │ │ ├── launch-profile.test.ts │ │ │ ├── launch-profile.ts │ │ │ ├── types.ts │ │ │ ├── viewport.ts │ │ │ └── visual-tracer.ts │ │ └── tsconfig.json │ └── sandbox/ │ ├── package.json │ ├── src/ │ │ ├── index.ts │ │ ├── sandbox.ts │ │ └── types.ts │ └── tsconfig.json ├── tsconfig.base.json └── tsconfig.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/CONTRIBUTING.md ================================================ # Contributing to Open Browser Thank you for your interest in contributing! ## Getting Started 1. Fork the repository 2. Clone your fork: `git clone https://github.com/YOUR_USERNAME/openbrowser.git` 3. Install dependencies: `bun install` 4. Create a branch: `git checkout -b my-feature` 5. Make your changes and add tests 6. Run tests: `bun run test` 7. Submit a pull request ## Code Style We use [Biome](https://biomejs.dev/) for formatting and linting. Run `bun run format` before committing. ## Reporting Issues Please use GitHub Issues to report bugs or request features. Include: - Steps to reproduce - Expected vs actual behavior - Browser and OS version ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: [main] pull_request: branches: [main] jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install - run: bun run build - run: bun run test - run: bun run lint ================================================ FILE: .gitignore ================================================ node_modules/ dist/ .env *.tsbuildinfo .DS_Store traces/ coverage/ recordings/ tmp/ *.log ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024-2026 Open Browser Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

Open Browser

AI-powered autonomous web browsing framework for TypeScript.

License GitHub stars

Header --- Give an AI agent a browser. It clicks, types, navigates, and extracts data — autonomously completing tasks on any website. Built on Playwright with first-class support for OpenAI, Anthropic, and Google models. > **Production-ready since v1.0.** Contributions welcome. ## Why Open Browser? - **Autonomous agents**: Describe a task in natural language, and an AI agent navigates the web to complete it — clicking, typing, scrolling, and extracting data without manual scripting - **Multi-model support**: Works with OpenAI, Anthropic, and Google out of the box via the Vercel AI SDK — swap models with a single flag - **Interactive REPL**: Drop into a live browser session and issue commands interactively — great for debugging, prototyping, and exploration - **Sandboxed execution**: Run agents in resource-limited environments with CPU/memory monitoring, timeouts, and domain restrictions - **Production-ready**: Stall detection, cost tracking, session management, replay recording, and comprehensive error handling - **Open source**: MIT licensed, fully extensible, bring your own API keys ## Quick Start ```bash # Install dependencies bun install # Set up your API keys cp .env.example .env # Edit .env with your API keys # Run an agent bun run open-browser run "Find the top story on Hacker News and summarize it" # Or open a browser interactively bun run open-browser interactive ``` ## Architecture Open Browser is a monorepo with three packages: | Package | Description | | --------------------------- | -------------------------------------------------------------------------- | | **`open-browser`** | Core library — agent logic, browser control, DOM analysis, LLM integration | | **`@open-browser/cli`** | Command-line interface for running agents and browser commands | | **`@open-browser/sandbox`** | Sandboxed execution with resource limits and monitoring | ## CLI Commands ### Run an AI Agent ```bash open-browser run [options] ``` Describe what you want done. The agent figures out the rest. ```bash # Search and extract information open-browser run "Find the price of the MacBook Pro on apple.com" # Fill out forms open-browser run "Sign up for the newsletter on example.com with test@email.com" # Multi-step workflows open-browser run "Go to GitHub, find the open-browser repo, and star it" ``` | Option | Description | | ---------------------------- | ----------------------------------------- | | `-m, --model ` | Model to use (default: `gpt-4o`) | | `-p, --provider ` | Provider: `openai`, `anthropic`, `google` | | `--headless / --no-headless` | Show or hide the browser window | | `--max-steps ` | Max agent steps (default: `25`) | | `-v, --verbose` | Show detailed step info | | `--no-cost` | Hide cost tracking | ### Browser Commands ```bash open-browser open # Open a URL open-browser click # Click an element open-browser type # Type into an input open-browser screenshot [output] # Capture a screenshot open-browser eval # Run JavaScript on the page open-browser extract # Extract content as markdown open-browser state # Show current URL, title, and tabs open-browser sessions # List active browser sessions ``` ### Interactive REPL ```bash open-browser interactive ``` Drop into a live `browser>` prompt with full control: ``` browser> open https://news.ycombinator.com browser> extract "top 5 stories with titles and points" browser> click .morelink browser> screenshot front-page.png browser> help ``` ## Using as a Library ```typescript import { Agent, createViewport, createModel } from 'open-browser' const viewport = await createViewport({ headless: true }) const model = createModel('openai', 'gpt-4o') const agent = new Agent({ viewport, model, task: 'Go to example.com and extract the main heading', settings: { stepLimit: 50, enableScreenshots: true, }, }) const result = await agent.run() console.log(result) ``` ### Sandboxed Execution Run agents with resource limits and monitoring: ```typescript import { Sandbox } from '@open-browser/sandbox' const sandbox = new Sandbox({ timeout: 300_000, // 5 minute timeout maxMemoryMB: 512, // Memory limit allowedDomains: ['example.com'], stepLimit: 100, captureOutput: true, }) const result = await sandbox.run({ task: 'Complete the checkout form', model: languageModel, }) console.log(result.metrics) // steps, URLs visited, CPU time ``` ## Configuration ### Environment Variables ```bash # LLM Provider Keys (at least one required) OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... GOOGLE_GENERATIVE_AI_API_KEY=... # Browser BROWSER_HEADLESS=true BROWSER_DISABLE_SECURITY=false # Recording & Debugging OPEN_BROWSER_TRACE_PATH=./traces OPEN_BROWSER_SAVE_RECORDING_PATH=./recordings ``` ### Agent Configuration | Setting | Default | Description | | ------------------- | -------- | ----------------------------------------- | | `stepLimit` | `100` | Maximum agent iterations | | `commandsPerStep` | `10` | Actions per agent step | | `failureThreshold` | `5` | Consecutive failures before stopping | | `enableScreenshots` | `true` | Include page screenshots in agent context | | `contextWindowSize` | `128000` | Token budget for conversation | | `allowedUrls` | `[]` | Restrict navigation to specific URLs | | `blockedUrls` | `[]` | Block navigation to specific URLs | ### Viewport Configuration | Setting | Default | Description | | ------------------ | --------------- | ------------------------------------------- | | `headless` | `true` | Run browser without visible window | | `width` / `height` | `1280` / `1100` | Browser window dimensions | | `relaxedSecurity` | `false` | Disable browser security features | | `proxy` | — | Proxy server configuration | | `cookieFile` | — | Path to cookie file for persistent sessions | ## How It Works ``` ┌─────────────┐ "Book a flight" │ │ ───────────────► │ Agent │ ◄── LLM (OpenAI / Anthropic / Google) │ │ └──────┬──────┘ │ ┌──────▼──────┐ │ Commands │ click, type, scroll, extract, navigate... └──────┬──────┘ │ ┌──────▼──────┐ │ Viewport │ Playwright browser instance └──────┬──────┘ │ ┌──────▼──────┐ │ DOM / Page │ Snapshot, interactive elements, content └─────────────┘ ``` 1. You describe a **task** in natural language 2. The **Agent** sends the current page state + task to an LLM 3. The LLM decides what **commands** to execute (click, type, navigate, extract...) 4. Commands execute against the **Viewport** (Playwright browser) 5. The agent observes the result, detects stalls, and loops until the task is complete ## Model Support | Provider | Example Models | Flag | | ------------- | ----------------------------------------------- | -------------- | | **OpenAI** | `gpt-4o`, `gpt-4o-mini`, `o1` | `-p openai` | | **Anthropic** | `claude-sonnet-4-5-20250929`, `claude-opus-4-6` | `-p anthropic` | | **Google** | `gemini-2.0-flash`, `gemini-2.5-pro` | `-p google` | ## Project Structure ``` packages/ ├── core/ # Core library (open-browser) │ └── src/ │ ├── agent/ # Agent logic, conversation, stall detection │ ├── commands/ # Action schemas and executor (25+ commands) │ ├── viewport/ # Browser control, events, guards │ ├── page/ # DOM analysis, content extraction │ ├── model/ # LLM adapter and message formatting │ ├── metering/ # Cost tracking │ ├── bridge/ # IPC server/client │ └── config/ # Configuration types ├── cli/ # CLI (@open-browser/cli) │ └── src/ │ ├── commands/ # CLI command implementations │ └── index.ts # Entry point └── sandbox/ # Sandbox (@open-browser/sandbox) └── src/ └── sandbox.ts # Resource-limited execution ``` ## Development ```bash # Install dependencies bun install # Type check bun run build # Run tests bun run test # Lint bun run lint # Format bun run format ``` ## Contributing Contributions are welcome! Please see [CONTRIBUTING.md](.github/CONTRIBUTING.md) for guidelines. ## License [MIT](LICENSE) ================================================ FILE: biome.json ================================================ { "$schema": "https://biomejs.dev/schemas/1.9.0/schema.json", "organizeImports": { "enabled": true }, "linter": { "enabled": true, "rules": { "recommended": true, "complexity": { "noForEach": "off" }, "style": { "noNonNullAssertion": "off", "useConst": "warn" }, "suspicious": { "noExplicitAny": "off" } } }, "formatter": { "enabled": true, "indentStyle": "tab", "indentWidth": 2, "lineWidth": 120 }, "javascript": { "formatter": { "quoteStyle": "single", "semicolons": "always", "trailingCommas": "all" } }, "files": { "ignore": ["node_modules", "dist", "*.json", "*.d.ts"] } } ================================================ FILE: bunfig.toml ================================================ [install] peer = false [test] timeout = 60000 ================================================ FILE: package.json ================================================ { "name": "open-browser-monorepo", "private": true, "workspaces": ["packages/*"], "scripts": { "build": "bun run --filter '*' build", "test": "bun run --filter '*' test", "lint": "biome check .", "format": "biome format --write ." }, "devDependencies": { "@biomejs/biome": "^1.9.4", "@types/bun": "^1.2.0", "typescript": "^5.8.0" }, "trustedDependencies": [ "@biomejs/biome" ] } ================================================ FILE: packages/cli/package.json ================================================ { "name": "@open-browser/cli", "version": "1.1.0", "description": "CLI for Open Browser - AI-powered autonomous web browsing", "type": "module", "main": "src/index.ts", "bin": { "open-browser": "src/index.ts" }, "scripts": { "build": "tsc --noEmit", "test": "bun test", "start": "bun run src/index.ts" }, "dependencies": { "open-browser": "workspace:*", "commander": "^12.1.0", "chalk": "^5.4.0" }, "license": "MIT" } ================================================ FILE: packages/cli/src/commands/click.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { sessionManager } from '../globals.js'; export function registerClickCommand(program: Command): void { program .command('click') .description('Click on an element matching the given CSS selector') .argument('', 'CSS selector of the element to click') .option('-s, --session ', 'Session ID to use') .action(async (selector: string, options: { session?: string }) => { try { const browser = options.session ? sessionManager.get(options.session) : sessionManager.getDefault(); if (!browser) { console.error(chalk.red('No active session. Use "open" command first.')); process.exit(1); } await browser.click(selector); console.log(chalk.green('Clicked:'), selector); } catch (error) { console.error(chalk.red('Failed to click:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/eval.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { sessionManager } from '../globals.js'; export function registerEvalCommand(program: Command): void { program .command('eval') .description('Evaluate a JavaScript expression in the browser') .argument('', 'JavaScript expression to evaluate') .option('-s, --session ', 'Session ID to use') .action(async (expression: string, options: { session?: string }) => { try { const browser = options.session ? sessionManager.get(options.session) : sessionManager.getDefault(); if (!browser) { console.error(chalk.red('No active session. Use "open" command first.')); process.exit(1); } const result = await browser.evaluate(expression); if (result === undefined) { console.log(chalk.dim('undefined')); } else if (result === null) { console.log(chalk.dim('null')); } else if (typeof result === 'object') { console.log(JSON.stringify(result, null, 2)); } else { console.log(String(result)); } } catch (error) { console.error(chalk.red('Evaluation failed:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/extract.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { extractMarkdown } from 'open-browser'; import { sessionManager } from '../globals.js'; export function registerExtractCommand(program: Command): void { program .command('extract') .description('Extract content from the current page as markdown') .argument('', 'Description of what to extract (used as a label)') .option('-s, --session ', 'Session ID to use') .action(async (goal: string, options: { session?: string }) => { try { const browser = options.session ? sessionManager.get(options.session) : sessionManager.getDefault(); if (!browser) { console.error(chalk.red('No active session. Use "open" command first.')); process.exit(1); } console.log(chalk.dim(`Extracting: ${goal}`)); const markdown = await extractMarkdown(browser.currentPage); if (!markdown) { console.log(chalk.yellow('No content extracted from the page.')); } else { console.log(markdown); } } catch (error) { console.error(chalk.red('Extraction failed:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/interactive.ts ================================================ import * as readline from 'node:readline'; import type { Command } from 'commander'; import chalk from 'chalk'; import { Viewport, extractMarkdown, } from 'open-browser'; import { Spinner, displayInfo, displayError, displaySeparator, } from '../display.js'; interface InteractiveOptions { headless: boolean; } /** * Interactive REPL-like session for browser automation. * Supports commands: open, click, type, eval, extract, screenshot, state, back, forward, tabs, help, quit */ export function registerInteractiveCommand(program: Command): void { program .command('interactive') .alias('repl') .description('Start an interactive browser session (REPL mode)') .option('--headless', 'Run browser in headless mode', false) .action(async (options: InteractiveOptions) => { console.log(chalk.bold.white('Interactive Browser Session')); console.log(chalk.dim('Type "help" for available commands, "quit" to exit.')); displaySeparator(); let browser: Viewport | null = null; try { const spinner = new Spinner('Starting browser...'); spinner.start(); browser = new Viewport({ headless: options.headless, }); await browser.start(); spinner.stop(chalk.green('Browser ready.')); console.log(''); const rl = readline.createInterface({ input: process.stdin, output: process.stdout, prompt: chalk.cyan('browser> '), terminal: true, }); rl.prompt(); rl.on('line', async (line) => { const trimmed = line.trim(); if (!trimmed) { rl.prompt(); return; } const [command, ...args] = parseCommandLine(trimmed); try { const shouldQuit = await handleCommand( command.toLowerCase(), args, browser!, ); if (shouldQuit) { rl.close(); return; } } catch (error) { displayError( error instanceof Error ? error.message : String(error), ); } rl.prompt(); }); rl.on('close', async () => { console.log(''); displayInfo('Closing browser session...'); if (browser) { await browser.close().catch(() => {}); } process.exit(0); }); } catch (error) { displayError( error instanceof Error ? error.message : String(error), ); if (browser) { await browser.close().catch(() => {}); } process.exit(1); } }); } // ── Command Parsing ── function parseCommandLine(input: string): string[] { const tokens: string[] = []; let current = ''; let inQuote: string | null = null; for (const char of input) { if (inQuote) { if (char === inQuote) { inQuote = null; } else { current += char; } } else if (char === '"' || char === "'") { inQuote = char; } else if (char === ' ' || char === '\t') { if (current) { tokens.push(current); current = ''; } } else { current += char; } } if (current) { tokens.push(current); } return tokens; } // ── Command Handler ── async function handleCommand( command: string, args: string[], browser: Viewport, ): Promise { switch (command) { case 'open': case 'goto': case 'navigate': { const url = args[0]; if (!url) { displayError('Usage: open '); return false; } const spinner = new Spinner(`Navigating to ${url}...`); spinner.start(); await browser.navigate(url); const finalUrl = browser.currentPage.url(); spinner.stop(`${chalk.green('Loaded:')} ${finalUrl}`); return false; } case 'tap': { const selector = args.join(' '); if (!selector) { displayError('Usage: click '); return false; } await browser.click(selector); console.log(chalk.green('Clicked:'), selector); return false; } case 'type': { const selector = args[0]; const text = args.slice(1).join(' '); if (!selector || !text) { displayError('Usage: type '); return false; } await browser.type(selector, text); console.log(chalk.green('Typed:'), text); return false; } case 'eval': case 'js': { const expression = args.join(' '); if (!expression) { displayError('Usage: eval '); return false; } const result = await browser.evaluate(expression); if (result === undefined) { console.log(chalk.dim('undefined')); } else if (result === null) { console.log(chalk.dim('null')); } else if (typeof result === 'object') { console.log(JSON.stringify(result, null, 2)); } else { console.log(String(result)); } return false; } case 'extract': case 'markdown': { const spinner = new Spinner('Extracting page content...'); spinner.start(); const markdown = await extractMarkdown(browser.currentPage); spinner.stop(); if (markdown) { // Show first 2000 chars const preview = markdown.length > 2000 ? `${markdown.slice(0, 2000)}\n${chalk.dim(`... (${markdown.length} chars total)`)}` : markdown; console.log(preview); } else { console.log(chalk.yellow('No content found.')); } return false; } case 'capture': { const outputPath = args[0] || 'screenshot.png'; const result = await browser.screenshot(false); const fs = await import('node:fs'); const path = await import('node:path'); const buffer = Buffer.from(result.base64, 'base64'); const resolved = path.resolve(outputPath); fs.writeFileSync(resolved, buffer); console.log(chalk.green('Screenshot saved:'), resolved); console.log(chalk.dim(`${result.width}x${result.height}`)); return false; } case 'state': case 'info': { const state = await browser.getState(); console.log(`${chalk.white('URL:')} ${state.url}`); console.log(`${chalk.white('Title:')} ${state.title}`); if (state.tabs.length > 1) { console.log(`${chalk.white('Tabs:')}`); for (const tab of state.tabs) { const marker = tab.isActive ? chalk.cyan(' > ') : ' '; console.log(`${marker}[${tab.tabId}] ${tab.title || '(untitled)'} - ${tab.url}`); } } return false; } case 'back': { await browser.currentPage.goBack({ timeout: 5000 }).catch(() => {}); console.log(chalk.green('Navigated back')); return false; } case 'forward': { await browser.currentPage.goForward({ timeout: 5000 }).catch(() => {}); console.log(chalk.green('Navigated forward')); return false; } case 'tabs': { const state = await browser.getState(); for (const tab of state.tabs) { const marker = tab.isActive ? chalk.cyan(' > ') : ' '; console.log(`${marker}[${tab.tabId}] ${tab.title || '(untitled)'} - ${tab.url}`); } return false; } case 'url': { console.log(browser.currentPage.url()); return false; } case 'title': { const title = await browser.currentPage.title(); console.log(title); return false; } case 'reload': case 'refresh': { await browser.currentPage.reload({ timeout: 10000 }).catch(() => {}); console.log(chalk.green('Page reloaded')); return false; } case 'wait': { const ms = Number.parseInt(args[0] || '1000', 10); console.log(chalk.dim(`Waiting ${ms}ms...`)); await new Promise((resolve) => setTimeout(resolve, ms)); return false; } case 'help': { printHelp(); return false; } case 'quit': case 'exit': case 'q': { return true; } default: { console.log(chalk.yellow(`Unknown command: ${command}`)); console.log(chalk.dim('Type "help" for available commands.')); return false; } } } function printHelp(): void { console.log(chalk.bold('Available commands:')); console.log(''); const commands = [ ['open ', 'Navigate to a URL'], ['click ', 'Click an element'], ['type ', 'Type text into an element'], ['eval ', 'Run JavaScript in the browser'], ['extract', 'Extract page content as markdown'], ['screenshot [path]', 'Take a screenshot'], ['state', 'Show current browser state'], ['back', 'Navigate back'], ['forward', 'Navigate forward'], ['tabs', 'List open tabs'], ['url', 'Show current URL'], ['title', 'Show current page title'], ['reload', 'Reload the current page'], ['wait [ms]', 'Wait for the specified time'], ['help', 'Show this help message'], ['quit', 'Exit the interactive session'], ]; for (const [cmd, desc] of commands) { console.log(` ${chalk.cyan(cmd.padEnd(25))} ${desc}`); } } ================================================ FILE: packages/cli/src/commands/open.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { sessionManager } from '../globals.js'; export function registerOpenCommand(program: Command): void { program .command('open') .description('Open a URL in the browser') .argument('', 'URL to navigate to') .option('--headless', 'Run in headless mode', false) .option('-s, --session ', 'Reuse an existing session') .action(async (url: string, options: { headless: boolean; session?: string }) => { try { let sessionId = options.session; if (sessionId) { const browser = sessionManager.get(sessionId); if (!browser) { console.error(chalk.red(`Session "${sessionId}" not found.`)); process.exit(1); } await browser.navigate(url); } else { // Try to reuse the default session, or create a new one sessionId = sessionManager.getDefaultId(); if (!sessionId) { sessionId = await sessionManager.create({ headless: options.headless, }); } const browser = sessionManager.get(sessionId)!; await browser.navigate(url); } const browser = sessionManager.get(sessionId)!; const finalUrl = browser.currentPage.url(); console.log(chalk.green('Session:'), sessionId); console.log(chalk.green('URL:'), finalUrl); } catch (error) { console.error(chalk.red('Failed to open URL:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/run.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { Agent, Viewport, VercelModelAdapter, type LanguageModel, type CommandResult, type StepRecord, } from 'open-browser'; import { Spinner, displayStep, displayTotalCost, displayResult, displayHeader, displaySeparator, displayError, } from '../display.js'; interface RunOptions { model: string; provider: string; headless: boolean; stepLimit: number; verbose: boolean; noCost: boolean; } /** * Dynamically import and create a Vercel AI SDK language model * based on the provider and model ID strings. */ async function createModel(provider: string, modelId: string): Promise { let languageModel: import('ai').LanguageModelV1; switch (provider) { case 'openai': { const { createOpenAI } = await import('@ai-sdk/openai'); const openai = createOpenAI({}); languageModel = openai(modelId); break; } case 'anthropic': { const { createAnthropic } = await import('@ai-sdk/anthropic'); const anthropic = createAnthropic({}); languageModel = anthropic(modelId); break; } case 'google': { const { createGoogleGenerativeAI } = await import('@ai-sdk/google'); const google = createGoogleGenerativeAI({}); languageModel = google(modelId); break; } default: throw new Error( `Unsupported provider: ${provider}. ` + 'Supported: openai, anthropic, google', ); } return new VercelModelAdapter({ model: languageModel }); } export function registerRunCommand(program: Command): void { program .command('run') .description('Run an AI agent to complete a browser task') .argument('', 'Description of the task for the agent to complete') .option('-m, --model ', 'Model ID to use', 'gpt-4o') .option('-p, --provider ', 'LLM provider (openai, anthropic, google)', 'openai') .option('--headless', 'Run browser in headless mode', true) .option('--no-headless', 'Show the browser window') .option('--max-steps ', 'Maximum number of agent steps', '25') .option('-v, --verbose', 'Show detailed step information', false) .option('--no-cost', 'Hide cost tracking information') .action(async (task: string, options: RunOptions) => { const stepLimit = Number.parseInt(String(options.stepLimit), 10); displayHeader(`Agent Task: ${task}`); console.log( `${chalk.dim('model:')} ${options.model} ` + `${chalk.dim('provider:')} ${options.provider} ` + `${chalk.dim('max steps:')} ${stepLimit}`, ); displaySeparator(); const spinner = new Spinner('Starting browser...'); spinner.start(); let browser: Viewport | null = null; try { // Initialize the LLM spinner.update('Loading model...'); const model = await createModel(options.provider, options.model); // Initialize the browser spinner.update('Starting browser...'); browser = new Viewport({ headless: options.headless, }); await browser.start(); spinner.update('Browser ready, starting agent...'); // Track per-step timing const stepTimings = new Map(); let currentStepStart = 0; // Create the agent const agent = new Agent({ task, model, browser, settings: { stepLimit, }, onStepStart: (step) => { currentStepStart = Date.now(); stepTimings.set(step, currentStepStart); spinner.update(`Step ${step}: thinking...`); }, onStepEnd: (step, results) => { const durationMs = Date.now() - (stepTimings.get(step) ?? currentStepStart); spinner.stop(); // Display each action result for this step for (const result of results) { displayStep({ step, action: extractActionName(result), target: extractActionTarget(result), durationMs, success: result.success, error: result.error, extractedContent: result.extractedContent, }); } if (options.verbose) { displaySeparator(); } // Restart spinner for next step spinner.start(); spinner.update(`Step ${step + 1}: thinking...`); }, }); spinner.update('Agent running...'); // Execute the agent const result = await agent.run(); spinner.stop(); // Display result displayResult(result.success, result.finalResult); // Display cost summary if (!options.noCost && result.totalCost) { displayTotalCost({ steps: result.history.entries.length, inputTokens: result.totalCost.totalInputTokens, outputTokens: result.totalCost.totalOutputTokens, totalCost: result.totalCost.totalCost, durationMs: computeTotalDuration(result.history.entries), }); } else if (!options.noCost) { // Show basic timing even without cost data const totalMs = computeTotalDuration(result.history.entries); console.log(''); console.log( chalk.dim( `Completed in ${result.history.entries.length} step(s), ` + `${(totalMs / 1000).toFixed(1)}s`, ), ); } // Display errors if any if (result.errors.length > 0) { console.log(''); console.log(chalk.bold.yellow('Errors encountered:')); for (const err of result.errors) { console.log(` ${chalk.red('-')} ${err}`); } } // Exit with appropriate code process.exit(result.success ? 0 : 1); } catch (error) { spinner.stop(); displayError( error instanceof Error ? error.message : String(error), ); process.exit(1); } finally { if (browser) { await browser.close().catch(() => {}); } } }); } // ── Helpers ── function extractActionName(result: CommandResult): string { if (result.isDone) return 'done'; if (result.extractedContent) return 'extract'; return result.success ? 'action' : 'failed_action'; } function extractActionTarget(result: CommandResult): string | undefined { if (result.extractedContent) { return result.extractedContent.slice(0, 80); } return undefined; } function computeTotalDuration(entries: StepRecord[]): number { return entries.reduce((sum, e) => sum + e.duration, 0); } ================================================ FILE: packages/cli/src/commands/screenshot.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import * as fs from 'node:fs'; import * as path from 'node:path'; import { sessionManager } from '../globals.js'; export function registerScreenshotCommand(program: Command): void { program .command('screenshot') .description('Take a screenshot of the current page') .argument('[output]', 'Output file path', 'screenshot.png') .option('-s, --session ', 'Session ID to use') .option('--full-page', 'Capture the full page', false) .action(async (output: string, options: { session?: string; fullPage: boolean }) => { try { const browser = options.session ? sessionManager.get(options.session) : sessionManager.getDefault(); if (!browser) { console.error(chalk.red('No active session. Use "open" command first.')); process.exit(1); } const result = await browser.screenshot(options.fullPage); const buffer = Buffer.from(result.base64, 'base64'); const outputPath = path.resolve(output); fs.writeFileSync(outputPath, buffer); console.log(chalk.green('Screenshot saved:'), outputPath); console.log(chalk.green('Dimensions:'), `${result.width}x${result.height}`); } catch (error) { console.error(chalk.red('Failed to take screenshot:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/sessions.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { sessionManager } from '../globals.js'; export function registerSessionsCommand(program: Command): void { program .command('sessions') .description('List all active browser sessions') .action(() => { try { const sessions = sessionManager.list(); if (sessions.length === 0) { console.log(chalk.yellow('No active sessions.')); return; } console.log(chalk.bold(`Active Sessions (${sessions.length}):`)); for (const session of sessions) { const created = new Date(session.createdAt).toLocaleTimeString(); const accessed = new Date(session.lastAccessedAt).toLocaleTimeString(); console.log(` ${chalk.cyan(session.id)} created ${created} last used ${accessed}`); } } catch (error) { console.error(chalk.red('Failed to list sessions:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); program .command('sessions:close') .description('Close a specific session or all sessions') .argument('[id]', 'Session ID to close (omit to close all)') .action(async (id?: string) => { try { if (id) { const closed = await sessionManager.close(id); if (closed) { console.log(chalk.green('Closed session:'), id); } else { console.error(chalk.red(`Session "${id}" not found.`)); process.exit(1); } } else { const count = sessionManager.activeCount; await sessionManager.closeAll(); console.log(chalk.green(`Closed ${count} session(s).`)); } } catch (error) { console.error(chalk.red('Failed to close session:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/state.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { sessionManager } from '../globals.js'; export function registerStateCommand(program: Command): void { program .command('state') .description('Print the current browser state (URL, title, tabs)') .option('-s, --session ', 'Session ID to use') .action(async (options: { session?: string }) => { try { const browser = options.session ? sessionManager.get(options.session) : sessionManager.getDefault(); if (!browser) { console.error(chalk.red('No active session. Use "open" command first.')); process.exit(1); } const state = await browser.getState(); console.log(chalk.bold('Browser State')); console.log(chalk.green('URL:'), state.url); console.log(chalk.green('Title:'), state.title); console.log(chalk.green('Tabs:'), state.tabs.length); for (const tab of state.tabs) { const marker = tab.isActive ? chalk.cyan('→') : ' '; console.log(` ${marker} [${tab.tabId}] ${tab.title || '(untitled)'} - ${tab.url}`); } } catch (error) { console.error(chalk.red('Failed to get state:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/commands/type.ts ================================================ import type { Command } from 'commander'; import chalk from 'chalk'; import { sessionManager } from '../globals.js'; export function registerTypeCommand(program: Command): void { program .command('type') .description('Type text into an element matching the given CSS selector') .argument('', 'CSS selector of the input element') .argument('', 'Text to type into the element') .option('-s, --session ', 'Session ID to use') .action(async (selector: string, text: string, options: { session?: string }) => { try { const browser = options.session ? sessionManager.get(options.session) : sessionManager.getDefault(); if (!browser) { console.error(chalk.red('No active session. Use "open" command first.')); process.exit(1); } await browser.type(selector, text); console.log(chalk.green('Typed into:'), selector); } catch (error) { console.error(chalk.red('Failed to type:'), error instanceof Error ? error.message : String(error)); process.exit(1); } }); } ================================================ FILE: packages/cli/src/display.ts ================================================ import chalk from 'chalk'; // ── Spinner ── const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; export class Spinner { private intervalId: ReturnType | null = null; private frameIndex = 0; private message: string; constructor(message: string) { this.message = message; } start(): void { if (this.intervalId) return; this.frameIndex = 0; this.intervalId = setInterval(() => { const frame = SPINNER_FRAMES[this.frameIndex % SPINNER_FRAMES.length]; process.stdout.write(`\r${chalk.cyan(frame)} ${this.message}`); this.frameIndex++; }, 80); } update(message: string): void { this.message = message; } stop(finalMessage?: string): void { if (this.intervalId) { clearInterval(this.intervalId); this.intervalId = null; } // Clear the spinner line process.stdout.write('\r\x1b[K'); if (finalMessage) { console.log(finalMessage); } } } // ── Step Display ── export interface StepDisplayInfo { step: number; action: string; target?: string; durationMs: number; success: boolean; error?: string; extractedContent?: string; } /** * Format and display a single agent step with its result. */ export function displayStep(info: StepDisplayInfo): void { const stepLabel = chalk.bold.white(`Step ${info.step}`); const actionLabel = chalk.yellow(info.action); const durationLabel = chalk.dim(`${info.durationMs}ms`); const statusIcon = info.success ? chalk.green('✓') : chalk.red('✗'); console.log(`${stepLabel} ${statusIcon} ${actionLabel} ${durationLabel}`); if (info.target) { console.log(` ${chalk.dim('target:')} ${info.target}`); } if (info.error) { console.log(` ${chalk.red('error:')} ${info.error}`); } if (info.extractedContent) { const preview = info.extractedContent.length > 120 ? `${info.extractedContent.slice(0, 120)}...` : info.extractedContent; console.log(` ${chalk.dim('output:')} ${preview}`); } } // ── Cost Display ── export interface CostDisplayInfo { inputTokens: number; outputTokens: number; totalCost: number; } /** * Display token usage and cost for a single step. */ export function displayStepCost(info: CostDisplayInfo): void { const tokens = chalk.dim( `tokens: ${info.inputTokens.toLocaleString()} in / ${info.outputTokens.toLocaleString()} out`, ); const cost = chalk.dim(`cost: $${info.totalCost.toFixed(4)}`); console.log(` ${tokens} ${cost}`); } /** * Display a summary of total cost and token usage. */ export function displayTotalCost(info: CostDisplayInfo & { steps: number; durationMs: number }): void { console.log(''); console.log(chalk.bold('Summary')); console.log(chalk.dim('─'.repeat(50))); console.log(` ${chalk.white('Steps:')} ${info.steps}`); console.log(` ${chalk.white('Duration:')} ${(info.durationMs / 1000).toFixed(1)}s`); console.log(` ${chalk.white('Input tokens:')} ${info.inputTokens.toLocaleString()}`); console.log(` ${chalk.white('Output tokens:')} ${info.outputTokens.toLocaleString()}`); console.log(` ${chalk.white('Total tokens:')} ${(info.inputTokens + info.outputTokens).toLocaleString()}`); console.log(` ${chalk.white('Total cost:')} $${info.totalCost.toFixed(4)}`); console.log(chalk.dim('─'.repeat(50))); } // ── Progress Bar ── export function displayProgressBar(current: number, total: number, width = 30): void { const ratio = Math.min(current / total, 1); const filled = Math.round(ratio * width); const empty = width - filled; const bar = chalk.green('█'.repeat(filled)) + chalk.dim('░'.repeat(empty)); const pct = (ratio * 100).toFixed(0).padStart(3); process.stdout.write(`\r [${bar}] ${pct}% (${current}/${total})`); } // ── Result Display ── export function displayResult(success: boolean, output?: string): void { console.log(''); if (success) { console.log(chalk.bold.green('Task completed successfully')); } else { console.log(chalk.bold.red('Task failed')); } if (output) { console.log(''); console.log(chalk.bold('Result:')); console.log(output); } } // ── Helpers ── export function displayError(message: string): void { console.error(chalk.red('Error:'), message); } export function displayWarning(message: string): void { console.warn(chalk.yellow('Warning:'), message); } export function displayInfo(message: string): void { console.log(chalk.blue('Info:'), message); } export function displaySeparator(): void { console.log(chalk.dim('─'.repeat(60))); } export function displayHeader(title: string): void { console.log(''); console.log(chalk.bold.white(title)); console.log(chalk.dim('═'.repeat(60))); } ================================================ FILE: packages/cli/src/globals.ts ================================================ import { SessionManager } from './sessions.js'; export const sessionManager = new SessionManager(); ================================================ FILE: packages/cli/src/index.ts ================================================ #!/usr/bin/env bun import { Command } from 'commander'; import { registerOpenCommand } from './commands/open.js'; import { registerClickCommand } from './commands/click.js'; import { registerTypeCommand } from './commands/type.js'; import { registerStateCommand } from './commands/state.js'; import { registerScreenshotCommand } from './commands/screenshot.js'; import { registerEvalCommand } from './commands/eval.js'; import { registerExtractCommand } from './commands/extract.js'; import { registerSessionsCommand } from './commands/sessions.js'; import { registerRunCommand } from './commands/run.js'; import { registerInteractiveCommand } from './commands/interactive.js'; const program = new Command(); program .name('open-browser') .description('AI-powered autonomous web browsing CLI') .version('0.1.0'); // ── Browser manipulation commands ── registerOpenCommand(program); registerClickCommand(program); registerTypeCommand(program); registerStateCommand(program); registerScreenshotCommand(program); registerEvalCommand(program); registerExtractCommand(program); registerSessionsCommand(program); // ── Agent and interactive commands ── registerRunCommand(program); registerInteractiveCommand(program); program.parse(); ================================================ FILE: packages/cli/src/protocol.ts ================================================ export interface CLIRequest { id: string; command: string; args: Record; } export interface CLIResponse { id: string; success: boolean; data?: unknown; error?: string; } export function serializeRequest(req: CLIRequest): string { return JSON.stringify(req) + '\n'; } export function parseRequest(data: string): CLIRequest | null { try { return JSON.parse(data.trim()) as CLIRequest; } catch { return null; } } export function serializeResponse(res: CLIResponse): string { return JSON.stringify(res) + '\n'; } export function parseResponse(data: string): CLIResponse | null { try { return JSON.parse(data.trim()) as CLIResponse; } catch { return null; } } ================================================ FILE: packages/cli/src/server.ts ================================================ import * as net from 'node:net'; import * as fs from 'node:fs'; import * as path from 'node:path'; import * as os from 'node:os'; import { SessionManager } from './sessions.js'; import { type CLIRequest, type CLIResponse, parseRequest, serializeResponse } from './protocol.js'; const SOCKET_DIR = path.join(os.tmpdir(), 'open-browser'); const SOCKET_PATH = path.join(SOCKET_DIR, 'server.sock'); export class CLIServer { private server: net.Server | null = null; readonly sessions: SessionManager; constructor() { this.sessions = new SessionManager(); } async start(): Promise { if (!fs.existsSync(SOCKET_DIR)) { fs.mkdirSync(SOCKET_DIR, { recursive: true }); } // Clean up stale socket if (fs.existsSync(SOCKET_PATH)) { fs.unlinkSync(SOCKET_PATH); } return new Promise((resolve, reject) => { this.server = net.createServer((socket) => { let buffer = ''; socket.on('data', async (data) => { buffer += data.toString(); const lines = buffer.split('\n'); buffer = lines.pop() ?? ''; for (const line of lines) { if (!line.trim()) continue; const request = parseRequest(line); if (request) { const response = await this.handleRequest(request); socket.write(serializeResponse(response)); } } }); socket.on('error', () => { // Client disconnected }); }); this.server.on('error', reject); this.server.listen(SOCKET_PATH, () => { resolve(SOCKET_PATH); }); }); } private async handleRequest(request: CLIRequest): Promise { try { switch (request.command) { case 'open': { const url = request.args.url as string; let sessionId = request.args.session as string | undefined; if (!sessionId) { sessionId = this.sessions.getDefaultId(); } if (!sessionId) { sessionId = await this.sessions.create({ headless: request.args.headless as boolean | undefined, }); } const browser = this.sessions.get(sessionId)!; await browser.navigate(url); return { id: request.id, success: true, data: { sessionId, url: browser.currentPage.url() }, }; } case 'tap': { const browser = this.getSessionBrowser(request); const selector = request.args.selector as string; await browser.click(selector); return { id: request.id, success: true }; } case 'type': { const browser = this.getSessionBrowser(request); const selector = request.args.selector as string; const text = request.args.text as string; await browser.type(selector, text); return { id: request.id, success: true }; } case 'state': { const browser = this.getSessionBrowser(request); const state = await browser.getState(); return { id: request.id, success: true, data: state }; } case 'capture': { const browser = this.getSessionBrowser(request); const result = await browser.screenshot(request.args.fullPage as boolean); return { id: request.id, success: true, data: result }; } case 'eval': { const browser = this.getSessionBrowser(request); const expression = request.args.expression as string; const result = await browser.evaluate(expression); return { id: request.id, success: true, data: result }; } case 'sessions': { return { id: request.id, success: true, data: this.sessions.list(), }; } case 'close': { const sessionId = request.args.session as string | undefined; if (sessionId) { await this.sessions.close(sessionId); } else { await this.sessions.closeAll(); } return { id: request.id, success: true }; } default: return { id: request.id, success: false, error: `Unknown command: ${request.command}`, }; } } catch (error) { return { id: request.id, success: false, error: error instanceof Error ? error.message : String(error), }; } } private getSessionBrowser(request: CLIRequest) { const sessionId = request.args.session as string | undefined; const browser = sessionId ? this.sessions.get(sessionId) : this.sessions.getDefault(); if (!browser) { throw new Error('No active session. Use "open" command first.'); } return browser; } async stop(): Promise { await this.sessions.closeAll(); if (this.server) { return new Promise((resolve) => { this.server!.close(() => { if (fs.existsSync(SOCKET_PATH)) { fs.unlinkSync(SOCKET_PATH); } resolve(); }); }); } } static get socketPath(): string { return SOCKET_PATH; } } ================================================ FILE: packages/cli/src/sessions.ts ================================================ import { Viewport, type ViewportOptions } from 'open-browser'; import { nanoid } from 'nanoid'; interface ManagedSession { id: string; browser: Viewport; createdAt: number; lastAccessedAt: number; } export class SessionManager { private sessions = new Map(); async create(options?: ViewportOptions): Promise { const id = nanoid(8); const browser = new Viewport(options); await browser.start(); this.sessions.set(id, { id, browser, createdAt: Date.now(), lastAccessedAt: Date.now(), }); return id; } get(id: string): Viewport | undefined { const session = this.sessions.get(id); if (session) { session.lastAccessedAt = Date.now(); return session.browser; } return undefined; } async close(id: string): Promise { const session = this.sessions.get(id); if (!session) return false; await session.browser.close(); this.sessions.delete(id); return true; } async closeAll(): Promise { for (const session of this.sessions.values()) { await session.browser.close(); } this.sessions.clear(); } list(): Array<{ id: string; createdAt: number; lastAccessedAt: number }> { return [...this.sessions.values()].map((s) => ({ id: s.id, createdAt: s.createdAt, lastAccessedAt: s.lastAccessedAt, })); } get activeCount(): number { return this.sessions.size; } getDefault(): Viewport | undefined { const first = this.sessions.values().next(); if (first.done) return undefined; first.value.lastAccessedAt = Date.now(); return first.value.browser; } getDefaultId(): string | undefined { const first = this.sessions.keys().next(); return first.done ? undefined : first.value; } } ================================================ FILE: packages/cli/tsconfig.json ================================================ { "extends": "../../tsconfig.base.json", "compilerOptions": { "rootDir": "src", "outDir": "dist" }, "include": ["src/**/*.ts"] } ================================================ FILE: packages/core/package.json ================================================ { "name": "open-browser", "version": "1.1.0", "description": "AI-powered autonomous web browsing library for TypeScript", "type": "module", "main": "src/index.ts", "types": "src/index.ts", "exports": { ".": "./src/index.ts" }, "scripts": { "build": "tsc --noEmit", "test": "bun test", "lint": "biome check src/" }, "dependencies": { "ai": "^4.2.0", "@ai-sdk/openai": "^1.1.0", "@ai-sdk/anthropic": "^1.1.0", "@ai-sdk/google": "^1.1.0", "zod": "^3.24.0", "playwright": "^1.51.0", "mitt": "^3.0.2", "nanoid": "^5.1.0", "turndown": "^7.2.1", "dotenv": "^16.5.0" }, "devDependencies": { "@types/turndown": "^5.0.5" }, "peerDependencies": { "sharp": ">=0.33.0" }, "peerDependenciesMeta": { "sharp": { "optional": true } }, "license": "MIT" } ================================================ FILE: packages/core/src/agent/agent.test.ts ================================================ import { test, expect, describe, beforeEach, mock } from 'bun:test'; import { Agent, type AgentOptions } from '../agent/agent.js'; import type { PageAnalyzer } from '../page/page-analyzer.js'; // ── Mock PageAnalyzer factory (injected via AgentOptions.domService) ── const mockExtractState = mock(async () => ({ tree: '
[1]
', selectorMap: { 1: 'button' }, elementCount: 10, interactiveElementCount: 1, scrollPosition: { x: 0, y: 0 }, viewportSize: { width: 1280, height: 1100 }, documentSize: { width: 1280, height: 2000 }, pixelsAbove: 0, pixelsBelow: 900, })); function createMockPageAnalyzer(): PageAnalyzer { return { extractState: mockExtractState, clickElementByIndex: mock(async () => {}), getCachedTree: mock(() => null), getCachedSelectorMap: mock(() => null), clearCache: mock(() => {}), getInteractedElements: mock(() => []), clearInteractedElements: mock(() => {}), getElementSelector: mock(async () => undefined), getElementByBackendNodeId: mock(async () => null), clickAtCoordinates: mock(async () => {}), inputTextByIndex: mock(async () => {}), extractWithIframes: mock(async () => ({ mainTree: null, iframeTrees: [] })), } as unknown as PageAnalyzer; } import type { RunOutcome } from './types.js'; import type { LanguageModel, InferenceOptions } from '../model/interface.js'; import type { InferenceResult, InferenceUsage } from '../model/types.js'; import type { Viewport } from '../viewport/viewport.js'; import type { ViewportSnapshot } from '../viewport/types.js'; import type { CommandExecutor } from '../commands/executor.js'; import type { Command, CommandResult, ExecutionContext } from '../commands/types.js'; import type { CommandCatalog } from '../commands/catalog/catalog.js'; // ── Mock Factories ── function createMockUsage(input = 100, output = 50): InferenceUsage { return { inputTokens: input, outputTokens: output, totalTokens: input + output }; } function createMockModel(options?: { responses?: Array<{ currentState: { evaluation: string; memory: string; nextGoal: string }; actions: Command[]; }>; modelId?: string; }): LanguageModel { let callCount = 0; const responses = options?.responses ?? [ { currentState: { evaluation: 'Page loaded', memory: '', nextGoal: 'Click element', }, actions: [{ action: 'tap', index: 1, clickCount: 1 } as Command], }, ]; return { modelId: options?.modelId ?? 'test-model', provider: 'custom', invoke: async (_options: InferenceOptions): Promise> => { const responseIndex = Math.min(callCount, responses.length - 1); callCount++; return { parsed: responses[responseIndex] as unknown as T, usage: createMockUsage(), finishReason: 'stop', }; }, }; } function createDoneOnStepModel(doneOnStep: number, result = 'Task completed'): LanguageModel { const responses: Array<{ currentState: { evaluation: string; memory: string; nextGoal: string }; actions: Command[]; }> = []; for (let i = 1; i < doneOnStep; i++) { responses.push({ currentState: { evaluation: `Step ${i} assessment`, memory: '', nextGoal: `Goal for step ${i + 1}`, }, actions: [{ action: 'tap', index: i, clickCount: 1 } as Command], }); } responses.push({ currentState: { evaluation: 'Task done', memory: '', nextGoal: 'Report result', }, actions: [{ action: 'finish', text: result, success: true } as Command], }); return createMockModel({ responses }); } function createMockBrowserState(): ViewportSnapshot { return { url: 'https://example.com', title: 'Example Page', tabs: [ { tabId: 0 as any, url: 'https://example.com', title: 'Example Page', isActive: true }, ], activeTabIndex: 0, }; } function createMockRegistry(): CommandCatalog{ return { register: mock(() => {}), get: mock(() => undefined), getAll: mock(() => []), getActionDescriptions: mock(() => 'click: Click on an element'), getPromptDescription: mock(() => 'click: Click on an element by its index\ngo_to_url: Navigate to a URL'), has: mock(() => false), } as unknown as CommandCatalog; } function createMockTools(actionResults?: CommandResult[]): CommandExecutor { const defaultResults: CommandResult[] = [{ success: true }]; return { registry: createMockRegistry(), commandsPerStep: 10, setCoordinateClicking: mock(() => {}), executeActions: mock(async (_actions: Command[], _ctx: ExecutionContext) => { return actionResults ?? defaultResults; }), executeAction: mock(async (_action: Command, _ctx: ExecutionContext) => { return (actionResults ?? defaultResults)[0]; }), } as unknown as CommandExecutor; } function createMockBrowser(overrides?: { browserState?: ViewportSnapshot; isConnected?: boolean; }): Viewport { const state = overrides?.browserState ?? createMockBrowserState(); return { isConnected: overrides?.isConnected ?? true, start: mock(async () => {}), getState: mock(async () => state), screenshot: mock(async () => ({ base64: 'fake_screenshot', width: 1280, height: 1100 })), navigate: mock(async () => {}), currentPage: { viewportSize: () => ({ width: 1280, height: 1100 }), evaluate: mock(async () => ({})), } as any, cdp: { send: mock(async () => ({})), } as any, } as unknown as Viewport; } function createDefaultAgentOptions(overrides?: Partial): AgentOptions { return { task: 'Find the price of the product', model: createDoneOnStepModel(2), browser: createMockBrowser(), tools: createMockTools([{ success: true, isDone: false }]), domService: createMockPageAnalyzer(), settings: { stepLimit: 5, enableScreenshots: false, commandDelayMs: 0, retryDelay: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, ...overrides, }; } // ── Tests ── describe('Agent', () => { describe('constructor', () => { test('creates agent with default settings merged', () => { const agent = new Agent(createDefaultAgentOptions()); const state = agent.getState(); expect(state.step).toBe(0); expect(state.isRunning).toBe(false); expect(state.isDone).toBe(false); expect(state.failureCount).toBe(0); expect(state.consecutiveFailures).toBe(0); }); test('overrides default settings with provided values', () => { const agent = new Agent( createDefaultAgentOptions({ settings: { stepLimit: 50, enableScreenshots: false, commandDelayMs: 0, retryDelay: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const state = agent.getState(); expect(state.stepLimit).toBe(50); }); test('initializes cost tracking to zero', () => { const agent = new Agent(createDefaultAgentOptions()); const cost = agent.getAccumulatedCost(); expect(cost.totalCost).toBe(0); expect(cost.totalInputTokens).toBe(0); expect(cost.totalOutputTokens).toBe(0); }); test('initializes empty history', () => { const agent = new Agent(createDefaultAgentOptions()); const history = agent.getHistory(); expect(history.entries).toHaveLength(0); expect(history.task).toBe('Find the price of the product'); }); test('uses custom tools when provided', () => { const customTools = createMockTools(); const agent = new Agent(createDefaultAgentOptions({ tools: customTools })); expect(agent).toBeDefined(); }); }); describe('run() basic flow', () => { test('completes when done action is returned', async () => { const doneModel = createDoneOnStepModel(1, 'The price is $42'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'The price is $42' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); const result = await agent.run(); expect(result.finalResult).toBe('The price is $42'); expect(result.success).toBe(true); expect(result.errors).toHaveLength(0); }); test('sets isRunning to false after completion', async () => { const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); await agent.run(); const state = agent.getState(); expect(state.isRunning).toBe(false); }); test('calls onStepStart callback', async () => { const stepStarts: number[] = []; const doneModel = createDoneOnStepModel(2, 'Result'); let callCount = 0; const tools = createMockTools(); (tools.executeActions as any) = mock(async () => { callCount++; if (callCount >= 2) { return [{ success: true, isDone: true, extractedContent: 'Result' }]; } return [{ success: true }]; }); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools, onStepStart: (step) => stepStarts.push(step), }), ); await agent.run(); expect(stepStarts.length).toBeGreaterThan(0); expect(stepStarts[0]).toBe(1); }); test('calls onDone callback with result', async () => { let doneResult: RunOutcome | undefined; const doneModel = createDoneOnStepModel(1, 'Final answer'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Final answer' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools, onDone: (r) => { doneResult = r; }, }), ); await agent.run(); expect(doneResult).toBeDefined(); expect(doneResult!.finalResult).toBe('Final answer'); }); test('starts browser if not connected', async () => { const browser = createMockBrowser({ isConnected: false }); const doneModel = createDoneOnStepModel(1, 'Result'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Result' }, ]); const agent = new Agent( createDefaultAgentOptions({ browser, model: doneModel, tools }), ); await agent.run(); expect(browser.start).toHaveBeenCalled(); }); }); describe('step execution', () => { test('invokes browser.getState() on each step', async () => { const browser = createMockBrowser(); const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); const agent = new Agent( createDefaultAgentOptions({ browser, model: doneModel, tools }), ); await agent.run(); expect(browser.getState).toHaveBeenCalled(); }); test('invokes PageAnalyzer.extractState on each step', async () => { const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); mockExtractState.mockClear(); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); await agent.run(); expect(mockExtractState).toHaveBeenCalled(); }); test('records history entries for each step', async () => { let callCount = 0; const tools = createMockTools(); (tools.executeActions as any) = mock(async () => { callCount++; if (callCount >= 3) { return [{ success: true, isDone: true, extractedContent: 'Done' }]; } return [{ success: true }]; }); const model = createDoneOnStepModel(3, 'Done'); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); await agent.run(); const history = agent.getHistory(); expect(history.entries.length).toBeGreaterThanOrEqual(1); }); test('token usage is tracked across steps', async () => { let callCount = 0; const tools = createMockTools(); (tools.executeActions as any) = mock(async () => { callCount++; if (callCount >= 2) { return [{ success: true, isDone: true, extractedContent: 'Done' }]; } return [{ success: true }]; }); const model = createDoneOnStepModel(2, 'Done'); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); await agent.run(); const state = agent.getState(); expect(state.totalInputTokens).toBeGreaterThan(0); expect(state.totalOutputTokens).toBeGreaterThan(0); }); }); describe('failure recovery', () => { test('consecutive failures increment failure count', async () => { let callCount = 0; const errorModel: LanguageModel = { modelId: 'test-model', provider: 'custom', invoke: async (): Promise> => { callCount++; throw new Error(`Simulated error ${callCount}`); }, }; const agent = new Agent( createDefaultAgentOptions({ model: errorModel, settings: { stepLimit: 10, failureThreshold: 3, retryDelay: 0, enableScreenshots: false, commandDelayMs: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const result = await agent.run(); expect(result.errors.length).toBeGreaterThan(0); }); test('agent records error about consecutive failures after failureThreshold', async () => { let callCount = 0; const errorModel: LanguageModel = { modelId: 'test-model', provider: 'custom', invoke: async (): Promise> => { callCount++; throw new Error(`Error ${callCount}`); }, }; const agent = new Agent( createDefaultAgentOptions({ model: errorModel, settings: { stepLimit: 20, failureThreshold: 3, retryDelay: 0, enableScreenshots: false, commandDelayMs: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const result = await agent.run(); const hasFailureError = result.errors.some( (e) => e.includes('consecutive failures'), ); expect(hasFailureError).toBe(true); }); test('successful step resets consecutive failure count', async () => { let callCount = 0; const model: LanguageModel = { modelId: 'test-model', provider: 'custom', invoke: async (): Promise> => { callCount++; if (callCount === 1) { throw new Error('Transient error'); } return { parsed: { currentState: { evaluation: 'Done', memory: '', nextGoal: '' }, actions: [{ action: 'finish', text: 'Success', success: true }], } as unknown as T, usage: createMockUsage(), finishReason: 'stop', }; }, }; const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Success' }, ]); const agent = new Agent( createDefaultAgentOptions({ model, tools, settings: { stepLimit: 10, failureThreshold: 5, retryDelay: 0, enableScreenshots: false, commandDelayMs: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const result = await agent.run(); expect(result.finalResult).toBe('Success'); }); }); describe('done action detection and result extraction', () => { test('detects done action and extracts result text', async () => { const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Product costs $99' }, ]); const model = createDoneOnStepModel(1, 'Product costs $99'); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); const result = await agent.run(); expect(result.finalResult).toBe('Product costs $99'); expect(result.success).toBe(true); }); test('handles done action with success=false', async () => { const model = createMockModel({ responses: [{ currentState: { evaluation: 'Cannot find', memory: '', nextGoal: '' }, actions: [{ action: 'finish', text: 'Could not find', success: false } as Command], }], }); const tools = createMockTools([ { success: false, isDone: true, extractedContent: 'Could not find' }, ]); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); const result = await agent.run(); expect(result.finalResult).toBe('Could not find'); expect(result.success).toBe(false); }); }); describe('pause / resume / stop', () => { test('pause sets isPaused flag', () => { const agent = new Agent(createDefaultAgentOptions()); agent.pause(); expect(agent.getState().isPaused).toBe(true); }); test('resume clears isPaused flag', () => { const agent = new Agent(createDefaultAgentOptions()); agent.pause(); agent.resume(); expect(agent.getState().isPaused).toBe(false); }); test('stop sets isRunning to false', async () => { let stepCount = 0; const tools = createMockTools(); (tools.executeActions as any) = mock(async () => { stepCount++; return [{ success: true }]; }); const model = createMockModel(); const agent = new Agent( createDefaultAgentOptions({ model, tools, settings: { stepLimit: 100, enableScreenshots: false, commandDelayMs: 0, retryDelay: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const runPromise = agent.run(); // Stop after a brief moment await new Promise((r) => setTimeout(r, 50)); agent.stop(); await runPromise; const state = agent.getState(); expect(state.isRunning).toBe(false); }); }); describe('max steps reached', () => { test('returns error when max steps exceeded without done', async () => { const model = createMockModel(); const tools = createMockTools([{ success: true }]); const agent = new Agent( createDefaultAgentOptions({ model, tools, settings: { stepLimit: 3, enableScreenshots: false, commandDelayMs: 0, retryDelay: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const result = await agent.run(); const hasMaxStepsError = result.errors.some( (e) => e.includes('maximum steps'), ); expect(hasMaxStepsError).toBe(true); }); test('run() accepts stepLimit parameter to override settings', async () => { const model = createMockModel(); const tools = createMockTools([{ success: true }]); const agent = new Agent( createDefaultAgentOptions({ model, tools, settings: { stepLimit: 100, enableScreenshots: false, commandDelayMs: 0, retryDelay: 0, autoNavigateToUrls: false, contextWindowSize: 50000, }, }), ); const result = await agent.run(2); const hasMaxStepsError = result.errors.some( (e) => e.includes('maximum steps'), ); expect(hasMaxStepsError).toBe(true); }); }); describe('sensitive data filtering', () => { test('filters sensitive values from action results', async () => { const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Your API key is sk-12345 and password is hunter2', }, ]); const model = createDoneOnStepModel(1, 'Done'); const agent = new Agent( createDefaultAgentOptions({ model, tools, settings: { stepLimit: 5, enableScreenshots: false, commandDelayMs: 0, retryDelay: 0, autoNavigateToUrls: false, contextWindowSize: 50000, maskedValues: { apiKey: 'sk-12345', password: 'hunter2', }, }, }), ); const result = await agent.run(); const history = agent.getHistory(); for (const entry of history.entries) { for (const ar of entry.actionResults) { if (ar.extractedContent) { expect(ar.extractedContent).not.toContain('sk-12345'); expect(ar.extractedContent).not.toContain('hunter2'); } } } }); test('returns unmodified results when no sensitive data configured', async () => { const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Plain text result', }, ]); const model = createDoneOnStepModel(1, 'Done'); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); const result = await agent.run(); expect(result.finalResult).toBe('Plain text result'); }); }); describe('history recording', () => { test('history entries contain step number', async () => { let callCount = 0; const tools = createMockTools(); (tools.executeActions as any) = mock(async () => { callCount++; if (callCount >= 2) { return [{ success: true, isDone: true, extractedContent: 'Done' }]; } return [{ success: true }]; }); const model = createDoneOnStepModel(2, 'Done'); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); await agent.run(); const history = agent.getHistory(); expect(history.entries.length).toBeGreaterThanOrEqual(1); expect(history.entries[0].step).toBe(1); }); test('history entries contain browser state info', async () => { const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); await agent.run(); const history = agent.getHistory(); expect(history.entries.length).toBeGreaterThanOrEqual(1); expect(history.entries[0].browserState.url).toBe('https://example.com'); expect(history.entries[0].browserState.title).toBe('Example Page'); }); test('history entries contain usage info', async () => { const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); await agent.run(); const history = agent.getHistory(); expect(history.entries.length).toBeGreaterThanOrEqual(1); expect(history.entries[0].usage).toBeDefined(); expect(history.entries[0].usage!.inputTokens).toBe(100); expect(history.entries[0].usage!.outputTokens).toBe(50); }); test('history is finalized after run', async () => { const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); await agent.run(); const history = agent.getHistory(); expect(history.endTime).toBeDefined(); expect(history.totalDuration).toBeDefined(); }); }); describe('cost tracking', () => { test('cumulative cost accumulates across steps', async () => { let callCount = 0; const tools = createMockTools(); (tools.executeActions as any) = mock(async () => { callCount++; if (callCount >= 3) { return [{ success: true, isDone: true, extractedContent: 'Done' }]; } return [{ success: true }]; }); const model = createDoneOnStepModel(3, 'Done'); const agent = new Agent( createDefaultAgentOptions({ model, tools }), ); await agent.run(); const cost = agent.getAccumulatedCost(); expect(cost.totalInputTokens).toBeGreaterThanOrEqual(100); expect(cost.totalOutputTokens).toBeGreaterThanOrEqual(50); }); }); describe('follow-up tasks', () => { test('addNewTask stores follow-up tasks', () => { const agent = new Agent(createDefaultAgentOptions()); agent.addNewTask('Follow up: check price again'); agent.addNewTask('Follow up: compare with competitor'); const tasks = agent.getFollowUpTasks(); expect(tasks).toHaveLength(2); expect(tasks[0]).toBe('Follow up: check price again'); expect(tasks[1]).toBe('Follow up: compare with competitor'); }); test('getFollowUpTasks returns a copy', () => { const agent = new Agent(createDefaultAgentOptions()); agent.addNewTask('Task 1'); const tasks1 = agent.getFollowUpTasks(); const tasks2 = agent.getFollowUpTasks(); expect(tasks1).toEqual(tasks2); expect(tasks1).not.toBe(tasks2); }); }); describe('getState', () => { test('returns a copy of the state', () => { const agent = new Agent(createDefaultAgentOptions()); const state1 = agent.getState(); const state2 = agent.getState(); expect(state1).toEqual(state2); expect(state1).not.toBe(state2); }); test('tracks current URL after run', async () => { const doneModel = createDoneOnStepModel(1, 'Done'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Done' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); await agent.run(); const state = agent.getState(); expect(state.currentUrl).toBe('https://example.com'); }); }); describe('getAccumulatedCost', () => { test('returns a copy of cost data', () => { const agent = new Agent(createDefaultAgentOptions()); const cost1 = agent.getAccumulatedCost(); const cost2 = agent.getAccumulatedCost(); expect(cost1).toEqual(cost2); expect(cost1).not.toBe(cost2); }); }); describe('run result structure', () => { test('result contains all expected fields', async () => { const doneModel = createDoneOnStepModel(1, 'Answer'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Answer' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); const result = await agent.run(); expect(result).toHaveProperty('finalResult'); expect(result).toHaveProperty('success'); expect(result).toHaveProperty('history'); expect(result).toHaveProperty('errors'); expect(result).toHaveProperty('totalCost'); }); test('result.history is an ExecutionLog', async () => { const doneModel = createDoneOnStepModel(1, 'Answer'); const tools = createMockTools([ { success: true, isDone: true, extractedContent: 'Answer' }, ]); const agent = new Agent( createDefaultAgentOptions({ model: doneModel, tools }), ); const result = await agent.run(); expect(result.history).toBeDefined(); expect(result.history.task).toBe('Find the price of the product'); expect(typeof result.history.finalResult).toBe('function'); }); }); }); ================================================ FILE: packages/core/src/agent/agent.ts ================================================ import { z, ZodError } from 'zod'; import type { LanguageModel, InferenceOptions } from '../model/interface.js'; import type { Viewport } from '../viewport/viewport.js'; import type { FileAccess } from '../sandbox/file-access.js'; import { PageAnalyzer } from '../page/page-analyzer.js'; import { CommandExecutor } from '../commands/executor.js'; import type { Command, CommandResult, ExecutionContext } from '../commands/types.js'; import { CommandSchema } from '../commands/types.js'; import { InstructionBuilder } from './instructions.js'; import { ConversationManager } from './conversation/service.js'; import { StallDetector, hashPageTree, hashTextContent } from './stall-detector.js'; import { ReplayRecorder } from './replay-recorder.js'; import { ResultEvaluator } from './evaluator.js'; import { type AgentConfig, type AgentState, type AgentDecision, type StepRecord, ExecutionLog, type RunOutcome, type AccumulatedCost, type EvaluationResult, type QuickCheckResult, ReasoningSchema, AgentDecisionCompactSchema, AgentDecisionDirectSchema, PlanRevisionSchema, DEFAULT_AGENT_CONFIG, calculateStepCost, supportsDeepReasoning, supportsCoordinateMode, isCompactModel, } from './types.js'; import { AgentError, StepLimitExceededError, AgentStalledError, ModelThrottledError, } from '../errors.js'; import { Timer, sleep, truncateText, withDeadline, extractUrls, escapeRegExp, } from '../utils.js'; import { createLogger } from '../logging.js'; const logger = createLogger('agent'); // ── Agent Options ── export interface AgentOptions { task: string; model: LanguageModel; browser: Viewport; tools?: CommandExecutor; /** Pre-configured PageAnalyzer instance (defaults to a new PageAnalyzer) */ domService?: PageAnalyzer; settings?: Partial; /** Separate model for the judge (defaults to main model) */ judgeModel?: LanguageModel; /** Separate model for extraction actions (defaults to main model) */ extractionModel?: LanguageModel; /** File system access for sandbox operations */ fileSystem?: FileAccess; onStepStart?: (step: number) => void; onStepEnd?: (step: number, result: CommandResult[]) => void; onDone?: (result: RunOutcome) => void; } // ── Agent ── export class Agent { private model: LanguageModel; private browser: Viewport; private tools: CommandExecutor; private domService: PageAnalyzer; private messageManager: ConversationManager; private loopDetector: StallDetector; private gifRecorder?: ReplayRecorder; private judge?: ResultEvaluator; private settings: AgentConfig; private extractionModel?: LanguageModel; private fileSystem?: FileAccess; private state: AgentState; private historyList: ExecutionLog; private startTime = 0; private followUpTasks: string[] = []; private onStepStart?: (step: number) => void; private onStepEnd?: (step: number, result: CommandResult[]) => void; private onDone?: (result: RunOutcome) => void; constructor(options: AgentOptions) { this.model = options.model; this.browser = options.browser; this.settings = { ...DEFAULT_AGENT_CONFIG, ...options.settings, task: options.task }; this.extractionModel = options.extractionModel; this.fileSystem = options.fileSystem; this.tools = options.tools ?? new CommandExecutor({ model: this.extractionModel ?? this.model, allowedUrls: this.settings.allowedUrls, blockedUrls: this.settings.blockedUrls, commandsPerStep: this.settings.commandsPerStep, }); this.domService = options.domService ?? new PageAnalyzer({ capturedAttributes: this.settings.capturedAttributes, }); this.messageManager = new ConversationManager({ contextWindowSize: this.settings.contextWindowSize, includeLastScreenshot: this.settings.enableScreenshots, maskedValues: this.settings.maskedValues, compaction: this.settings.conversationCompaction, }); this.loopDetector = new StallDetector(); if (this.settings.replayOutputPath) { this.gifRecorder = new ReplayRecorder({ outputPath: this.settings.replayOutputPath, }); } // Judge setup if (this.settings.enableEvaluation || this.settings.enableSimpleJudge) { const judgeModel = options.judgeModel ?? this.model; this.judge = new ResultEvaluator(judgeModel); } // Auto-enable coordinate clicking for supported models if (this.settings.autoEnableCoordinateClicking) { if (supportsCoordinateMode(this.model.modelId)) { this.tools.setCoordinateClicking(true); logger.info(`Coordinate clicking auto-enabled for model ${this.model.modelId}`); } } // Initialize state this.state = { step: 0, stepLimit: this.settings.stepLimit, failureCount: 0, consecutiveFailures: 0, isRunning: false, isPaused: false, isDone: false, totalInputTokens: 0, totalOutputTokens: 0, cumulativeCost: { totalInputTokens: 0, totalOutputTokens: 0, totalInputCost: 0, totalOutputCost: 0, totalCost: 0, }, }; this.historyList = new ExecutionLog({ task: this.settings.task, }); this.onStepStart = options.onStepStart; this.onStepEnd = options.onStepEnd; this.onDone = options.onDone; } // ──────────────────────────────────────── // Main run loop // ──────────────────────────────────────── async run(stepLimit?: number): Promise { const effectiveMaxSteps = stepLimit ?? this.settings.stepLimit; this.state.stepLimit = effectiveMaxSteps; this.state.isRunning = true; this.startTime = Date.now(); // Ensure browser is started if (!this.browser.isConnected) { await this.browser.start(); } // Build system prompt (may be rebuilt per step if dynamicCommandSchema is on) this.rebuildInstructionBuilder(); // URL extraction: auto-navigate to first URL found in task text if (this.settings.autoNavigateToUrls) { await this.autoNavigateFromTask(); } // Execute initial actions before the main loop if (this.settings.preflightCommands.length > 0) { await this.executeInitialActions(); } const errors: string[] = []; let finalResult: string | undefined; let success = false; let judgement: EvaluationResult | undefined; let simpleJudgement: QuickCheckResult | undefined; try { for (let step = 1; step <= effectiveMaxSteps; step++) { if (!this.state.isRunning || this.state.isDone) break; // Pause support while (this.state.isPaused) { await sleep(100); } this.state.step = step; this.onStepStart?.(step); try { // Wrap step execution in optional timeout const stepPromise = this.executeStep(step, effectiveMaxSteps); const result = this.settings.stepDeadlineMs > 0 ? await withDeadline( stepPromise, this.settings.stepDeadlineMs, `Step ${step} timed out after ${this.settings.stepDeadlineMs}ms`, ) : await stepPromise; this.state.consecutiveFailures = 0; // Check if done const doneResult = result.find((r) => r.isDone); if (doneResult) { finalResult = doneResult.extractedContent; success = doneResult.success; // Simple judge: quick validation before accepting the result if (this.settings.enableSimpleJudge && this.judge && finalResult) { simpleJudgement = await this.judge.simpleEvaluate( this.settings.task, finalResult, ); if (simpleJudgement.shouldRetry && step < effectiveMaxSteps) { logger.info( `Simple judge suggests retry: ${simpleJudgement.reason}`, ); this.messageManager.addCommandResultMessage( `The result was reviewed and found lacking: ${simpleJudgement.reason}. ` + 'Please try a different approach to complete the task.', step, ); // Don't mark as done -- continue the loop continue; } } this.state.isDone = true; break; } this.onStepEnd?.(step, result); // Planning: periodically update the plan if (this.settings.enableStrategy && this.shouldUpdatePlan(step)) { await this.updatePlan(step); } // Replan on stall: if loop detector shows stuck + planning enabled if (this.settings.restrategizeOnStall && this.settings.enableStrategy) { const loopCheck = this.loopDetector.isStuck(); if (loopCheck.stuck && loopCheck.severity >= 2) { logger.info('Agent stalled, triggering replan'); await this.updatePlan(step); } } // Message compaction: every N steps (LLM-based) if (this.messageManager.shouldCompactWithLlm()) { const compacted = await this.messageManager.compactWithLlm(this.model); if (compacted) { logger.debug(`Messages compacted at step ${step}`); } } // Save conversation per step if configured if (this.settings.conversationOutputPath) { await this.saveConversation(step); } } catch (error) { // Rate limit retry with exponential backoff if (error instanceof ModelThrottledError) { const waitMs = error.retryAfterMs ?? Math.min( 60_000, this.settings.retryDelay * 1000 * 2 ** this.state.consecutiveFailures, ); logger.warn(`Rate limited, waiting ${waitMs}ms before retry`); await sleep(waitMs); this.state.consecutiveFailures++; // Don't count rate limits toward max failures continue; } const message = error instanceof Error ? error.message : String(error); errors.push(`Step ${step}: ${message}`); this.state.failureCount++; this.state.consecutiveFailures++; if (this.state.consecutiveFailures >= this.settings.failureThreshold) { // Failure recovery: make one final LLM call to diagnose const failureSummary = await this.makeFailureRecoveryCall(errors); if (failureSummary) { finalResult = failureSummary; } throw new AgentError( `Too many consecutive failures (${this.state.consecutiveFailures})`, ); } // Add error message to conversation this.messageManager.addCommandResultMessage( `Error: ${truncateText(message, 400)}`, step, ); // Wait before retry await sleep(this.settings.retryDelay * 1000); } } if (!this.state.isDone && this.state.step >= effectiveMaxSteps) { throw new StepLimitExceededError(this.state.step, effectiveMaxSteps); } } catch (error) { if ( error instanceof StepLimitExceededError || error instanceof AgentStalledError || error instanceof AgentError ) { errors.push(error.message); } else { throw error; } } finally { this.state.isRunning = false; // Save recording if (this.gifRecorder) { await this.gifRecorder.save(); } } // Full judge evaluation after completion if (this.settings.enableEvaluation && this.judge && finalResult) { judgement = await this.judge.evaluate( this.settings.task, finalResult, this.historyList.entries, { expectedOutcome: this.settings.expectedOutcome, includeScreenshots: this.settings.enableScreenshots, }, ); } // Finalize history this.historyList.finish(); const runResult: RunOutcome = { finalResult, success, history: this.historyList, errors, judgement, simpleJudgement, totalCost: { ...this.state.cumulativeCost }, }; this.onDone?.(runResult); return runResult; } // ──────────────────────────────────────── // Step Execution // ──────────────────────────────────────── private async executeStep(step: number, stepLimit: number): Promise { const timer = new Timer(); // Get browser state const browserState = await this.browser.getState(); this.state.currentUrl = browserState.url; // Dynamic action schema: rebuild system prompt per step based on current URL if (this.settings.dynamicCommandSchema) { this.rebuildInstructionBuilder(browserState.url); } // Extract DOM const domState = await this.domService.extractState( this.browser.currentPage, this.browser.cdp!, ); // Take screenshot if using vision let screenshot: string | undefined; if (this.settings.enableScreenshots) { const screenshotResult = await this.browser.screenshot(); screenshot = screenshotResult.base64; if (this.gifRecorder) { const actionLabel = browserState.url; this.gifRecorder.addFrame(screenshot, step, actionLabel); } } // Build state message const stateText = InstructionBuilder.buildStatePrompt( browserState.url, browserState.title, browserState.tabs, domState.tree, step, stepLimit, domState.pixelsAbove, domState.pixelsBelow, ); // Check for loop const loopCheck = this.loopDetector.isStuck(); let additionalContext = ''; if (loopCheck.stuck) { additionalContext = InstructionBuilder.buildLoopNudge( this.loopDetector.getLoopNudgeMessage(), ); // Severe loop: throw stuck error if (loopCheck.severity >= 3) { throw new AgentStalledError( `Agent stuck: ${loopCheck.reason} (severity ${loopCheck.severity})`, ); } } // Add plan context if planning is enabled if (this.settings.enableStrategy && this.state.currentPlan) { additionalContext += InstructionBuilder.buildPlanPrompt(this.state.currentPlan); } // Add messages this.messageManager.addStateMessage( stateText + additionalContext, screenshot, step, ); // Determine output schema based on mode const outputSchema = this.getOutputSchema(); // Invoke LLM with optional timeout and Zod recovery const completion = await this.invokeLlmWithRecovery(outputSchema, step); // Update token tracking this.state.totalInputTokens += completion.usage.inputTokens; this.state.totalOutputTokens += completion.usage.outputTokens; // Cost tracking this.updateCostTracking(completion.usage.inputTokens, completion.usage.outputTokens, step); const output = completion.parsed; // Normalize output to standard AgentDecision shape const normalizedOutput = this.normalizeOutput(output); // Add assistant response this.messageManager.addAssistantMessage( JSON.stringify(normalizedOutput.currentState), step, ); // Execute actions const context: ExecutionContext = { page: this.browser.currentPage, cdpSession: this.browser.cdp!, domService: this.domService, browserSession: this.browser, extractionLlm: this.extractionModel, fileSystem: this.fileSystem, maskedValues: this.settings.maskedValues, }; const actions = normalizedOutput.actions as Command[]; const results = await this.tools.executeActions(actions, context); // Record for loop detection (with enhanced fingerprint) this.loopDetector.recordAction(actions); this.loopDetector.recordFingerprint({ url: browserState.url, domHash: hashPageTree(domState.tree), scrollY: domState.scrollPosition.y, elementCount: domState.elementCount, textHash: hashTextContent(domState.tree.slice(0, 2000)), }); // Filter sensitive data from results const filteredResults = this.filterSensitiveData(results); // Add action results to conversation const resultText = filteredResults .map((r, i) => { const actionName = actions[i]?.action ?? 'unknown'; const status = r.success ? 'success' : `error: ${r.error}`; const content = r.extractedContent ? `\nContent: ${r.extractedContent}` : ''; return `${actionName}: ${status}${content}`; }) .join('\n'); if (resultText) { this.messageManager.addCommandResultMessage(resultText, step); } // Wait between actions if (this.settings.commandDelayMs > 0) { await sleep(this.settings.commandDelayMs * 1000); } // Record history entry const entry: StepRecord = { step, timestamp: Date.now(), browserState: { url: browserState.url, title: browserState.title, tabs: browserState.tabs, interactedElements: actions .filter((a): a is Command & { index: number } => 'index' in a) .map((a) => ({ index: a.index, description: '', action: a.action, })), screenshot, }, agentOutput: normalizedOutput as AgentDecision, actionResults: filteredResults, usage: completion.usage, duration: timer.elapsed(), metadata: { stepNumber: step, durationMs: timer.elapsed(), inputTokens: completion.usage.inputTokens, outputTokens: completion.usage.outputTokens, actionCount: actions.length, url: browserState.url, startedAt: Date.now() - timer.elapsed(), completedAt: Date.now(), }, }; this.historyList.addEntry(entry); return results; } // ──────────────────────────────────────── // LLM Invocation with Zod Recovery // ──────────────────────────────────────── private async invokeLlmWithRecovery( outputSchema: z.ZodType, step: number, retryCount = 0, ): Promise<{ parsed: Record; usage: { inputTokens: number; outputTokens: number; totalTokens: number }; }> { const messages = this.messageManager.getMessages(); const invokeOptions: InferenceOptions = { messages, responseSchema: outputSchema, schemaName: this.getSchemaName(), schemaDescription: 'Agent decision with current state assessment and actions to take', }; // Extended thinking: pass thinking budget as maxTokens if ( this.settings.enableDeepReasoning && supportsDeepReasoning(this.model.modelId) ) { invokeOptions.maxTokens = this.settings.reasoningBudget; } try { // Wrap LLM call in optional timeout const invokePromise = this.model.invoke(invokeOptions); const completion = this.settings.modelDeadlineMs > 0 ? await withDeadline( invokePromise, this.settings.modelDeadlineMs, `LLM call timed out after ${this.settings.modelDeadlineMs}ms`, ) : await invokePromise; return { parsed: completion.parsed as Record, usage: completion.usage, }; } catch (error) { // Zod validation error recovery: re-prompt with the error details if (error instanceof ZodError && retryCount < 2) { logger.warn( `Zod validation failed (attempt ${retryCount + 1}), re-prompting LLM`, ); const issues = error.issues .map((issue) => `- ${issue.path.join('.')}: ${issue.message}`) .join('\n'); this.messageManager.addCommandResultMessage( 'Your previous response had a validation error. ' + 'Please fix the following issues and respond again:\n' + `${issues}\n\n` + 'Make sure your response matches the expected JSON schema exactly.', step, ); return this.invokeLlmWithRecovery(outputSchema, step, retryCount + 1); } // Re-throw rate limit errors for special handling in the main loop if (error instanceof ModelThrottledError) { throw error; } throw error; } } // ──────────────────────────────────────── // Output Schema Selection // ──────────────────────────────────────── private getOutputSchema(): z.ZodType { // Flash mode: simpler schema for cheaper / faster models if (this.settings.compactMode || isCompactModel(this.model.modelId)) { return AgentDecisionCompactSchema as z.ZodType; } // Extended thinking: model reasons internally, skip brain schema if ( this.settings.enableDeepReasoning && supportsDeepReasoning(this.model.modelId) ) { return AgentDecisionDirectSchema as z.ZodType; } // Default full schema with brain + typed action union return z.object({ currentState: ReasoningSchema, actions: z.array(CommandSchema), }) as z.ZodType; } private getSchemaName(): string { if (this.settings.compactMode || isCompactModel(this.model.modelId)) { return 'AgentDecisionCompact'; } if ( this.settings.enableDeepReasoning && supportsDeepReasoning(this.model.modelId) ) { return 'AgentDecisionDirect'; } return 'AgentDecision'; } /** * Normalize the various output schema shapes into the standard AgentDecision. */ private normalizeOutput(output: Record): AgentDecision { // Flash schema: { goal, actions } if ('goal' in output && !('currentState' in output)) { return { currentState: { evaluation: String(output.goal ?? ''), memory: '', nextGoal: String(output.goal ?? ''), }, actions: (output.actions ?? []) as Record[], }; } // No-thinking schema: { actions } only if (!('currentState' in output) && 'actions' in output) { return { currentState: { evaluation: '', memory: '', nextGoal: '', }, actions: (output.actions ?? []) as Record[], }; } // Standard schema passthrough return output as AgentDecision; } // ──────────────────────────────────────── // Planning System // ──────────────────────────────────────── private shouldUpdatePlan(step: number): boolean { if (!this.settings.enableStrategy) return false; const interval = this.settings.strategyInterval > 0 ? this.settings.strategyInterval : 5; const lastPlan = this.state.lastPlanStep ?? 0; return step - lastPlan >= interval; } private async updatePlan(step: number): Promise { try { const recentHistory = this.historyList.entries .slice(-5) .map( (e) => `Step ${e.step}: ${e.agentOutput.currentState?.evaluation ?? '(no eval)'}`, ) .join('\n'); const planPrompt = `Task: ${this.settings.task}\n\n` + `Current step: ${step}/${this.state.stepLimit}\n` + (this.state.currentPlan ? `Current plan:\n${this.state.currentPlan}\n\n` : '') + `Recent progress:\n${recentHistory}\n\n` + 'Based on the current progress, provide an updated plan. ' + 'Include what has been accomplished and what remains.'; // Use ephemeral message so the plan prompt doesn't persist this.messageManager.addEphemeralMessage(planPrompt); const completion = await this.model.invoke({ messages: this.messageManager.getMessages(), responseSchema: PlanRevisionSchema, schemaName: 'PlanRevision', temperature: 0.3, }); this.state.currentPlan = completion.parsed.plan; this.state.lastPlanStep = step; logger.info(`Plan updated at step ${step}: ${completion.parsed.reasoning}`); } catch (error) { logger.warn( `Plan update failed at step ${step}: ${ error instanceof Error ? error.message : String(error) }`, ); } } // ──────────────────────────────────────── // System Prompt Management // ──────────────────────────────────────── /** * (Re)build the system prompt. When `pageUrl` is provided, the registry * can filter action descriptions to show only domain-relevant actions. */ private rebuildInstructionBuilder(pageUrl?: string): void { const systemPrompt = InstructionBuilder.fromSettings( this.settings, this.tools.registry, pageUrl, ); this.messageManager.setInstructionBuilder(systemPrompt.build()); } // ──────────────────────────────────────── // URL Extraction from Task Text // ──────────────────────────────────────── private async autoNavigateFromTask(): Promise { const urls = extractUrls(this.settings.task); if (urls.length === 0) return; const firstUrl = urls[0]; logger.info(`Auto-navigating to URL found in task: ${firstUrl}`); try { await this.browser.navigate(firstUrl); // Give the page a moment to load await sleep(1000); } catch (error) { logger.warn( `Auto-navigation to ${firstUrl} failed: ${ error instanceof Error ? error.message : String(error) }`, ); } } // ──────────────────────────────────────── // Initial Actions // ──────────────────────────────────────── private async executeInitialActions(): Promise { logger.info( `Executing ${this.settings.preflightCommands.length} initial action(s)`, ); const context: ExecutionContext = { page: this.browser.currentPage, cdpSession: this.browser.cdp!, domService: this.domService, browserSession: this.browser, extractionLlm: this.extractionModel, fileSystem: this.fileSystem, maskedValues: this.settings.maskedValues, }; for (const action of this.settings.preflightCommands) { try { await this.tools.executeAction(action, context); logger.debug(`Initial action ${action.action} completed`); } catch (error) { logger.warn( `Initial action ${action.action} failed: ${ error instanceof Error ? error.message : String(error) }`, ); } } await sleep(500); } // ──────────────────────────────────────── // Failure Recovery // ──────────────────────────────────────── /** * On max failures, make one final LLM call to produce a diagnostic * summary. Returns a description of what went wrong, or undefined * if the recovery call itself fails. */ private async makeFailureRecoveryCall( errors: string[], ): Promise { try { const errorSummary = errors.slice(-5).join('\n'); const recoverySchema = z.object({ diagnosis: z.string().describe('What went wrong'), suggestion: z.string().describe('What could be tried differently'), }); const completion = await this.model.invoke({ messages: [ { role: 'system' as const, content: 'You are a diagnostic assistant. Analyze the errors that occurred during ' + 'a web browsing automation task and provide a brief diagnosis.', }, { role: 'user' as const, content: `Task: ${this.settings.task}\n\n` + `Errors encountered:\n${errorSummary}\n\n` + 'Provide a brief diagnosis of what went wrong and what could be tried differently.', }, ], responseSchema: recoverySchema, schemaName: 'FailureRecovery', temperature: 0, }); const result = `Task failed. Diagnosis: ${completion.parsed.diagnosis}. ` + `Suggestion: ${completion.parsed.suggestion}`; logger.info(`Failure recovery: ${result}`); return result; } catch { logger.debug('Failure recovery call itself failed'); return undefined; } } // ──────────────────────────────────────── // Cost Tracking // ──────────────────────────────────────── private updateCostTracking( inputTokens: number, outputTokens: number, step: number, ): void { const stepCost = calculateStepCost( inputTokens, outputTokens, this.model.modelId, ); this.state.cumulativeCost.totalInputTokens += inputTokens; this.state.cumulativeCost.totalOutputTokens += outputTokens; if (stepCost) { this.state.cumulativeCost.totalInputCost += stepCost.inputCost; this.state.cumulativeCost.totalOutputCost += stepCost.outputCost; this.state.cumulativeCost.totalCost += stepCost.totalCost; logger.debug( `Step ${step} cost: $${stepCost.totalCost.toFixed(4)} ` + `(cumulative: $${this.state.cumulativeCost.totalCost.toFixed(4)})`, ); } } // ──────────────────────────────────────── // Sensitive Data Filtering // ──────────────────────────────────────── private filterSensitiveData(results: CommandResult[]): CommandResult[] { if (!this.settings.maskedValues) return results; return results.map((r) => { if (!r.extractedContent) return r; let content = r.extractedContent; for (const [key, value] of Object.entries(this.settings.maskedValues!)) { content = content.replace( new RegExp(escapeRegExp(value), 'g'), `<${key}>`, ); } return { ...r, extractedContent: content }; }); } // ──────────────────────────────────────── // Save Conversation // ──────────────────────────────────────── private async saveConversation(step: number): Promise { if (!this.settings.conversationOutputPath) return; try { const filePath = this.settings.conversationOutputPath.replace( /\{step\}/g, step.toString(), ); await this.messageManager.saveToFile(filePath); } catch (error) { logger.debug( `Failed to save conversation at step ${step}: ${ error instanceof Error ? error.message : String(error) }`, ); } } // ──────────────────────────────────────── // Follow-up Tasks // ──────────────────────────────────────── /** * Add a follow-up task to be executed after the current task completes. * Tasks are stored and can be retrieved via getFollowUpTasks(). */ addNewTask(task: string): void { this.followUpTasks.push(task); logger.info(`Follow-up task added: ${truncateText(task, 100)}`); } getFollowUpTasks(): string[] { return [...this.followUpTasks]; } // ──────────────────────────────────────── // Control Methods // ──────────────────────────────────────── pause(): void { this.state.isPaused = true; } resume(): void { this.state.isPaused = false; } stop(): void { this.state.isRunning = false; } getState(): AgentState { return { ...this.state }; } getHistory(): ExecutionLog { return this.historyList; } getAccumulatedCost(): AccumulatedCost { return { ...this.state.cumulativeCost }; } } ================================================ FILE: packages/core/src/agent/conversation/service.ts ================================================ import { z } from 'zod'; import type { Message } from '../../model/messages.js'; import { systemMessage, userMessage, assistantMessage, imageContent, textContent, type ContentPart, } from '../../model/messages.js'; import type { LanguageModel } from '../../model/interface.js'; import type { ConversationManagerOptions, TrackedMessage, ConversationManagerState, ConversationEntry, SerializedTrackedMessage, MessageCategory, } from './types.js'; import { estimateTokens, estimateMessageTokens, redactMessages, extractTextContent, truncate, } from './utils.js'; // ── LLM Compaction Summary Schema ── const CompactionSummarySchema = z.object({ summary: z.string().describe('Concise summary of the conversation so far'), }); // ── ConversationManager ── export class ConversationManager { private messages: TrackedMessage[] = []; private systemPromptMessage: Message | null = null; private systemPromptText: string | null = null; private options: ConversationManagerOptions; private historyItems: ConversationEntry[] = []; private currentStep = 0; private lastCompactionStep = 0; constructor(options: ConversationManagerOptions) { this.options = options; } // ──────────────────────────────────────── // System Prompt // ──────────────────────────────────────── setInstructionBuilder(prompt: string): void { this.systemPromptText = prompt; this.systemPromptMessage = systemMessage(prompt); } // ──────────────────────────────────────── // Add Messages // ──────────────────────────────────────── addStateMessage( stateText: string, screenshot?: string, step?: number, ): void { const content: ContentPart[] = [textContent(stateText)]; if (screenshot && this.options.includeLastScreenshot) { content.push(imageContent(screenshot, 'image/png')); } if (step !== undefined) this.currentStep = step; this.messages.push({ message: userMessage(content), isCompactable: true, tokenEstimate: estimateMessageTokens(content), step, category: 'state', addedAt: Date.now(), }); this.recordConversationEntry(step ?? this.currentStep, 'state', stateText, !!screenshot); } addAssistantMessage(text: string, step?: number): void { if (step !== undefined) this.currentStep = step; this.messages.push({ message: assistantMessage(text), isCompactable: true, tokenEstimate: estimateTokens(text), step, category: 'assistant', addedAt: Date.now(), }); this.recordConversationEntry(step ?? this.currentStep, 'assistant', text); } addCommandResultMessage(text: string, step?: number): void { if (step !== undefined) this.currentStep = step; this.messages.push({ message: userMessage(text), isCompactable: true, tokenEstimate: estimateTokens(text), step, category: 'action_result', addedAt: Date.now(), }); this.recordConversationEntry(step ?? this.currentStep, 'action_result', text); } addUserMessage(text: string): void { this.messages.push({ message: userMessage(text), isCompactable: false, tokenEstimate: estimateTokens(text), category: 'user', addedAt: Date.now(), }); this.recordConversationEntry(this.currentStep, 'user', text); } /** * Add an ephemeral message that is included in the next getMessages() call * and then automatically removed. Useful for one-shot instructions or * temporary context that should not persist across steps. */ addEphemeralMessage(text: string, role: 'user' | 'assistant' = 'user'): void { const msg = role === 'user' ? userMessage(text) : assistantMessage(text); this.messages.push({ message: msg, isCompactable: false, tokenEstimate: estimateTokens(text), category: role === 'user' ? 'user' : 'assistant', ephemeral: true, ephemeralRead: false, addedAt: Date.now(), }); } // ──────────────────────────────────────── // Get Messages (with compaction + filtering) // ──────────────────────────────────────── getMessages(): Message[] { const result: Message[] = []; if (this.systemPromptMessage) { result.push(this.systemPromptMessage); } // Check if we need to compact const totalTokens = this.estimateTotalTokens(); if (totalTokens > this.options.contextWindowSize) { this.compact(); } for (const managed of this.messages) { result.push(managed.message); } // Mark ephemeral messages as read so they can be cleaned up this.consumeEphemeralMessages(); // Apply sensitive data filtering if (this.options.maskedValues && Object.keys(this.options.maskedValues).length > 0) { return redactMessages(result, this.options.maskedValues); } return result; } // ──────────────────────────────────────── // Ephemeral Message Lifecycle // ──────────────────────────────────────── /** * After getMessages() has been called, remove ephemeral messages that were already read. * Freshly-added ephemeral messages are marked as read (so they survive one getMessages call). */ private consumeEphemeralMessages(): void { // Remove previously-read ephemeral messages this.messages = this.messages.filter( (m) => !(m.ephemeral && m.ephemeralRead), ); // Mark remaining ephemeral messages as read for the next pass for (const m of this.messages) { if (m.ephemeral && !m.ephemeralRead) { m.ephemeralRead = true; } } } // ──────────────────────────────────────── // Token Estimation // ──────────────────────────────────────── estimateTotalTokens(): number { let total = 0; if (this.systemPromptMessage) { total += estimateTokens( typeof this.systemPromptMessage.content === 'string' ? this.systemPromptMessage.content : '', ); } for (const managed of this.messages) { total += managed.tokenEstimate; } return total; } // ──────────────────────────────────────── // Basic Compaction (image removal + old message replacement) // ──────────────────────────────────────── private compact(): void { // Remove screenshots from older messages (keep only last) let foundLast = false; for (let i = this.messages.length - 1; i >= 0; i--) { const msg = this.messages[i]; if (!msg.isCompactable) continue; const content = msg.message.content; if (Array.isArray(content)) { const hasImage = content.some( (p) => typeof p === 'object' && p !== null && (p as ContentPart).type === 'image', ); if (hasImage) { if (foundLast) { // Remove images from this message const filtered = content.filter( (p) => typeof p !== 'object' || p === null || (p as ContentPart).type !== 'image', ); if (filtered.length > 0) { msg.message = userMessage(filtered as ContentPart[]); msg.tokenEstimate = estimateMessageTokens(filtered); } } else { foundLast = true; } } } } // If still over budget, remove old compactable state messages while ( this.estimateTotalTokens() > this.options.contextWindowSize && this.messages.length > 4 ) { // Find first compactable message const idx = this.messages.findIndex((m) => m.isCompactable); if (idx === -1) break; // Replace with a summary const removed = this.messages.splice(idx, 1)[0]; const summary = `[Step ${removed.step ?? '?'} state omitted to save tokens]`; this.messages.splice(idx, 0, { message: userMessage(summary), isCompactable: true, tokenEstimate: estimateTokens(summary), step: removed.step, category: 'compaction_summary', addedAt: Date.now(), }); } } // ──────────────────────────────────────── // LLM-Based Compaction // ──────────────────────────────────────── /** * Run LLM-based message compaction: send the older portion of the conversation * to a summarization model and replace it with a single summary message. * * Call this periodically (e.g. every N steps as configured in compaction.interval). * Returns true if compaction was performed, false if skipped. */ async compactWithLlm(model?: LanguageModel): Promise { const compactionConfig = this.options.compaction; if (!compactionConfig) return false; const llm = model ?? this.options.compactionModel; if (!llm) return false; // Only compact if enough steps have passed since last compaction if ( compactionConfig.interval > 0 && this.currentStep - this.lastCompactionStep < compactionConfig.interval ) { return false; } const targetTokens = compactionConfig.targetTokens ?? Math.floor(this.options.contextWindowSize * 0.6); // If we're under the target, no need to compact if (this.estimateTotalTokens() <= targetTokens) return false; // Split messages: keep the last few messages intact, summarize the rest const keepCount = Math.min(6, Math.floor(this.messages.length / 2)); const toSummarize = this.messages.slice(0, this.messages.length - keepCount); const toKeep = this.messages.slice(this.messages.length - keepCount); if (toSummarize.length === 0) return false; // Build a transcript of the messages to summarize const transcript = toSummarize .map((m) => { const role = m.message.role; const text = extractTextContent(m.message); const stepLabel = m.step !== undefined ? ` (step ${m.step})` : ''; return `[${role}${stepLabel}]: ${truncate(text, 500)}`; }) .join('\n'); const prompt = [ systemMessage( 'You are a conversation summarizer. Summarize the following agent-browser conversation transcript. ' + 'Preserve key facts: URLs visited, actions taken, errors encountered, extracted data, and the current task state. ' + 'Be concise but complete.', ), userMessage( `Summarize this conversation transcript:\n\n${transcript}`, ), ]; try { const completion = await llm.invoke({ messages: prompt, responseSchema: CompactionSummarySchema, schemaName: 'CompactionSummary', schemaDescription: 'A concise summary of the conversation so far', maxTokens: compactionConfig.maxTokens, temperature: 0, }); const summaryText = `[Conversation summary of steps 1-${toSummarize[toSummarize.length - 1]?.step ?? '?'}]\n${completion.parsed.summary}`; // Replace the summarized messages with a single summary this.messages = [ { message: userMessage(summaryText), isCompactable: false, // Don't re-compact the summary tokenEstimate: estimateTokens(summaryText), category: 'compaction_summary', addedAt: Date.now(), }, ...toKeep, ]; this.lastCompactionStep = this.currentStep; return true; } catch { // If LLM compaction fails, fall back to basic compaction silently return false; } } /** * Check whether LLM compaction should run at the current step. * This is a convenience check; the caller can use it to decide whether * to call compactWithLlm(). */ shouldCompactWithLlm(): boolean { const config = this.options.compaction; if (!config || config.interval <= 0) return false; return ( this.currentStep - this.lastCompactionStep >= config.interval && this.estimateTotalTokens() > (config.targetTokens ?? this.options.contextWindowSize * 0.6) ); } // ──────────────────────────────────────── // History Items & Description // ──────────────────────────────────────── private recordConversationEntry( step: number, category: MessageCategory, content: string, hasScreenshot?: boolean, ): void { this.historyItems.push({ step, category, summary: truncate(content, 120), content: truncate(content, 2000), hasScreenshot, timestamp: Date.now(), }); } /** * Build a human-readable description of the agent's history, * with "N steps omitted" truncation for long histories. * * @param stepLimitShown Maximum number of steps to show in full detail. * If the history is longer, middle steps are replaced with a "N steps omitted" line. */ agentHistoryDescription(stepLimitShown = 10): string { // Group history items by step const byStep = new Map(); for (const item of this.historyItems) { const existing = byStep.get(item.step); if (existing) { existing.push(item); } else { byStep.set(item.step, [item]); } } const stepNumbers = [...byStep.keys()].sort((a, b) => a - b); if (stepNumbers.length === 0) return '(no history)'; const lines: string[] = []; if (stepNumbers.length <= stepLimitShown) { // Show all steps for (const stepNum of stepNumbers) { lines.push(this.formatStepDescription(stepNum, byStep.get(stepNum)!)); } } else { // Show first few, omitted middle, last few const headCount = Math.ceil(stepLimitShown / 2); const tailCount = stepLimitShown - headCount; const headSteps = stepNumbers.slice(0, headCount); const tailSteps = stepNumbers.slice(stepNumbers.length - tailCount); const omittedCount = stepNumbers.length - headCount - tailCount; for (const stepNum of headSteps) { lines.push(this.formatStepDescription(stepNum, byStep.get(stepNum)!)); } lines.push(` ... (${omittedCount} steps omitted) ...`); for (const stepNum of tailSteps) { lines.push(this.formatStepDescription(stepNum, byStep.get(stepNum)!)); } } return lines.join('\n'); } private formatStepDescription(step: number, items: ConversationEntry[]): string { const parts = items.map((item) => { const prefix = item.category === 'state' ? 'State' : item.category === 'assistant' ? 'Agent' : item.category === 'action_result' ? 'Result' : item.category === 'user' ? 'User' : item.category; return `${prefix}: ${item.summary}`; }); return `Step ${step}:\n ${parts.join('\n ')}`; } /** Get all recorded history items. */ getConversationEntrys(): readonly ConversationEntry[] { return this.historyItems; } // ──────────────────────────────────────── // Save / Load (Conversation Persistence) // ──────────────────────────────────────── /** * Serialize the current state to a persistence-friendly snapshot. * Screenshots are stripped (replaced with placeholder text) to keep size manageable. */ save(): ConversationManagerState { const serialized: SerializedTrackedMessage[] = this.messages.map((m) => ({ role: m.message.role, content: extractTextContent(m.message), isCompactable: m.isCompactable, tokenEstimate: m.tokenEstimate, step: m.step, category: m.category, })); return { systemPrompt: this.systemPromptText, messages: serialized, historyItems: [...this.historyItems], currentStep: this.currentStep, }; } /** * Restore the ConversationManager from a previously saved state. * This replaces all current messages and history. */ load(state: ConversationManagerState): void { if (state.systemPrompt) { this.setInstructionBuilder(state.systemPrompt); } else { this.systemPromptMessage = null; this.systemPromptText = null; } this.messages = state.messages.map((s) => ({ message: s.role === 'assistant' ? assistantMessage(s.content) : userMessage(s.content), isCompactable: s.isCompactable, tokenEstimate: s.tokenEstimate, step: s.step, category: s.category, addedAt: Date.now(), })); this.historyItems = [...state.historyItems]; this.currentStep = state.currentStep; } /** * Save the conversation state to a JSON file. */ async saveToFile(filePath: string): Promise { const { writeFile, mkdir } = await import('node:fs/promises'); const { dirname } = await import('node:path'); await mkdir(dirname(filePath), { recursive: true }); const json = JSON.stringify(this.save(), null, 2); await writeFile(filePath, json, 'utf-8'); return filePath; } /** * Load conversation state from a JSON file. */ async loadFromFile(filePath: string): Promise { const { readFile } = await import('node:fs/promises'); const raw = await readFile(filePath, 'utf-8'); const state = JSON.parse(raw) as ConversationManagerState; this.load(state); } // ──────────────────────────────────────── // Accessors // ──────────────────────────────────────── get messageCount(): number { return this.messages.length + (this.systemPromptMessage ? 1 : 0); } get step(): number { return this.currentStep; } clear(): void { this.messages = []; this.historyItems = []; this.currentStep = 0; this.lastCompactionStep = 0; } /** * Remove all messages but preserve history items and step counter. * Useful when restarting message context without losing the history summary. */ resetMessages(): void { this.messages = []; this.lastCompactionStep = 0; } } ================================================ FILE: packages/core/src/agent/conversation/types.ts ================================================ import type { Message } from '../../model/messages.js'; import type { CompactionPolicy } from '../types.js'; import type { LanguageModel } from '../../model/interface.js'; // ── Message Manager Options ── export interface ConversationManagerOptions { contextWindowSize: number; estimateTokens?: (text: string) => number; includeLastScreenshot: boolean; /** Sensitive key-value pairs to mask in outgoing messages. */ maskedValues?: Record; /** LLM-based compaction configuration. */ compaction?: CompactionPolicy; /** LanguageModel used for LLM-based compaction. Ignored if compaction is not set. */ compactionModel?: LanguageModel; } // ── Managed Message ── export type MessageCategory = | 'system' | 'state' | 'action_result' | 'assistant' | 'user' | 'compaction_summary'; export interface TrackedMessage { message: Message; isCompactable: boolean; tokenEstimate: number; step?: number; /** Semantic category for structured history tracking. */ category?: MessageCategory; /** When true, this message is only included on the next getMessages() call then removed. */ ephemeral?: boolean; /** When true, this message has already been read (consumed) in an ephemeral pass. */ ephemeralRead?: boolean; /** Timestamp when this message was added. */ addedAt?: number; } // ── History Item ── /** * A structured entry in the agent's conversation history, richer than TrackedMessage. * Used for building human-readable summaries and for save/load. */ export interface ConversationEntry { /** Step number this item belongs to. */ step: number; /** Category of this history item. */ category: MessageCategory; /** Brief human-readable summary of this item (e.g. "Clicked element 5" or "Navigated to google.com"). */ summary: string; /** The full text content (truncated for large payloads). */ content?: string; /** Whether this item included a screenshot. */ hasScreenshot?: boolean; /** Timestamp. */ timestamp: number; } // ── Message Manager State (persistence) ── /** * Serializable snapshot of the ConversationManager for save/load. */ export interface ConversationManagerState { systemPrompt: string | null; messages: SerializedTrackedMessage[]; historyItems: ConversationEntry[]; /** Step count at the time of snapshot. */ currentStep: number; } /** * Serializable form of TrackedMessage (Message content may contain base64 * screenshots, which are replaced with placeholders during serialization). */ export interface SerializedTrackedMessage { role: string; content: string; isCompactable: boolean; tokenEstimate: number; step?: number; category?: MessageCategory; } ================================================ FILE: packages/core/src/agent/conversation/utils.ts ================================================ import type { Message } from '../../model/messages.js'; import type { ContentPart } from '../../model/messages.js'; /** * Rough token estimation: ~4 characters per token. */ export function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } export function estimateMessageTokens(content: string | unknown[]): number { if (typeof content === 'string') { return estimateTokens(content); } let total = 0; for (const part of content) { if (typeof part === 'object' && part !== null) { const p = part as Record; if (p.type === 'text' && typeof p.text === 'string') { total += estimateTokens(p.text); } else if (p.type === 'image') { total += 1000; // Approximate cost for an image } } } return total; } // ── Sensitive Data Filtering ── const MASK = '***'; /** * Replace all occurrences of each sensitive value in `text` with a mask. * Keys are used only for logging context; values are the secrets to redact. */ export function redactSensitiveValues( text: string, maskedValues: Record, ): string { let result = text; for (const [_key, value] of Object.entries(maskedValues)) { if (!value) continue; // Escape regex special characters in the value const escaped = value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); result = result.replace(new RegExp(escaped, 'g'), MASK); } return result; } /** * Deep-filter a Message, masking any sensitive values found in text content. * Returns a new message (does not mutate the original). */ export function redactMessage( message: Message, maskedValues: Record, ): Message { const entries = Object.entries(maskedValues); if (entries.length === 0) return message; const content = message.content; if (typeof content === 'string') { return { ...message, content: redactSensitiveValues(content, maskedValues), } as Message; } if (Array.isArray(content)) { const filtered = (content as ContentPart[]).map((part) => { if (part.type === 'text') { return { ...part, text: redactSensitiveValues(part.text, maskedValues), }; } // Images are left as-is (binary data) return part; }); return { ...message, content: filtered, } as Message; } return message; } /** * Filter an array of Messages, masking sensitive data in each. */ export function redactMessages( messages: Message[], maskedValues: Record, ): Message[] { if (Object.keys(maskedValues).length === 0) return messages; return messages.map((m) => redactMessage(m, maskedValues)); } /** * Extract the text content from a Message as a plain string. * For multi-part content, concatenates all text parts. */ export function extractTextContent(message: Message): string { const content = message.content; if (typeof content === 'string') return content; if (Array.isArray(content)) { return (content as ContentPart[]) .filter((p): p is Extract => p.type === 'text') .map((p) => p.text) .join('\n'); } return ''; } /** * Truncate a string to maxLen characters, appending an ellipsis if truncated. */ export function truncate(text: string, maxLen: number): string { if (text.length <= maxLen) return text; return `${text.slice(0, maxLen - 3)}...`; } ================================================ FILE: packages/core/src/agent/conversation.test.ts ================================================ import { test, expect, describe, beforeEach } from 'bun:test'; import { ConversationManager } from './conversation/service.js'; import type { ConversationManagerOptions } from './conversation/types.js'; import type { LanguageModel, InferenceOptions } from '../model/interface.js'; import type { InferenceResult } from '../model/types.js'; // ── Helpers ── function createManager( overrides: Partial = {}, ): ConversationManager { return new ConversationManager({ contextWindowSize: 10000, includeLastScreenshot: true, ...overrides, }); } function createMockModel(summary = 'Summary of the conversation'): LanguageModel { return { modelId: 'test-model', provider: 'custom', invoke: async (_options: InferenceOptions): Promise> => { return { parsed: { summary } as unknown as T, usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 }, finishReason: 'stop', }; }, }; } // ── Tests ── describe('ConversationManager', () => { let mm: ConversationManager; beforeEach(() => { mm = createManager(); }); describe('system prompt', () => { test('setInstructionBuilder stores the system prompt', () => { mm.setInstructionBuilder('You are a helpful assistant'); const messages = mm.getMessages(); expect(messages[0]).toEqual({ role: 'system', content: 'You are a helpful assistant', }); }); test('system prompt appears first in getMessages', () => { mm.setInstructionBuilder('System'); mm.addStateMessage('State text', undefined, 1); const messages = mm.getMessages(); expect(messages[0].role).toBe('system'); expect(messages[1].role).toBe('user'); }); test('changing system prompt replaces the previous one', () => { mm.setInstructionBuilder('First'); mm.setInstructionBuilder('Second'); const messages = mm.getMessages(); const systemMessages = messages.filter((m) => m.role === 'system'); expect(systemMessages).toHaveLength(1); expect(systemMessages[0].content).toBe('Second'); }); }); describe('addStateMessage', () => { test('adds a user message with state text', () => { mm.addStateMessage('Page state info', undefined, 1); const messages = mm.getMessages(); expect(messages).toHaveLength(1); expect(messages[0].role).toBe('user'); }); test('includes screenshot when provided and vision enabled', () => { mm.addStateMessage('State', 'base64screenshot', 1); const messages = mm.getMessages(); const content = messages[0].content; expect(Array.isArray(content)).toBe(true); if (Array.isArray(content)) { expect(content).toHaveLength(2); expect(content[0]).toEqual({ type: 'text', text: 'State' }); expect(content[1]).toHaveProperty('type', 'image'); } }); test('excludes screenshot when vision disabled', () => { const noVision = createManager({ includeLastScreenshot: false }); noVision.addStateMessage('State', 'base64screenshot', 1); const messages = noVision.getMessages(); const content = messages[0].content; // Content should be text-only array expect(Array.isArray(content)).toBe(true); if (Array.isArray(content)) { expect(content).toHaveLength(1); expect(content[0]).toHaveProperty('type', 'text'); } }); test('updates messageCount', () => { expect(mm.messageCount).toBe(0); mm.addStateMessage('State 1', undefined, 1); expect(mm.messageCount).toBe(1); mm.addStateMessage('State 2', undefined, 2); expect(mm.messageCount).toBe(2); }); }); describe('addAssistantMessage', () => { test('adds an assistant role message', () => { mm.addAssistantMessage('Agent response', 1); const messages = mm.getMessages(); expect(messages[0].role).toBe('assistant'); expect(messages[0].content).toBe('Agent response'); }); }); describe('addCommandResultMessage', () => { test('adds a user role message for action results', () => { mm.addCommandResultMessage('click: success', 1); const messages = mm.getMessages(); expect(messages[0].role).toBe('user'); expect(messages[0].content).toBe('click: success'); }); }); describe('getMessages ordering', () => { test('returns messages in correct order', () => { mm.setInstructionBuilder('System prompt'); mm.addStateMessage('State text', undefined, 1); mm.addAssistantMessage('Agent thought', 1); mm.addCommandResultMessage('Action result', 1); const messages = mm.getMessages(); expect(messages).toHaveLength(4); expect(messages[0].role).toBe('system'); expect(messages[1].role).toBe('user'); expect(messages[2].role).toBe('assistant'); expect(messages[3].role).toBe('user'); }); }); describe('compaction - screenshot removal', () => { test('removes old screenshots when over token budget, keeps last', () => { // 3 screenshots: each ~1000 tokens for image + ~2 for text = ~3006 total. // Budget of 1500: after removing 2 old screenshots (saving 2000), // total becomes ~1006 < 1500, so compact exits successfully. const small = createManager({ contextWindowSize: 1500 }); small.addStateMessage('State 1', 'screenshot1', 1); small.addStateMessage('State 2', 'screenshot2', 2); small.addStateMessage('State 3', 'screenshot3', 3); const messages = small.getMessages(); // After compaction, older screenshots should be removed // The last message should still have its image const lastMessage = messages[messages.length - 1]; const lastContent = lastMessage.content; expect(Array.isArray(lastContent)).toBe(true); if (Array.isArray(lastContent)) { const hasImage = lastContent.some( (p: any) => typeof p === 'object' && p.type === 'image', ); expect(hasImage).toBe(true); // Older messages should have had their images removed const firstMsg = messages[0]; const firstContent = firstMsg.content; if (Array.isArray(firstContent)) { const firstHasImage = firstContent.some( (p: any) => typeof p === 'object' && p.type === 'image', ); expect(firstHasImage).toBe(false); } } }); }); describe('compaction - token budget behavior', () => { test('does not trigger compaction when under budget', () => { // Budget of 10000 means no compaction needed for a few messages const large = createManager({ contextWindowSize: 10000, includeLastScreenshot: false }); large.addStateMessage('Short state', undefined, 1); large.addAssistantMessage('Short response', 1); const messages = large.getMessages(); // No summaries should be present const summaryMessages = messages.filter( (m) => typeof m.content === 'string' && m.content.includes('omitted to save tokens'), ); expect(summaryMessages).toHaveLength(0); }); test('estimateTotalTokens reflects actual message content', () => { const mm2 = createManager({ contextWindowSize: 100000, includeLastScreenshot: false }); mm2.addStateMessage('A'.repeat(400), undefined, 1); // ~100 tokens mm2.addStateMessage('B'.repeat(800), undefined, 2); // ~200 tokens const total = mm2.estimateTotalTokens(); // Total should be roughly 300 tokens for 1200 chars expect(total).toBeGreaterThanOrEqual(250); expect(total).toBeLessThanOrEqual(400); }); }); describe('token estimation', () => { test('estimateTotalTokens includes system prompt', () => { mm.setInstructionBuilder('System prompt text'); const tokensWithSystem = mm.estimateTotalTokens(); expect(tokensWithSystem).toBeGreaterThan(0); }); test('estimateTotalTokens grows with messages', () => { const before = mm.estimateTotalTokens(); mm.addStateMessage('Some state text', undefined, 1); const after = mm.estimateTotalTokens(); expect(after).toBeGreaterThan(before); }); test('estimateTotalTokens counts images as ~1000 tokens', () => { mm.addStateMessage('Text', 'screenshot', 1); const tokens = mm.estimateTotalTokens(); // Text ~4 chars = 1 token, plus ~1000 for image expect(tokens).toBeGreaterThanOrEqual(1000); }); }); describe('history items', () => { test('records history for each added message', () => { mm.addStateMessage('State text', undefined, 1); mm.addAssistantMessage('Agent response', 1); mm.addCommandResultMessage('Result text', 1); const items = mm.getConversationEntrys(); expect(items).toHaveLength(3); expect(items[0].category).toBe('state'); expect(items[1].category).toBe('assistant'); expect(items[2].category).toBe('action_result'); }); test('history items include step number', () => { mm.addStateMessage('State', undefined, 5); const items = mm.getConversationEntrys(); expect(items[0].step).toBe(5); }); test('history items include truncated summary', () => { const longText = 'a'.repeat(200); mm.addStateMessage(longText, undefined, 1); const items = mm.getConversationEntrys(); // Summary should be truncated to 120 chars expect(items[0].summary.length).toBeLessThanOrEqual(123); // 120 + '...' }); test('history items track screenshot presence', () => { mm.addStateMessage('State', 'screenshot_data', 1); const items = mm.getConversationEntrys(); expect(items[0].hasScreenshot).toBe(true); }); }); describe('agentHistoryDescription', () => { test('returns "(no history)" when empty', () => { expect(mm.agentHistoryDescription()).toBe('(no history)'); }); test('shows all steps when under stepLimitShown', () => { mm.addStateMessage('State 1', undefined, 1); mm.addAssistantMessage('Agent 1', 1); mm.addStateMessage('State 2', undefined, 2); mm.addAssistantMessage('Agent 2', 2); const desc = mm.agentHistoryDescription(10); expect(desc).toContain('Step 1:'); expect(desc).toContain('Step 2:'); }); test('truncates with "steps omitted" when exceeding stepLimitShown', () => { for (let i = 1; i <= 20; i++) { mm.addStateMessage(`State ${i}`, undefined, i); mm.addAssistantMessage(`Agent ${i}`, i); } const desc = mm.agentHistoryDescription(4); expect(desc).toContain('steps omitted'); // Should show first 2 and last 2 steps expect(desc).toContain('Step 1:'); expect(desc).toContain('Step 2:'); expect(desc).toContain('Step 19:'); expect(desc).toContain('Step 20:'); }); test('includes category prefixes in description', () => { mm.addStateMessage('Page loaded', undefined, 1); mm.addAssistantMessage('Clicking button', 1); mm.addCommandResultMessage('click: success', 1); const desc = mm.agentHistoryDescription(); expect(desc).toContain('State:'); expect(desc).toContain('Agent:'); expect(desc).toContain('Result:'); }); }); describe('ephemeral messages', () => { test('ephemeral message appears in first getMessages call', () => { mm.addEphemeralMessage('Temporary instruction'); const messages = mm.getMessages(); const found = messages.some( (m) => typeof m.content === 'string' && m.content === 'Temporary instruction', ); expect(found).toBe(true); }); test('ephemeral message is removed after being consumed', () => { mm.addEphemeralMessage('Temp'); // First call: message is present and gets marked as read const first = mm.getMessages(); expect(first.some((m) => typeof m.content === 'string' && m.content === 'Temp')).toBe(true); // Second call: message is still in result (removal happens after building result), // then gets removed during consumeEphemeralMessages const second = mm.getMessages(); // Third call: message is now actually gone from this.messages const third = mm.getMessages(); const found = third.some( (m) => typeof m.content === 'string' && m.content === 'Temp', ); expect(found).toBe(false); }); test('ephemeral message with assistant role', () => { mm.addEphemeralMessage('Agent thought', 'assistant'); const messages = mm.getMessages(); const found = messages.find( (m) => m.role === 'assistant' && m.content === 'Agent thought', ); expect(found).toBeDefined(); }); test('multiple ephemeral messages all appear then get cleaned up', () => { mm.addEphemeralMessage('Temp 1'); mm.addEphemeralMessage('Temp 2'); // First call: both present, marked as read const first = mm.getMessages(); expect(first).toHaveLength(2); // Second call: still in result (removal after build), then removed mm.getMessages(); // Third call: messages have been removed const third = mm.getMessages(); expect(third).toHaveLength(0); }); }); describe('save / load round-trip', () => { test('save and load preserves system prompt', () => { mm.setInstructionBuilder('My system prompt'); mm.addStateMessage('State 1', undefined, 1); const saved = mm.save(); const restored = createManager(); restored.load(saved); const messages = restored.getMessages(); expect(messages[0].role).toBe('system'); expect(messages[0].content).toBe('My system prompt'); }); test('save and load preserves messages', () => { mm.addStateMessage('State 1', undefined, 1); mm.addAssistantMessage('Response 1', 1); mm.addCommandResultMessage('Result 1', 1); const saved = mm.save(); const restored = createManager(); restored.load(saved); const messages = restored.getMessages(); expect(messages).toHaveLength(3); expect(messages[0].role).toBe('user'); expect(messages[1].role).toBe('assistant'); expect(messages[2].role).toBe('user'); }); test('save and load preserves history items', () => { mm.addStateMessage('State 1', undefined, 1); mm.addAssistantMessage('Response 1', 1); const saved = mm.save(); const restored = createManager(); restored.load(saved); const items = restored.getConversationEntrys(); expect(items).toHaveLength(2); expect(items[0].category).toBe('state'); expect(items[1].category).toBe('assistant'); }); test('save and load preserves currentStep', () => { mm.addStateMessage('State', undefined, 7); const saved = mm.save(); expect(saved.currentStep).toBe(7); const restored = createManager(); restored.load(saved); expect(restored.step).toBe(7); }); test('save strips screenshots (text only in serialized form)', () => { mm.addStateMessage('State with screenshot', 'base64data', 1); const saved = mm.save(); // Serialized content should be text-only, no base64 for (const msg of saved.messages) { expect(msg.content).not.toContain('base64data'); } }); test('load with null system prompt clears system prompt', () => { mm.setInstructionBuilder('Initial prompt'); const saved = mm.save(); saved.systemPrompt = null; mm.load(saved); const messages = mm.getMessages(); const hasSystem = messages.some((m) => m.role === 'system'); expect(hasSystem).toBe(false); }); }); describe('sensitive data filtering', () => { test('masks sensitive values in outgoing messages', () => { const sensitive = createManager({ maskedValues: { password: 'secret123', apiKey: 'key-abc' }, }); sensitive.addStateMessage('Login with password secret123', undefined, 1); sensitive.addAssistantMessage('Using key-abc to authenticate', 1); const messages = sensitive.getMessages(); // Text should have been masked const stateMsg = messages[0]; if (typeof stateMsg.content === 'string') { expect(stateMsg.content).not.toContain('secret123'); expect(stateMsg.content).toContain('***'); } else if (Array.isArray(stateMsg.content)) { const textPart = stateMsg.content.find((p: any) => p.type === 'text'); expect((textPart as any).text).not.toContain('secret123'); } const assistantMsg = messages[1]; if (typeof assistantMsg.content === 'string') { expect(assistantMsg.content).not.toContain('key-abc'); expect(assistantMsg.content).toContain('***'); } }); test('no filtering when maskedValues is empty', () => { const noSensitive = createManager({ maskedValues: {} }); noSensitive.addStateMessage('Plain text with secret123', undefined, 1); const messages = noSensitive.getMessages(); const content = messages[0].content; if (Array.isArray(content)) { const textPart = content.find((p: any) => p.type === 'text'); expect((textPart as any).text).toContain('secret123'); } }); test('no filtering when maskedValues is not set', () => { mm.addStateMessage('Text with sensitive data', undefined, 1); const messages = mm.getMessages(); const content = messages[0].content; if (Array.isArray(content)) { const textPart = content.find((p: any) => p.type === 'text'); expect((textPart as any).text).toContain('sensitive data'); } }); }); describe('LLM-based compaction', () => { test('shouldCompactWithLlm returns false when no compaction config', () => { expect(mm.shouldCompactWithLlm()).toBe(false); }); test('shouldCompactWithLlm returns false when interval not reached', () => { const withCompaction = createManager({ compaction: { interval: 10, maxTokens: 500 }, }); // Only 1 message, interval not reached withCompaction.addStateMessage('State', undefined, 1); expect(withCompaction.shouldCompactWithLlm()).toBe(false); }); test('compactWithLlm returns false without a model', async () => { const withCompaction = createManager({ contextWindowSize: 100000, includeLastScreenshot: false, compaction: { interval: 1, maxTokens: 500, targetTokens: 10 }, }); // Add enough messages so estimateTotalTokens > targetTokens (10) for (let i = 1; i <= 5; i++) { withCompaction.addStateMessage('x'.repeat(100), undefined, i); } const result = await withCompaction.compactWithLlm(); expect(result).toBe(false); }); test('compactWithLlm performs compaction with model', async () => { const model = createMockModel('Summarized: visited pages and clicked buttons'); // Use large contextWindowSize so getMessages() doesn't trigger basic compact(), // but low targetTokens so the LLM compaction decides to run. const longText = 'A'.repeat(500); const withCompaction = createManager({ contextWindowSize: 100000, includeLastScreenshot: false, compaction: { interval: 1, maxTokens: 500, targetTokens: 500 }, }); // Add lots of messages to exceed targetTokens (500). // Each 500-char message = ~125 tokens. 10 messages = ~1250 tokens > 500. for (let i = 1; i <= 10; i++) { withCompaction.addStateMessage(`${longText} step ${i}`, undefined, i); withCompaction.addAssistantMessage(`${longText} response ${i}`, i); } const result = await withCompaction.compactWithLlm(model); expect(result).toBe(true); // After compaction, message count should be reduced const messages = withCompaction.getMessages(); expect(messages.length).toBeLessThan(20); // First message should be the summary const firstContent = messages[0].content; expect(typeof firstContent).toBe('string'); expect(firstContent as string).toContain('Conversation summary'); }); }); describe('clear and resetMessages', () => { test('clear removes all messages and history', () => { mm.setInstructionBuilder('System'); mm.addStateMessage('State', undefined, 1); mm.addAssistantMessage('Response', 1); mm.clear(); expect(mm.messageCount).toBe(1); // system prompt still present via setInstructionBuilder expect(mm.getConversationEntrys()).toHaveLength(0); expect(mm.step).toBe(0); }); test('resetMessages removes messages but preserves history', () => { mm.addStateMessage('State', undefined, 1); mm.addAssistantMessage('Response', 1); const historyBefore = mm.getConversationEntrys().length; mm.resetMessages(); // Messages cleared const messages = mm.getMessages(); expect(messages).toHaveLength(0); // History preserved expect(mm.getConversationEntrys()).toHaveLength(historyBefore); }); }); describe('messageCount', () => { test('includes system prompt in count', () => { mm.setInstructionBuilder('System'); expect(mm.messageCount).toBe(1); mm.addStateMessage('State', undefined, 1); expect(mm.messageCount).toBe(2); }); test('does not count system prompt when not set', () => { expect(mm.messageCount).toBe(0); mm.addStateMessage('State', undefined, 1); expect(mm.messageCount).toBe(1); }); }); describe('step tracking', () => { test('step reflects the most recent step from added messages', () => { mm.addStateMessage('State 1', undefined, 1); expect(mm.step).toBe(1); mm.addStateMessage('State 5', undefined, 5); expect(mm.step).toBe(5); }); }); }); ================================================ FILE: packages/core/src/agent/evaluator.ts ================================================ import type { LanguageModel } from '../model/interface.js'; import type { Message, ContentPart } from '../model/messages.js'; import { systemMessage, userMessage, imageContent, textContent } from '../model/messages.js'; import { EvaluationResultSchema, QuickCheckResultSchema, type EvaluationResult, type QuickCheckResult, type StepRecord, } from './types.js'; import { createLogger } from '../logging.js'; const logger = createLogger('judge'); // ── Judge System Prompts ── const JUDGE_SYSTEM_PROMPT = `You are an expert task completion judge. Your job is to evaluate whether a web browser automation agent completed its assigned task successfully. You will be provided with: 1. The task description 2. A history of steps the agent took (including actions and their results) 3. Screenshots from during execution (if available) 4. Optionally, ground truth information about the expected result Evaluate thoroughly: - Did the agent actually complete the task, or just claim to? - Is the extracted information correct and complete? - Did the agent handle errors and edge cases appropriately? - Was the agent stuck at any point without recovery? If ground truth is provided, compare the agent's result against it. Be strict but fair. Partial completions should be marked with lower confidence.`; const SIMPLE_JUDGE_SYSTEM_PROMPT = `You are a quick-check validator for web browser automation results. Given a task and the agent's final result, determine if the result appears correct. Be concise. Focus on whether the result directly answers/completes the task.`; export class ResultEvaluator { private model: LanguageModel; constructor(model: LanguageModel) { this.model = model; } /** * Full evaluation with step history, screenshots, and optional ground truth. * Provides detailed verdict with failure analysis. */ async evaluate( task: string, result: string, history: StepRecord[], options?: { expectedOutcome?: string; includeScreenshots?: boolean; }, ): Promise { const messages = constructEvaluatorMessages(task, result, history, options); try { const completion = await this.model.invoke({ messages, responseSchema: EvaluationResultSchema, schemaName: 'EvaluationResult', temperature: 0, }); logger.info( `Judge verdict: complete=${completion.parsed.isComplete}, ` + `confidence=${completion.parsed.confidence}, ` + `verdict=${completion.parsed.verdict ?? 'n/a'}`, ); return completion.parsed; } catch (error) { logger.error('Judge evaluation failed', error); return { isComplete: false, reason: `Judge evaluation failed: ${error instanceof Error ? error.message : String(error)}`, confidence: 0, verdict: 'unknown', }; } } /** * Lightweight always-on validation. * Quick pass/fail check without detailed history analysis. * Useful for running after every "done" action to catch obvious errors. */ async simpleEvaluate( task: string, result: string, ): Promise { const messages = constructQuickCheckMessages(task, result); try { const completion = await this.model.invoke({ messages, responseSchema: QuickCheckResultSchema, schemaName: 'QuickCheckResult', temperature: 0, }); logger.debug( `Simple judge: passed=${completion.parsed.passed}, reason=${completion.parsed.reason}`, ); return completion.parsed; } catch (error) { logger.error('Simple judge evaluation failed', error); return { passed: true, // Default to pass on error to avoid blocking reason: `Simple judge failed: ${error instanceof Error ? error.message : String(error)}`, shouldRetry: false, }; } } } // ── Message Construction ── /** * Build the full message array for detailed judge evaluation. * Includes step-by-step history, screenshots (if enabled), and ground truth. */ export function constructEvaluatorMessages( task: string, result: string, history: StepRecord[], options?: { expectedOutcome?: string; includeScreenshots?: boolean; }, ): Message[] { const messages: Message[] = [ systemMessage(JUDGE_SYSTEM_PROMPT), ]; // Build the evaluation prompt const parts: string[] = []; parts.push(`## Task\n${task}`); parts.push(`## Agent's Final Result\n${result}`); // Step history summary if (history.length > 0) { const stepSummaries: string[] = []; for (const entry of history) { const actions = entry.agentOutput.actions .map((a) => { const actionObj = a as Record; return actionObj.action ?? 'unknown'; }) .join(', '); const results = entry.actionResults .map((r) => { if (r.isDone) return `DONE: ${r.extractedContent?.slice(0, 200) ?? ''}`; if (r.error) return `ERROR: ${r.error.slice(0, 150)}`; if (r.extractedContent) return `OK: ${r.extractedContent.slice(0, 150)}`; return r.success ? 'OK' : 'FAILED'; }) .join('; '); const evaluation = entry.agentOutput.currentState?.evaluation ?? ''; stepSummaries.push( `Step ${entry.step} [${entry.browserState.url}]:\n` + ` Eval: ${evaluation.slice(0, 200)}\n` + ` Actions: ${actions}\n` + ` Results: ${results}`, ); } parts.push(`## Step History (${history.length} steps)\n${stepSummaries.join('\n\n')}`); } // Ground truth if (options?.expectedOutcome) { parts.push( `## Ground Truth (Expected Result)\n${options.expectedOutcome}\n\n` + 'Compare the agent\'s result against this ground truth carefully.', ); } parts.push( '## Instructions\n' + 'Evaluate the task completion. Provide:\n' + '- isComplete: whether the task was fully completed\n' + '- reason: detailed explanation\n' + '- confidence: 0-1 score\n' + '- verdict: "success", "partial", "failed", or "unknown"\n' + '- failureReason: if failed, explain why\n' + '- impossibleTask: true if the task appears impossible\n' + '- reachedCaptcha: true if a CAPTCHA blocked progress', ); // If screenshots are requested and available, include the last few if (options?.includeScreenshots) { const screenshotEntries = history .filter((e) => e.browserState.screenshot) .slice(-3); // Last 3 screenshots if (screenshotEntries.length > 0) { const content: ContentPart[] = [ textContent(`${parts.join('\n\n')}\n\nBelow are screenshots from the agent's execution:`), ]; for (const entry of screenshotEntries) { if (entry.browserState.screenshot) { content.push( textContent(`Screenshot from step ${entry.step} (${entry.browserState.url}):`), ); content.push(imageContent(entry.browserState.screenshot)); } } messages.push(userMessage(content)); return messages; } } messages.push(userMessage(parts.join('\n\n'))); return messages; } /** * Build messages for lightweight simple judge evaluation. * Only includes task and result -- no history or screenshots. */ export function constructQuickCheckMessages( task: string, result: string, ): Message[] { return [ systemMessage(SIMPLE_JUDGE_SYSTEM_PROMPT), userMessage( `Task: ${task}\n\n` + `Agent's Result: ${result}\n\n` + 'Does this result correctly complete the task? ' + 'If not, should the agent retry with a different approach?', ), ]; } ================================================ FILE: packages/core/src/agent/index.ts ================================================ export { Agent, type AgentOptions } from '../agent/agent.js'; export { InstructionBuilder, StepPromptBuilder, buildCommandDescriptions, buildContextualCommands, buildExtractionInstructionBuilder, buildExtractionUserPrompt, clearTemplateCache, type PromptTemplate, type InstructionBuilderOptions, type StepInfo, type StepPromptBuilderOptions, } from './instructions.js'; export { ConversationManager } from './conversation/service.js'; export { StallDetector, hashPageTree, hashTextContent, type PageSignature, type StallDetectorConfig, type StallCheckResult, } from './stall-detector.js'; export { ResultEvaluator, constructEvaluatorMessages, constructQuickCheckMessages, } from './evaluator.js'; export { ReplayRecorder, type ReplayRecorderOptions } from './replay-recorder.js'; export { type AgentConfig, type AgentState, type AgentDecision, type AgentDecisionCompact, type AgentDecisionDirect, type StepRecord, ExecutionLog, type RunOutcome, type Reasoning, type PlanStep, type EvaluationResult, type QuickCheckResult, type CompactionPolicy, type StepTelemetry, type ExtractedVariable, type AccumulatedCost, type StepCostBreakdown, type PricingTable, type PlanRevision, AgentDecisionSchema, AgentDecisionCompactSchema, AgentDecisionDirectSchema, ReasoningSchema, EvaluationResultSchema, QuickCheckResultSchema, PlanStepSchema, StrategyPlanSchema, PlanRevisionSchema, PRICING_TABLE, calculateStepCost, supportsDeepReasoning, supportsCoordinateMode, isCompactModel, DEFAULT_AGENT_CONFIG, } from './types.js'; export type { ConversationManagerOptions, TrackedMessage, ConversationManagerState, ConversationEntry, SerializedTrackedMessage, MessageCategory, } from './conversation/types.js'; export { estimateTokens, estimateMessageTokens, redactSensitiveValues, redactMessage, redactMessages, extractTextContent, truncate, } from './conversation/utils.js'; ================================================ FILE: packages/core/src/agent/instructions/instructions-compact.md ================================================ You are an AI agent that controls a web browser to complete tasks. You operate in an iterative loop: observe, decide, act, repeat. Your task: {{task}} Default: English. Match the task's language. Elements: `[index]text`. Only `[indexed]` elements are interactive. Indentation = child. `*[` = new element. - Only interact with elements that have a numeric [index] - If research is needed, open a **new tab** instead of reusing the current one - If the page changes after an input action, analyze new elements (e.g., suggestions) before proceeding - If an action sequence was interrupted, complete remaining actions in the next step - For autocomplete fields: type text, WAIT for suggestions, click the correct one or press Enter - Handle popups/modals/cookie banners immediately before other actions - If blocked by captcha/login/403, try alternative approaches rather than retrying - ALWAYS look for filter/sort options FIRST when the task specifies criteria - Detect unproductive loops: if same URL for 3+ steps without progress, change approach Maximum {{maxActionsPerStep}} actions per step. If the page changes after an action, remaining actions are skipped. Check browser state each step to verify your previous action succeeded. When chaining actions, never take consequential actions (form submissions, critical button clicks) without confirming changes occurred. {{actionDescriptions}} Combine actions when sensible. Do not predict actions that do not apply to the current page. **Recommended combinations:** - `input_text` + `click` -> Fill field and submit - `input_text` + `input_text` -> Fill multiple fields - `click` + `click` -> Multi-step flows (when page does not navigate between clicks) Do not chain actions that change browser state multiple times (e.g., click then navigate). Always have one clear goal per step. Respond with valid JSON: ```json { "currentState": { "evaluation": "One-sentence analysis of last action. State success, failure, or uncertain.", "memory": "1-3 sentences: progress tracking, data found, approaches tried.", "nextGoal": "Next immediate goal in one clear sentence." }, "actions": [{"action_name": {"param": "value"}}] } ``` Action list should NEVER be empty. Call `done` when: - Task is fully completed - Reached max steps (even if incomplete) - Absolutely impossible to continue Set `success=true` ONLY if the full task is completed. Put ALL findings in the `text` field. Before calling done with success=true: re-read the task, verify every requirement is met, confirm actions completed via page state, ensure no data was fabricated. 1. Verify state using screenshot as ground truth 2. Handle blocking popups/overlays first 3. If element not found, scroll to reveal more content 4. If action fails 2-3 times, try alternative approach 5. If blocked by login/captcha/403, try alternative sites 6. If stuck in a loop, acknowledge and change strategy ================================================ FILE: packages/core/src/agent/instructions/instructions-direct.md ================================================ You are an AI agent that controls a web browser to complete tasks. You operate in an iterative loop: observe the current page state, decide on actions, execute them, and repeat until the task is done. Your task: {{task}} You excel at: 1. Navigating complex websites and extracting precise information 2. Automating form submissions and interactive web actions 3. Gathering and organizing information across multiple pages 4. Operating effectively in an iterative agent loop 5. Adapting strategies when encountering obstacles - Default working language: **English** - Always respond in the same language as the task description At every step, your input will consist of: 1. **Agent history**: A chronological event stream including your previous actions and their results. 2. **Browser state**: Current URL, open tabs, interactive elements indexed for actions, and visible page content. 3. **Screenshot** (when vision is enabled): A screenshot of the current page with bounding boxes around interactive elements. Browser state is given as: - **Current URL**: The URL of the page you are currently viewing. - **Open Tabs**: Open tabs with their IDs. - **Interactive Elements**: All interactive elements in the format `[index]text` where: - `index`: Numeric identifier for interaction - `type`: HTML element type (button, input, etc.) - `text`: Element description Important notes: - Only elements with numeric indexes in `[]` are interactive - Indentation (with tab) means the element is a child of the element above - Elements tagged with `*[` are **new** interactive elements that appeared since the last step - Pure text elements without `[]` are not interactive If vision is enabled, you will receive a screenshot of the current page with bounding boxes around interactive elements. - This is your **ground truth**: use it to evaluate your progress - If an interactive element has no text in browser_state, its index is at the top center of its bounding box Strictly follow these rules while using the browser: - Only interact with elements that have a numeric `[index]` - Only use indexes that are explicitly provided - If research is needed, open a **new tab** instead of reusing the current one - If the page changes after an action, analyze new elements before proceeding - By default, only elements in the visible viewport are listed - If the page is not fully loaded, use the wait action - Use extract_content only if information is NOT visible in browser_state - extract_content is expensive - do NOT call it multiple times on the same page - If you fill an input field and your action sequence is interrupted, something changed (e.g., suggestions appeared) - Complete any remaining actions from interrupted sequences in the next step - For autocomplete fields: type text, WAIT for suggestions, click the correct one or press Enter - If the task specifies criteria (price, rating, location, etc.), look for filter/sort options FIRST - Handle popups, modals, cookie banners immediately before other actions - If blocked by captcha/login/403, try alternative approaches - Detect loops: if same URL for 3+ steps without progress, change approach - Do not log in unless the task requires it and you have credentials ## Output Format Respond with: 1. **currentState**: Your assessment including: - `evaluation`: Assessment of how the last action went - `memory`: Important information to remember - `nextGoal`: The next immediate goal 2. **actions**: A list of actions to execute (max {{maxActionsPerStep}} per step) Maximum {{maxActionsPerStep}} actions per step, executed sequentially. - If the page changes after an action, remaining actions are skipped and you get the new state. - Check browser state each step to verify your previous action achieved its goal. - When chaining actions, never take consequential actions without confirming changes occurred. {{actionDescriptions}} Combine actions when sensible. Do not predict actions that do not apply to the current page. **Recommended combinations:** - `input_text` + `input_text` + `click` -> Fill multiple fields then submit - `input_text` + `send_keys` -> Fill a field and press Enter - `scroll` + `scroll` -> Scroll further down Do not try multiple paths in one step. Have one clear goal per step. Place page-changing actions **last** in your action list. Be clear and concise in your decision-making: 1. Analyze the last action result - state success, failure, or uncertain 2. Analyze browser state and screenshot to understand current position 3. If stuck, consider alternative approaches 4. Store concise, actionable context in memory 5. State your next immediate goal clearly Call `done` when: - Task is fully completed - Reached max steps (even if incomplete) - Absolutely impossible to continue Rules: - Set `success=true` ONLY if the full task is completed - Put ALL relevant findings in the `text` field - Call `done` as a single action - never combine with other actions **Before calling done with success=true, verify:** 1. Re-read the original task and check every requirement 2. Verify correct count, filters, format 3. Confirm actions completed via page state/screenshot 4. Ensure no fabricated data 5. If anything is unmet or uncertain, set success to false When encountering errors: 1. Verify state using screenshot as ground truth 2. Check for blocking popups/overlays 3. If element not found, scroll to reveal content 4. If action fails 2-3 times, try alternative approach 5. If blocked by login/captcha/403, try alternative sites 6. If page structure differs from expected, re-analyze and adapt 7. If stuck in loop, acknowledge in memory and change strategy 8. If max_steps approaching, prioritize most important parts **Good evaluation examples:** - "Successfully navigated to the product page and found the target information. Verdict: Success" - "Failed to input text into the search bar - element not visible. Verdict: Failure" **Good memory examples:** - "Visited 2 of 5 target websites. Collected pricing from Amazon ($39.99) and eBay ($42.00). Still need Walmart, Target, Best Buy." - "Search returned results but no filter applied. User wants items under $50 with 4+ stars. Will apply price filter first." **Good next goal examples:** - "Click 'Add to Cart' to proceed with purchase flow." - "Apply price filter to narrow results to items under $50." 1. ALWAYS verify action success using screenshot/browser state 2. ALWAYS handle popups/modals before other actions 3. ALWAYS apply filters when task specifies criteria 4. NEVER repeat failing actions more than 2-3 times 5. NEVER assume success without verification 6. Track progress in memory to avoid loops 7. Match requested output format exactly 8. Be efficient - combine actions when possible ================================================ FILE: packages/core/src/agent/instructions/instructions.md ================================================ You are an AI agent that controls a web browser to complete tasks. You operate in an iterative loop: observe the current page state, decide on actions, execute them, and repeat until the task is done. Your task: {{task}} You excel at: 1. Navigating complex websites and extracting precise information 2. Automating form submissions and interactive web actions 3. Gathering and organizing information across multiple pages 4. Operating effectively in an iterative agent loop 5. Adapting strategies when encountering obstacles - Default working language: **English** - Always respond in the same language as the task description At every step, your input will consist of: 1. **Agent history**: A chronological event stream including your previous actions and their results. 2. **Browser state**: Current URL, open tabs, interactive elements indexed for actions, and visible page content. 3. **Screenshot** (when vision is enabled): A screenshot of the current page with bounding boxes around interactive elements. Browser state is given as: - **Current URL**: The URL of the page you are currently viewing. - **Open Tabs**: Open tabs with their IDs. - **Interactive Elements**: All interactive elements in the format `[index]text` where: - `index`: Numeric identifier for interaction - `type`: HTML element type (button, input, etc.) - `text`: Element description Examples: ``` [33]
User form
*[35] ``` Important notes: - Only elements with numeric indexes in `[]` are interactive - Indentation (with tab) means the element is a child of the element above - Elements tagged with `*[` are **new** interactive elements that appeared since the last step. Your previous actions caused that change. Consider if you need to interact with them. - Pure text elements without `[]` are not interactive
If vision is enabled, you will receive a screenshot of the current page with bounding boxes around interactive elements. - This is your **ground truth**: use it to evaluate your progress - If an interactive element has no text in browser_state, its index is written at the top center of its bounding box in the screenshot - Use the screenshot action if you need more visual information Strictly follow these rules while using the browser: **Element Interaction:** - Only interact with elements that have a numeric `[index]` assigned - Only use indexes that are explicitly provided in the current browser state - If a page changes after an action (e.g., input text triggers suggestions), analyze new elements before proceeding **Navigation:** - If research is needed, open a **new tab** instead of reusing the current one - By default, only elements in the visible viewport are listed - If the page is not fully loaded, use the wait action **Content Extraction:** - Use extract_content on specific pages to gather structured information from the entire page, including parts not currently visible - Only call extract_content if the information is NOT already visible in browser_state - prefer using text directly from browser_state - extract_content is expensive - do NOT call it multiple times with the same query on the same page **Input Handling:** - If you fill an input field and your action sequence is interrupted, something likely changed (e.g., suggestions appeared) - If the action sequence was interrupted in a previous step, complete any remaining actions that were not executed - For autocomplete/combobox fields: type your text, then WAIT for suggestions in the next step. If suggestions appear (marked with `*[`), click the correct one. If none appear, press Enter. - After input, you may need to press Enter, click a search button, or select from a dropdown **Filters and Criteria:** - If the task includes specific criteria (product type, rating, price, location, etc.), ALWAYS look for filter/sort options FIRST before browsing results **Error Recovery:** - If a captcha appears, attempt solving it. If blocked after 3-4 steps, try alternative approaches or report the limitation - Handle popups, modals, cookie banners, and overlays immediately before other actions - If you encounter access denied (403), bot detection, or rate limiting, do NOT retry the same URL repeatedly - try alternatives - Detect and break out of unproductive loops: if you are on the same URL for 3+ steps without progress, or the same action fails 2-3 times, try a different approach **Authentication:** - Do not log into a page unless required by the task and you have credentials ## Output Format Respond with: 1. **currentState**: Your assessment of the current state including: - `evaluation`: Assessment of how the last action went - `memory`: Important information to remember (progress, data found, approaches tried) - `nextGoal`: The next immediate goal to pursue 2. **actions**: A list of actions to execute (max {{maxActionsPerStep}} per step) You are allowed to use a maximum of {{maxActionsPerStep}} actions per step. Multiple actions execute sequentially (one after another). - If the page changes after an action, remaining actions are automatically skipped and you get the new state. - Check the browser state each step to verify your previous action achieved its goal. {{actionDescriptions}} You can output multiple actions in one step. Be efficient where it makes sense, but do not predict actions that do not make sense for the current page. **Action categories:** - **Page-changing (always last):** navigate, search_google, go_back, switch_tab - these always change the page. Remaining actions after them are skipped automatically. - **Potentially page-changing:** click (on links/buttons that navigate) - monitored at runtime; if the page changes, remaining actions are skipped. - **Safe to chain:** input_text, scroll, extract_content, find_elements - these do not change the page and can be freely combined. **Recommended combinations:** - `input_text` + `input_text` + `click` -> Fill multiple form fields then submit - `input_text` + `send_keys` -> Fill a field and press Enter - `scroll` + `scroll` -> Scroll further down the page Do not try multiple different paths in one step. Always have one clear goal per step. Place any page-changing action **last** in your action list. You must reason systematically at every step: 1. Analyze the most recent action result - clearly state success, failure, or uncertainty. Never assume success without verification. 2. Analyze browser state, screenshot, and history to understand current position relative to the task. 3. If stuck (same actions repeated without progress), consider alternative approaches. 4. Decide what concise, actionable context should be stored in memory. 5. State your next immediate goal clearly. You must use the `done` action when: - You have fully completed the task - You reach the final allowed step, even if the task is incomplete - It is absolutely impossible to continue Rules for `done`: - Set `success` to `true` only if the FULL task has been completed - If any part is missing, incomplete, or uncertain, set `success` to `false` - Put ALL relevant findings in the `text` field - You are ONLY allowed to call `done` as a single action - never combine it with other actions **Before calling done with success=true, verify:** 1. Re-read the original task and list every concrete requirement 2. Check each requirement against your results (correct count, filters applied, format matched) 3. Verify actions actually completed (check page state/screenshot) 4. Ensure no data was fabricated - every fact must come from pages you visited 5. If ANY requirement is unmet or uncertain, set success to false - When you reach 75% of your step budget, critically evaluate whether you can complete the full task in remaining steps - If completion is unlikely, shift strategy: focus on highest-value remaining items and consolidate results - For large multi-item tasks, estimate per-item cost from the first few items and prioritize if the task will exceed your budget When encountering errors or unexpected states: 1. Verify the current state using screenshot as ground truth 2. Check if a popup, modal, or overlay is blocking interaction 3. If an element is not found, scroll to reveal more content 4. If an action fails repeatedly (2-3 times), try an alternative approach 5. If blocked by login/captcha/403, consider alternative sites or search engines 6. If the page structure is different than expected, re-analyze and adapt 7. If stuck in a loop, explicitly acknowledge it in memory and change strategy 8. If max_steps is approaching, prioritize completing the most important parts **Good evaluation examples:** - "Successfully navigated to the product page and found the target information. Verdict: Success" - "Failed to input text into the search bar - element not visible. Verdict: Failure" **Good memory examples:** - "Visited 2 of 5 target websites. Collected pricing data from Amazon ($39.99) and eBay ($42.00). Still need Walmart, Target, Best Buy." - "Search returned results but no filter applied yet. User wants items under $50 with 4+ stars. Will apply price filter first." - "Captcha appeared twice on this site. Will try alternative approach via search engine." **Good next goal examples:** - "Click the 'Add to Cart' button to proceed with the purchase flow." - "Apply price filter to narrow results to items under $50." - "Close the popup blocking the main content." 1. ALWAYS verify action success using screenshot/browser state before proceeding 2. ALWAYS handle popups/modals/cookie banners before other actions 3. ALWAYS apply filters when the task specifies criteria 4. NEVER repeat the same failing action more than 2-3 times 5. NEVER assume success without verification 6. Track progress in memory to avoid loops 7. Match the task's requested output format exactly 8. Be efficient - combine actions when possible but verify between major steps ================================================ FILE: packages/core/src/agent/instructions.ts ================================================ import { readFileSync } from 'node:fs'; import { resolve, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import type { AgentConfig } from './types.js'; import type { ViewportSnapshot, TabDescriptor } from '../viewport/types.js'; import type { CommandCatalog } from '../commands/catalog/catalog.js'; import type { ContentPart } from '../model/messages.js'; import { textContent, imageContent } from '../model/messages.js'; import { isNewTabPage, sanitizeSurrogates, dedent } from '../utils.js'; // ── Template types ── export type PromptTemplate = 'default' | 'flash' | 'no-thinking'; export interface InstructionBuilderOptions { /** Maximum actions the agent can take per step. */ commandsPerStep: number; /** Override the entire system prompt with a custom string. */ overrideInstructionBuilder?: string; /** Append additional instructions to the system prompt. */ extendInstructionBuilder?: string; /** Which template variant to use. Defaults to 'default'. */ template?: PromptTemplate; /** Whether to include sensitive-data warnings. */ hasSensitiveData?: boolean; } export interface StepInfo { step: number; stepLimit: number; } export interface StepPromptBuilderOptions { browserState: ViewportSnapshot; task: string; stepInfo?: StepInfo; actionDescriptions?: string; pageFilteredActions?: string; agentHistoryDescription?: string; maskedValues?: string; planDescription?: string; screenshots?: string[]; enableScreenshots?: boolean; maxElementsLength?: number; } // ── Template loading ── /** * Directory containing the .md system prompt templates. * Resolved relative to this file's location so it works regardless of * the current working directory or whether the package is installed. */ const TEMPLATES_DIR = resolve(dirname(fileURLToPath(import.meta.url)), 'instructions'); /** Cache loaded templates so we only hit the filesystem once per variant. */ const templateCache = new Map(); /** * Map from PromptTemplate variant to the corresponding filename. */ const TEMPLATE_FILES: Record = { default: 'instructions.md', flash: 'instructions-compact.md', 'no-thinking': 'instructions-direct.md', }; /** * Load a system-prompt template from disk. Results are cached. * * @param variant - Which prompt template to load. * @returns The raw template string with `{{variable}}` placeholders. * @throws If the template file cannot be read. */ function loadTemplate(variant: PromptTemplate): string { const cached = templateCache.get(variant); if (cached !== undefined) return cached; const filename = TEMPLATE_FILES[variant]; const filepath = resolve(TEMPLATES_DIR, filename); try { const content = readFileSync(filepath, 'utf-8'); templateCache.set(variant, content); return content; } catch (error) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Failed to load system prompt template "${filename}": ${message}`); } } /** * Interpolate `{{key}}` placeholders in a template string. * Unmatched placeholders are left as-is so downstream code can detect them. */ function interpolate(template: string, variables: Record): string { return template.replace(/\{\{(\w+)\}\}/g, (match, key: string) => { return key in variables ? variables[key] : match; }); } /** * Clear the template cache. Useful for testing or hot-reloading. */ export function clearTemplateCache(): void { templateCache.clear(); } // ── InstructionBuilder ── /** * Builds the system prompt for the browser automation agent. * * In the simplest case it loads a `.md` template from the `system-prompts/` * directory and interpolates variables like `{{task}}`, `{{commandsPerStep}}`, * and `{{actionDescriptions}}`. * * The class also exposes static helpers for building per-step state messages, * action results, and other ancillary prompt fragments that are injected as * user messages during the agent loop. */ export class InstructionBuilder { private options: InstructionBuilderOptions; private actionDescriptions: string; constructor(options: InstructionBuilderOptions, actionDescriptions: string) { this.options = options; this.actionDescriptions = actionDescriptions; } /** * Build and return the complete system prompt string. * * If `overrideInstructionBuilder` is set, it is returned verbatim (after * optional extension). Otherwise, the appropriate `.md` template is * loaded and interpolated with the current settings. */ build(): string { if (this.options.overrideInstructionBuilder) { let prompt = this.options.overrideInstructionBuilder; if (this.options.extendInstructionBuilder) { prompt += `\n${this.options.extendInstructionBuilder}`; } return prompt; } const variant = this.options.template ?? 'default'; const template = loadTemplate(variant); const variables: Record = { task: '(set per-step in user messages)', commandsPerStep: String(this.options.commandsPerStep), actionDescriptions: this.actionDescriptions, }; let prompt = interpolate(template, variables); if (this.options.extendInstructionBuilder) { prompt += `\n${this.options.extendInstructionBuilder}`; } return prompt; } /** * Convenience: create a InstructionBuilder from AgentConfig + a CommandCatalog. * Pulls action descriptions directly from the registry, optionally * filtered by the current page URL. */ static fromSettings(settings: AgentConfig, registry: CommandCatalog, pageUrl?: string): InstructionBuilder { const descriptions = registry.getPromptDescription(pageUrl); return new InstructionBuilder( { commandsPerStep: settings.commandsPerStep, overrideInstructionBuilder: settings.overrideInstructionBuilder, extendInstructionBuilder: settings.extendInstructionBuilder, hasSensitiveData: settings.maskedValues !== undefined, }, descriptions, ); } // ── Static prompt fragment builders ── static buildTaskPrompt(task: string): string { return `Your current task: ${task}`; } static buildStatePrompt( url: string, title: string, tabs: Array<{ url: string; title: string; isActive: boolean }>, domTree: string, step: number, stepLimit: number, pixelsAbove?: number, pixelsBelow?: number, ): string { const parts: string[] = []; parts.push(`[Step ${step}/${stepLimit}]`); parts.push(`Current URL: ${url}`); parts.push(`Page Title: ${title}`); if (tabs.length > 1) { const tabList = tabs .map((t, i) => ` [${i}] ${t.isActive ? '(active) ' : ''}${t.title} - ${t.url}`) .join('\n'); parts.push(`Open Tabs:\n${tabList}`); } if (pixelsAbove !== undefined && pixelsAbove > 0) { parts.push(`Scroll position: ${pixelsAbove}px from top`); } if (pixelsBelow !== undefined && pixelsBelow > 0) { parts.push(`${pixelsBelow}px of content below the visible area`); } parts.push(`\nPage content:\n${domTree}`); return parts.join('\n'); } static buildCommandResultPrompt(results: Array<{ action: string; result: string }>): string { if (results.length === 0) return ''; const formatted = results .map((r) => `Action: ${r.action}\nResult: ${r.result}`) .join('\n---\n'); return `Previous action results:\n${formatted}`; } static buildLoopNudge(message: string): string { return `\nIMPORTANT: ${message}`; } static buildPlanPrompt(currentPlan: string): string { return `\nCurrent plan:\n${currentPlan}`; } } // ── StepPromptBuilder ── /** * Constructs the per-step user message for the agent. * * Each step of the agent loop sends a user message containing: * - The current browser state (URL, tabs, interactive elements) * - Scroll position and page boundaries * - Agent history summary * - Step information (step N of M) * - Optionally: screenshots, sensitive data warnings, plan description * - Optionally: page-specific action descriptions * * The message can be returned as a plain string or as a multipart content * array (text + images) when vision is enabled. */ export class StepPromptBuilder { private browserState: ViewportSnapshot; private task: string; private stepInfo?: StepInfo; private actionDescriptions?: string; private pageFilteredActions?: string; private agentHistoryDescription?: string; private maskedValues?: string; private planDescription?: string; private screenshots: string[]; private enableScreenshots: boolean; private maxElementsLength: number; constructor(options: StepPromptBuilderOptions) { this.browserState = options.browserState; this.task = options.task; this.stepInfo = options.stepInfo; this.actionDescriptions = options.actionDescriptions; this.pageFilteredActions = options.pageFilteredActions; this.agentHistoryDescription = options.agentHistoryDescription; this.maskedValues = options.maskedValues; this.planDescription = options.planDescription; this.screenshots = options.screenshots ?? []; this.enableScreenshots = options.enableScreenshots ?? false; this.maxElementsLength = options.maxElementsLength ?? 40_000; } /** * Build the user message content. * * When vision is disabled (or no screenshots are available), returns a * single string. When vision is enabled and screenshots exist, returns * a `ContentPart[]` array interleaving text and image parts. */ getUserMessage(): string | ContentPart[] { // Skip screenshots on step 0 for new-tab pages with a single tab let effectiveVision = this.enableScreenshots; if ( isNewTabPage(this.browserState.url) && this.stepInfo?.step === 0 && this.browserState.tabs.length <= 1 ) { effectiveVision = false; } const stateDescription = this.buildStateDescription(); if (effectiveVision && this.screenshots.length > 0) { const parts: ContentPart[] = [textContent(stateDescription)]; for (let i = 0; i < this.screenshots.length; i++) { const label = i === this.screenshots.length - 1 ? 'Current screenshot:' : 'Previous screenshot:'; parts.push(textContent(label)); parts.push(imageContent(this.screenshots[i], 'image/png')); } return parts; } return stateDescription; } /** * Build the complete text description of the current state. * This includes agent history, agent state (task, step info, plan), * and browser state (URL, tabs, elements, scroll position). */ private buildStateDescription(): string { const sections: string[] = []; // Agent history sections.push(this.buildAgentHistorySection()); // Agent state (task, step info, plan, sensitive data) sections.push(this.buildAgentStateSection()); // Browser state (URL, tabs, elements) sections.push(this.buildBrowserStateSection()); // Page-specific actions (if any domain-filtered actions apply) if (this.pageFilteredActions) { sections.push( `\n${this.pageFilteredActions}\n`, ); } // Sanitize surrogates to prevent JSON serialization issues return sanitizeSurrogates(sections.join('\n\n')); } private buildAgentHistorySection(): string { const history = this.agentHistoryDescription?.trim() ?? ''; return `\n${history}\n`; } private buildAgentStateSection(): string { const parts: string[] = []; parts.push(`\n${this.task}\n`); if (this.planDescription) { parts.push(`\n${this.planDescription}\n`); } if (this.maskedValues) { parts.push(`${this.maskedValues}`); } if (this.stepInfo) { const today = new Date().toISOString().slice(0, 10); parts.push( `Step ${this.stepInfo.step + 1} of ${this.stepInfo.stepLimit} | Today: ${today}`, ); } return `\n${parts.join('\n')}\n`; } private buildBrowserStateSection(): string { const parts: string[] = []; // Tabs const tabsText = this.buildTabsText(); if (tabsText) { parts.push(tabsText); } // Scroll / page info const pageInfo = this.buildPageInfoText(); if (pageInfo) { parts.push(pageInfo); } // Interactive elements parts.push(this.buildElementsText()); return `\n${parts.join('\n')}\n`; } private buildTabsText(): string { const { tabs, url, title } = this.browserState; if (tabs.length === 0) return ''; // Try to identify the current tab const currentCandidates = tabs.filter((t) => t.url === url && t.title === title); const currentTabId = currentCandidates.length === 1 ? currentCandidates[0].tabId : undefined; const lines: string[] = []; if (currentTabId) { lines.push(`Current tab: ${String(currentTabId).slice(-4)}`); } lines.push('Available tabs:'); for (const tab of tabs) { lines.push(`Tab ${String(tab.tabId).slice(-4)}: ${tab.url} - ${tab.title.slice(0, 30)}`); } return lines.join('\n'); } private buildPageInfoText(): string { const { pixelsAbove, pixelsBelow } = this.browserState; const parts: string[] = []; if (pixelsAbove !== undefined && pixelsAbove > 0) { // Estimate "pages above" assuming ~900px viewport height const pagesAbove = (pixelsAbove / 900).toFixed(1); parts.push(`${pagesAbove} pages above`); } if (pixelsBelow !== undefined && pixelsBelow > 0) { const pagesBelow = (pixelsBelow / 900).toFixed(1); parts.push(`${pagesBelow} pages below`); } if (parts.length === 0) return ''; return `${parts.join(', ')}`; } private buildElementsText(): string { let elementsText = this.browserState.domTree ?? ''; if (!elementsText) { return 'Interactive elements:\nempty page'; } // Truncate if too long let truncatedNote = ''; if (elementsText.length > this.maxElementsLength) { elementsText = elementsText.slice(0, this.maxElementsLength); truncatedNote = ` (truncated to ${this.maxElementsLength} characters)`; } // Add start/end of page markers based on scroll position const hasContentAbove = this.browserState.pixelsAbove !== undefined && this.browserState.pixelsAbove > 0; const hasContentBelow = this.browserState.pixelsBelow !== undefined && this.browserState.pixelsBelow > 0; if (!hasContentAbove) { elementsText = `[Start of page]\n${elementsText}`; } if (!hasContentBelow) { elementsText = `${elementsText}\n[End of page]`; } return `Interactive elements${truncatedNote}:\n${elementsText}`; } } // ── Dynamic action descriptions ── /** * Build action descriptions from a registry, optionally filtered by * the current page URL. Returns a formatted string suitable for * injection into the system prompt's `{{actionDescriptions}}` slot. */ export function buildCommandDescriptions(registry: CommandCatalog, pageUrl?: string): string { return registry.getPromptDescription(pageUrl); } /** * Build a description of actions that are specific to the current page's domain. * Returns `undefined` if there are no domain-specific actions beyond the * universal set. * * This is injected as a `` section in the per-step * user message when the page URL triggers extra actions. */ export function buildContextualCommands(registry: CommandCatalog, pageUrl: string): string | undefined { const allActions = registry.getAll(); const domainActions = registry.getActionsForDomain(extractDomain(pageUrl)); // If all actions are already shown (no domain filtering), nothing extra to show if (domainActions.length === allActions.length) return undefined; // Find domain-specific actions (ones that have a domainFilter) const extraActions = domainActions.filter( (a) => a.domainFilter && a.domainFilter.length > 0, ); if (extraActions.length === 0) return undefined; const lines = extraActions.map( (a) => `- ${a.name}: ${a.description}`, ); return `The following actions are available on this page:\n${lines.join('\n')}`; } // ── Rerun / extraction prompt helpers ── /** * Build a system prompt for the extraction/AI-step action used during reruns. */ export function buildExtractionInstructionBuilder(): string { return dedent(` You are an expert at extracting data from webpages. You will be given: 1. A query describing what to extract 2. The markdown of the webpage (filtered to remove noise) 3. Optionally, a screenshot of the current page state Instructions: - Extract information from the webpage that is relevant to the query - ONLY use the information available in the webpage - do not make up information - If the information is not available, mention that clearly - If the query asks for all items, list all of them Output: - Present ALL relevant information in a concise way - Do not use conversational format - directly output the relevant information - If information is unavailable, state that clearly `); } /** * Build a user prompt for the extraction/AI-step action. */ export function buildExtractionUserPrompt( query: string, statsSummary: string, content: string, ): string { return [ `\n${query}\n`, `\n${statsSummary}\n`, `\n${content}\n`, ].join('\n\n'); } // ── Helpers ── function extractDomain(url: string): string { try { return new URL(url).hostname.replace(/^www\./, '').toLowerCase(); } catch { return ''; } } ================================================ FILE: packages/core/src/agent/replay-recorder.ts ================================================ import * as fs from 'node:fs'; import * as path from 'node:path'; import { createLogger } from '../logging.js'; const logger = createLogger('gif-recorder'); export interface ReplayRecorderOptions { /** Output file path. Extension determines format (.gif or .png for fallback). */ outputPath: string; /** Delay between frames in milliseconds */ frameDelay?: number; /** Resize frames to this width (maintains aspect ratio). 0 = no resize. */ resizeWidth?: number; /** Quality (1-30, lower = better quality). Only used for GIF encoding. */ quality?: number; } interface FrameData { buffer: Buffer; stepNumber: number; label?: string; } /** * Records agent screenshots and encodes them into an animated GIF. * * Uses the `sharp` library (optional dependency) for image processing * and compositing step-number overlays. If sharp is not available, * falls back to saving individual PNG frames. * * Usage: * const recorder = new ReplayRecorder({ outputPath: './recording.gif' }); * recorder.addFrame(screenshotBase64, 1); * // ... more frames ... * await recorder.save(); // -> path to GIF or frames directory */ export class ReplayRecorder { private frames: FrameData[] = []; private outputPath: string; private frameDelay: number; private resizeWidth: number; private quality: number; constructor(options: ReplayRecorderOptions) { this.outputPath = options.outputPath; this.frameDelay = options.frameDelay ?? 500; this.resizeWidth = options.resizeWidth ?? 800; this.quality = options.quality ?? 10; } /** * Add a screenshot frame to the recording. * @param screenshotBase64 - PNG screenshot as base64 string * @param stepNumber - Step number for the overlay annotation * @param label - Optional label text (e.g., the action taken) */ addFrame(screenshotBase64: string, stepNumber?: number, label?: string): void { const buffer = Buffer.from(screenshotBase64, 'base64'); this.frames.push({ buffer, stepNumber: stepNumber ?? this.frames.length + 1, label, }); } /** * Save the recording. Attempts GIF encoding with sharp, falls back * to individual PNG frames if sharp is not available. * * @param generateGif - true to generate a GIF, 'path' to override output path, * false to only save individual frames * @returns The path where the recording was saved */ async save(generateGif: string | boolean = true): Promise { if (this.frames.length === 0) { logger.debug('No frames to save'); return this.outputPath; } const effectivePath = typeof generateGif === 'string' ? generateGif : this.outputPath; const dir = path.dirname(effectivePath); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } // Always save individual frames as fallback / debug await this.saveFrames(effectivePath); if (generateGif === false) { return effectivePath; } // Try to generate actual GIF using sharp try { const gifPath = await this.encodeGif(effectivePath); logger.info(`GIF saved: ${gifPath} (${this.frames.length} frames)`); return gifPath; } catch (error) { logger.warn( `GIF encoding failed, falling back to individual frames: ${ error instanceof Error ? error.message : String(error) }`, ); return effectivePath; } } /** * Encode frames into an animated GIF using sharp. * Sharp must be installed as a peer dependency. */ private async encodeGif(outputPath: string): Promise { // Dynamic import -- sharp is an optional dependency. // Use indirect require to avoid TS module resolution error. // eslint-disable-next-line @typescript-eslint/no-explicit-any let sharpModule: any; try { // Indirect dynamic import avoids TS2307 for optional peer deps const moduleName = 'sharp'; sharpModule = await import(/* webpackIgnore: true */ moduleName); } catch { throw new Error( 'sharp is not installed. Install it with: npm install sharp', ); } // Resolve the default export (handles both ESM and CJS) const sharp = sharpModule.default ?? sharpModule; const gifPath = outputPath.replace(/\.[^.]+$/, '.gif'); const processedFrames: Buffer[] = []; for (const frame of this.frames) { let img = sharp(frame.buffer); // Resize if configured if (this.resizeWidth > 0) { img = img.resize(this.resizeWidth, undefined, { fit: 'inside', withoutEnlargement: true, }); } // Composite a step number overlay onto the frame const overlayBuffer = this.createStepOverlaySvg( frame.stepNumber, frame.label, ); img = img.composite([ { input: Buffer.from(overlayBuffer), gravity: 'northwest', }, ]); // Convert to PNG for further processing const processed = await img .flatten({ background: { r: 255, g: 255, b: 255 } }) .png() .toBuffer(); processedFrames.push(processed); } // Attempt to assemble an animated GIF from the processed frames try { const firstFrame = sharp(processedFrames[0]); const metadata = await firstFrame.metadata(); const width = metadata.width ?? this.resizeWidth; const height = metadata.height ?? 600; // Convert each frame to raw RGBA const rawFrames: Buffer[] = []; for (const frameBuffer of processedFrames) { const raw = await sharp(frameBuffer) .resize(width, height, { fit: 'contain', background: { r: 255, g: 255, b: 255 }, }) .raw() .ensureAlpha() .toBuffer(); rawFrames.push(raw); } // Concatenate all raw frames and encode as animated GIF const combinedRaw = Buffer.concat(rawFrames); await sharp(combinedRaw, { raw: { width, height, channels: 4, pages: rawFrames.length, }, }) .gif({ delay: Array(rawFrames.length).fill(this.frameDelay), loop: 0, }) .toFile(gifPath); return gifPath; } catch (animatedError) { // If animated GIF creation fails, save the last frame as a static image logger.debug( `Animated GIF assembly failed, saving static image: ${ animatedError instanceof Error ? animatedError.message : String(animatedError) }`, ); const lastFrame = processedFrames[processedFrames.length - 1]; const staticPath = outputPath.replace(/\.[^.]+$/, '.png'); await sharp(lastFrame).png().toFile(staticPath); return staticPath; } } /** * Create an SVG overlay with the step number and optional label. * Returns an SVG string that can be composited onto the frame. */ private createStepOverlaySvg(stepNumber: number, label?: string): string { const labelText = label ? ` - ${label.slice(0, 40)}` : ''; const text = `Step ${stepNumber}${labelText}`; const width = Math.max(200, text.length * 10 + 20); const height = 36; return ` ${this.escapeXml(text)} `; } /** * Save individual PNG frames to a directory alongside the output path. */ private async saveFrames(outputPath: string): Promise { const framesDir = outputPath.replace(/\.[^.]+$/, '_frames'); if (!fs.existsSync(framesDir)) { fs.mkdirSync(framesDir, { recursive: true }); } for (let i = 0; i < this.frames.length; i++) { const frame = this.frames[i]; const framePath = path.join( framesDir, `frame_${frame.stepNumber.toString().padStart(4, '0')}.png`, ); fs.writeFileSync(framePath, frame.buffer); } // Also save the last frame as the preview image if (this.frames.length > 0) { const lastFrame = this.frames[this.frames.length - 1]; const previewPath = outputPath.replace(/\.[^.]+$/, '_preview.png'); fs.writeFileSync(previewPath, lastFrame.buffer); } logger.debug(`Saved ${this.frames.length} frames to ${framesDir}`); return framesDir; } /** Escape XML special characters for SVG text content */ private escapeXml(text: string): string { return text .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); } get frameCount(): number { return this.frames.length; } clear(): void { this.frames = []; } } ================================================ FILE: packages/core/src/agent/stall-detector.test.ts ================================================ import { test, expect, describe, beforeEach } from 'bun:test'; import { StallDetector, hashPageTree, hashTextContent, type PageSignature, } from './stall-detector.js'; import type { Command } from '../commands/types.js'; // ── Helpers ── function clickAction(index: number): Command { return { action: 'tap', index, clickCount: 1 }; } function inputAction(index: number, text: string): Command { return { action: 'type_text', index, text, clearFirst: true }; } function navigateAction(url: string): Command { return { action: 'navigate', url }; } function scrollAction(direction: 'up' | 'down', index?: number): Command { return { action: 'scroll', direction, index }; } function doneAction(text: string): Command { return { action: 'finish', text, success: true }; } function searchGoogleAction(query: string): Command { return { action: 'web_search', query }; } function makeFingerprint(overrides: Partial = {}): PageSignature { return { url: 'https://example.com', domHash: 'abc123', scrollY: 0, elementCount: 50, textHash: 'texthash1', ...overrides, }; } // ── Tests ── describe('StallDetector', () => { let detector: StallDetector; beforeEach(() => { detector = new StallDetector(); }); describe('initial state', () => { test('isStuck returns not stuck when no actions recorded', () => { const result = detector.isStuck(); expect(result.stuck).toBe(false); expect(result.severity).toBe(0); }); test('getTotalRepetitions returns 0 initially', () => { expect(detector.getTotalRepetitions()).toBe(0); }); test('getLoopNudgeMessage returns empty string when not stuck', () => { expect(detector.getLoopNudgeMessage()).toBe(''); }); }); describe('recordAction and repeated action detection', () => { test('does not flag non-repeated actions', () => { detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); detector.recordAction([clickAction(3)]); const result = detector.isStuck(); expect(result.stuck).toBe(false); }); test('flags the same action repeated maxRepeatedActions times (default 3)', () => { detector.recordAction([clickAction(5)]); detector.recordAction([clickAction(5)]); detector.recordAction([clickAction(5)]); const result = detector.isStuck(); expect(result.stuck).toBe(true); expect(result.reason).toContain('repeated'); expect(result.reason).toContain('3'); }); test('flags repeated multi-action steps', () => { const actions: Command[] = [clickAction(1), inputAction(2, 'hello')]; detector.recordAction(actions); detector.recordAction(actions); detector.recordAction(actions); const result = detector.isStuck(); expect(result.stuck).toBe(true); }); test('does not flag when only two repeated actions (below threshold)', () => { detector.recordAction([clickAction(5)]); detector.recordAction([clickAction(5)]); const result = detector.isStuck(); expect(result.stuck).toBe(false); }); test('custom maxRepeatedActions threshold', () => { // With maxRepeatedActions=5, only 5+ trailing repeats should trigger. // Note: cycle detection (A->B->A->B) fires with 4 identical actions // because all 4 being the same matches the pattern. So we can only test // that at exactly 3 trailing repeats (below our custom threshold of 5, // and below the cycle check threshold of 4 identical entries), it's not stuck. const custom = new StallDetector({ maxRepeatedActions: 5 }); custom.recordAction([clickAction(10)]); // prefix to avoid cycle match custom.recordAction([clickAction(1)]); custom.recordAction([clickAction(1)]); custom.recordAction([clickAction(1)]); // 3 trailing repeats < 5 threshold, and cycle check sees [10,1,1,1] which is not A->B->A->B expect(custom.isStuck().stuck).toBe(false); // Add two more to reach 5 trailing repeats custom.recordAction([clickAction(1)]); custom.recordAction([clickAction(1)]); expect(custom.isStuck().stuck).toBe(true); }); }); describe('action cycle detection (A -> B -> A -> B)', () => { test('detects alternating two-action cycle', () => { detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); const result = detector.isStuck(); expect(result.stuck).toBe(true); expect(result.reason).toContain('cycle'); }); test('does not falsely detect A -> B -> A -> C as a cycle', () => { detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(3)]); const result = detector.isStuck(); expect(result.stuck).toBe(false); }); }); describe('triple cycle detection (A -> B -> C -> A -> B -> C)', () => { test('detects 3-step cycle', () => { detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); detector.recordAction([clickAction(3)]); detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); detector.recordAction([clickAction(3)]); const result = detector.isStuck(); expect(result.stuck).toBe(true); expect(result.reason).toContain('3-step'); }); test('does not detect partial triple cycle', () => { detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); detector.recordAction([clickAction(3)]); detector.recordAction([clickAction(1)]); detector.recordAction([clickAction(2)]); // Only 5 entries, needs 6 for triple check const result = detector.isStuck(); expect(result.stuck).toBe(false); }); }); describe('fingerprint-based stuck detection', () => { test('detects repeated page fingerprints', () => { const fp = makeFingerprint(); detector.recordFingerprint(fp); detector.recordFingerprint(fp); detector.recordFingerprint(fp); const result = detector.isStuck(); expect(result.stuck).toBe(true); expect(result.reason).toContain('Page state unchanged'); }); test('different fingerprints do not trigger stuck', () => { detector.recordFingerprint(makeFingerprint({ domHash: 'hash1' })); detector.recordFingerprint(makeFingerprint({ domHash: 'hash2' })); detector.recordFingerprint(makeFingerprint({ domHash: 'hash3' })); const result = detector.isStuck(); expect(result.stuck).toBe(false); }); test('scroll position bucketed (200px buckets) - same bucket triggers stuck', () => { // scrollY 0 and 100 are in the same bucket (both floor to 0) detector.recordFingerprint(makeFingerprint({ scrollY: 0 })); detector.recordFingerprint(makeFingerprint({ scrollY: 50 })); detector.recordFingerprint(makeFingerprint({ scrollY: 100 })); const result = detector.isStuck(); expect(result.stuck).toBe(true); }); test('different scroll buckets not considered stuck', () => { detector.recordFingerprint(makeFingerprint({ scrollY: 0 })); detector.recordFingerprint(makeFingerprint({ scrollY: 200 })); detector.recordFingerprint(makeFingerprint({ scrollY: 400 })); const result = detector.isStuck(); expect(result.stuck).toBe(false); }); test('custom maxRepeatedFingerprints threshold', () => { const custom = new StallDetector({ maxRepeatedFingerprints: 5 }); const fp = makeFingerprint(); for (let i = 0; i < 4; i++) { custom.recordFingerprint(fp); } expect(custom.isStuck().stuck).toBe(false); custom.recordFingerprint(fp); expect(custom.isStuck().stuck).toBe(true); }); }); describe('consecutive stagnant pages detection', () => { test('detects stagnant pages with same URL and similar element count', () => { const detector5 = new StallDetector({ maxStagnantPages: 5 }); for (let i = 0; i < 5; i++) { // Different domHash/scrollY so fingerprint hashing is distinct, // but same URL and elementCount triggers stagnant detection. detector5.recordFingerprint( makeFingerprint({ domHash: `hash_${i}`, scrollY: i * 200, elementCount: 50, }), ); } const result = detector5.isStuck(); expect(result.stuck).toBe(true); expect(result.reason).toContain('stagnant'); }); test('different URLs do not trigger stagnant detection', () => { for (let i = 0; i < 5; i++) { detector.recordFingerprint( makeFingerprint({ url: `https://example.com/page${i}`, domHash: `hash_${i}`, scrollY: i * 200, elementCount: 50, }), ); } const result = detector.isStuck(); expect(result.stuck).toBe(false); }); }); describe('escalating nudge messages', () => { test('severity 0 for repetitions below 5', () => { // 3 repetitions -> gets flagged as stuck but severity 0 for (let i = 0; i < 3; i++) { detector.recordAction([clickAction(1)]); } const result = detector.isStuck(); expect(result.stuck).toBe(true); expect(result.severity).toBe(0); }); test('severity 1 at 5+ total repetitions via cycle detection', () => { // Cycle detection path uses getSeverity(this.totalRepetitions) // so accumulating enough totalRepetitions can reach severity 1. const det = new StallDetector({ maxRepeatedActions: 3 }); // First: accumulate 3 via repeated actions for (let i = 0; i < 3; i++) { det.recordAction([clickAction(1)]); } det.isStuck(); // totalRepetitions += 3 // Break the trailing sequence, then trigger a 2-cycle det.recordAction([clickAction(10)]); // A->B->A->B cycle adds 2 to totalRepetitions -> total 5 det.recordAction([clickAction(20)]); det.recordAction([clickAction(10)]); det.recordAction([clickAction(20)]); const result = det.isStuck(); expect(result.stuck).toBe(true); // totalRepetitions = 3 + 2 = 5, getSeverity(5) = 1 expect(result.severity).toBe(1); }); test('nudge message contains appropriate text', () => { for (let i = 0; i < 3; i++) { detector.recordAction([clickAction(1)]); } const msg = detector.getLoopNudgeMessage(); expect(msg).toContain('Warning:'); expect(msg.length).toBeGreaterThan(0); }); }); describe('action hash normalization', () => { test('click actions normalized by index only', () => { // Two click actions with same index but different click counts // should both normalize to "click:5" const d1 = new StallDetector(); const d2 = new StallDetector(); const act1: Command = { action: 'tap', index: 5, clickCount: 1 }; const act2: Command = { action: 'tap', index: 5, clickCount: 2 }; // Record 3 of each in separate detectors for (let i = 0; i < 3; i++) { d1.recordAction([act1]); d2.recordAction([act2]); } // Both should detect as stuck since click is normalized by index expect(d1.isStuck().stuck).toBe(true); expect(d2.isStuck().stuck).toBe(true); }); test('search queries normalized for order independence', () => { // "best pizza NYC" and "NYC best pizza" should produce same hash const d = new StallDetector(); d.recordAction([searchGoogleAction('best pizza NYC')]); d.recordAction([searchGoogleAction('NYC best pizza')]); d.recordAction([searchGoogleAction('pizza best NYC')]); expect(d.isStuck().stuck).toBe(true); }); test('different navigate URLs not considered same action', () => { detector.recordAction([navigateAction('https://a.com')]); detector.recordAction([navigateAction('https://b.com')]); detector.recordAction([navigateAction('https://c.com')]); expect(detector.isStuck().stuck).toBe(false); }); test('scroll actions include direction and index', () => { // Same direction, same index -> stuck for (let i = 0; i < 3; i++) { detector.recordAction([scrollAction('down', 1)]); } expect(detector.isStuck().stuck).toBe(true); }); test('done actions include text prefix', () => { detector.recordAction([doneAction('Task completed successfully')]); detector.recordAction([doneAction('Task completed successfully')]); detector.recordAction([doneAction('Task completed successfully')]); expect(detector.isStuck().stuck).toBe(true); }); }); describe('reset', () => { test('clears all history and repetitions', () => { for (let i = 0; i < 3; i++) { detector.recordAction([clickAction(1)]); detector.recordFingerprint(makeFingerprint()); } expect(detector.isStuck().stuck).toBe(true); detector.reset(); expect(detector.isStuck().stuck).toBe(false); expect(detector.getTotalRepetitions()).toBe(0); expect(detector.getLoopNudgeMessage()).toBe(''); }); }); describe('window size pruning', () => { test('keeps action history within bounds', () => { const smallWindow = new StallDetector({ windowSize: 5 }); // Record 15 unique actions, then 3 repeated for (let i = 0; i < 15; i++) { smallWindow.recordAction([clickAction(i)]); } // Now repeat same action 3 times for (let i = 0; i < 3; i++) { smallWindow.recordAction([clickAction(99)]); } // Should still detect the repetition expect(smallWindow.isStuck().stuck).toBe(true); }); }); }); describe('hashPageTree', () => { test('produces consistent hash for same input', () => { const hash1 = hashPageTree('
hello
'); const hash2 = hashPageTree('
hello
'); expect(hash1).toBe(hash2); }); test('produces different hash for different input', () => { const hash1 = hashPageTree('
hello
'); const hash2 = hashPageTree('
world
'); expect(hash1).not.toBe(hash2); }); test('returns a base-36 string', () => { const hash = hashPageTree('some content'); expect(typeof hash).toBe('string'); // Base-36 characters: 0-9, a-z, and optional leading minus expect(hash).toMatch(/^-?[0-9a-z]+$/); }); test('handles empty string', () => { const hash = hashPageTree(''); expect(hash).toBe('0'); }); }); describe('hashTextContent', () => { test('produces consistent hash for same input', () => { const hash1 = hashTextContent('Hello World'); const hash2 = hashTextContent('Hello World'); expect(hash1).toBe(hash2); }); test('normalizes case: same hash for different casing', () => { const hash1 = hashTextContent('Hello World'); const hash2 = hashTextContent('hello world'); expect(hash1).toBe(hash2); }); test('normalizes whitespace: collapses multiple spaces', () => { const hash1 = hashTextContent('hello world'); const hash2 = hashTextContent('hello world'); expect(hash1).toBe(hash2); }); test('removes punctuation for content-based matching', () => { const hash1 = hashTextContent('hello, world!'); const hash2 = hashTextContent('hello world'); expect(hash1).toBe(hash2); }); test('handles empty string', () => { const hash = hashTextContent(''); expect(hash).toBe('0'); }); }); ================================================ FILE: packages/core/src/agent/stall-detector.ts ================================================ import type { Command } from '../commands/types.js'; // ── Enhanced Page Fingerprint ── export interface PageSignature { url: string; domHash: string; scrollY: number; elementCount?: number; textHash?: string; } export interface StallDetectorConfig { maxRepeatedActions: number; maxRepeatedFingerprints: number; windowSize: number; /** Number of consecutive stagnant pages before raising stall alert */ maxStagnantPages: number; } const DEFAULT_OPTIONS: StallDetectorConfig = { maxRepeatedActions: 3, maxRepeatedFingerprints: 3, windowSize: 10, maxStagnantPages: 5, }; export interface StallCheckResult { stuck: boolean; reason?: string; /** Escalation level: 0 = not stuck, 1 = mild, 2 = moderate, 3 = severe */ severity: number; } /** * Nudge messages that escalate in urgency as repetitions increase. * Thresholds: 5 repetitions = mild, 8 = moderate, 12 = severe. */ const ESCALATING_NUDGES = [ { threshold: 5, severity: 1, message: 'You seem to be repeating similar actions. Consider trying a different approach:\n' + '- Click a different element\n' + '- Try an alternative navigation path\n' + '- Use search to find what you need', }, { threshold: 8, severity: 2, message: 'WARNING: You are stuck in a loop and have been repeating actions. You MUST change your approach:\n' + '- Navigate to a completely different page\n' + '- Try a fundamentally different strategy\n' + '- If the current approach is not working, consider using the done action to report the issue', }, { threshold: 12, severity: 3, message: 'CRITICAL: You have been stuck for many steps. This approach is NOT working.\n' + 'You MUST either:\n' + '1. Use the done action to report that the task cannot be completed with your current approach\n' + '2. Navigate to a completely different website or page\n' + '3. Try a radically different interaction method\n' + 'Do NOT repeat the same actions again.', }, ]; export class StallDetector { private actionHistory: string[] = []; private fingerprintHistory: PageSignature[] = []; private fingerprintHashes: string[] = []; private options: StallDetectorConfig; private totalRepetitions = 0; constructor(options?: Partial) { this.options = { ...DEFAULT_OPTIONS, ...options }; } recordAction(actions: Command[]): void { const key = this.normalizeActionHash(actions); this.actionHistory.push(key); // Keep only the window if (this.actionHistory.length > this.options.windowSize * 2) { this.actionHistory = this.actionHistory.slice(-this.options.windowSize * 2); } } recordFingerprint(fingerprint: PageSignature): void { this.fingerprintHistory.push(fingerprint); const hash = this.hashFingerprint(fingerprint); this.fingerprintHashes.push(hash); if (this.fingerprintHistory.length > this.options.windowSize * 2) { this.fingerprintHistory = this.fingerprintHistory.slice(-this.options.windowSize * 2); this.fingerprintHashes = this.fingerprintHashes.slice(-this.options.windowSize * 2); } } isStuck(): StallCheckResult { // Check for repeated actions const actionRepetitions = this.countTrailingRepetitions(this.actionHistory); if (actionRepetitions >= this.options.maxRepeatedActions) { this.totalRepetitions += actionRepetitions; const severity = this.getSeverity(actionRepetitions); return { stuck: true, reason: `Same action repeated ${actionRepetitions} times`, severity, }; } // Check for action cycle (A -> B -> A -> B) if (this.actionHistory.length >= 4) { const last4 = this.actionHistory.slice(-4); if (last4[0] === last4[2] && last4[1] === last4[3]) { this.totalRepetitions += 2; return { stuck: true, reason: 'Detected action cycle (alternating between two actions)', severity: this.getSeverity(this.totalRepetitions), }; } } // Check for triple cycle (A -> B -> C -> A -> B -> C) if (this.actionHistory.length >= 6) { const last6 = this.actionHistory.slice(-6); if ( last6[0] === last6[3] && last6[1] === last6[4] && last6[2] === last6[5] ) { this.totalRepetitions += 3; return { stuck: true, reason: 'Detected 3-step action cycle', severity: this.getSeverity(this.totalRepetitions), }; } } // Check for repeated fingerprints (same page state) const fpRepetitions = this.countTrailingRepetitions(this.fingerprintHashes); if (fpRepetitions >= this.options.maxRepeatedFingerprints) { this.totalRepetitions += fpRepetitions; return { stuck: true, reason: `Page state unchanged for ${fpRepetitions} steps`, severity: this.getSeverity(fpRepetitions), }; } // Check for consecutive stagnant pages (URL + elementCount unchanged) const stagnantCount = this.countConsecutiveStagnantPages(); if (stagnantCount >= this.options.maxStagnantPages) { this.totalRepetitions += stagnantCount; return { stuck: true, reason: `Page appears stagnant for ${stagnantCount} consecutive steps (same URL and element structure)`, severity: this.getSeverity(stagnantCount), }; } return { stuck: false, severity: 0 }; } getLoopNudgeMessage(): string { const result = this.isStuck(); if (!result.stuck) { return ''; } // Find the appropriate escalating nudge const nudge = this.getEscalatingNudge(); return `Warning: ${result.reason ?? 'You appear to be stuck'}.\n${nudge}`; } /** Get total number of detected repetitions across the session */ getTotalRepetitions(): number { return this.totalRepetitions; } reset(): void { this.actionHistory = []; this.fingerprintHistory = []; this.fingerprintHashes = []; this.totalRepetitions = 0; } // ── Private helpers ── /** * Normalize action hash for better deduplication: * - Sort search token strings for order-independent matching * - Use element index (not full params) for click actions * - Use URL (not full params) for navigate actions */ private normalizeActionHash(actions: Command[]): string { const normalized = actions.map((action) => { switch (action.action) { case 'tap': // Normalize click: use index as the primary key, ignore transient params return `click:${action.index}`; case 'type_text': return `input_text:${action.index}:${action.text}`; case 'navigate': // Normalize: just the URL return `go_to_url:${action.url}`; case 'web_search': // Sort search terms for order-independent matching return `search_google:${this.normalizeSearchQuery(action.query)}`; case 'search': { const q = 'query' in action ? String((action as Record).query) : ''; return `search_page:${this.normalizeSearchQuery(q)}`; } case 'scroll': return `scroll:${action.direction}:${action.index ?? 'page'}`; case 'finish': return `done:${action.text.slice(0, 50)}`; default: // Generic fallback: action name + stringified params return JSON.stringify(action); } }); return normalized.join('|'); } /** * Normalize a search query by lowercasing and sorting tokens. * "best pizza NYC" and "NYC best pizza" produce the same hash. */ private normalizeSearchQuery(query: string): string { return query .toLowerCase() .split(/\s+/) .filter(Boolean) .sort() .join(' '); } /** * Hash a page fingerprint for quick equality checks. * Includes URL, element count, text hash, and scroll position bucket. */ private hashFingerprint(fp: PageSignature): string { const scrollBucket = Math.floor(fp.scrollY / 200); const parts = [ fp.url, fp.domHash, scrollBucket.toString(), ]; if (fp.elementCount !== undefined) { parts.push(`e:${fp.elementCount}`); } if (fp.textHash) { parts.push(`t:${fp.textHash}`); } return parts.join('|'); } /** * Count how many trailing entries in a history array are identical. */ private countTrailingRepetitions(history: string[]): number { if (history.length === 0) return 0; const last = history[history.length - 1]; let count = 0; for (let i = history.length - 1; i >= 0; i--) { if (history[i] === last) { count++; } else { break; } } return count; } /** * Count consecutive stagnant pages: same URL and similar element count. * "Similar" means within 5% or 10 elements of each other. */ private countConsecutiveStagnantPages(): number { if (this.fingerprintHistory.length < 2) return 0; const latest = this.fingerprintHistory[this.fingerprintHistory.length - 1]; let count = 1; for (let i = this.fingerprintHistory.length - 2; i >= 0; i--) { const fp = this.fingerprintHistory[i]; if (fp.url !== latest.url) break; if (latest.elementCount !== undefined && fp.elementCount !== undefined) { const diff = Math.abs(latest.elementCount - fp.elementCount); const threshold = Math.max(10, Math.floor(latest.elementCount * 0.05)); if (diff > threshold) break; } count++; } return count; } /** * Map repetition count to severity level (0-3). */ private getSeverity(repetitions: number): number { if (repetitions >= 12) return 3; if (repetitions >= 8) return 2; if (repetitions >= 5) return 1; return 0; } /** * Get the appropriate escalating nudge message based on total repetitions. */ private getEscalatingNudge(): string { // Pick the highest-threshold nudge that applies let bestNudge = ESCALATING_NUDGES[0]; for (const nudge of ESCALATING_NUDGES) { if (this.totalRepetitions >= nudge.threshold) { bestNudge = nudge; } } return bestNudge.message; } } /** * Compute a fast 32-bit hash of a DOM tree string. * Used for quick fingerprint comparison. */ export function hashPageTree(domTree: string): string { let hash = 0; for (let i = 0; i < domTree.length; i++) { const char = domTree.charCodeAt(i); hash = ((hash << 5) - hash + char) | 0; } return hash.toString(36); } /** * Compute a content-based text hash from visible page text. * More robust than DOM hash for detecting actual content changes. */ export function hashTextContent(text: string): string { // Normalize: lowercase, collapse whitespace, remove punctuation const normalized = text .toLowerCase() .replace(/\s+/g, ' ') .replace(/[^\w\s]/g, '') .trim(); let hash = 0; for (let i = 0; i < normalized.length; i++) { const char = normalized.charCodeAt(i); hash = ((hash << 5) - hash + char) | 0; } return hash.toString(36); } ================================================ FILE: packages/core/src/agent/types.ts ================================================ import { z } from 'zod'; import type { Command, CommandResult } from '../commands/types.js'; import type { ViewportSnapshot, ViewportHistory } from '../viewport/types.js'; import type { InferenceUsage } from '../model/types.js'; // ── Agent Settings ── export interface AgentConfig { task: string; stepLimit: number; commandsPerStep: number; failureThreshold: number; retryDelay: number; enableScreenshots: boolean; enableScreenshotsForTextExtraction: boolean; contextWindowSize: number; capturedAttributes: string[]; commandDelayMs: number; allowedUrls?: string[]; blockedUrls?: string[]; traceOutputPath?: string; replayOutputPath?: string; strategyInterval: number; maskedValues?: Record; overrideInstructionBuilder?: string; extendInstructionBuilder?: string; inlineCommands: boolean; conversationCompaction?: CompactionPolicy; // Extended thinking enableDeepReasoning: boolean; reasoningBudget: number; // Flash mode compactMode: boolean; // Timeouts (0 = no timeout) stepDeadlineMs: number; modelDeadlineMs: number; // Planning system enableStrategy: boolean; restrategizeOnStall: boolean; // URL extraction from task text autoNavigateToUrls: boolean; // Coordinate clicking auto-enable per model autoEnableCoordinateClicking: boolean; // Judge integration enableEvaluation: boolean; enableSimpleJudge: boolean; expectedOutcome?: string; // Demo mode enableVisualTracer: boolean; // Initial actions before main loop preflightCommands: Command[]; // Save conversation per step conversationOutputPath?: string; // Dynamic action schema rebuild per step dynamicCommandSchema: boolean; } export const DEFAULT_AGENT_CONFIG: AgentConfig = { task: '', stepLimit: 100, commandsPerStep: 10, failureThreshold: 5, retryDelay: 10, enableScreenshots: true, enableScreenshotsForTextExtraction: false, contextWindowSize: 128000, capturedAttributes: [ 'title', 'type', 'name', 'role', 'tabindex', 'aria-label', 'placeholder', 'value', 'alt', 'aria-expanded', ], commandDelayMs: 1, strategyInterval: 0, inlineCommands: true, enableDeepReasoning: false, reasoningBudget: 10000, compactMode: false, stepDeadlineMs: 0, modelDeadlineMs: 0, enableStrategy: false, restrategizeOnStall: false, autoNavigateToUrls: true, autoEnableCoordinateClicking: false, enableEvaluation: false, enableSimpleJudge: false, enableVisualTracer: false, preflightCommands: [], dynamicCommandSchema: false, }; // ── Message Compaction Settings ── export interface CompactionPolicy { /** Run LLM-based compaction every N steps (0 = disabled). */ interval: number; /** Model ID to use for summarization. If omitted, uses the agent's main model. */ model?: string; /** Max tokens for the compaction summary output. */ maxTokens: number; /** Target token budget after compaction. Defaults to 60% of contextWindowSize. */ targetTokens?: number; } // ── Agent Brain (LLM thought process) ── export const ReasoningSchema = z.object({ evaluation: z.string().describe('Assessment of the current state'), memory: z.string().describe('Important information to remember'), nextGoal: z.string().describe('Next immediate goal'), }); export type Reasoning = z.infer; // ── Agent Output (what LLM returns each step) ── export const AgentDecisionSchema = z.object({ currentState: ReasoningSchema, actions: z.array(z.record(z.unknown())).describe('Actions to execute'), thinking: z.string().optional().describe('Extended thinking / chain-of-thought'), evaluation: z.string().optional().describe('Top-level evaluation (mirrors currentState.evaluation for convenience)'), memory: z.string().optional().describe('Top-level memory note (mirrors currentState.memory for convenience)'), nextGoal: z.string().optional().describe('Top-level next goal (mirrors currentState.nextGoal for convenience)'), }); export type AgentDecision = z.infer; /** * Simplified output schema for flash / lightweight models that skip extended thinking. * Only contains the essential fields: current state evaluation + actions. */ export const AgentDecisionCompactSchema = z.object({ currentState: z.object({ evaluation: z.string().describe('Brief assessment'), nextGoal: z.string().describe('Next immediate goal'), }), actions: z.array(z.record(z.unknown())).describe('Actions to execute'), }); export type AgentDecisionCompact = z.infer; /** * Output variant that omits the extended thinking field. * Used when the model does not support or should not produce chain-of-thought. */ export const AgentDecisionDirectSchema = z.object({ currentState: ReasoningSchema, actions: z.array(z.record(z.unknown())).describe('Actions to execute'), }); export type AgentDecisionDirect = z.infer; // ── Step Metadata ── export interface StepTelemetry { /** Step number (1-based). */ stepNumber: number; /** Wall-clock duration of this step in milliseconds. */ durationMs: number; /** Token usage for this step. */ inputTokens: number; outputTokens: number; /** Number of actions attempted in this step. */ actionCount: number; /** URL at the start of this step. */ url?: string; /** Path to screenshot file if one was saved. */ screenshotPath?: string; /** Timestamp when the step started. */ startedAt: number; /** Timestamp when the step completed. */ completedAt: number; } // ── Detected Variable ── /** * A variable or piece of data detected during agent execution, * e.g. a confirmation number, order ID, or extracted value. */ export interface ExtractedVariable { /** Human-readable name (e.g. "order_id", "confirmation_number"). */ name: string; /** The detected value as a string. */ value: string; /** Where this variable was found. */ source: 'extraction' | 'action_result' | 'page_content' | 'user_input'; /** Step number where this variable was detected. */ step?: number; } // ── Agent State ── export interface AgentState { step: number; stepLimit: number; failureCount: number; consecutiveFailures: number; isRunning: boolean; isPaused: boolean; isDone: boolean; lastResult?: string; currentUrl?: string; totalInputTokens: number; totalOutputTokens: number; cumulativeCost: AccumulatedCost; currentPlan?: string; lastPlanStep?: number; } // ── History ── export interface StepRecord { step: number; timestamp: number; browserState: ViewportHistory; agentOutput: AgentDecision; actionResults: CommandResult[]; error?: string; usage?: InferenceUsage; duration: number; metadata?: StepTelemetry; detectedVariables?: ExtractedVariable[]; } /** * Concrete class wrapping agent execution history with helper methods. * * Replaces the plain ExecutionLog interface so that consumers can call * convenience methods like `finalResult()`, `isDone()`, `urls()`, etc. */ export class ExecutionLog { readonly entries: StepRecord[]; readonly task: string; readonly startTime: number; endTime?: number; totalDuration?: number; totalSteps: number; totalInputTokens: number; totalOutputTokens: number; constructor(init: { entries?: StepRecord[]; task: string; startTime?: number; }) { this.entries = init.entries ?? []; this.task = init.task; this.startTime = init.startTime ?? Date.now(); this.totalSteps = this.entries.length; this.totalInputTokens = 0; this.totalOutputTokens = 0; this.recomputeTotals(); } /** Recalculate aggregate totals from entries. Called internally and from static factories. */ recomputeTotals(): void { this.totalSteps = this.entries.length; this.totalInputTokens = 0; this.totalOutputTokens = 0; for (const entry of this.entries) { if (entry.usage) { this.totalInputTokens += entry.usage.inputTokens; this.totalOutputTokens += entry.usage.outputTokens; } } } /** Push a new entry and update totals. */ addEntry(entry: StepRecord): void { this.entries.push(entry); this.recomputeTotals(); } /** Mark the history as finished. */ finish(): void { this.endTime = Date.now(); this.totalDuration = this.endTime - this.startTime; this.recomputeTotals(); } /** * Returns the final result text from the last "done" action, or undefined * if the agent never completed with a done action. */ finalResult(): string | undefined { for (let i = this.entries.length - 1; i >= 0; i--) { const entry = this.entries[i]; for (const result of entry.actionResults) { if (result.isDone && result.extractedContent) { return result.extractedContent; } } } return undefined; } /** * Whether the agent reached a "done" action at any point. */ isDone(): boolean { return this.entries.some((entry) => entry.actionResults.some((r) => r.isDone), ); } /** * Deduplicated list of all URLs visited during execution (in order of first visit). */ urls(): string[] { const seen = new Set(); const result: string[] = []; for (const entry of this.entries) { const url = entry.browserState.url; if (url && !seen.has(url)) { seen.add(url); result.push(url); } } return result; } /** * All screenshot base64 strings collected during execution (chronological). */ screenshots(): string[] { const result: string[] = []; for (const entry of this.entries) { if (entry.browserState.screenshot) { result.push(entry.browserState.screenshot); } } return result; } /** * All errors encountered during execution. */ errors(): string[] { const result: string[] = []; for (const entry of this.entries) { if (entry.error) { result.push(entry.error); } for (const ar of entry.actionResults) { if (ar.error) { result.push(ar.error); } } } return result; } /** * All detected variables across all steps. */ allExtractedVariables(): ExtractedVariable[] { const result: ExtractedVariable[] = []; for (const entry of this.entries) { if (entry.detectedVariables) { result.push(...entry.detectedVariables); } } return result; } /** * Serialize the full history to a JSON-compatible object for saving to disk. */ toJSON(): Record { return { task: this.task, startTime: this.startTime, endTime: this.endTime, totalDuration: this.totalDuration, totalSteps: this.totalSteps, totalInputTokens: this.totalInputTokens, totalOutputTokens: this.totalOutputTokens, entries: this.entries.map((e) => ({ ...e, // Strip screenshot data from serialized form to keep file size down browserState: { ...e.browserState, screenshot: e.browserState.screenshot ? '[screenshot omitted]' : undefined, }, })), }; } /** * Save the history to a file at the given path (JSON format). * Returns the written path. */ async saveToFile(filePath: string): Promise { const { writeFile, mkdir } = await import('node:fs/promises'); const { dirname } = await import('node:path'); await mkdir(dirname(filePath), { recursive: true }); const json = JSON.stringify(this.toJSON(), null, 2); await writeFile(filePath, json, 'utf-8'); return filePath; } /** * Load history from a JSON file. Screenshots will be placeholders. */ static async loadFromFile(filePath: string): Promise { const { readFile } = await import('node:fs/promises'); const raw = await readFile(filePath, 'utf-8'); const data = JSON.parse(raw) as Record; const list = new ExecutionLog({ task: (data.task as string) ?? '', startTime: (data.startTime as number) ?? Date.now(), }); list.endTime = data.endTime as number | undefined; list.totalDuration = data.totalDuration as number | undefined; const entries = (data.entries ?? []) as StepRecord[]; for (const entry of entries) { list.entries.push(entry); } list.recomputeTotals(); return list; } } // ── Plan ── export const PlanStepSchema = z.object({ id: z.number(), description: z.string(), status: z.enum(['pending', 'in_progress', 'completed', 'failed', 'blocked', 'skipped']), note: z.string().optional(), }); export type PlanStep = z.infer; export const StrategyPlanSchema = z.object({ items: z.array(PlanStepSchema), }); // ── Judgement ── export const EvaluationResultSchema = z.object({ isComplete: z.boolean(), reason: z.string(), confidence: z.number().min(0).max(1), verdict: z.string().optional().describe('Short human-readable verdict (e.g. "success", "partial", "failed")'), failureReason: z.string().optional().describe('Detailed reason if the task failed'), impossibleTask: z.boolean().optional().describe('Whether the task appears impossible to complete'), reachedCaptcha: z.boolean().optional().describe('Whether a CAPTCHA was encountered that blocked progress'), }); export type EvaluationResult = z.infer; /** * Lightweight judgement result for simple pass/fail evaluation * without confidence scoring or detailed analysis. */ export const QuickCheckResultSchema = z.object({ passed: z.boolean(), reason: z.string(), shouldRetry: z.boolean().optional().describe('Whether the agent should retry with a different approach'), }); export type QuickCheckResult = z.infer; // ── Cost Tracking ── export interface StepCostBreakdown { inputCost: number; outputCost: number; totalCost: number; } export interface AccumulatedCost { totalInputTokens: number; totalOutputTokens: number; totalInputCost: number; totalOutputCost: number; totalCost: number; } /** Per-model pricing in USD per 1M tokens */ export interface PricingTable { inputPer1M: number; outputPer1M: number; } export const PRICING_TABLE: Record = { 'gpt-4o': { inputPer1M: 2.5, outputPer1M: 10 }, 'gpt-4o-mini': { inputPer1M: 0.15, outputPer1M: 0.6 }, 'gpt-4-turbo': { inputPer1M: 10, outputPer1M: 30 }, 'claude-3-opus': { inputPer1M: 15, outputPer1M: 75 }, 'claude-3-5-sonnet': { inputPer1M: 3, outputPer1M: 15 }, 'claude-3-5-haiku': { inputPer1M: 0.8, outputPer1M: 4 }, 'claude-3-haiku': { inputPer1M: 0.25, outputPer1M: 1.25 }, 'gemini-2.0-flash': { inputPer1M: 0.1, outputPer1M: 0.4 }, 'gemini-1.5-pro': { inputPer1M: 1.25, outputPer1M: 5 }, 'gemini-1.5-flash': { inputPer1M: 0.075, outputPer1M: 0.3 }, }; export function calculateStepCost( inputTokens: number, outputTokens: number, modelId: string, ): StepCostBreakdown | undefined { let pricing: PricingTable | undefined; for (const [key, value] of Object.entries(PRICING_TABLE)) { if (modelId.startsWith(key)) { pricing = value; break; } } if (!pricing) return undefined; const inputCost = (inputTokens / 1_000_000) * pricing.inputPer1M; const outputCost = (outputTokens / 1_000_000) * pricing.outputPer1M; return { inputCost, outputCost, totalCost: inputCost + outputCost }; } // ── Plan Update ── export const PlanRevisionSchema = z.object({ plan: z.string().describe('Updated plan based on current progress'), reasoning: z.string().describe('Why the plan was updated'), }); export type PlanRevision = z.infer; // ── Model capability helpers ── const EXTENDED_THINKING_MODELS = [ 'claude-3-5-sonnet', 'claude-3-opus', 'claude-3-7-sonnet', 'claude-4', 'o1', 'o1-pro', 'o3', 'o3-mini', 'gemini-2.0-flash-thinking', 'deepseek-r1', ]; export function supportsDeepReasoning(modelId: string): boolean { return EXTENDED_THINKING_MODELS.some((m) => modelId.includes(m)); } const COORDINATE_CLICK_MODELS = [ 'gpt-4o', 'claude-3-5-sonnet', 'claude-4', 'gemini-2.0', 'gemini-1.5-pro', ]; export function supportsCoordinateMode(modelId: string): boolean { return COORDINATE_CLICK_MODELS.some((m) => modelId.includes(m)); } const FLASH_MODELS = [ 'gpt-4o-mini', 'claude-3-haiku', 'claude-3-5-haiku', 'gemini-1.5-flash', 'gemini-2.0-flash', ]; export function isCompactModel(modelId: string): boolean { return FLASH_MODELS.some((m) => modelId.includes(m)); } // ── Agent Run Result ── export interface RunOutcome { finalResult?: string; success: boolean; history: ExecutionLog; errors: string[]; detectedVariables?: ExtractedVariable[]; judgement?: EvaluationResult; simpleJudgement?: QuickCheckResult; totalCost?: AccumulatedCost; } ================================================ FILE: packages/core/src/bridge/adapter.ts ================================================ import { z, type ZodTypeAny } from 'zod'; import type { CommandExecutor } from '../commands/executor.js'; export interface MCPToolDefinition { name: string; description: string; inputSchema: Record; } export class BridgeAdapter { private tools: CommandExecutor; constructor(tools: CommandExecutor) { this.tools = tools; } getToolDefinitions(): MCPToolDefinition[] { return this.tools.registry.getAll().map((action) => ({ name: `browser_${action.name}`, description: action.description, inputSchema: this.zodToJsonSchema(action.schema), })); } getToolNames(): string[] { return this.tools.registry.getNames().map((name) => `browser_${name}`); } parseToolName(mcpToolName: string): string | null { if (mcpToolName.startsWith('browser_')) { return mcpToolName.slice(8); } return null; } private zodToJsonSchema(schema: ZodTypeAny): Record { const jsonSchema: Record = { type: 'object' }; if (schema instanceof z.ZodObject) { const shape = schema.shape; const properties: Record = {}; const required: string[] = []; for (const [key, value] of Object.entries(shape)) { const fieldSchema = value as ZodTypeAny; properties[key] = this.fieldToJsonSchema(fieldSchema); if (!(fieldSchema instanceof z.ZodOptional)) { required.push(key); } } jsonSchema.properties = properties; if (required.length > 0) { jsonSchema.required = required; } } return jsonSchema; } private fieldToJsonSchema(schema: ZodTypeAny): Record { if (schema instanceof z.ZodString) { return { type: 'string', description: schema.description }; } if (schema instanceof z.ZodNumber) { return { type: 'number', description: schema.description }; } if (schema instanceof z.ZodBoolean) { return { type: 'boolean', description: schema.description }; } if (schema instanceof z.ZodEnum) { return { type: 'string', enum: schema.options, description: schema.description }; } if (schema instanceof z.ZodArray) { return { type: 'array', items: this.fieldToJsonSchema(schema.element), description: schema.description, }; } if (schema instanceof z.ZodOptional) { return this.fieldToJsonSchema(schema.unwrap()); } if (schema instanceof z.ZodDefault) { const inner = this.fieldToJsonSchema(schema.removeDefault()); (inner as any).default = schema._def.defaultValue(); return inner; } if (schema instanceof z.ZodLiteral) { return { const: schema.value }; } return { type: 'object', description: schema.description }; } } ================================================ FILE: packages/core/src/bridge/client.ts ================================================ import { type ChildProcess, spawn } from 'node:child_process'; import { EventEmitter } from 'node:events'; import type { CustomCommandSpec } from '../commands/types.js'; import { createLogger } from '../logging.js'; const logger = createLogger('mcp-client'); // ── Types ── export interface BridgeClientOptions { command: string; args?: string[]; env?: Record; /** Timeout per JSON-RPC request in ms (default: 30_000) */ requestTimeoutMs?: number; /** Maximum reconnection attempts (default: 5) */ maxReconnectAttempts?: number; /** Initial reconnection delay in ms, doubles each attempt (default: 1000) */ reconnectDelayMs?: number; /** Interval between health checks in ms (0 to disable, default: 0) */ healthCheckIntervalMs?: number; } export interface MCPTool { name: string; description: string; inputSchema: Record; } export type MCPConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecting'; interface PendingRequest { resolve: (value: unknown) => void; reject: (error: Error) => void; timer: ReturnType; method: string; } export interface BridgeClientEvents { stateChange: [state: MCPConnectionState, previousState: MCPConnectionState]; error: [error: Error]; notification: [method: string, params: Record | undefined]; } /** * MCP client that connects to external MCP servers and converts their tools * into custom browser actions. * * Features: * - Reconnection with exponential backoff * - Per-call request timeout * - Concurrent request multiplexing (multiple in-flight requests) * - Tool list caching with invalidation * - Health check / ping * - Event emitter for connection state changes * - Graceful shutdown with pending request drain */ export class BridgeClient extends EventEmitter { private process: ChildProcess | null = null; private requestId = 0; private pendingRequests = new Map(); private options: BridgeClientOptions; private buffer = ''; // ── Connection state ── private _state: MCPConnectionState = 'disconnected'; private reconnectAttempts = 0; private reconnectTimer: ReturnType | null = null; // ── Tool caching ── private cachedTools: MCPTool[] | null = null; private toolsCacheTimestamp = 0; // ── Health check ── private healthCheckTimer: ReturnType | null = null; // ── Config ── private readonly requestTimeoutMs: number; private readonly maxReconnectAttempts: number; private readonly reconnectDelayMs: number; private readonly healthCheckIntervalMs: number; constructor(options: BridgeClientOptions) { super(); this.options = options; this.requestTimeoutMs = options.requestTimeoutMs ?? 30_000; this.maxReconnectAttempts = options.maxReconnectAttempts ?? 5; this.reconnectDelayMs = options.reconnectDelayMs ?? 1000; this.healthCheckIntervalMs = options.healthCheckIntervalMs ?? 0; } // ── Public accessors ── get state(): MCPConnectionState { return this._state; } get isConnected(): boolean { return this._state === 'connected'; } // ── Connection lifecycle ── async connect(): Promise { if (this._state === 'connected') { logger.debug('Already connected, skipping connect()'); return; } this.setState('connecting'); await this.spawnProcess(); await this.initialize(); this.setState('connected'); this.reconnectAttempts = 0; // Warm the tool cache await this.listTools(); // Start health checks if configured this.startHealthChecks(); logger.info(`Connected to MCP server: ${this.options.command}`); } private async spawnProcess(): Promise { this.process = spawn(this.options.command, this.options.args ?? [], { stdio: ['pipe', 'pipe', 'pipe'], env: { ...process.env, ...this.options.env }, }); this.process.stdout?.setEncoding('utf-8'); this.process.stdout?.on('data', (data: string) => { this.buffer += data; this.processBuffer(); }); this.process.stderr?.on('data', (data: Buffer) => { logger.warn(`[MCP stderr] ${data.toString().trimEnd()}`); }); this.process.on('close', (code: number | null) => { logger.info(`MCP server process exited with code ${code}`); this.handleProcessClose(); }); this.process.on('error', (error: Error) => { logger.error(`MCP server process error: ${error.message}`); this.emit('error', error); this.handleProcessClose(); }); } private async initialize(): Promise { await this.send('initialize', { protocolVersion: '2024-11-05', capabilities: {}, clientInfo: { name: 'open-browser', version: '0.1.0' }, }); // Send initialized notification (no id, no response expected) this.sendNotification('notifications/initialized'); } // ── State management ── private setState(newState: MCPConnectionState): void { const previousState = this._state; if (previousState === newState) return; this._state = newState; logger.debug(`Connection state: ${previousState} -> ${newState}`); this.emit('stateChange', newState, previousState); } // ── Reconnection ── private handleProcessClose(): void { const wasPreviouslyConnected = this._state === 'connected'; // Reject all pending requests for (const [id, pending] of this.pendingRequests) { clearTimeout(pending.timer); pending.reject(new Error('MCP server disconnected')); } this.pendingRequests.clear(); this.process = null; this.buffer = ''; if (wasPreviouslyConnected) { this.attemptReconnect(); } else { this.setState('disconnected'); } } private attemptReconnect(): void { if (this.reconnectAttempts >= this.maxReconnectAttempts) { logger.error(`Max reconnection attempts (${this.maxReconnectAttempts}) reached`); this.setState('disconnected'); this.emit('error', new Error('MCP server reconnection failed after all attempts')); return; } this.setState('reconnecting'); this.reconnectAttempts++; const delay = this.reconnectDelayMs * 2 ** (this.reconnectAttempts - 1); logger.info( `Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts})`, ); this.reconnectTimer = setTimeout(async () => { this.reconnectTimer = null; try { await this.spawnProcess(); await this.initialize(); this.setState('connected'); this.reconnectAttempts = 0; // Invalidate tool cache on reconnect -- server may have changed this.invalidateToolCache(); await this.listTools(); this.startHealthChecks(); logger.info('Reconnected to MCP server'); } catch (error) { logger.warn( `Reconnect attempt ${this.reconnectAttempts} failed: ${ error instanceof Error ? error.message : String(error) }`, ); this.attemptReconnect(); } }, delay); } // ── Tool caching ── async listTools(): Promise { if (this.cachedTools) { return this.cachedTools; } const result = (await this.send('tools/list', {})) as { tools: MCPTool[] }; this.cachedTools = result.tools ?? []; this.toolsCacheTimestamp = Date.now(); logger.debug(`Cached ${this.cachedTools.length} tools from MCP server`); return this.cachedTools; } /** Get cached tools synchronously. Returns empty array if cache is cold. */ getTools(): MCPTool[] { return this.cachedTools ?? []; } /** Force-invalidate the tool cache. Next listTools() call will re-fetch. */ invalidateToolCache(): void { this.cachedTools = null; this.toolsCacheTimestamp = 0; } /** Returns when the tool cache was last populated (epoch ms), or 0 if empty. */ get toolsCacheAge(): number { return this.toolsCacheTimestamp > 0 ? Date.now() - this.toolsCacheTimestamp : 0; } // ── Tool invocation ── toCustomActions(): CustomCommandSpec[] { const { z } = require('zod'); const tools = this.getTools(); return tools.map((tool) => ({ name: `mcp_${tool.name}`, description: `[MCP] ${tool.description}`, schema: z.object({}), handler: async (params: Record) => { const result = await this.callTool(tool.name, params); return { success: true, extractedContent: typeof result === 'string' ? result : JSON.stringify(result), }; }, })); } async callTool(name: string, args: Record): Promise { const result = (await this.send('tools/call', { name, arguments: args })) as { content: Array<{ type: string; text?: string }>; isError?: boolean; }; if (result.isError) { const errorText = result.content?.find((c) => c.type === 'text')?.text; throw new Error(errorText ?? 'MCP tool call failed'); } const textContent = result.content?.find((c) => c.type === 'text'); return textContent?.text ?? result; } // ── Health check ── /** Send a ping to verify the server is responsive. Rejects if no pong within timeout. */ async ping(): Promise { await this.send('ping', {}); } private startHealthChecks(): void { this.stopHealthChecks(); if (this.healthCheckIntervalMs <= 0) return; this.healthCheckTimer = setInterval(async () => { try { await this.ping(); } catch { logger.warn('Health check failed'); } }, this.healthCheckIntervalMs); } private stopHealthChecks(): void { if (this.healthCheckTimer) { clearInterval(this.healthCheckTimer); this.healthCheckTimer = null; } } // ── JSON-RPC transport ── private send(method: string, params?: Record): Promise { if (!this.process?.stdin?.writable) { return Promise.reject(new Error('MCP client is not connected')); } const id = ++this.requestId; return new Promise((resolve, reject) => { // Per-call timeout const timer = setTimeout(() => { this.pendingRequests.delete(id); reject(new Error(`MCP request timed out after ${this.requestTimeoutMs}ms: ${method}`)); }, this.requestTimeoutMs); this.pendingRequests.set(id, { resolve, reject, timer, method }); const request = JSON.stringify({ jsonrpc: '2.0', id, method, params, }); this.process?.stdin?.write(`${request}\n`); }); } /** Send a JSON-RPC notification (no id, no response expected). */ private sendNotification(method: string, params?: Record): void { if (!this.process?.stdin?.writable) return; const notification = JSON.stringify({ jsonrpc: '2.0', method, ...(params ? { params } : {}), }); this.process.stdin.write(`${notification}\n`); } private processBuffer(): void { const lines = this.buffer.split('\n'); this.buffer = lines.pop() ?? ''; for (const line of lines) { if (!line.trim()) continue; try { const message = JSON.parse(line); // JSON-RPC notification from server (no id field) if (message.id === undefined || message.id === null) { this.handleServerNotification(message); continue; } // Response to a pending request const pending = this.pendingRequests.get(message.id); if (pending) { clearTimeout(pending.timer); this.pendingRequests.delete(message.id); if (message.error) { pending.reject(new Error(message.error.message)); } else { pending.resolve(message.result); } } } catch { // Ignore malformed responses } } } private handleServerNotification(message: { method: string; params?: Record; }): void { logger.debug(`Server notification: ${message.method}`); this.emit('notification', message.method, message.params); // If server signals tool list changed, invalidate cache if (message.method === 'notifications/tools/list_changed') { this.invalidateToolCache(); } } // ── Graceful shutdown ── /** * Disconnect gracefully: wait for pending requests to drain (up to a timeout), * then kill the server process. */ async disconnect(drainTimeoutMs = 5000): Promise { this.stopHealthChecks(); if (this.reconnectTimer) { clearTimeout(this.reconnectTimer); this.reconnectTimer = null; } // Wait for pending requests to drain if (this.pendingRequests.size > 0) { logger.debug( `Waiting for ${this.pendingRequests.size} pending request(s) to drain...`, ); await Promise.race([ this.waitForPendingDrain(), new Promise((resolve) => setTimeout(resolve, drainTimeoutMs)), ]); } // Reject any still-pending requests for (const [id, pending] of this.pendingRequests) { clearTimeout(pending.timer); pending.reject(new Error('MCP client shutting down')); } this.pendingRequests.clear(); // Kill the process if (this.process) { this.process.removeAllListeners(); this.process.kill(); this.process = null; } this.buffer = ''; this.setState('disconnected'); logger.info('MCP client disconnected'); } private waitForPendingDrain(): Promise { return new Promise((resolve) => { const check = () => { if (this.pendingRequests.size === 0) { resolve(); } else { setTimeout(check, 50); } }; check(); }); } /** Get the number of in-flight requests. */ get pendingRequestCount(): number { return this.pendingRequests.size; } } ================================================ FILE: packages/core/src/bridge/index.ts ================================================ export { BridgeServer, type BridgeServerOptions } from './server.js'; export { BridgeClient, type BridgeClientOptions } from './client.js'; export { BridgeAdapter } from './adapter.js'; ================================================ FILE: packages/core/src/bridge/mcp-types.ts ================================================ /** * Experimental MCP (Model Context Protocol) server types. * @experimental */ export interface MCPServerOptions { port?: number; host?: string; capabilities?: MCPCapability[]; } export type MCPCapability = 'browse' | 'extract' | 'screenshot' | 'interact'; export interface MCPRequest { method: string; params: Record; } export interface MCPResponse { result?: unknown; error?: { code: number; message: string }; } ================================================ FILE: packages/core/src/bridge/server.test.ts ================================================ import { test, expect, describe, beforeEach, mock } from 'bun:test'; import { BridgeServer, type MCPRequest, type MCPResponse } from './server.js'; import { CommandExecutor } from '../commands/executor.js'; // ── Mock factories ── function makeMockViewport() { return { currentPage: { goBack: mock(() => Promise.resolve()), evaluate: mock(() => Promise.resolve({})), mouse: { click: mock(() => Promise.resolve()) }, keyboard: { press: mock(() => Promise.resolve()) }, }, cdp: { send: mock(() => Promise.resolve({})), }, navigate: mock(() => Promise.resolve()), waitForPageReady: mock(() => Promise.resolve()), switchTab: mock(() => Promise.resolve()), newTab: mock(() => Promise.resolve()), closeTab: mock(() => Promise.resolve()), screenshot: mock(() => Promise.resolve({ base64: 'abc123', width: 1280, height: 800 }), ), isConnected: true, getState: mock(() => Promise.resolve({ url: 'https://example.com', title: 'Example', tabs: [{ url: 'https://example.com', title: 'Example' }], }), ), } as any; } function makeMockPageAnalyzer() { return { extractState: mock(() => Promise.resolve({ tree: '...', selectorMap: {}, elementCount: 5, interactiveElementCount: 2, scrollPosition: { x: 0, y: 0 }, viewportSize: { width: 1280, height: 800 }, documentSize: { width: 1280, height: 2000 }, pixelsAbove: 0, pixelsBelow: 1200, }), ), clickElementByIndex: mock(() => Promise.resolve()), inputTextByIndex: mock(() => Promise.resolve()), getElementSelector: mock(() => Promise.resolve('#el')), } as any; } function makeRequest( method: string, id: number | string = 1, params?: Record, ): MCPRequest & { id: number | string } { return { jsonrpc: '2.0' as const, id, method, ...(params ? { params } : {}), }; } // ── Tests ── describe('BridgeServer', () => { let server: BridgeServer; let browser: ReturnType; let domService: ReturnType; let tools: CommandExecutor; beforeEach(() => { browser = makeMockViewport(); domService = makeMockPageAnalyzer(); tools = new CommandExecutor(); server = new BridgeServer({ browser, domService, tools, name: 'test-server', version: '1.0.0', }); }); describe('handleRequest: initialize', () => { test('returns server info and capabilities', async () => { const response = await server.handleRequest(makeRequest('initialize')); expect(response.jsonrpc).toBe('2.0'); expect(response.id).toBe(1); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.protocolVersion).toBe('2024-11-05'); expect(result.serverInfo.name).toBe('test-server'); expect(result.serverInfo.version).toBe('1.0.0'); expect(result.capabilities.tools).toBeDefined(); expect(result.capabilities.resources).toBeDefined(); expect(result.capabilities.resources.subscribe).toBe(true); }); }); describe('handleRequest: tools/list', () => { test('returns list of available tools', async () => { const response = await server.handleRequest(makeRequest('tools/list')); expect(response.result).toBeDefined(); const result = response.result as any; expect(Array.isArray(result.tools)).toBe(true); expect(result.tools.length).toBeGreaterThan(0); // Each tool should have name, description, inputSchema const firstTool = result.tools[0]; expect(firstTool.name).toBeDefined(); expect(firstTool.description).toBeDefined(); expect(firstTool.inputSchema).toBeDefined(); // Tool names should be prefixed with browser_ expect(firstTool.name.startsWith('browser_')).toBe(true); }); }); describe('handleRequest: tools/call', () => { test('executes a browser tool and returns result', async () => { const response = await server.handleRequest( makeRequest('tools/call', 1, { name: 'browser_tap', arguments: { index: 0 }, }), ); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.content).toBeDefined(); expect(Array.isArray(result.content)).toBe(true); expect(result.content[0].type).toBe('text'); expect(result.isError).toBe(false); }); test('returns error for unknown tool', async () => { const response = await server.handleRequest( makeRequest('tools/call', 1, { name: 'unknown_tool', arguments: {}, }), ); expect(response.error).toBeDefined(); expect(response.error!.code).toBe(-32602); expect(response.error!.message).toContain('Unknown tool'); }); test('returns error for tool that does not start with browser_', async () => { const response = await server.handleRequest( makeRequest('tools/call', 1, { name: 'not_browser_tool', arguments: {}, }), ); expect(response.error).toBeDefined(); expect(response.error!.code).toBe(-32602); }); test('returns success content for done action', async () => { const response = await server.handleRequest( makeRequest('tools/call', 1, { name: 'browser_finish', arguments: { text: 'All done' }, }), ); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.content[0].text).toContain('All done'); }); }); describe('handleRequest: resources/list', () => { test('returns available resources', async () => { const response = await server.handleRequest(makeRequest('resources/list')); expect(response.result).toBeDefined(); const result = response.result as any; expect(Array.isArray(result.resources)).toBe(true); const uris = result.resources.map((r: any) => r.uri); expect(uris).toContain('browser://state'); expect(uris).toContain('browser://dom'); expect(uris).toContain('browser://screenshot'); expect(uris).toContain('browser://tabs'); // Each resource should have standard fields for (const resource of result.resources) { expect(resource.name).toBeDefined(); expect(resource.description).toBeDefined(); expect(resource.mimeType).toBeDefined(); } }); }); describe('handleRequest: resources/read', () => { test('reads browser://state resource', async () => { const response = await server.handleRequest( makeRequest('resources/read', 1, { uri: 'browser://state' }), ); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.contents).toBeDefined(); expect(result.contents[0].uri).toBe('browser://state'); expect(result.contents[0].mimeType).toBe('application/json'); expect(result.contents[0].text).toBeDefined(); const state = JSON.parse(result.contents[0].text); expect(state.url).toBe('https://example.com'); }); test('reads browser://dom resource', async () => { const response = await server.handleRequest( makeRequest('resources/read', 1, { uri: 'browser://dom' }), ); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.contents[0].uri).toBe('browser://dom'); expect(result.contents[0].mimeType).toBe('text/plain'); expect(result.contents[0].text).toContain(''); }); test('reads browser://screenshot resource', async () => { const response = await server.handleRequest( makeRequest('resources/read', 1, { uri: 'browser://screenshot' }), ); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.contents[0].uri).toBe('browser://screenshot'); expect(result.contents[0].mimeType).toBe('image/png'); expect(result.contents[0].blob).toBe('abc123'); }); test('reads browser://tabs resource', async () => { const response = await server.handleRequest( makeRequest('resources/read', 1, { uri: 'browser://tabs' }), ); expect(response.result).toBeDefined(); const result = response.result as any; expect(result.contents[0].uri).toBe('browser://tabs'); const tabs = JSON.parse(result.contents[0].text); expect(Array.isArray(tabs)).toBe(true); }); test('returns error for unknown resource URI', async () => { const response = await server.handleRequest( makeRequest('resources/read', 1, { uri: 'browser://nonexistent' }), ); expect(response.error).toBeDefined(); expect(response.error!.message).toContain('Unknown resource URI'); }); test('returns error when uri parameter is missing', async () => { const response = await server.handleRequest( makeRequest('resources/read', 1, {}), ); expect(response.error).toBeDefined(); expect(response.error!.message).toContain('Missing required parameter'); }); }); describe('handleRequest: unknown method', () => { test('returns method not found error', async () => { const response = await server.handleRequest( makeRequest('unknown/method'), ); expect(response.error).toBeDefined(); expect(response.error!.code).toBe(-32601); expect(response.error!.message).toContain('Method not found'); }); }); describe('handleRequest: ping', () => { test('responds to ping', async () => { const response = await server.handleRequest(makeRequest('ping')); expect(response.jsonrpc).toBe('2.0'); expect(response.result).toEqual({}); }); }); describe('handleRequest: resources/subscribe', () => { test('subscribes to a valid resource', async () => { const response = await server.handleRequest( makeRequest('resources/subscribe', 1, { uri: 'browser://state' }), ); expect(response.result).toEqual({}); expect(response.error).toBeUndefined(); }); test('returns error for unknown resource URI', async () => { const response = await server.handleRequest( makeRequest('resources/subscribe', 1, { uri: 'browser://invalid' }), ); expect(response.error).toBeDefined(); expect(response.error!.message).toContain('Unknown resource URI'); }); test('returns error when uri is missing', async () => { const response = await server.handleRequest( makeRequest('resources/subscribe', 1, {}), ); expect(response.error).toBeDefined(); }); }); describe('handleRequest: resources/unsubscribe', () => { test('unsubscribes from a resource', async () => { // First subscribe await server.handleRequest( makeRequest('resources/subscribe', 1, { uri: 'browser://state' }), ); // Then unsubscribe const response = await server.handleRequest( makeRequest('resources/unsubscribe', 2, { uri: 'browser://state' }), ); expect(response.result).toEqual({}); }); test('returns error when uri is missing', async () => { const response = await server.handleRequest( makeRequest('resources/unsubscribe', 1, {}), ); expect(response.error).toBeDefined(); }); }); describe('error handling', () => { test('returns error response for synchronously thrown errors', async () => { // Test with a method that will cause a synchronous error in the handler // The try/catch in handleRequest catches synchronous errors from switch cases const response = await server.handleRequest( makeRequest('resources/read', 1, { uri: 'browser://nonexistent' }), ); expect(response.jsonrpc).toBe('2.0'); expect(response.error).toBeDefined(); expect(response.error!.message).toContain('Unknown resource URI'); }); test('returns error for tools/call when execution fails', async () => { // Modify the domService to throw on clickElementByIndex domService.clickElementByIndex = mock(() => Promise.reject(new Error('Unexpected crash')), ); const failServer = new BridgeServer({ browser, domService, tools, }); // CommandFailedError propagates from registry.execute through // handleToolsCall. Since handleRequest returns (not awaits) the // promise from handleToolsCall, the error may propagate as a // rejection. We handle both cases. try { const response = await failServer.handleRequest( makeRequest('tools/call', 1, { name: 'browser_tap', arguments: { index: 0 }, }), ); // If it returns a response, it should have an error field expect(response.jsonrpc).toBe('2.0'); const hasError = response.error !== undefined; const hasIsError = (response.result as any)?.isError === true; expect(hasError || hasIsError).toBe(true); } catch (error) { // If the error propagates as a rejection, that is acceptable too expect(error).toBeDefined(); } }); }); describe('handleMessage (with notifications)', () => { test('returns null for notification (no id)', async () => { const notification: MCPRequest = { jsonrpc: '2.0', method: 'notifications/initialized', }; const response = await server.handleMessage(notification); expect(response).toBeNull(); }); test('returns response for request (with id)', async () => { const request: MCPRequest = { jsonrpc: '2.0', id: 1, method: 'ping', }; const response = await server.handleMessage(request); expect(response).not.toBeNull(); expect(response!.result).toEqual({}); }); }); }); ================================================ FILE: packages/core/src/bridge/server.ts ================================================ import type { IncomingMessage, ServerResponse } from 'node:http'; import type { Viewport } from '../viewport/viewport.js'; import type { PageAnalyzer } from '../page/page-analyzer.js'; import type { CommandExecutor } from '../commands/executor.js'; import type { ExecutionContext } from '../commands/types.js'; import { BridgeAdapter, type MCPToolDefinition } from './adapter.js'; import { createLogger } from '../logging.js'; const logger = createLogger('mcp-server'); // ── JSON-RPC types ── export interface BridgeServerOptions { browser: Viewport; domService: PageAnalyzer; tools: CommandExecutor; name?: string; version?: string; /** Port for SSE transport (default: 3100) */ ssePort?: number; } export interface MCPRequest { jsonrpc: '2.0'; id?: string | number; method: string; params?: Record; } export interface MCPResponse { jsonrpc: '2.0'; id: string | number; result?: unknown; error?: { code: number; message: string; data?: unknown }; } export interface MCPNotification { jsonrpc: '2.0'; method: string; params?: Record; } // ── Resource types ── export interface MCPResource { uri: string; name: string; description: string; mimeType: string; } export interface MCPResourceContent { uri: string; mimeType: string; text?: string; blob?: string; } // ── Subscription tracking ── interface ResourceSubscription { uri: string; /** Callback that receives the notification to send to the client */ notify: (notification: MCPNotification) => void; } /** * MCP (Model Context Protocol) server that exposes browser actions as tools * and browser state as resources. Supports stdio and SSE transports. * * Implements: * - initialize / tools/list / tools/call (existing) * - resources/list / resources/read (browser state as resources) * - resources/subscribe / resources/unsubscribe (live updates) * - notifications/progress (step progress notifications) * - SSE transport via HTTP */ export class BridgeServer { private controller: BridgeAdapter; private browser: Viewport; private domService: PageAnalyzer; private tools: CommandExecutor; private name: string; private version: string; private ssePort: number; /** Active SSE connections that receive notifications */ private sseClients = new Set(); /** Resource subscriptions keyed by URI */ private subscriptions = new Map>(); /** Last screenshot base64 cache for resource reads */ private lastScreenshotBase64: string | null = null; /** HTTP server reference for SSE transport */ private httpServer: import('node:http').Server | null = null; constructor(options: BridgeServerOptions) { this.browser = options.browser; this.domService = options.domService; this.tools = options.tools; this.controller = new BridgeAdapter(options.tools); this.name = options.name ?? 'open-browser'; this.version = options.version ?? '0.1.0'; this.ssePort = options.ssePort ?? 3100; } // ── Static resource definitions ── private getResourceDefinitions(): MCPResource[] { return [ { uri: 'browser://state', name: 'Browser State', description: 'Current browser state summary including URL, title, and active tab', mimeType: 'application/json', }, { uri: 'browser://dom', name: 'DOM Tree', description: 'Current page DOM tree serialized for LLM consumption', mimeType: 'text/plain', }, { uri: 'browser://screenshot', name: 'Screenshot', description: 'Last screenshot of the current page as base64 PNG', mimeType: 'image/png', }, { uri: 'browser://tabs', name: 'Open Tabs', description: 'List of all open browser tabs with URLs and titles', mimeType: 'application/json', }, ]; } // ── Request dispatcher ── async handleMessage(message: MCPRequest): Promise { // JSON-RPC notifications have no `id` field -- they are fire-and-forget if (message.id === undefined || message.id === null) { await this.handleNotification(message); return null; } return this.handleRequest(message as MCPRequest & { id: string | number }); } async handleRequest(request: MCPRequest & { id: string | number }): Promise { try { switch (request.method) { case 'initialize': return this.handleInitialize(request); case 'tools/list': return this.handleToolsList(request); case 'tools/call': return this.handleToolsCall(request); case 'resources/list': return this.handleResourcesList(request); case 'resources/read': return this.handleResourcesRead(request); case 'resources/subscribe': return this.handleResourcesSubscribe(request); case 'resources/unsubscribe': return this.handleResourcesUnsubscribe(request); case 'ping': return { jsonrpc: '2.0', id: request.id, result: {} }; default: return { jsonrpc: '2.0', id: request.id, error: { code: -32601, message: `Method not found: ${request.method}` }, }; } } catch (error) { return { jsonrpc: '2.0', id: request.id, error: { code: -32603, message: error instanceof Error ? error.message : String(error), }, }; } } /** Handle incoming JSON-RPC notifications (no response expected). */ private async handleNotification(message: MCPRequest): Promise { switch (message.method) { case 'notifications/initialized': logger.debug('Client confirmed initialization'); break; case 'notifications/cancelled': { const requestId = message.params?.requestId; logger.debug(`Client cancelled request ${requestId}`); break; } default: logger.debug(`Received unknown notification: ${message.method}`); } } // ── Protocol handlers ── private handleInitialize(request: MCPRequest & { id: string | number }): MCPResponse { return { jsonrpc: '2.0', id: request.id, result: { protocolVersion: '2024-11-05', capabilities: { tools: {}, resources: { subscribe: true, listChanged: true, }, }, serverInfo: { name: this.name, version: this.version, }, }, }; } private handleToolsList(request: MCPRequest & { id: string | number }): MCPResponse { const tools = this.controller.getToolDefinitions(); return { jsonrpc: '2.0', id: request.id, result: { tools: tools.map((t) => ({ name: t.name, description: t.description, inputSchema: t.inputSchema, })), }, }; } private async handleToolsCall(request: MCPRequest & { id: string | number }): Promise { const params = request.params ?? {}; const toolName = params.name as string; const args = (params.arguments ?? {}) as Record; const actionName = this.controller.parseToolName(toolName); if (!actionName) { return { jsonrpc: '2.0', id: request.id, error: { code: -32602, message: `Unknown tool: ${toolName}` }, }; } // Emit progress notification at start this.emitProgress(request.id, 0, `Executing ${toolName}...`); const context: ExecutionContext = { page: this.browser.currentPage, cdpSession: this.browser.cdp!, domService: this.domService, browserSession: this.browser, }; const result = await this.tools.registry.execute(actionName, args, context); // Emit progress notification at completion this.emitProgress(request.id, 1, 'Complete'); // Notify subscribers that browser state may have changed this.notifyResourceChanged('browser://state'); this.notifyResourceChanged('browser://dom'); return { jsonrpc: '2.0', id: request.id, result: { content: [ { type: 'text', text: result.extractedContent ?? (result.success ? 'Success' : `Error: ${result.error}`), }, ], isError: !result.success, }, }; } // ── Resource handlers ── private handleResourcesList(request: MCPRequest & { id: string | number }): MCPResponse { return { jsonrpc: '2.0', id: request.id, result: { resources: this.getResourceDefinitions(), }, }; } private async handleResourcesRead(request: MCPRequest & { id: string | number }): Promise { const uri = request.params?.uri as string; if (!uri) { return { jsonrpc: '2.0', id: request.id, error: { code: -32602, message: 'Missing required parameter: uri' }, }; } try { const content = await this.readResource(uri); return { jsonrpc: '2.0', id: request.id, result: { contents: [content], }, }; } catch (error) { return { jsonrpc: '2.0', id: request.id, error: { code: -32602, message: error instanceof Error ? error.message : String(error), }, }; } } private async readResource(uri: string): Promise { switch (uri) { case 'browser://state': { const state = await this.browser.getState(); return { uri, mimeType: 'application/json', text: JSON.stringify(state, null, 2), }; } case 'browser://dom': { const domState = await this.domService.extractState( this.browser.currentPage, this.browser.cdp!, ); return { uri, mimeType: 'text/plain', text: domState.tree, }; } case 'browser://screenshot': { const screenshot = await this.browser.screenshot(); this.lastScreenshotBase64 = screenshot.base64; return { uri, mimeType: 'image/png', blob: screenshot.base64, }; } case 'browser://tabs': { const state = await this.browser.getState(); return { uri, mimeType: 'application/json', text: JSON.stringify(state.tabs, null, 2), }; } default: throw new Error(`Unknown resource URI: ${uri}`); } } private handleResourcesSubscribe(request: MCPRequest & { id: string | number }): MCPResponse { const uri = request.params?.uri as string; if (!uri) { return { jsonrpc: '2.0', id: request.id, error: { code: -32602, message: 'Missing required parameter: uri' }, }; } const validUris = new Set(this.getResourceDefinitions().map((r) => r.uri)); if (!validUris.has(uri)) { return { jsonrpc: '2.0', id: request.id, error: { code: -32602, message: `Unknown resource URI: ${uri}` }, }; } // The subscription is tracked; actual notification delivery happens // via emitNotification which writes to all connected transports if (!this.subscriptions.has(uri)) { this.subscriptions.set(uri, new Set()); } logger.debug(`Client subscribed to resource: ${uri}`); return { jsonrpc: '2.0', id: request.id, result: {} }; } private handleResourcesUnsubscribe(request: MCPRequest & { id: string | number }): MCPResponse { const uri = request.params?.uri as string; if (!uri) { return { jsonrpc: '2.0', id: request.id, error: { code: -32602, message: 'Missing required parameter: uri' }, }; } this.subscriptions.delete(uri); logger.debug(`Client unsubscribed from resource: ${uri}`); return { jsonrpc: '2.0', id: request.id, result: {} }; } // ── Notification emission ── /** Emit a progress notification for an in-flight request. */ emitProgress(requestId: string | number, progress: number, message?: string): void { const notification: MCPNotification = { jsonrpc: '2.0', method: 'notifications/progress', params: { progressToken: requestId, progress, total: 1, ...(message ? { message } : {}), }, }; this.broadcastNotification(notification); } /** Notify subscribers that a resource has changed. */ private notifyResourceChanged(uri: string): void { if (!this.subscriptions.has(uri)) return; const notification: MCPNotification = { jsonrpc: '2.0', method: 'notifications/resources/updated', params: { uri }, }; this.broadcastNotification(notification); } /** Send a notification to all connected transports (SSE clients + stdio). */ private broadcastNotification(notification: MCPNotification): void { const serialized = JSON.stringify(notification); // SSE clients for (const client of this.sseClients) { try { client.write(`data: ${serialized}\n\n`); } catch { // Client may have disconnected; will be cleaned up this.sseClients.delete(client); } } } // ── Stdio transport ── async startStdio(): Promise { const stdin = process.stdin; const stdout = process.stdout; stdin.setEncoding('utf-8'); let buffer = ''; stdin.on('data', async (data: string) => { buffer += data; const lines = buffer.split('\n'); buffer = lines.pop() ?? ''; for (const line of lines) { if (!line.trim()) continue; try { const message = JSON.parse(line) as MCPRequest; const response = await this.handleMessage(message); if (response) { stdout.write(`${JSON.stringify(response)}\n`); } } catch { const errorResponse: MCPResponse = { jsonrpc: '2.0', id: 0, error: { code: -32700, message: 'Parse error' }, }; stdout.write(`${JSON.stringify(errorResponse)}\n`); } } }); stdin.on('end', () => { process.exit(0); }); } // ── SSE transport ── /** * Start an HTTP server that exposes the MCP protocol over Server-Sent Events. * * Endpoints: * - GET /sse -- SSE event stream for notifications and responses * - POST /message -- Send JSON-RPC requests * - GET /health -- Health check */ async startSSE(port?: number): Promise { const http = await import('node:http'); const listenPort = port ?? this.ssePort; this.httpServer = http.createServer(async (req: IncomingMessage, res: ServerResponse) => { // CORS headers for browser clients res.setHeader('Access-Control-Allow-Origin', '*'); res.setHeader('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); res.setHeader('Access-Control-Allow-Headers', 'Content-Type'); if (req.method === 'OPTIONS') { res.writeHead(204); res.end(); return; } const url = req.url ?? '/'; if (req.method === 'GET' && url === '/sse') { this.handleSSEConnection(res); return; } if (req.method === 'POST' && url === '/message') { await this.handleSSEMessage(req, res); return; } if (req.method === 'GET' && url === '/health') { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ status: 'ok', server: this.name, version: this.version, browserConnected: this.browser.isConnected, })); return; } res.writeHead(404, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ error: 'Not found' })); }); return new Promise((resolve) => { this.httpServer!.listen(listenPort, () => { logger.info(`MCP SSE server listening on port ${listenPort}`); resolve(); }); }); } private handleSSEConnection(res: ServerResponse): void { res.writeHead(200, { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', Connection: 'keep-alive', }); // Send endpoint info as the first event so the client knows where to POST const endpointEvent = JSON.stringify({ endpoint: '/message' }); res.write(`event: endpoint\ndata: ${endpointEvent}\n\n`); this.sseClients.add(res); logger.debug(`SSE client connected (total: ${this.sseClients.size})`); res.on('close', () => { this.sseClients.delete(res); logger.debug(`SSE client disconnected (total: ${this.sseClients.size})`); }); } private async handleSSEMessage(req: IncomingMessage, res: ServerResponse): Promise { let body = ''; for await (const chunk of req) { body += chunk; } try { const message = JSON.parse(body) as MCPRequest; const response = await this.handleMessage(message); if (response) { // Send response both as HTTP response and as SSE event res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify(response)); // Also push to SSE stream for clients that expect it there const serialized = JSON.stringify(response); for (const client of this.sseClients) { try { client.write(`event: message\ndata: ${serialized}\n\n`); } catch { this.sseClients.delete(client); } } } else { // Notification -- no response body res.writeHead(202); res.end(); } } catch { res.writeHead(400, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ jsonrpc: '2.0', id: 0, error: { code: -32700, message: 'Parse error' } })); } } /** Stop the SSE HTTP server and disconnect all clients. */ async stopSSE(): Promise { for (const client of this.sseClients) { try { client.end(); } catch { // Ignore } } this.sseClients.clear(); if (this.httpServer) { return new Promise((resolve) => { this.httpServer!.close(() => { this.httpServer = null; logger.info('MCP SSE server stopped'); resolve(); }); }); } } /** Stop all transports and clean up. */ async stop(): Promise { await this.stopSSE(); this.subscriptions.clear(); } } ================================================ FILE: packages/core/src/commands/catalog/catalog.ts ================================================ import { z, type ZodTypeAny } from 'zod'; import type { CatalogEntry, CatalogOptions } from './types.js'; import type { CommandResult, ExecutionContext, CustomCommandSpec } from '../types.js'; import { CommandFailedError } from '../../errors.js'; import { escapeRegExp } from '../../utils.js'; // ── Special parameter names ── // These parameter names, when found in a handler's function signature, // are automatically injected from the ExecutionContext instead of from // the action's validated params. const SPECIAL_PARAMS = new Set([ 'browserSession', 'cdpSession', 'page', 'domService', 'extractionLlm', 'fileSystem', 'maskedValues', ]); /** * Parse the parameter names from a function's source text. * Handles arrow functions, regular functions, destructured params, etc. */ function inspectHandlerParams(handler: Function): string[] { const source = handler.toString(); // Match parameter list: function(a, b) / (a, b) => / async (a, b) => // Also handles single param without parens: a => const arrowMatch = source.match(/^(?:async\s+)?\(([^)]*)\)/); const funcMatch = source.match(/^(?:async\s+)?function\s*\w*\s*\(([^)]*)\)/); const singleParamArrow = source.match(/^(?:async\s+)?(\w+)\s*=>/); let paramString: string | undefined; if (arrowMatch) { paramString = arrowMatch[1]; } else if (funcMatch) { paramString = funcMatch[1]; } else if (singleParamArrow) { return [singleParamArrow[1]]; } if (!paramString || !paramString.trim()) { return []; } // Split on commas, handling nested braces/brackets for destructuring const params: string[] = []; let depth = 0; let current = ''; for (const char of paramString) { if (char === '{' || char === '[' || char === '(') { depth++; current += char; } else if (char === '}' || char === ']' || char === ')') { depth--; current += char; } else if (char === ',' && depth === 0) { params.push(current.trim()); current = ''; } else { current += char; } } if (current.trim()) { params.push(current.trim()); } // Clean up: remove type annotations, defaults, destructuring return params.map((p) => { // Remove default values: param = defaultVal const withoutDefault = p.split('=')[0].trim(); // Remove type annotations: param: Type const withoutType = withoutDefault.split(':')[0].trim(); // If it's a destructured param like { a, b }, keep the braces stripped name // For our purposes we only care about top-level named params return withoutType.replace(/^[{[(]|[})\]]$/g, '').trim(); }); } /** * Detect which special parameters a handler function expects, * based on its parameter names (beyond the standard params + context args). */ function detectSpecialParams(handler: Function): Set { const paramNames = inspectHandlerParams(handler); const detected = new Set(); for (const name of paramNames) { if (SPECIAL_PARAMS.has(name)) { detected.add(name); } } return detected; } /** * Resolve a special parameter value from the ExecutionContext. */ function resolveSpecialParam( name: string, context: ExecutionContext, ): unknown { switch (name) { case 'browserSession': return context.browserSession; case 'cdpSession': return context.cdpSession; case 'page': return context.page; case 'domService': return context.domService; case 'extractionLlm': return context.extractionLlm; case 'fileSystem': return context.fileSystem; case 'maskedValues': return context.maskedValues; default: return undefined; } } export class CommandCatalog { private actions = new Map(); private specialParamsCache = new Map>(); private options: CatalogOptions; constructor(options?: CatalogOptions) { this.options = options ?? {}; } register(action: CatalogEntry): void { if (this.options.excludeActions?.includes(action.name)) return; if ( this.options.includeActions && this.options.includeActions.length > 0 && !this.options.includeActions.includes(action.name) ) { return; } this.actions.set(action.name, action); // Pre-compute which special parameters the handler expects const specialParams = detectSpecialParams(action.handler); if (specialParams.size > 0) { this.specialParamsCache.set(action.name, specialParams); } } registerCustom(definition: CustomCommandSpec): void { this.register({ name: definition.name, description: definition.description, schema: definition.schema, handler: definition.handler, terminatesSequence: definition.terminatesSequence, }); } unregister(name: string): void { this.actions.delete(name); this.specialParamsCache.delete(name); } get(name: string): CatalogEntry | undefined { return this.actions.get(name); } has(name: string): boolean { return this.actions.has(name); } getAll(): CatalogEntry[] { return [...this.actions.values()]; } getNames(): string[] { return [...this.actions.keys()]; } async execute( name: string, params: Record, context: ExecutionContext, ): Promise { const action = this.actions.get(name); if (!action) { throw new CommandFailedError(name, `Action "${name}" is not registered`); } try { // Validate params against schema const validated = action.schema.parse(params); // Inject special parameters from context into the validated params const enriched = this.injectSpecialParams(name, validated, context); return await action.handler(enriched, context); } catch (error) { if (error instanceof CommandFailedError) throw error; const message = error instanceof Error ? error.message : String(error); throw new CommandFailedError(name, message, { cause: error instanceof Error ? error : undefined, }); } } /** * Return the set of special parameter names detected for a given action. * Returns an empty set if no special params were detected. */ getSpecialParams(name: string): Set { return this.specialParamsCache.get(name) ?? new Set(); } /** * Inject special parameters from ExecutionContext into the params object. * Special params are resolved from context and merged into the params * so the handler can destructure them directly from its first argument. */ private injectSpecialParams( actionName: string, params: Record, context: ExecutionContext, ): Record { const specialParams = this.specialParamsCache.get(actionName); if (!specialParams || specialParams.size === 0) { return params; } const enriched = { ...params }; for (const paramName of specialParams) { // Only inject if not already present in the validated params if (!(paramName in enriched)) { const value = resolveSpecialParam(paramName, context); if (value !== undefined) { enriched[paramName] = value; } } } return enriched; } buildDynamicSchema(): z.ZodType { const actionSchemas = this.getAll().map((action) => { if (action.schema instanceof z.ZodObject) { return action.schema.extend({ action: z.literal(action.name), }); } return action.schema; }); if (actionSchemas.length === 0) { return z.object({ action: z.string() }); } if (actionSchemas.length === 1) { return actionSchemas[0]; } return z.union(actionSchemas as [ZodTypeAny, ZodTypeAny, ...ZodTypeAny[]]); } get size(): number { return this.actions.size; } // ── Prompt description ── /** * Build a formatted multi-line description of all available actions. * Optionally filter by page URL domain so only relevant actions appear. */ getPromptDescription(pageUrl?: string): string { let actions = this.getAll(); // If a URL is provided, filter out actions whose domainFilter does not match if (pageUrl) { const domain = extractDomain(pageUrl); if (domain) { actions = actions.filter((a) => { // Actions without a domainFilter are always shown if (!a.domainFilter || a.domainFilter.length === 0) return true; return a.domainFilter.some( (pattern) => domain === pattern || domain.endsWith(`.${pattern}`), ); }); } } const lines: string[] = []; for (const action of actions) { const termFlag = action.terminatesSequence ? ' [terminates]' : ''; lines.push(`- ${action.name}: ${action.description}${termFlag}`); // Describe the schema parameters if (action.schema instanceof z.ZodObject) { const shape = action.schema.shape as Record; for (const [key, zodType] of Object.entries(shape)) { if (key === 'action') continue; const desc = zodType.description ?? ''; const isOptional = zodType.isOptional?.() ?? false; const optLabel = isOptional ? ' (optional)' : ''; lines.push(` ${key}${optLabel}: ${desc}`); } } } return lines.join('\n'); } // ── Domain-based filtering ── /** * Return actions that have a domainFilter matching the given domain, * plus all actions that have no domainFilter (universal actions). */ getActionsForDomain(domain: string): CatalogEntry[] { const normalized = domain.replace(/^www\./, '').toLowerCase(); return this.getAll().filter((action) => { if (!action.domainFilter || action.domainFilter.length === 0) return true; return action.domainFilter.some((pattern) => { const p = pattern.toLowerCase(); return normalized === p || normalized.endsWith(`.${p}`); }); }); } // ── Sensitive data replacement ── /** * Replace sensitive data values in text with `` placeholders. * Keys are sorted longest-value-first to avoid partial replacements. */ replaceSensitiveData( text: string, maskedValues: Record, ): string { if (!text) return text; // Sort entries by value length descending so longer values are replaced first const entries = Object.entries(maskedValues).sort( (a, b) => b[1].length - a[1].length, ); let result = text; for (const [key, value] of entries) { if (!value) continue; const pattern = new RegExp(escapeRegExp(value), 'g'); result = result.replace(pattern, `<${key}>`); } return result; } // ── Actions that terminate the sequence ── /** * Return the names of all actions marked as terminatesSequence. */ getTerminatingActions(): string[] { return this.getAll() .filter((a) => a.terminatesSequence) .map((a) => a.name); } /** * Check whether a given action name is marked as terminatesSequence. */ isTerminating(name: string): boolean { const action = this.actions.get(name); return action?.terminatesSequence === true; } } // ── Helpers ── function extractDomain(url: string): string | null { try { return new URL(url).hostname.replace(/^www\./, '').toLowerCase(); } catch { return null; } } ================================================ FILE: packages/core/src/commands/catalog/types.ts ================================================ import type { z } from 'zod'; import type { CommandResult, ExecutionContext } from '../types.js'; export interface CatalogEntry { name: string; description: string; schema: z.ZodTypeAny; handler: (params: Record, context: ExecutionContext) => Promise; terminatesSequence?: boolean; domainFilter?: string[]; } export interface CatalogOptions { excludeActions?: string[]; includeActions?: string[]; } ================================================ FILE: packages/core/src/commands/catalog.test.ts ================================================ import { test, expect, describe, beforeEach, mock } from 'bun:test'; import { z } from 'zod'; import { CommandCatalog } from './catalog/catalog.js'; import { CommandFailedError } from '../errors.js'; import type { ExecutionContext, CommandResult } from './types.js'; // ── Helpers ── function makeHandler( result: CommandResult = { success: true }, ): (params: Record, ctx: ExecutionContext) => Promise { return mock(() => Promise.resolve(result)); } function makeContext(overrides: Partial = {}): ExecutionContext { return { page: {} as any, cdpSession: {} as any, domService: {} as any, browserSession: {} as any, ...overrides, }; } const testSchema = z.object({ value: z.string(), count: z.number().optional(), }); // ── Tests ── describe('CommandCatalog', () => { let registry: CommandCatalog beforeEach(() => { registry = new CommandCatalog(); }); describe('register and unregister', () => { test('registers an action', () => { registry.register({ name: 'test_action', description: 'A test action', schema: testSchema, handler: makeHandler(), }); expect(registry.has('test_action')).toBe(true); expect(registry.size).toBe(1); }); test('unregisters an action', () => { registry.register({ name: 'test_action', description: 'A test action', schema: testSchema, handler: makeHandler(), }); registry.unregister('test_action'); expect(registry.has('test_action')).toBe(false); expect(registry.size).toBe(0); }); test('get returns registered action', () => { registry.register({ name: 'my_action', description: 'Mine', schema: testSchema, handler: makeHandler(), }); const action = registry.get('my_action'); expect(action).toBeDefined(); expect(action!.name).toBe('my_action'); expect(action!.description).toBe('Mine'); }); test('get returns undefined for unregistered action', () => { expect(registry.get('nonexistent')).toBeUndefined(); }); test('respects excludeActions option', () => { const filtered = new CommandCatalog({ excludeActions: ['blocked'] }); filtered.register({ name: 'blocked', description: 'Should not register', schema: testSchema, handler: makeHandler(), }); filtered.register({ name: 'allowed', description: 'Should register', schema: testSchema, handler: makeHandler(), }); expect(filtered.has('blocked')).toBe(false); expect(filtered.has('allowed')).toBe(true); }); test('respects includeActions option', () => { const filtered = new CommandCatalog({ includeActions: ['only_this'] }); filtered.register({ name: 'only_this', description: 'Should register', schema: testSchema, handler: makeHandler(), }); filtered.register({ name: 'other', description: 'Should not register', schema: testSchema, handler: makeHandler(), }); expect(filtered.has('only_this')).toBe(true); expect(filtered.has('other')).toBe(false); }); }); describe('getAll and getNames', () => { test('returns all registered actions', () => { registry.register({ name: 'alpha', description: 'Alpha', schema: testSchema, handler: makeHandler(), }); registry.register({ name: 'beta', description: 'Beta', schema: testSchema, handler: makeHandler(), }); const all = registry.getAll(); expect(all).toHaveLength(2); const names = registry.getNames(); expect(names).toContain('alpha'); expect(names).toContain('beta'); }); }); describe('execute', () => { test('executes registered action with valid params', async () => { const handler = makeHandler({ success: true, extractedContent: 'result' }); registry.register({ name: 'exec_test', description: 'Test execute', schema: testSchema, handler, }); const ctx = makeContext(); const result = await registry.execute('exec_test', { value: 'hello' }, ctx); expect(result.success).toBe(true); expect(result.extractedContent).toBe('result'); expect(handler).toHaveBeenCalledTimes(1); }); test('throws CommandFailedError for unregistered action', async () => { const ctx = makeContext(); await expect( registry.execute('nonexistent', {}, ctx), ).rejects.toThrow(CommandFailedError); }); test('throws CommandFailedError when schema validation fails', async () => { registry.register({ name: 'strict', description: 'Strict schema', schema: z.object({ required: z.string() }), handler: makeHandler(), }); const ctx = makeContext(); await expect( registry.execute('strict', { wrong: 'param' }, ctx), ).rejects.toThrow(CommandFailedError); }); test('wraps handler errors in CommandFailedError', async () => { registry.register({ name: 'failing', description: 'Fails', schema: testSchema, handler: async () => { throw new Error('Internal failure'); }, }); const ctx = makeContext(); await expect( registry.execute('failing', { value: 'x' }, ctx), ).rejects.toThrow(CommandFailedError); }); test('re-throws CommandFailedError without wrapping', async () => { const original = new CommandFailedError('tool', 'original error'); registry.register({ name: 'rethrow', description: 'Rethrow', schema: testSchema, handler: async () => { throw original; }, }); const ctx = makeContext(); try { await registry.execute('rethrow', { value: 'x' }, ctx); expect.unreachable('Should have thrown'); } catch (error) { expect(error).toBe(original); } }); }); describe('domain-based filtering', () => { test('returns universal actions for any domain', () => { registry.register({ name: 'universal', description: 'No filter', schema: testSchema, handler: makeHandler(), }); const actions = registry.getActionsForDomain('example.com'); expect(actions.map((a) => a.name)).toContain('universal'); }); test('returns domain-specific actions matching the domain', () => { registry.register({ name: 'github_only', description: 'GitHub', schema: testSchema, handler: makeHandler(), domainFilter: ['github.com'], }); const githubActions = registry.getActionsForDomain('github.com'); expect(githubActions.map((a) => a.name)).toContain('github_only'); const otherActions = registry.getActionsForDomain('example.com'); expect(otherActions.map((a) => a.name)).not.toContain('github_only'); }); test('matches subdomains', () => { registry.register({ name: 'google_all', description: 'Google subdomains', schema: testSchema, handler: makeHandler(), domainFilter: ['google.com'], }); const actions = registry.getActionsForDomain('mail.google.com'); expect(actions.map((a) => a.name)).toContain('google_all'); }); test('strips www prefix from domain', () => { registry.register({ name: 'example', description: 'Example', schema: testSchema, handler: makeHandler(), domainFilter: ['example.com'], }); const actions = registry.getActionsForDomain('www.example.com'); expect(actions.map((a) => a.name)).toContain('example'); }); }); describe('terminatesSequence flag', () => { test('isTerminating returns true for terminating actions', () => { registry.register({ name: 'finish', description: 'Finish', schema: testSchema, handler: makeHandler(), terminatesSequence: true, }); expect(registry.isTerminating('finish')).toBe(true); }); test('isTerminating returns false for non-terminating actions', () => { registry.register({ name: 'continue', description: 'Continue', schema: testSchema, handler: makeHandler(), }); expect(registry.isTerminating('continue')).toBe(false); }); test('getTerminatingActions returns all terminating action names', () => { registry.register({ name: 'finish', description: 'Done', schema: testSchema, handler: makeHandler(), terminatesSequence: true, }); registry.register({ name: 'abort', description: 'Abort', schema: testSchema, handler: makeHandler(), terminatesSequence: true, }); registry.register({ name: 'tap', description: 'Click', schema: testSchema, handler: makeHandler(), }); const terminating = registry.getTerminatingActions(); expect(terminating).toContain('finish'); expect(terminating).toContain('abort'); expect(terminating).not.toContain('tap'); }); }); describe('getPromptDescription', () => { test('returns formatted description of all actions', () => { registry.register({ name: 'tap', description: 'Click on an element', schema: z.object({ index: z.number().describe('Element index'), }), handler: makeHandler(), }); registry.register({ name: 'finish', description: 'Mark task as done', schema: z.object({ text: z.string().describe('Result text'), }), handler: makeHandler(), terminatesSequence: true, }); const desc = registry.getPromptDescription(); expect(desc).toContain('- tap: Click on an element'); expect(desc).toContain('index'); expect(desc).toContain('Element index'); expect(desc).toContain('- finish: Mark task as done [terminates]'); }); test('filters by page URL domain', () => { registry.register({ name: 'universal', description: 'Universal action', schema: testSchema, handler: makeHandler(), }); registry.register({ name: 'github_only', description: 'GitHub action', schema: testSchema, handler: makeHandler(), domainFilter: ['github.com'], }); const githubDesc = registry.getPromptDescription('https://github.com/repo'); expect(githubDesc).toContain('universal'); expect(githubDesc).toContain('github_only'); const otherDesc = registry.getPromptDescription('https://example.com'); expect(otherDesc).toContain('universal'); expect(otherDesc).not.toContain('github_only'); }); }); describe('sensitive data replacement', () => { test('replaces sensitive values with placeholders', () => { const result = registry.replaceSensitiveData( 'The password is hunter2 and the key is abc123', { PASSWORD: 'hunter2', API_KEY: 'abc123' }, ); expect(result).toBe('The password is and the key is '); }); test('replaces longer values first to avoid partial replacements', () => { const result = registry.replaceSensitiveData( 'Token: my-long-secret-token and key: secret', { TOKEN: 'my-long-secret-token', KEY: 'secret' }, ); // "my-long-secret-token" should be replaced first, not the inner "secret" expect(result).toBe('Token: and key: '); }); test('handles empty text', () => { const result = registry.replaceSensitiveData('', { KEY: 'value' }); expect(result).toBe(''); }); test('handles empty sensitive data', () => { const result = registry.replaceSensitiveData('some text', {}); expect(result).toBe('some text'); }); test('handles special regex characters in values', () => { const result = registry.replaceSensitiveData( 'Found: $100.00 (USD)', { PRICE: '$100.00' }, ); expect(result).toBe('Found: (USD)'); }); }); describe('parameter inspection and injection', () => { test('detects special parameters from handler function', () => { registry.register({ name: 'with_page', description: 'Uses page', schema: z.object({}), handler: async (params, ctx) => { return { success: true }; }, }); // The handler doesn't use named special params, so set should be empty const special = registry.getSpecialParams('with_page'); expect(special.size).toBe(0); }); test('returns empty set for unregistered action', () => { const special = registry.getSpecialParams('nonexistent'); expect(special.size).toBe(0); }); }); describe('buildDynamicSchema', () => { test('builds a union schema from registered actions', () => { registry.register({ name: 'tap', description: 'Click', schema: z.object({ index: z.number() }), handler: makeHandler(), }); registry.register({ name: 'finish', description: 'Done', schema: z.object({ text: z.string() }), handler: makeHandler(), }); const schema = registry.buildDynamicSchema(); expect(schema).toBeDefined(); // Should parse a click action const clickResult = schema.safeParse({ action: 'tap', index: 5 }); expect(clickResult.success).toBe(true); // Should parse a done action const doneResult = schema.safeParse({ action: 'finish', text: 'finished' }); expect(doneResult.success).toBe(true); }); test('returns simple object schema when no actions registered', () => { const schema = registry.buildDynamicSchema(); const result = schema.safeParse({ action: 'anything' }); expect(result.success).toBe(true); }); test('returns single schema when only one action registered', () => { registry.register({ name: 'only', description: 'Only action', schema: z.object({ x: z.number() }), handler: makeHandler(), }); const schema = registry.buildDynamicSchema(); const result = schema.safeParse({ action: 'only', x: 42 }); expect(result.success).toBe(true); }); }); describe('registerCustom', () => { test('registers a custom action definition', () => { registry.registerCustom({ name: 'custom_action', description: 'A custom action', schema: z.object({ query: z.string() }), handler: async () => ({ success: true }), }); expect(registry.has('custom_action')).toBe(true); }); test('registers with terminatesSequence flag', () => { registry.registerCustom({ name: 'custom_done', description: 'Custom done', schema: z.object({}), handler: async () => ({ success: true, isDone: true }), terminatesSequence: true, }); expect(registry.isTerminating('custom_done')).toBe(true); }); }); }); ================================================ FILE: packages/core/src/commands/executor.test.ts ================================================ import { test, expect, describe, beforeEach, mock } from 'bun:test'; import { CommandExecutor } from './executor.js'; import type { Command, ExecutionContext, CommandResult } from './types.js'; import { UrlBlockedError, CommandFailedError } from '../errors.js'; // ── Mock factories ── function makeMockPageAnalyzer() { return { clickElementByIndex: mock(() => Promise.resolve()), inputTextByIndex: mock(() => Promise.resolve()), getElementSelector: mock(() => Promise.resolve('#selector')), extractState: mock(() => Promise.resolve({ tree: '', selectorMap: {}, elementCount: 0, interactiveElementCount: 0, scrollPosition: { x: 0, y: 0 }, viewportSize: { width: 1280, height: 800 }, documentSize: { width: 1280, height: 2000 }, pixelsAbove: 0, pixelsBelow: 0, }), ), } as any; } function makeMockViewport() { return { navigate: mock(() => Promise.resolve()), waitForPageReady: mock(() => Promise.resolve()), switchTab: mock(() => Promise.resolve()), newTab: mock(() => Promise.resolve()), closeTab: mock(() => Promise.resolve()), screenshot: mock(() => Promise.resolve({ base64: 'abc', width: 1280, height: 800 }), ), currentPage: makeMockPage(), cdp: makeMockCdpSession(), isConnected: true, } as any; } function makeMockPage() { return { goBack: mock(() => Promise.resolve()), evaluate: mock(() => Promise.resolve([])), mouse: { click: mock(() => Promise.resolve()), }, keyboard: { press: mock(() => Promise.resolve()), }, fill: mock(() => Promise.resolve()), click: mock(() => Promise.resolve()), selectOption: mock(() => Promise.resolve()), $: mock(() => Promise.resolve({ setInputFiles: mock(() => Promise.resolve()) })), } as any; } function makeMockCdpSession() { return { send: mock(() => Promise.resolve({})), } as any; } function makeContext(overrides: Partial = {}): ExecutionContext { const browser = makeMockViewport(); return { page: browser.currentPage, cdpSession: browser.cdp, domService: makeMockPageAnalyzer(), browserSession: browser, ...overrides, }; } /** * Helper to create action objects. Zod schemas with .default() produce * required fields in the inferred output type, but at runtime the defaults * are applied during validation. We cast through `any` to allow omitting * fields that have Zod defaults. */ function action(a: Record): Command { return a as Command; } // ── Tests ── describe('CommandExecutor', () => { let tools: CommandExecutor; beforeEach(() => { tools = new CommandExecutor(); }); describe('constructor and registration', () => { test('registers all built-in actions', () => { const names = tools.registry.getNames(); expect(names).toContain('tap'); expect(names).toContain('type_text'); expect(names).toContain('navigate'); expect(names).toContain('back'); expect(names).toContain('scroll'); expect(names).toContain('press_keys'); expect(names).toContain('extract'); expect(names).toContain('finish'); expect(names).toContain('focus_tab'); expect(names).toContain('new_tab'); expect(names).toContain('close_tab'); expect(names).toContain('web_search'); expect(names).toContain('capture'); expect(names).toContain('read_page'); expect(names).toContain('wait'); expect(names).toContain('scroll_to'); expect(names).toContain('find'); expect(names).toContain('search'); expect(names).toContain('extract_structured'); }); test('has default commandsPerStep of 10', () => { expect(tools.commandsPerStep).toBe(10); }); test('respects custom commandsPerStep', () => { const custom = new CommandExecutor({ commandsPerStep: 5 }); expect(custom.commandsPerStep).toBe(5); }); }); describe('click action', () => { test('delegates to domService.clickElementByIndex', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'tap', index: 0 }), ctx, ); expect(result.success).toBe(true); expect(ctx.domService.clickElementByIndex).toHaveBeenCalledWith( ctx.page, ctx.cdpSession, 0, ); }); test('supports multiple clicks via clickCount', async () => { const ctx = makeContext(); await tools.executeAction( action({ action: 'tap', index: 0, clickCount: 3 }), ctx, ); // First call + 2 additional expect(ctx.domService.clickElementByIndex).toHaveBeenCalledTimes(3); }); test('uses coordinate-based clicking when enabled', async () => { tools.setCoordinateClicking(true); const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'tap', index: 0, coordinateX: 100, coordinateY: 200 }), ctx, ); expect(result.success).toBe(true); expect(ctx.page.mouse.click).toHaveBeenCalledWith(100, 200); // domService should NOT have been called expect(ctx.domService.clickElementByIndex).not.toHaveBeenCalled(); }); test('coordinate click supports clickCount', async () => { tools.setCoordinateClicking(true); const ctx = makeContext(); await tools.executeAction( action({ action: 'tap', index: 0, coordinateX: 50, coordinateY: 50, clickCount: 2 }), ctx, ); expect(ctx.page.mouse.click).toHaveBeenCalledTimes(2); }); test('falls back to index-based click when coordinate clicking disabled', async () => { // Default: coordinate clicking is disabled const ctx = makeContext(); await tools.executeAction( action({ action: 'tap', index: 0, coordinateX: 100, coordinateY: 200 }), ctx, ); // Should use domService, not coordinates expect(ctx.domService.clickElementByIndex).toHaveBeenCalled(); }); }); describe('navigate action', () => { test('navigates to valid URL', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'navigate', url: 'https://example.com' }), ctx, ); expect(result.success).toBe(true); expect(ctx.browserSession.navigate).toHaveBeenCalledWith('https://example.com'); }); test('throws CommandFailedError wrapping UrlBlockedError for blocked URL', async () => { const restricted = new CommandExecutor({ blockedUrls: ['evil.com'] }); const ctx = makeContext(); await expect( restricted.executeAction( action({ action: 'navigate', url: 'https://evil.com/page' }), ctx, ), ).rejects.toThrow(CommandFailedError); }); test('throws when URL not in allowlist', async () => { const restricted = new CommandExecutor({ allowedUrls: ['safe.com'] }); const ctx = makeContext(); await expect( restricted.executeAction( action({ action: 'navigate', url: 'https://other.com' }), ctx, ), ).rejects.toThrow(CommandFailedError); }); }); describe('input_text action', () => { test('inputs text into element', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'type_text', index: 3, text: 'hello' }), ctx, ); expect(result.success).toBe(true); expect(ctx.domService.inputTextByIndex).toHaveBeenCalledWith( ctx.page, ctx.cdpSession, 3, 'hello', true, // clearFirst defaults to true ); }); test('passes clearFirst=false when specified', async () => { const ctx = makeContext(); await tools.executeAction( action({ action: 'type_text', index: 0, text: 'append', clearFirst: false }), ctx, ); expect(ctx.domService.inputTextByIndex).toHaveBeenCalledWith( ctx.page, ctx.cdpSession, 0, 'append', false, ); }); }); describe('scroll action', () => { test('scrolls the page when no index provided', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'scroll', direction: 'down' }), ctx, ); expect(result.success).toBe(true); }); test('scrolls an element when index is provided', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'scroll', direction: 'up', index: 5 }), ctx, ); expect(result.success).toBe(true); expect(ctx.domService.getElementSelector).toHaveBeenCalledWith(5); }); }); describe('search_google action', () => { test('navigates to Google search URL', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'web_search', query: 'bun test runner' }), ctx, ); expect(result.success).toBe(true); expect(ctx.browserSession.navigate).toHaveBeenCalled(); const navigateArg = (ctx.browserSession.navigate as any).mock.calls[0][0] as string; expect(navigateArg).toContain('google.com/search'); expect(navigateArg).toContain('bun%20test%20runner'); }); }); describe('done action', () => { test('returns isDone=true with text', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'finish', text: 'Task completed successfully' }), ctx, ); expect(result.success).toBe(true); expect(result.isDone).toBe(true); expect(result.extractedContent).toBe('Task completed successfully'); expect(result.includeInMemory).toBe(true); }); test('respects explicit success=false', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'finish', text: 'Could not complete', success: false }), ctx, ); expect(result.success).toBe(false); expect(result.isDone).toBe(true); }); }); describe('go_back action', () => { test('calls page.goBack and waits for ready', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'back' }), ctx, ); expect(result.success).toBe(true); expect(ctx.page.goBack).toHaveBeenCalled(); expect(ctx.browserSession.waitForPageReady).toHaveBeenCalled(); }); }); describe('send_keys action', () => { test('presses keyboard keys', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'press_keys', keys: 'Enter' }), ctx, ); expect(result.success).toBe(true); expect(ctx.page.keyboard.press).toHaveBeenCalledWith('Enter'); }); }); describe('find_elements action', () => { test('returns found elements description', async () => { const page = makeMockPage(); page.evaluate = mock(() => Promise.resolve([ { tag: 'button', text: 'Submit', attributes: { id: 'btn-submit' } }, { tag: 'a', text: 'Home', attributes: {} }, ]), ); const ctx = makeContext({ page }); const result = await tools.executeAction( action({ action: 'find', query: 'submit' }), ctx, ); expect(result.success).toBe(true); expect(result.extractedContent).toContain('Found 2 element(s)'); expect(result.extractedContent).toContain('button'); expect(result.extractedContent).toContain('Submit'); }); test('returns message when no elements found', async () => { const page = makeMockPage(); page.evaluate = mock(() => Promise.resolve([])); const ctx = makeContext({ page }); const result = await tools.executeAction( action({ action: 'find', query: 'nonexistent' }), ctx, ); expect(result.success).toBe(true); expect(result.extractedContent).toContain('No elements found'); }); }); describe('extract_content action (fallback, no LLM)', () => { test('returns error/fallback when no extraction service', async () => { // Tools without model won't have an extraction service // The handler falls back to extractMarkdown which we mock via page.evaluate const ctx = makeContext(); // extractMarkdown eventually calls page.evaluate // For this test, just verify no crash. The actual extractMarkdown module // import might require more setup, so we test the branch try { await tools.executeAction( action({ action: 'extract', goal: 'get all links' }), ctx, ); } catch { // Expected - extractMarkdown import/evaluation may fail in test env } }); }); describe('search_page action (multi-engine)', () => { test('navigates to DuckDuckGo when specified', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'search', query: 'hello', engine: 'duckduckgo' }), ctx, ); expect(result.success).toBe(true); const url = (ctx.browserSession.navigate as any).mock.calls[0][0] as string; expect(url).toContain('duckduckgo.com'); }); test('navigates to Bing when specified', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'search', query: 'hello', engine: 'bing' }), ctx, ); expect(result.success).toBe(true); const url = (ctx.browserSession.navigate as any).mock.calls[0][0] as string; expect(url).toContain('bing.com/search'); }); test('defaults to Google', async () => { const ctx = makeContext(); await tools.executeAction( action({ action: 'search', query: 'hello' }), ctx, ); const url = (ctx.browserSession.navigate as any).mock.calls[0][0] as string; expect(url).toContain('google.com/search'); }); }); describe('sensitive data masking', () => { test('masks sensitive data in action results', async () => { const ctx = makeContext({ maskedValues: { PASSWORD: 'secret123', API_KEY: 'sk-abc', }, }); // Execute done action with text containing sensitive data const result = await tools.executeActions( [action({ action: 'finish', text: 'Found password: secret123 and key: sk-abc' })], ctx, ); expect(result[0].success).toBe(true); expect(result[0].extractedContent).toContain(''); expect(result[0].extractedContent).toContain(''); expect(result[0].extractedContent).not.toContain('secret123'); expect(result[0].extractedContent).not.toContain('sk-abc'); }); test('does not mask when no sensitive data configured', async () => { const ctx = makeContext(); // no maskedValues const result = await tools.executeActions( [action({ action: 'finish', text: 'Plain text with no secrets' })], ctx, ); expect(result[0].extractedContent).toBe('Plain text with no secrets'); }); }); describe('action sequence execution', () => { test('executes multiple actions in sequence', async () => { const ctx = makeContext(); const results = await tools.executeActions( [ action({ action: 'tap', index: 0 }), action({ action: 'tap', index: 1 }), ], ctx, ); expect(results).toHaveLength(2); expect(results[0].success).toBe(true); expect(results[1].success).toBe(true); }); test('stops at done action', async () => { const ctx = makeContext(); const results = await tools.executeActions( [ action({ action: 'tap', index: 0 }), action({ action: 'finish', text: 'Finished' }), action({ action: 'tap', index: 1 }), // should not execute ], ctx, ); expect(results).toHaveLength(2); expect(results[1].isDone).toBe(true); }); test('respects commandsPerStep limit', async () => { const limited = new CommandExecutor({ commandsPerStep: 2 }); const ctx = makeContext(); const results = await limited.executeActions( [ action({ action: 'tap', index: 0 }), action({ action: 'tap', index: 1 }), action({ action: 'tap', index: 2 }), // should not execute (limit=2) ], ctx, ); expect(results).toHaveLength(2); }); test('handles errors gracefully in sequence', async () => { const ctx = makeContext(); ctx.domService.clickElementByIndex = mock(() => Promise.reject(new Error('Element is not visible')), ); const results = await tools.executeActions( [action({ action: 'tap', index: 0 })], ctx, ); expect(results).toHaveLength(1); expect(results[0].success).toBe(false); expect(results[0].error).toBeDefined(); expect(results[0].error).toContain('not visible'); }); test('stops sequence on non-retryable error', async () => { const ctx = makeContext(); ctx.domService.clickElementByIndex = mock(() => Promise.reject(new Error('browser has been closed')), ); const results = await tools.executeActions( [ action({ action: 'tap', index: 0 }), action({ action: 'tap', index: 1 }), // should not run ], ctx, ); expect(results).toHaveLength(1); expect(results[0].success).toBe(false); }); test('continues after retryable error', async () => { const ctx = makeContext(); let callCount = 0; ctx.domService.clickElementByIndex = mock(() => { callCount++; if (callCount === 1) { return Promise.reject(new Error('Element is not visible')); } return Promise.resolve(); }); const results = await tools.executeActions( [ action({ action: 'tap', index: 0 }), action({ action: 'tap', index: 1 }), ], ctx, ); expect(results).toHaveLength(2); expect(results[0].success).toBe(false); expect(results[1].success).toBe(true); }); test('masks sensitive data in error messages', async () => { const ctx = makeContext({ maskedValues: { TOKEN: 'my-secret-token' }, }); ctx.domService.clickElementByIndex = mock(() => Promise.reject(new Error('Failed with my-secret-token')), ); const results = await tools.executeActions( [action({ action: 'tap', index: 0 })], ctx, ); expect(results[0].error).not.toContain('my-secret-token'); expect(results[0].error).toContain(''); }); }); describe('switch_tab action', () => { test('switches to specified tab', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'focus_tab', tabIndex: 1 }), ctx, ); expect(result.success).toBe(true); expect(ctx.browserSession.switchTab).toHaveBeenCalledWith(1); }); }); describe('open_tab action', () => { test('opens new tab with URL', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'new_tab', url: 'https://example.com' }), ctx, ); expect(result.success).toBe(true); expect(ctx.browserSession.newTab).toHaveBeenCalledWith('https://example.com'); }); test('throws for blocked URL', async () => { const restricted = new CommandExecutor({ blockedUrls: ['banned.com'] }); const ctx = makeContext(); await expect( restricted.executeAction( action({ action: 'new_tab', url: 'https://banned.com' }), ctx, ), ).rejects.toThrow(CommandFailedError); }); }); describe('close_tab action', () => { test('closes specified tab', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'close_tab', tabIndex: 2 }), ctx, ); expect(result.success).toBe(true); expect(ctx.browserSession.closeTab).toHaveBeenCalledWith(2); }); }); describe('screenshot action', () => { test('takes a screenshot', async () => { const ctx = makeContext(); const result = await tools.executeAction( action({ action: 'capture' }), ctx, ); expect(result.success).toBe(true); expect(result.extractedContent).toContain('Screenshot taken'); expect(ctx.browserSession.screenshot).toHaveBeenCalled(); }); }); describe('setCoordinateClicking', () => { test('enables coordinate-based clicking', () => { tools.setCoordinateClicking(true); // Verified through click behavior in click action tests above expect(tools).toBeDefined(); }); test('disables coordinate-based clicking', () => { tools.setCoordinateClicking(true); tools.setCoordinateClicking(false); expect(tools).toBeDefined(); }); }); }); ================================================ FILE: packages/core/src/commands/executor.ts ================================================ import type { Page, CDPSession } from 'playwright'; import { z } from 'zod'; import { CommandCatalog } from './catalog/catalog.js'; import type { Command, CommandResult, ExecutionContext, InterpretedViewportError, ViewportErrorCategory, } from './types.js'; import { TapCommandSchema, TypeTextCommandSchema, NavigateCommandSchema, BackCommandSchema, ScrollCommandSchema, PressKeysCommandSchema, ExtractCommandSchema, FinishCommandSchema, FocusTabCommandSchema, NewTabCommandSchema, CloseTabCommandSchema, WebSearchCommandSchema, UploadCommandSchema, SelectCommandSchema, CaptureCommandSchema, ReadPageCommandSchema, WaitCommandSchema, ScrollToCommandSchema, FindCommandSchema, SearchCommandSchema, ListOptionsCommandSchema, PickOptionCommandSchema, ExtractStructuredCommandSchema, } from './types.js'; import type { Viewport } from '../viewport/viewport.js'; import type { PageAnalyzer } from '../page/page-analyzer.js'; import type { LanguageModel } from '../model/interface.js'; import { ContentExtractor } from './extraction/extractor.js'; import { scrollPage, scrollElement, buildGoogleSearchUrl } from './utils.js'; import { extractMarkdown } from '../page/content-extractor.js'; import { isUrlPermitted } from '../utils.js'; import { UrlBlockedError, NavigationFailedError, ViewportCrashedError, } from '../errors.js'; import { sleep } from '../utils.js'; export interface CommandExecutorOptions { model?: LanguageModel; allowedUrls?: string[]; blockedUrls?: string[]; commandsPerStep?: number; } export class CommandExecutor { readonly registry: CommandCatalog private extractionService?: ContentExtractor; private allowedUrls?: string[]; private blockedUrls?: string[]; readonly commandsPerStep: number; private coordinateClickingEnabled = false; constructor(options?: CommandExecutorOptions) { this.registry = new CommandCatalog(); this.allowedUrls = options?.allowedUrls; this.blockedUrls = options?.blockedUrls; this.commandsPerStep = options?.commandsPerStep ?? 10; if (options?.model) { this.extractionService = new ContentExtractor(options.model); } this.registerBuiltinActions(); } /** * Enable or disable coordinate-based clicking. * When enabled, click actions with coordinateX/coordinateY will use * page.mouse.click instead of element index lookup. */ setCoordinateClicking(enabled: boolean): void { this.coordinateClickingEnabled = enabled; } private registerBuiltinActions(): void { // Click this.registry.register({ name: 'tap', description: 'Click on an element by its index', schema: TapCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { index, clickCount, coordinateX, coordinateY } = params as { index: number; clickCount?: number; coordinateX?: number; coordinateY?: number; }; // Coordinate-based clicking if ( this.coordinateClickingEnabled && coordinateX !== undefined && coordinateY !== undefined ) { const clicks = clickCount ?? 1; for (let i = 0; i < clicks; i++) { await ctx.page.mouse.click(coordinateX, coordinateY); } return { success: true }; } await ctx.domService.clickElementByIndex(ctx.page, ctx.cdpSession, index); if (clickCount && clickCount > 1) { for (let i = 1; i < clickCount; i++) { await ctx.domService.clickElementByIndex(ctx.page, ctx.cdpSession, index); } } return { success: true }; }, }); // Input text this.registry.register({ name: 'type_text', description: 'Type text into an input element', schema: TypeTextCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { index, text, clearFirst } = params as { index: number; text: string; clearFirst?: boolean; }; await ctx.domService.inputTextByIndex( ctx.page, ctx.cdpSession, index, text, clearFirst ?? true, ); return { success: true }; }, }); // Navigate this.registry.register({ name: 'navigate', description: 'Navigate to a URL', schema: NavigateCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { url } = params as { url: string }; if (!isUrlPermitted(url, this.allowedUrls, this.blockedUrls)) { throw new UrlBlockedError(url); } await ctx.browserSession.navigate(url); return { success: true }; }, }); // Go back this.registry.register({ name: 'back', description: 'Go back to previous page', schema: BackCommandSchema.omit({ action: true }), handler: async (_params, ctx) => { await ctx.page.goBack({ timeout: 5000 }).catch(() => {}); await ctx.browserSession.waitForPageReady(); return { success: true }; }, }); // Scroll this.registry.register({ name: 'scroll', description: 'Scroll the page or an element', schema: ScrollCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { direction, amount, index } = params as { direction: 'up' | 'down'; amount?: number; index?: number; }; if (index !== undefined) { const selector = await ctx.domService.getElementSelector(index); if (selector) { await scrollElement(ctx.page, selector, direction, amount); } } else { await scrollPage(ctx.page, direction, amount); } return { success: true }; }, }); // Send keys this.registry.register({ name: 'press_keys', description: 'Send keyboard keys (e.g., Enter, Escape, Control+a)', schema: PressKeysCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { keys } = params as { keys: string }; await ctx.page.keyboard.press(keys); return { success: true }; }, }); // Extract content this.registry.register({ name: 'extract', description: 'Extract specific information from the current page', schema: ExtractCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { goal, outputSchema } = params as { goal: string; outputSchema?: Record; }; // Use the extraction LLM from context if available, otherwise fall back const extractionModel = ctx.extractionLlm; const service = extractionModel ? new ContentExtractor(extractionModel) : this.extractionService; if (!service) { // Fallback: just extract markdown const markdown = await extractMarkdown(ctx.page); return { success: true, extractedContent: markdown.slice(0, 5000), includeInMemory: true, }; } // If an outputSchema is provided, use structured extraction from text if (outputSchema) { const markdown = await extractMarkdown(ctx.page); const content = await service.extractFromText( markdown.slice(0, 8000), goal, outputSchema, ); return { success: true, extractedContent: content, includeInMemory: true }; } const content = await service.extract(ctx.page, goal); return { success: true, extractedContent: content, includeInMemory: true }; }, }); // Done this.registry.register({ name: 'finish', description: 'Mark the task as completed with a result', schema: FinishCommandSchema.omit({ action: true }), terminatesSequence: true, handler: async (params) => { const { text, success } = params as { text: string; success?: boolean }; return { success: success ?? true, isDone: true, extractedContent: text, includeInMemory: true, }; }, }); // Switch tab this.registry.register({ name: 'focus_tab', description: 'Switch to a different browser tab', schema: FocusTabCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { tabIndex } = params as { tabIndex: number }; await ctx.browserSession.switchTab(tabIndex); return { success: true }; }, }); // Open tab this.registry.register({ name: 'new_tab', description: 'Open a new tab with a URL', schema: NewTabCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { url } = params as { url: string }; if (!isUrlPermitted(url, this.allowedUrls, this.blockedUrls)) { throw new UrlBlockedError(url); } await ctx.browserSession.newTab(url); return { success: true }; }, }); // Close tab this.registry.register({ name: 'close_tab', description: 'Close a browser tab', schema: CloseTabCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { tabIndex } = params as { tabIndex?: number }; await ctx.browserSession.closeTab(tabIndex); return { success: true }; }, }); // Search Google this.registry.register({ name: 'web_search', description: 'Search Google for a query', schema: WebSearchCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { query } = params as { query: string }; const url = buildGoogleSearchUrl(query); await ctx.browserSession.navigate(url); return { success: true }; }, }); // Upload file this.registry.register({ name: 'upload', description: 'Upload files to a file input', schema: UploadCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { index, filePaths } = params as { index: number; filePaths: string[] }; // If a fileSystem is available in context, resolve relative paths // against the sandbox directory let resolvedPaths = filePaths; if (ctx.fileSystem) { const sandboxDir = ctx.fileSystem.getSandboxDir(); const { resolve: pathResolve } = await import('node:path'); resolvedPaths = filePaths.map((fp) => fp.startsWith('/') ? fp : pathResolve(sandboxDir, fp), ); } const selector = await ctx.domService.getElementSelector(index); if (!selector) { return { success: false, error: `Element ${index} not found` }; } const fileInput = await ctx.page.$(selector); if (!fileInput) { return { success: false, error: `File input element not found` }; } await fileInput.setInputFiles(resolvedPaths); return { success: true }; }, }); // Select option this.registry.register({ name: 'select', description: 'Select an option in a dropdown', schema: SelectCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { index, value } = params as { index: number; value: string }; const selector = await ctx.domService.getElementSelector(index); if (!selector) { return { success: false, error: `Element ${index} not found` }; } await ctx.page.selectOption(selector, value); return { success: true }; }, }); // Screenshot this.registry.register({ name: 'capture', description: 'Take a screenshot of the current page', schema: CaptureCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { fullPage } = params as { fullPage?: boolean }; const result = await ctx.browserSession.screenshot(fullPage); return { success: true, extractedContent: `Screenshot taken (${result.width}x${result.height})`, }; }, }); // Read content this.registry.register({ name: 'read_page', description: 'Read the text content of the current page', schema: ReadPageCommandSchema.omit({ action: true }), handler: async (_params, ctx) => { const markdown = await extractMarkdown(ctx.page); return { success: true, extractedContent: markdown.slice(0, 10000), includeInMemory: true, }; }, }); // Wait this.registry.register({ name: 'wait', description: 'Wait for a specified number of seconds', schema: WaitCommandSchema.omit({ action: true }), handler: async (params) => { const { seconds } = params as { seconds?: number }; await sleep((seconds ?? 3) * 1000); return { success: true }; }, }); // ── New actions ── // Scroll to text this.registry.register({ name: 'scroll_to', description: 'Scroll to a specific text on the page', schema: ScrollToCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { text } = params as { text: string }; const found = await ctx.page.evaluate((searchText: string) => { // Use TreeWalker to find text nodes containing the search text const walker = document.createTreeWalker( document.body, NodeFilter.SHOW_TEXT, { acceptNode(node) { if ( node.textContent && node.textContent.toLowerCase().includes(searchText.toLowerCase()) ) { return NodeFilter.FILTER_ACCEPT; } return NodeFilter.FILTER_REJECT; }, }, ); const node = walker.nextNode(); if (!node?.parentElement) return false; node.parentElement.scrollIntoView({ behavior: 'smooth', block: 'center', }); return true; }, text); if (!found) { return { success: false, error: `Text "${text}" not found on the page`, }; } // Allow time for the smooth scroll to finish await sleep(500); return { success: true }; }, }); // Find elements this.registry.register({ name: 'find', description: 'Find elements on the page matching a description', schema: FindCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { query } = params as { query: string }; const elements = await ctx.page.evaluate((searchQuery: string) => { const results: Array<{ tag: string; text: string; attributes: Record; }> = []; const queryLower = searchQuery.toLowerCase(); // Search through interactive and content elements const selectors = [ 'a', 'button', 'input', 'select', 'textarea', '[role="button"]', '[role="link"]', '[role="tab"]', '[role="menuitem"]', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', '[aria-label]', ]; for (const selector of selectors) { for (const el of document.querySelectorAll(selector)) { const htmlEl = el as HTMLElement; const text = (htmlEl.innerText || htmlEl.textContent || '').trim(); const ariaLabel = el.getAttribute('aria-label') || ''; const placeholder = el.getAttribute('placeholder') || ''; const title = el.getAttribute('title') || ''; const searchableText = `${text} ${ariaLabel} ${placeholder} ${title}`.toLowerCase(); if (searchableText.includes(queryLower)) { const attrs: Record = {}; if (el.id) attrs.id = el.id; if (el.className && typeof el.className === 'string') { attrs.class = el.className; } if (ariaLabel) attrs['aria-label'] = ariaLabel; if (placeholder) attrs.placeholder = placeholder; results.push({ tag: el.tagName.toLowerCase(), text: text.slice(0, 100), attributes: attrs, }); } // Cap at 20 results if (results.length >= 20) break; } if (results.length >= 20) break; } return results; }, query); if (elements.length === 0) { return { success: true, extractedContent: `No elements found matching "${query}"`, includeInMemory: true, }; } const descriptions = elements.map((el, i) => { const attrStr = Object.entries(el.attributes) .map(([k, v]) => `${k}="${v}"`) .join(' '); return `[${i}] <${el.tag}${attrStr ? ` ${attrStr}` : ''}> ${el.text}`; }); return { success: true, extractedContent: `Found ${elements.length} element(s):\n${descriptions.join('\n')}`, includeInMemory: true, }; }, }); // Search page (multi-engine) this.registry.register({ name: 'search', description: 'Search the web using a specified search engine', schema: SearchCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { query, engine } = params as { query: string; engine?: 'google' | 'duckduckgo' | 'bing'; }; const searchEngine = engine ?? 'google'; const url = buildSearchUrl(query, searchEngine); if (!isUrlPermitted(url, this.allowedUrls, this.blockedUrls)) { throw new UrlBlockedError(url); } await ctx.browserSession.navigate(url); return { success: true }; }, }); // Get dropdown options this.registry.register({ name: 'list_options', description: 'Get all options from a select/dropdown element', schema: ListOptionsCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { index } = params as { index: number }; const selector = await ctx.domService.getElementSelector(index); if (!selector) { return { success: false, error: `Element ${index} not found` }; } const options = await ctx.page.evaluate((sel: string) => { const selectEl = document.querySelector(sel) as HTMLSelectElement | null; if (!selectEl || selectEl.tagName !== 'SELECT') { return null; } return Array.from(selectEl.options).map((opt) => ({ value: opt.value, text: opt.text.trim(), selected: opt.selected, })); }, selector); if (!options) { return { success: false, error: `Element ${index} is not a select element`, }; } const formatted = options .map( (opt, i) => `[${i}] "${opt.text}" (value="${opt.value}")${opt.selected ? ' [selected]' : ''}`, ) .join('\n'); return { success: true, extractedContent: `Dropdown options:\n${formatted}`, includeInMemory: true, }; }, }); // Select dropdown option (by text match) this.registry.register({ name: 'pick_option', description: 'Select a dropdown option by its visible text', schema: PickOptionCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { index, optionText } = params as { index: number; optionText: string; }; const selector = await ctx.domService.getElementSelector(index); if (!selector) { return { success: false, error: `Element ${index} not found` }; } // Find the option value by matching text content const matchedValue = await ctx.page.evaluate( ({ sel, text }: { sel: string; text: string }) => { const selectEl = document.querySelector(sel) as HTMLSelectElement | null; if (!selectEl || selectEl.tagName !== 'SELECT') return null; const textLower = text.toLowerCase(); // Try exact match first for (const opt of selectEl.options) { if (opt.text.trim().toLowerCase() === textLower) { return opt.value; } } // Try partial / includes match for (const opt of selectEl.options) { if (opt.text.trim().toLowerCase().includes(textLower)) { return opt.value; } } return null; }, { sel: selector, text: optionText }, ); if (matchedValue === null) { return { success: false, error: `No option matching "${optionText}" found in dropdown at element ${index}`, }; } await ctx.page.selectOption(selector, matchedValue); return { success: true }; }, }); // Structured output this.useStructuredOutputAction(); } /** * Register the structured_output action. * Uses the extraction LLM to produce structured JSON output from * the current page content according to a caller-provided JSON schema. */ private useStructuredOutputAction(): void { this.registry.register({ name: 'extract_structured', description: 'Extract structured data from the current page content. Returns JSON conforming to the provided schema.', schema: ExtractStructuredCommandSchema.omit({ action: true }), handler: async (params, ctx) => { const { goal, outputSchema, maxContentLength } = params as { goal: string; outputSchema: Record; maxContentLength?: number; }; const contentLimit = maxContentLength ?? 8000; // Resolve the extraction model: prefer context-provided, fall back to Tools-level const extractionModel = ctx.extractionLlm; const service = extractionModel ? new ContentExtractor(extractionModel) : this.extractionService; if (!service) { return { success: false, error: 'No extraction LLM configured. Provide a model via CommandExecutorOptions or ExecutionContext.extractionLlm.', }; } // Extract page content as markdown const markdown = await extractMarkdown(ctx.page); if (!markdown.trim()) { return { success: false, error: 'No content found on the page for structured extraction.', }; } const truncatedContent = markdown.slice(0, contentLimit); try { const result = await service.extractFromText( truncatedContent, goal, outputSchema, ); return { success: true, extractedContent: result, includeInMemory: true, }; } catch (error) { const message = error instanceof Error ? error.message : String(error); return { success: false, error: `Structured extraction failed: ${message}`, }; } }, }); } async executeAction( action: Command, context: ExecutionContext, ): Promise { const { action: actionName, ...params } = action; return this.registry.execute(actionName, params, context); } async executeActions( actions: Command[], context: ExecutionContext, ): Promise { const results: CommandResult[] = []; const limit = Math.min(actions.length, this.commandsPerStep); for (let i = 0; i < limit; i++) { try { const result = await this.executeAction(actions[i], context); // Mask sensitive data in extracted content const maskedResult = this.maskSensitiveResult(result, context); results.push(maskedResult); // Stop if we hit a terminating action (done, or custom terminatesSequence) if (maskedResult.isDone) break; const actionName = actions[i].action; if (this.registry.isTerminating(actionName)) break; } catch (error) { // Interpret the browser error for a more meaningful result const interpreted = classifyViewportError(error); const errorMessage = `${interpreted.message} | Suggestion: ${interpreted.suggestion}`; // Mask sensitive data in error messages too const maskedMessage = this.maskSensitiveText(errorMessage, context); results.push({ success: false, error: maskedMessage, }); // If the error is not retryable (e.g., browser crash), stop the sequence if (!interpreted.isRetryable) break; } } return results; } // ── Sensitive data masking ── /** * Mask sensitive data values in an CommandResult's extractedContent and error fields. */ private maskSensitiveResult( result: CommandResult, context: ExecutionContext, ): CommandResult { if (!context.maskedValues) return result; const masked = { ...result }; if (masked.extractedContent) { masked.extractedContent = this.registry.replaceSensitiveData( masked.extractedContent, context.maskedValues, ); } if (masked.error) { masked.error = this.registry.replaceSensitiveData( masked.error, context.maskedValues, ); } return masked; } /** * Mask sensitive data in a plain text string. */ private maskSensitiveText( text: string, context: ExecutionContext, ): string { if (!context.maskedValues) return text; return this.registry.replaceSensitiveData(text, context.maskedValues); } } // ── Helpers ── function buildSearchUrl( query: string, engine: 'google' | 'duckduckgo' | 'bing', ): string { const encoded = encodeURIComponent(query); switch (engine) { case 'google': return `https://www.google.com/search?q=${encoded}&udm=14`; case 'duckduckgo': return `https://duckduckgo.com/?q=${encoded}`; case 'bing': return `https://www.bing.com/search?q=${encoded}`; } } // ── Browser error interpretation ── /** * Error pattern matcher: maps regex patterns against error messages to * categories, human-readable messages, and actionable suggestions. */ const ERROR_PATTERNS: Array<{ pattern: RegExp; category: ViewportErrorCategory; message: (match: RegExpMatchArray) => string; suggestion: string; isRetryable: boolean; }> = [ { pattern: /net::ERR_NAME_NOT_RESOLVED/i, category: 'network', message: () => 'DNS resolution failed - the domain could not be found.', suggestion: 'Check the URL for typos or try a different URL.', isRetryable: false, }, { pattern: /net::ERR_CONNECTION_REFUSED/i, category: 'network', message: () => 'Connection refused by the server.', suggestion: 'The server may be down. Try again later or use a different URL.', isRetryable: true, }, { pattern: /net::ERR_CONNECTION_TIMED_OUT/i, category: 'network', message: () => 'Connection timed out.', suggestion: 'The server is not responding. Try again or use a different URL.', isRetryable: true, }, { pattern: /net::ERR_SSL/i, category: 'network', message: () => 'SSL/TLS connection error.', suggestion: 'The site has an invalid certificate. Try an alternative URL.', isRetryable: false, }, { pattern: /net::ERR_CERT/i, category: 'network', message: () => 'Certificate verification failed.', suggestion: 'The site has a certificate issue. Try a different URL.', isRetryable: false, }, { pattern: /net::ERR_ABORTED/i, category: 'navigation', message: () => 'Navigation was aborted.', suggestion: 'The page load was interrupted. Try navigating again.', isRetryable: true, }, { pattern: /net::ERR_/i, category: 'network', message: (m) => `Network error: ${m[0]}`, suggestion: 'A network error occurred. Check the URL and try again.', isRetryable: true, }, { pattern: /Navigation timeout of \d+ms exceeded/i, category: 'timeout', message: () => 'Page navigation timed out.', suggestion: 'The page took too long to load. Try again or navigate to a simpler page.', isRetryable: true, }, { pattern: /Timeout \d+ms exceeded/i, category: 'timeout', message: () => 'Operation timed out.', suggestion: 'The operation took too long. Try a simpler action or wait and retry.', isRetryable: true, }, { pattern: /waiting for selector/i, category: 'timeout', message: () => 'Timed out waiting for an element to appear.', suggestion: 'The element may not exist on this page. Check the page content and try a different selector or index.', isRetryable: true, }, { pattern: /Element is not visible/i, category: 'element_not_interactable', message: () => 'The element exists but is not visible.', suggestion: 'Try scrolling to make the element visible, or use a different element.', isRetryable: true, }, { pattern: /Element is not attached to the DOM/i, category: 'element_stale', message: () => 'The element reference is stale - the element was removed from the page.', suggestion: 'The page content has changed. Re-read the page and use updated element indices.', isRetryable: true, }, { pattern: /Element is outside of the viewport/i, category: 'element_not_interactable', message: () => 'The element is outside the visible viewport.', suggestion: 'Scroll to bring the element into view before interacting with it.', isRetryable: true, }, { pattern: /Element is not (?:enabled|editable)/i, category: 'element_not_interactable', message: () => 'The element is disabled or read-only.', suggestion: 'The element cannot be interacted with in its current state. Look for an alternative element or action.', isRetryable: false, }, { pattern: /intercepts pointer events/i, category: 'element_not_interactable', message: () => 'Another element is covering the target element.', suggestion: 'An overlay or dialog may be blocking the click. Try closing it first, or use send_keys as an alternative.', isRetryable: true, }, { pattern: /(?:Element|Node)\s+(?:\d+\s+)?not found/i, category: 'element_not_found', message: () => 'The specified element was not found on the page.', suggestion: 'The element index may be invalid. Re-read the page content to get updated element indices.', isRetryable: true, }, { pattern: /frame was detached/i, category: 'element_stale', message: () => 'The frame containing the element has been detached.', suggestion: 'The page structure changed. Navigate to a stable page and retry.', isRetryable: true, }, { pattern: /browser has been closed/i, category: 'crash', message: () => 'The browser has been closed unexpectedly.', suggestion: 'The browser session is no longer available.', isRetryable: false, }, { pattern: /Target (?:page|context|browser) (?:closed|crashed)/i, category: 'crash', message: () => 'The browser page or context has crashed.', suggestion: 'The browser session is no longer available.', isRetryable: false, }, { pattern: /Protocol error/i, category: 'crash', message: () => 'Browser protocol communication error.', suggestion: 'The browser may have crashed or become unresponsive.', isRetryable: false, }, { pattern: /Permission denied|not allowed/i, category: 'permission', message: () => 'Permission denied for this operation.', suggestion: 'The action requires permissions that are not available. Try an alternative approach.', isRetryable: false, }, ]; /** * Analyze a browser or tool error and return a structured interpretation * with a human-readable message, category, and actionable suggestion. */ export function classifyViewportError(error: unknown): InterpretedViewportError { const rawMessage = error instanceof Error ? error.message : String(error); // Check for known error types first if (error instanceof NavigationFailedError) { return { category: 'navigation', message: `Navigation failed for ${error.url}: ${rawMessage}`, suggestion: 'Check the URL for correctness and try again.', isRetryable: true, }; } if (error instanceof ViewportCrashedError) { return { category: 'crash', message: rawMessage, suggestion: 'The browser has crashed and the session must be restarted.', isRetryable: false, }; } if (error instanceof UrlBlockedError) { return { category: 'permission', message: rawMessage, suggestion: 'This URL is blocked by the allowed/blocked URL configuration. Use a different URL.', isRetryable: false, }; } // Match against known patterns for (const entry of ERROR_PATTERNS) { const match = rawMessage.match(entry.pattern); if (match) { return { category: entry.category, message: entry.message(match), suggestion: entry.suggestion, isRetryable: entry.isRetryable, }; } } // Unknown error - default interpretation return { category: 'unknown', message: rawMessage, suggestion: 'An unexpected error occurred. Try a different action or approach.', isRetryable: true, }; } ================================================ FILE: packages/core/src/commands/extraction/extractor.ts ================================================ import type { Page } from 'playwright'; import type { LanguageModel } from '../../model/interface.js'; import { z } from 'zod'; import { extractMarkdown, chunkText, extractLinks as extractPageLinks, } from '../../page/content-extractor.js'; import { systemMessage, userMessage } from '../../model/messages.js'; const ExtractionResultSchema = z.object({ content: z.string().describe('The extracted information'), confidence: z.number().min(0).max(1).describe('Confidence in the extraction (0-1)'), }); type ExtractionResult = z.infer; export class ContentExtractor { private model: LanguageModel; constructor(model: LanguageModel) { this.model = model; } async extract(page: Page, goal: string, startFromChar?: number): Promise { const markdown = await extractMarkdown(page, { startFromChar: startFromChar && startFromChar > 0 ? startFromChar : undefined, }); if (!markdown.trim()) { return 'No content found on the page.'; } // For short pages, extract directly if (markdown.length <= 8000) { return this.extractFromText(markdown, goal); } // For longer pages, chunk and extract from each chunk const chunks = chunkText(markdown, 6000); const results: string[] = []; for (const chunk of chunks) { const result = await this.extractFromText(chunk, goal); if (result && result !== 'No relevant information found.') { results.push(result); } } if (results.length === 0) { return 'No relevant information found on the page.'; } if (results.length === 1) { return results[0]; } // Combine results return this.combineExtractions(results, goal); } // ── Structured extraction ── /** * Extract information from a page and validate against a Zod schema. * The LLM is prompted to return JSON conforming to the schema, then the * output is parsed/validated with Zod. */ async extractStructured( page: Page, goal: string, schema: z.ZodType, ): Promise { const markdown = await extractMarkdown(page); if (!markdown.trim()) { throw new Error('No content found on the page for structured extraction.'); } // Build a JSON schema description for the prompt const schemaDescription = schema instanceof z.ZodObject ? JSON.stringify( (schema as z.ZodObject).shape, (_key, value) => { if (value?._def?.description) return `(${value._def.description})`; if (value?._def?.typeName) return value._def.typeName; return value; }, 2, ) : 'See schema constraints'; const text = markdown.length > 8000 ? markdown.slice(0, 8000) : markdown; const StructuredOutputSchema = z.object({ result: z.string().describe('JSON string conforming to the requested schema'), }); const response = await this.model.invoke({ messages: [ systemMessage( 'You are a precise information extractor. Extract the requested information from the provided text and return it as a valid JSON string in the "result" field. The JSON must conform to the schema described below.', ), userMessage( `Goal: ${goal}\n\nExpected schema:\n${schemaDescription}\n\nText content:\n${text}\n\nReturn the extracted data as a JSON string in the "result" field.`, ), ], responseSchema: StructuredOutputSchema, schemaName: 'StructuredOutput', temperature: 0, }); const parsed = JSON.parse(response.parsed.result); return schema.parse(parsed); } // ── Link extraction ── /** * Extract all links from a page, returning text, url, and whether external. */ async extractLinks( page: Page, ): Promise> { return extractPageLinks(page); } // ── Text extraction with optional JSON schema ── async extractFromText( text: string, goal: string, outputJsonSchema?: Record, ): Promise { // If a JSON schema is provided, ask the LLM to produce structured output if (outputJsonSchema) { return this.extractFromTextWithJsonSchema(text, goal, outputJsonSchema); } const result = await this.model.invoke({ messages: [ systemMessage( 'You are a precise information extractor. Extract only the requested information from the provided text. Be concise and accurate.', ), userMessage( `Goal: ${goal}\n\nText content:\n${text}\n\nExtract the information specified in the goal. If the information is not found, say "No relevant information found."`, ), ], responseSchema: ExtractionResultSchema, schemaName: 'ExtractionResult', temperature: 0, }); return result.parsed.content; } // ── Private helpers ── private async extractFromTextWithJsonSchema( text: string, goal: string, jsonSchema: Record, ): Promise { const schemaStr = JSON.stringify(jsonSchema, null, 2); const JsonExtractionSchema = z.object({ json: z.string().describe('JSON conforming to the requested schema'), }); const result = await this.model.invoke({ messages: [ systemMessage( 'You are a precise information extractor. Extract the requested information and return it as valid JSON conforming to the provided schema. Put the JSON string in the "json" field.', ), userMessage( `Goal: ${goal}\n\nRequired JSON schema:\n${schemaStr}\n\nText content:\n${text}\n\nExtract and return as JSON.`, ), ], responseSchema: JsonExtractionSchema, schemaName: 'JsonExtraction', temperature: 0, }); // Validate the JSON parses correctly const parsed = JSON.parse(result.parsed.json); return JSON.stringify(parsed); } private async combineExtractions(results: string[], goal: string): Promise { const combined = results.map((r, i) => `Part ${i + 1}:\n${r}`).join('\n\n'); const result = await this.model.invoke({ messages: [ systemMessage( 'Combine the following extracted information into a single coherent response. Remove duplicates and organize logically.', ), userMessage(`Goal: ${goal}\n\nExtracted parts:\n${combined}`), ], responseSchema: ExtractionResultSchema, schemaName: 'ExtractionResult', temperature: 0, }); return result.parsed.content; } } ================================================ FILE: packages/core/src/commands/index.ts ================================================ export { CommandExecutor, type CommandExecutorOptions, classifyViewportError } from './executor.js'; export { CommandCatalog } from './catalog/catalog.js'; export { ContentExtractor } from './extraction/extractor.js'; export { type CatalogEntry, type CatalogOptions } from './catalog/types.js'; export { CommandSchema, type Command, type CommandName, type CommandResult, type ExecutionContext, type CustomCommandSpec, type ViewportErrorCategory, type InterpretedViewportError, TapCommandSchema, TypeTextCommandSchema, NavigateCommandSchema, BackCommandSchema, ScrollCommandSchema, PressKeysCommandSchema, ExtractCommandSchema, FinishCommandSchema, FocusTabCommandSchema, NewTabCommandSchema, CloseTabCommandSchema, WebSearchCommandSchema, UploadCommandSchema, SelectCommandSchema, CaptureCommandSchema, ReadPageCommandSchema, WaitCommandSchema, ScrollToCommandSchema, FindCommandSchema, SearchCommandSchema, ListOptionsCommandSchema, PickOptionCommandSchema, ExtractStructuredCommandSchema, } from './types.js'; ================================================ FILE: packages/core/src/commands/types.ts ================================================ import { z } from 'zod'; // ── Individual action schemas ── export const TapCommandSchema = z.object({ action: z.literal('tap'), index: z.number().describe('Element index to click'), clickCount: z.number().optional().default(1).describe('Number of clicks'), coordinateX: z.number().optional().describe('X coordinate for coordinate-based clicking'), coordinateY: z.number().optional().describe('Y coordinate for coordinate-based clicking'), }); export const TypeTextCommandSchema = z.object({ action: z.literal('type_text'), index: z.number().describe('Element index to type into'), text: z.string().describe('Text to input'), clearFirst: z.boolean().optional().default(true).describe('Clear existing text first'), }); export const NavigateCommandSchema = z.object({ action: z.literal('navigate'), url: z.string().describe('URL to navigate to'), }); export const BackCommandSchema = z.object({ action: z.literal('back'), }); export const ScrollCommandSchema = z.object({ action: z.literal('scroll'), direction: z.enum(['up', 'down']).describe('Scroll direction'), amount: z.number().optional().describe('Scroll amount in pixels or pages'), index: z.number().optional().describe('Element index to scroll within'), pages: z.number().optional().describe('Number of pages to scroll (fractional allowed)'), }); export const PressKeysCommandSchema = z.object({ action: z.literal('press_keys'), keys: z.string().describe('Keys to send (e.g., "Enter", "Escape", "Control+a")'), }); export const ExtractCommandSchema = z.object({ action: z.literal('extract'), goal: z.string().describe('What information to extract from the page'), outputSchema: z.record(z.unknown()).optional().describe('Optional JSON schema for structured output'), }); export const FinishCommandSchema = z.object({ action: z.literal('finish'), text: z.string().describe('Final result text'), success: z.boolean().optional().default(true), }); export const FocusTabCommandSchema = z.object({ action: z.literal('focus_tab'), tabIndex: z.number().describe('Tab index to switch to'), }); export const NewTabCommandSchema = z.object({ action: z.literal('new_tab'), url: z.string().describe('URL to open in new tab'), }); export const CloseTabCommandSchema = z.object({ action: z.literal('close_tab'), tabIndex: z.number().optional().describe('Tab index to close (current if omitted)'), }); export const WebSearchCommandSchema = z.object({ action: z.literal('web_search'), query: z.string().describe('Search query'), }); export const UploadCommandSchema = z.object({ action: z.literal('upload'), index: z.number().describe('File input element index'), filePaths: z.array(z.string()).describe('File paths to upload'), }); export const SelectCommandSchema = z.object({ action: z.literal('select'), index: z.number().describe('Select element index'), value: z.string().describe('Option value to select'), }); export const CaptureCommandSchema = z.object({ action: z.literal('capture'), fullPage: z.boolean().optional().default(false), }); export const ReadPageCommandSchema = z.object({ action: z.literal('read_page'), }); export const WaitCommandSchema = z.object({ action: z.literal('wait'), seconds: z.number().optional().default(3).describe('Seconds to wait'), }); // ── New action schemas ── export const ScrollToCommandSchema = z.object({ action: z.literal('scroll_to'), text: z.string().describe('Text to scroll to on the page'), }); export const FindCommandSchema = z.object({ action: z.literal('find'), query: z.string().describe('Description of elements to find (e.g., "all submit buttons")'), }); export const SearchCommandSchema = z.object({ action: z.literal('search'), query: z.string().describe('Search query'), engine: z.enum(['google', 'duckduckgo', 'bing']).optional().default('google'), }); export const ListOptionsCommandSchema = z.object({ action: z.literal('list_options'), index: z.number().describe('Select element index'), }); export const PickOptionCommandSchema = z.object({ action: z.literal('pick_option'), index: z.number().describe('Select element index'), optionText: z.string().describe('Text of the option to select'), }); export const ExtractStructuredCommandSchema = z.object({ action: z.literal('extract_structured'), goal: z.string().describe('Description of what data to extract from the page'), outputSchema: z .record(z.unknown()) .describe( 'JSON Schema describing the structure of the expected output. The LLM will return data conforming to this schema.', ), maxContentLength: z .number() .optional() .default(8000) .describe('Maximum number of characters of page content to send to the LLM'), }); // ── Discriminated union of all actions ── export const CommandSchema = z.discriminatedUnion('action', [ TapCommandSchema, TypeTextCommandSchema, NavigateCommandSchema, BackCommandSchema, ScrollCommandSchema, PressKeysCommandSchema, ExtractCommandSchema, FinishCommandSchema, FocusTabCommandSchema, NewTabCommandSchema, CloseTabCommandSchema, WebSearchCommandSchema, UploadCommandSchema, SelectCommandSchema, CaptureCommandSchema, ReadPageCommandSchema, WaitCommandSchema, ScrollToCommandSchema, FindCommandSchema, SearchCommandSchema, ListOptionsCommandSchema, PickOptionCommandSchema, ExtractStructuredCommandSchema, ]); export type Command = z.infer; export type CommandName = Command['action']; // ── Action result ── export interface CommandResult { success: boolean; extractedContent?: string; error?: string; isDone?: boolean; includeInMemory?: boolean; } // ── Browser error categories ── export type ViewportErrorCategory = | 'navigation' | 'element_not_found' | 'element_stale' | 'element_not_interactable' | 'timeout' | 'permission' | 'network' | 'crash' | 'unknown'; export interface InterpretedViewportError { category: ViewportErrorCategory; message: string; suggestion: string; isRetryable: boolean; } // ── Custom action definition ── export interface CustomCommandSpec { name: string; description: string; schema: z.ZodObject; handler: (params: Record, context: ExecutionContext) => Promise; terminatesSequence?: boolean; } export interface ExecutionContext { page: import('playwright').Page; cdpSession: import('playwright').CDPSession; domService: import('../page/page-analyzer.js').PageAnalyzer; browserSession: import('../viewport/viewport.js').Viewport; extractionLlm?: import('../model/interface.js').LanguageModel; fileSystem?: import('../sandbox/file-access.js').FileAccess; maskedValues?: Record; } ================================================ FILE: packages/core/src/commands/utils.ts ================================================ import type { Page } from 'playwright'; export async function scrollPage( page: Page, direction: 'up' | 'down', amount?: number, ): Promise { const scrollAmount = amount ?? 500; const delta = direction === 'down' ? scrollAmount : -scrollAmount; await page.evaluate((d) => { window.scrollBy(0, d); }, delta); // Wait for scroll to complete await new Promise((resolve) => setTimeout(resolve, 200)); } export async function scrollElement( page: Page, selector: string, direction: 'up' | 'down', amount?: number, ): Promise { const scrollAmount = amount ?? 300; const delta = direction === 'down' ? scrollAmount : -scrollAmount; await page.evaluate( ({ sel, d }) => { const el = document.querySelector(sel); if (el) el.scrollBy(0, d); }, { sel: selector, d: delta }, ); await new Promise((resolve) => setTimeout(resolve, 200)); } export function buildGoogleSearchUrl(query: string): string { return `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=14`; } ================================================ FILE: packages/core/src/config/config.ts ================================================ import { config as loadDotenv } from 'dotenv'; import * as path from 'node:path'; import * as os from 'node:os'; import * as fs from 'node:fs'; import { type GlobalConfig, GlobalConfigSchema, type ConfigFileContents } from './types.js'; import type { DeepPartial } from '../types.js'; import { createLogger } from '../logging.js'; const logger = createLogger('config'); let _instance: Config | undefined; export class Config { readonly config: GlobalConfig; private constructor(overrides: DeepPartial = {}) { loadDotenv(); // Load from config file first, then merge env and overrides const fileConfig = Config.loadConfigFile(); const merged = this.deepMerge( this.mergeEnvDefaults({}), fileConfig, overrides, ); this.config = GlobalConfigSchema.parse(merged); } static instance(overrides?: DeepPartial): Config { if (!_instance) { _instance = new Config(overrides); } return _instance; } static reset(): void { _instance = undefined; } private mergeEnvDefaults(overrides: DeepPartial): DeepPartial { const env = process.env; const proxy = env.OPEN_BROWSER_PROXY_SERVER ? { server: env.OPEN_BROWSER_PROXY_SERVER, username: env.OPEN_BROWSER_PROXY_USERNAME, password: env.OPEN_BROWSER_PROXY_PASSWORD, } : (env.HTTP_PROXY || env.HTTPS_PROXY) ? { server: (env.HTTPS_PROXY || env.HTTP_PROXY)! } : undefined; return { browser: { headless: env.BROWSER_HEADLESS !== 'false', relaxedSecurity: env.BROWSER_DISABLE_SECURITY === 'true', browserBinaryPath: env.BROWSER_BINARY_PATH ?? undefined, userDataDir: env.BROWSER_USER_DATA_DIR ?? undefined, ...(proxy ? { proxy } : {}), ...overrides.browser, }, tracePath: env.OPEN_BROWSER_TRACE_PATH ?? overrides.tracePath, recordingPath: env.OPEN_BROWSER_SAVE_RECORDING_PATH ?? overrides.recordingPath, ...overrides, }; } private deepMerge(...objects: DeepPartial[]): DeepPartial { const result: Record = {}; for (const obj of objects) { if (!obj) continue; for (const [key, value] of Object.entries(obj)) { if ( value !== null && value !== undefined && typeof value === 'object' && !Array.isArray(value) && typeof result[key] === 'object' && result[key] !== null && !Array.isArray(result[key]) ) { result[key] = this.deepMerge( result[key] as DeepPartial, value as DeepPartial, ); } else if (value !== undefined) { result[key] = value; } } } return result as DeepPartial; } get browser() { return this.config.browser; } get agent() { return this.config.agent; } static get configDir(): string { const dir = path.join(os.homedir(), '.open-browser'); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } return dir; } static get tmpDir(): string { const dir = path.join(Config.configDir, 'tmp'); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } return dir; } static get configFilePath(): string { return path.join(Config.configDir, 'config.json'); } static loadConfigFile(): DeepPartial { try { const filePath = Config.configFilePath; if (fs.existsSync(filePath)) { const raw = fs.readFileSync(filePath, 'utf-8'); const parsed = JSON.parse(raw) as ConfigFileContents; logger.debug(`Loaded config from ${filePath}`); return parsed; } } catch (error) { logger.warn(`Failed to load config file: ${error}`); } return {}; } static saveConfigFile(config: ConfigFileContents): void { const filePath = Config.configFilePath; const dir = path.dirname(filePath); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } fs.writeFileSync(filePath, JSON.stringify(config, null, 2), 'utf-8'); logger.info(`Config saved to ${filePath}`); } static isDocker(): boolean { try { if (fs.existsSync('/.dockerenv')) return true; if (fs.existsSync('/proc/1/cgroup')) { const cgroup = fs.readFileSync('/proc/1/cgroup', 'utf-8'); return cgroup.includes('docker') || cgroup.includes('kubepods'); } } catch { // Not on Linux, definitely not Docker } return false; } static hasDisplay(): boolean { if (process.platform === 'win32') return true; if (process.platform === 'darwin') return true; return !!process.env.DISPLAY || !!process.env.WAYLAND_DISPLAY; } } ================================================ FILE: packages/core/src/config/index.ts ================================================ export { Config } from './config.js'; export { type ViewportConfig, ViewportConfigSchema, type AgentConfig, AgentConfigSchema, type GlobalConfig, GlobalConfigSchema, } from './types.js'; ================================================ FILE: packages/core/src/config/types.ts ================================================ import { z } from 'zod'; export const ProxyConfigSchema = z.object({ server: z.string(), username: z.string().optional(), password: z.string().optional(), bypass: z.array(z.string()).optional(), }); export type ProxyConfig = z.infer; export const ViewportConfigSchema = z.object({ headless: z.boolean().default(true), relaxedSecurity: z.boolean().default(false), extraChromiumArgs: z.array(z.string()).default([]), windowWidth: z.number().default(1280), windowHeight: z.number().default(1100), proxy: ProxyConfigSchema.optional(), minWaitPageLoadMs: z.number().default(500), waitForNetworkIdleMs: z.number().default(1000), maxWaitPageLoadMs: z.number().default(5000), cookieFile: z.string().optional(), minimumWaitBetweenActions: z.number().default(1000), maxErrorLength: z.number().default(400), commandsPerStep: z.number().default(10), browserBinaryPath: z.string().optional(), userDataDir: z.string().optional(), persistAfterClose: z.boolean().default(false), channelName: z.string().optional(), deterministicRendering: z.boolean().default(false), maxIframes: z.number().default(3), downloadsPath: z.string().optional(), }); export type ViewportConfig = z.infer; export const AgentConfigSchema = z.object({ stepLimit: z.number().default(100), commandsPerStep: z.number().default(10), failureThreshold: z.number().default(5), retryDelay: z.number().default(10), enableScreenshots: z.boolean().default(true), enableScreenshotsForTextExtraction: z.boolean().default(false), contextWindowSize: z.number().default(128000), inlineCommands: z.boolean().default(true), capturedAttributes: z.array(z.string()).default([ 'title', 'type', 'name', 'role', 'tabindex', 'aria-label', 'placeholder', 'value', 'alt', 'aria-expanded', ]), commandDelayMs: z.number().default(1), allowedUrls: z.array(z.string()).optional(), blockedUrls: z.array(z.string()).optional(), traceOutputPath: z.string().optional(), replayOutputPath: z.string().optional(), strategyInterval: z.number().default(0), plannerModel: z.any().optional(), enableStrategy: z.boolean().default(false), enableEvaluation: z.boolean().default(false), stepTimeout: z.number().default(60000), llmTimeout: z.number().default(30000), maxElementsInDom: z.number().default(2000), coordinateClicking: z.boolean().default(false), compactMode: z.boolean().default(false), }); export type AgentConfig = z.infer; export const GlobalConfigSchema = z.object({ browser: ViewportConfigSchema.default({}), agent: AgentConfigSchema.default({}), tracePath: z.string().default('./traces'), recordingPath: z.string().default('./recordings'), }); export type GlobalConfig = z.infer; export interface ConfigFileContents { browser?: Partial; agent?: Partial; tracePath?: string; recordingPath?: string; } ================================================ FILE: packages/core/src/errors.ts ================================================ export class OpenBrowserError extends Error { constructor(message: string, options?: ErrorOptions) { super(message, options); this.name = 'OpenBrowserError'; } } export class ViewportError extends OpenBrowserError { constructor(message: string, options?: ErrorOptions) { super(message, options); this.name = 'ViewportError'; } } export class LaunchFailedError extends ViewportError { constructor(message: string, options?: ErrorOptions) { super(message, options); this.name = 'LaunchFailedError'; } } export class NavigationFailedError extends ViewportError { constructor( message: string, public readonly url: string, options?: ErrorOptions, ) { super(message, options); this.name = 'NavigationFailedError'; } } export class ViewportCrashedError extends ViewportError { constructor(message = 'Browser has crashed', options?: ErrorOptions) { super(message, options); this.name = 'ViewportCrashedError'; } } export class AgentError extends OpenBrowserError { constructor(message: string, options?: ErrorOptions) { super(message, options); this.name = 'AgentError'; } } export class AgentStalledError extends AgentError { constructor(message = 'Agent is stuck in a loop', options?: ErrorOptions) { super(message, options); this.name = 'AgentStalledError'; } } export class StepLimitExceededError extends AgentError { public readonly stepsTaken: number; public readonly stepLimit: number; constructor(stepsTaken: number, stepLimit: number, options?: ErrorOptions) { super(`Agent reached maximum steps (${stepsTaken}/${stepLimit})`, options); this.name = 'StepLimitExceededError'; this.stepsTaken = stepsTaken; this.stepLimit = stepLimit; } } export class UrlBlockedError extends OpenBrowserError { public readonly url: string; constructor(url: string, options?: ErrorOptions) { super(`URL not allowed: ${url}`, options); this.name = 'UrlBlockedError'; this.url = url; } } export class PageExtractionError extends OpenBrowserError { constructor(message: string, options?: ErrorOptions) { super(message, options); this.name = 'PageExtractionError'; } } export class ModelError extends OpenBrowserError { constructor(message: string, options?: ErrorOptions) { super(message, options); this.name = 'ModelError'; } } export class ModelThrottledError extends ModelError { public readonly retryAfterMs?: number; constructor(message: string, retryAfterMs?: number, options?: ErrorOptions) { super(message, options); this.name = 'ModelThrottledError'; this.retryAfterMs = retryAfterMs; } } export class CommandFailedError extends OpenBrowserError { public readonly toolName: string; constructor(toolName: string, message: string, options?: ErrorOptions) { super(`Tool "${toolName}" failed: ${message}`, options); this.name = 'CommandFailedError'; this.toolName = toolName; } } export class ContextualViewportError extends ViewportError { public readonly pageUrl: string; public readonly pageTitle: string; public readonly stepNumber: number; constructor( message: string, context: { pageUrl: string; pageTitle: string; stepNumber: number }, options?: ErrorOptions, ) { super( `[Step ${context.stepNumber}] ${message} (url: ${context.pageUrl})`, options, ); this.name = 'ContextualViewportError'; this.pageUrl = context.pageUrl; this.pageTitle = context.pageTitle; this.stepNumber = context.stepNumber; } } export class ProviderError extends ModelError { public readonly provider: string; public readonly statusCode?: number; constructor( provider: string, message: string, statusCode?: number, options?: ErrorOptions, ) { super(`[${provider}] ${message}`, options); this.name = 'ProviderError'; this.provider = provider; this.statusCode = statusCode; } get isRetryable(): boolean { if (this.statusCode === undefined) return false; return this.statusCode === 429 || this.statusCode >= 500; } } export class SchemaViolationError extends OpenBrowserError { public readonly field: string; public readonly issues: string[]; constructor(field: string, issues: string[], options?: ErrorOptions) { super(`Validation failed for "${field}": ${issues.join('; ')}`, options); this.name = 'SchemaViolationError'; this.field = field; this.issues = issues; } } ================================================ FILE: packages/core/src/index.ts ================================================ // ── Core types ── export { type TargetId, type SessionId, type ElementRef, type TabId, targetId, sessionId, elementIndex, tabId, type Result, ok, err, type Position, type Rect, LogLevel, type DeepPartial, type Awaitable, } from './types.js'; // ── Errors ── export { OpenBrowserError, ViewportError, LaunchFailedError, NavigationFailedError, ViewportCrashedError, ContextualViewportError, AgentError, AgentStalledError, StepLimitExceededError, UrlBlockedError, PageExtractionError, ModelError, ModelThrottledError, CommandFailedError, ProviderError, SchemaViolationError, } from './errors.js'; // ── Logging ── export { Logger, createLogger, setGlobalLogLevel, getGlobalLogLevel, setLogColors, setLogTimestamps, } from './logging.js'; // ── Observability ── export { timed, withTiming, Stopwatch, type TimingResult, } from './telemetry.js'; // ── Utils ── export { generateId, matchesUrlPattern, isUrlPermitted, sleep, withDeadline, Timer } from './utils.js'; // ── Config ── export { Config } from './config/index.js'; export type { ViewportConfig, AgentConfig as AgentConfigSchema, GlobalConfig } from './config/index.js'; // ── LLM ── export { type LanguageModel, type InferenceOptions, type ModelProvider, type InferenceResult, type InferenceUsage, type Message, type SystemMessage, type UserMessage, type AssistantMessage, type ToolResultMessage, type ToolCall, type ContentPart, type TextContent, type ImageContent, systemMessage, userMessage, assistantMessage, toolResultMessage, textContent, imageContent, VercelModelAdapter, type VercelModelAdapterOptions, zodToJsonSchema, optimizeSchemaForModel, optimizeJsonSchemaForModel, type SchemaOptimizationOptions, } from './model/index.js'; // ── Browser ── export { Viewport, type ViewportOptions, LaunchProfile, EventHub, BaseGuard, type GuardContext, VisualTracer, type VisualTracerOptions, type TabDescriptor, type ViewportSnapshot, type ViewportHistory, type LaunchOptions, type PageState, type ViewportEventMap, type ViewportRequestMap, type NavigateEvent, type ClickEvent, type InputEvent, type ScrollEvent, type ScreenshotEvent, type ScreenshotResult, type DownloadEvent, type PopupEvent, type SecurityEvent, type CrashEvent, } from './viewport/index.js'; // ── DOM ── export { PageAnalyzer, type PageAnalyzerOptions, SnapshotBuilder, TreeRenderer, type RendererOptions, extractMarkdown, htmlToMarkdown, extractTextContent, extractLinks, chunkText, type MarkdownExtractionOptions, type PageTreeNode, type SelectorIndex, type RenderedPageState, type DOMRect, type CDPSnapshotResult, type AXNode, type TargetInfo, type TargetAllTrees, type InteractedElement, type MatchLevel, type SimplifiedNode, } from './page/index.js'; // ── FileAccess ── export { FileAccess, type FileAccessOptions, type FileInfo, type FileAccessState, } from './sandbox/index.js'; // ── Commands ── export { CommandExecutor, type CommandExecutorOptions, classifyViewportError, CommandCatalog, ContentExtractor, type CatalogEntry, type CatalogOptions, CommandSchema, type Command, type CommandName, type CommandResult, type ExecutionContext, type CustomCommandSpec, type ViewportErrorCategory, type InterpretedViewportError, TapCommandSchema, TypeTextCommandSchema, NavigateCommandSchema, BackCommandSchema, ScrollCommandSchema, PressKeysCommandSchema, ExtractCommandSchema, FinishCommandSchema, FocusTabCommandSchema, NewTabCommandSchema, CloseTabCommandSchema, WebSearchCommandSchema, UploadCommandSchema, SelectCommandSchema, CaptureCommandSchema, ReadPageCommandSchema, WaitCommandSchema, ScrollToCommandSchema, FindCommandSchema, SearchCommandSchema, ListOptionsCommandSchema, PickOptionCommandSchema, ExtractStructuredCommandSchema, } from './commands/index.js'; // ── Agent ── export { Agent, type AgentOptions, InstructionBuilder, StepPromptBuilder, buildCommandDescriptions, buildContextualCommands, buildExtractionInstructionBuilder, buildExtractionUserPrompt, clearTemplateCache, type PromptTemplate, type InstructionBuilderOptions, type StepInfo, type StepPromptBuilderOptions, ConversationManager, StallDetector, hashPageTree, hashTextContent, type PageSignature, type StallDetectorConfig, type StallCheckResult, ResultEvaluator, constructEvaluatorMessages, constructQuickCheckMessages, ReplayRecorder, type ReplayRecorderOptions, type AgentConfig, type AgentState, type AgentDecision, type AgentDecisionCompact, type AgentDecisionDirect, type StepRecord, ExecutionLog, type RunOutcome, type Reasoning, type PlanStep, type EvaluationResult, type QuickCheckResult, type CompactionPolicy, type StepTelemetry, type ExtractedVariable, type AccumulatedCost, type StepCostBreakdown, type PricingTable as AgentPricingTable, type PlanRevision, AgentDecisionSchema, AgentDecisionCompactSchema, AgentDecisionDirectSchema, ReasoningSchema, EvaluationResultSchema, QuickCheckResultSchema, PlanStepSchema, StrategyPlanSchema, PlanRevisionSchema, PRICING_TABLE, calculateStepCost, supportsDeepReasoning, supportsCoordinateMode, isCompactModel, DEFAULT_AGENT_CONFIG, type ConversationManagerOptions, type TrackedMessage, type ConversationManagerState, type ConversationEntry, type SerializedTrackedMessage, type MessageCategory, estimateTokens, estimateMessageTokens, redactSensitiveValues, redactMessage, redactMessages, extractTextContent as extractMessageTextContent, truncate, } from './agent/index.js'; // ── Bridge ── export { BridgeServer, type BridgeServerOptions, BridgeClient, type BridgeClientOptions, BridgeAdapter } from './bridge/index.js'; // ── Metering ── export { UsageMeter, CompositeUsageMeter, BudgetDepletedError, estimateTokenCount, DEFAULT_COST_RATES, type UsageRecord, type CostRates, type PricingTable, type ModelRole, type ActionUsageRecord, type MeteringSummary, type ModelUsageBreakdown, type RoleUsageBreakdown, type BudgetPolicy, type BudgetState, } from './metering/index.js'; ================================================ FILE: packages/core/src/logging.ts ================================================ import { LogLevel } from './types.js'; const LEVEL_NAMES: Record = { [LogLevel.DEBUG]: 'DEBUG', [LogLevel.INFO]: 'INFO', [LogLevel.WARN]: 'WARN', [LogLevel.ERROR]: 'ERROR', }; const LEVEL_COLORS: Record = { [LogLevel.DEBUG]: '\x1b[36m', // cyan [LogLevel.INFO]: '\x1b[32m', // green [LogLevel.WARN]: '\x1b[33m', // yellow [LogLevel.ERROR]: '\x1b[31m', // red }; const RESET = '\x1b[0m'; const DIM = '\x1b[2m'; const BOLD = '\x1b[1m'; let globalLevel: LogLevel = LogLevel.INFO; let useColors = true; let logTimestamps = true; export function setGlobalLogLevel(level: LogLevel): void { globalLevel = level; } export function getGlobalLogLevel(): LogLevel { return globalLevel; } export function setLogColors(enabled: boolean): void { useColors = enabled; } export function setLogTimestamps(enabled: boolean): void { logTimestamps = enabled; } function formatTimestamp(): string { const now = new Date(); const h = now.getHours().toString().padStart(2, '0'); const m = now.getMinutes().toString().padStart(2, '0'); const s = now.getSeconds().toString().padStart(2, '0'); const ms = now.getMilliseconds().toString().padStart(3, '0'); return `${h}:${m}:${s}.${ms}`; } function formatMessage( level: LogLevel, name: string, message: string, ): string { const parts: string[] = []; if (logTimestamps) { const ts = formatTimestamp(); parts.push(useColors ? `${DIM}${ts}${RESET}` : ts); } const levelName = LEVEL_NAMES[level] ?? 'UNKNOWN'; const color = LEVEL_COLORS[level] ?? ''; if (useColors) { parts.push(`${color}${levelName.padEnd(5)}${RESET}`); parts.push(`${BOLD}[${name}]${RESET}`); } else { parts.push(levelName.padEnd(5)); parts.push(`[${name}]`); } parts.push(message); return parts.join(' '); } export class Logger { readonly name: string; private level: LogLevel | null = null; constructor(name: string) { this.name = name; } setLevel(level: LogLevel): void { this.level = level; } getEffectiveLevel(): LogLevel { return this.level ?? globalLevel; } isEnabled(level: LogLevel): boolean { return level >= this.getEffectiveLevel(); } debug(message: string, ...args: unknown[]): void { this.log(LogLevel.DEBUG, message, ...args); } info(message: string, ...args: unknown[]): void { this.log(LogLevel.INFO, message, ...args); } warn(message: string, ...args: unknown[]): void { this.log(LogLevel.WARN, message, ...args); } error(message: string, ...args: unknown[]): void { this.log(LogLevel.ERROR, message, ...args); } private log(level: LogLevel, message: string, ...args: unknown[]): void { if (!this.isEnabled(level)) return; const formatted = formatMessage(level, this.name, message); switch (level) { case LogLevel.ERROR: console.error(formatted, ...args); break; case LogLevel.WARN: console.warn(formatted, ...args); break; default: console.log(formatted, ...args); } } } const loggerCache = new Map(); export function createLogger(name: string): Logger { let logger = loggerCache.get(name); if (!logger) { logger = new Logger(name); loggerCache.set(name, logger); } return logger; } ================================================ FILE: packages/core/src/metering/index.ts ================================================ export { UsageMeter, CompositeUsageMeter, BudgetDepletedError, estimateTokenCount } from './tracker.js'; export { DEFAULT_COST_RATES, type UsageRecord, type CostRates, type PricingTable, type ModelRole, type ActionUsageRecord, type MeteringSummary, type ModelUsageBreakdown, type RoleUsageBreakdown, type BudgetPolicy, type BudgetState, } from './types.js'; ================================================ FILE: packages/core/src/metering/tracker.test.ts ================================================ import { test, expect, describe, beforeEach, mock } from 'bun:test'; import { UsageMeter, CompositeUsageMeter, BudgetDepletedError, estimateTokenCount, } from './tracker.js'; import type { PricingTable } from './types.js'; // ── Shared pricing for predictable cost calculations ── const TEST_PRICING: PricingTable = { 'gpt-4o': { inputCostPerMillion: 2.5, outputCostPerMillion: 10.0 }, 'gpt-4o-mini': { inputCostPerMillion: 0.15, outputCostPerMillion: 0.6 }, 'claude-3-5-sonnet': { inputCostPerMillion: 3.0, outputCostPerMillion: 15.0 }, }; // ── UsageMeter ── describe('UsageMeter', () => { let tracker: UsageMeter; beforeEach(() => { tracker = new UsageMeter('gpt-4o', TEST_PRICING); }); describe('record and getTotalUsage', () => { test('records token usage and returns totals', () => { tracker.record(100, 50); const usage = tracker.getTotalUsage(); expect(usage.inputTokens).toBe(100); expect(usage.outputTokens).toBe(50); expect(usage.totalTokens).toBe(150); }); test('accumulates across multiple records', () => { tracker.record(100, 50); tracker.record(200, 100); tracker.record(300, 150); const usage = tracker.getTotalUsage(); expect(usage.inputTokens).toBe(600); expect(usage.outputTokens).toBe(300); expect(usage.totalTokens).toBe(900); }); test('returns a copy of usage object', () => { tracker.record(100, 50); const usage1 = tracker.getTotalUsage(); const usage2 = tracker.getTotalUsage(); expect(usage1).not.toBe(usage2); expect(usage1).toEqual(usage2); }); }); describe('getEstimatedCost', () => { test('computes correct cost for gpt-4o', () => { // gpt-4o: $2.50/M input, $10.00/M output tracker.record(1_000_000, 500_000); const cost = tracker.getEstimatedCost(); // input: 1M * 2.5/M = 2.5; output: 0.5M * 10/M = 5.0 expect(cost).toBeCloseTo(7.5, 4); }); test('returns 0 for unknown model', () => { const unknown = new UsageMeter('unknown-model', TEST_PRICING); unknown.record(1000, 500); expect(unknown.getEstimatedCost()).toBe(0); }); test('formats cost as dollar string', () => { tracker.record(100_000, 50_000); const formatted = tracker.getEstimatedCostFormatted(); expect(formatted).toMatch(/^\$\d+\.\d{4}$/); }); }); describe('getStepUsages', () => { test('tracks per-step usage', () => { tracker.record(100, 50); tracker.record(200, 100); const steps = tracker.getStepUsages(); expect(steps).toHaveLength(2); expect(steps[0]).toEqual({ inputTokens: 100, outputTokens: 50, totalTokens: 150 }); expect(steps[1]).toEqual({ inputTokens: 200, outputTokens: 100, totalTokens: 300 }); }); test('returns a copy of step usages array', () => { tracker.record(100, 50); const steps1 = tracker.getStepUsages(); const steps2 = tracker.getStepUsages(); expect(steps1).not.toBe(steps2); }); }); describe('getSummary', () => { test('returns formatted summary string', () => { tracker.record(1000, 500); const summary = tracker.getSummary(); expect(summary).toContain('Model: gpt-4o'); expect(summary).toContain('Steps: 1'); expect(summary).toContain('Input tokens:'); expect(summary).toContain('Output tokens:'); expect(summary).toContain('Total tokens:'); expect(summary).toContain('Estimated cost: $'); }); }); describe('reset', () => { test('resets all usage data', () => { tracker.record(1000, 500); tracker.record(2000, 1000); tracker.reset(); const usage = tracker.getTotalUsage(); expect(usage.inputTokens).toBe(0); expect(usage.outputTokens).toBe(0); expect(usage.totalTokens).toBe(0); expect(tracker.getStepUsages()).toHaveLength(0); expect(tracker.getEstimatedCost()).toBe(0); }); }); describe('partial model matching', () => { test('matches model by partial ID', () => { // "gpt-4o" pricing should match "gpt-4o-2024-08-06" via partial match const versioned = new UsageMeter('gpt-4o-2024-08-06', TEST_PRICING); versioned.record(1_000_000, 0); // Should find gpt-4o pricing ($2.50/M input) expect(versioned.getEstimatedCost()).toBeCloseTo(2.5, 4); }); }); }); // ── CompositeUsageMeter ── describe('CompositeUsageMeter', () => { let multiTracker: CompositeUsageMeter; beforeEach(() => { multiTracker = new CompositeUsageMeter(TEST_PRICING); }); describe('record and getTotalUsage', () => { test('records usage for a single model', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1000, outputTokens: 500, }); const usage = multiTracker.getTotalUsage(); expect(usage.inputTokens).toBe(1000); expect(usage.outputTokens).toBe(500); expect(usage.totalTokens).toBe(1500); }); test('aggregates across multiple models', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1000, outputTokens: 500, }); multiTracker.record({ modelId: 'gpt-4o-mini', role: 'extraction', inputTokens: 2000, outputTokens: 800, }); const usage = multiTracker.getTotalUsage(); expect(usage.inputTokens).toBe(3000); expect(usage.outputTokens).toBe(1300); expect(usage.totalTokens).toBe(4300); }); test('returns estimated cost for the recorded call', () => { const cost = multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1_000_000, outputTokens: 0, }); // gpt-4o: $2.50/M input expect(cost).toBeCloseTo(2.5, 4); }); }); describe('getTotalCost', () => { test('sums costs across all models', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1_000_000, outputTokens: 0, }); multiTracker.record({ modelId: 'gpt-4o-mini', role: 'extraction', inputTokens: 1_000_000, outputTokens: 0, }); const totalCost = multiTracker.getTotalCost(); // gpt-4o: $2.50; gpt-4o-mini: $0.15 expect(totalCost).toBeCloseTo(2.65, 4); }); test('formats total cost', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 100_000, outputTokens: 50_000, }); const formatted = multiTracker.getTotalCostFormatted(); expect(formatted).toMatch(/^\$\d+\.\d{4}$/); }); }); describe('getTracker', () => { test('returns per-model tracker', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 500, outputTokens: 200, }); const tracker = multiTracker.getTracker('gpt-4o'); expect(tracker.getTotalUsage().inputTokens).toBe(500); }); test('creates tracker on first access', () => { const tracker = multiTracker.getTracker('claude-3-5-sonnet'); expect(tracker).toBeDefined(); expect(tracker.getTotalUsage().totalTokens).toBe(0); }); }); describe('budget alerts', () => { test('fires threshold callback when cost crosses threshold', () => { const thresholdCrossed = mock(() => {}); multiTracker.setBudget({ maxCostUsd: 1.0, thresholds: [0.5, 0.8, 1.0], onThresholdCrossed: thresholdCrossed, }); // Record enough to cross 0.5 threshold ($0.50) // gpt-4o: $2.50/M input -> need 200k tokens for $0.50 multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 200_000, outputTokens: 0, }); expect(thresholdCrossed).toHaveBeenCalledTimes(1); const call = (thresholdCrossed as any).mock.calls[0]; expect(call[1]).toBe(0.5); // threshold expect(call[2]).toBe(1.0); // maxCost }); test('fires multiple thresholds as cost increases', () => { const thresholdCrossed = mock(() => {}); multiTracker.setBudget({ maxCostUsd: 1.0, thresholds: [0.5, 1.0], onThresholdCrossed: thresholdCrossed, }); // Cross 0.5 threshold multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 200_000, outputTokens: 0, }); // Cross 1.0 threshold multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 200_000, outputTokens: 0, }); expect(thresholdCrossed).toHaveBeenCalledTimes(2); }); test('does not fire same threshold twice', () => { const thresholdCrossed = mock(() => {}); multiTracker.setBudget({ maxCostUsd: 1.0, thresholds: [0.5], onThresholdCrossed: thresholdCrossed, }); // Cross 0.5 threshold twice multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 200_000, outputTokens: 0, }); multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 10_000, outputTokens: 0, }); expect(thresholdCrossed).toHaveBeenCalledTimes(1); }); test('throws BudgetDepletedError when budget exceeded and callback returns false', () => { multiTracker.setBudget({ maxCostUsd: 0.01, thresholds: [1.0], onThresholdCrossed: () => {}, onBudgetExhausted: () => false, }); expect(() => multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1_000_000, outputTokens: 0, }), ).toThrow(BudgetDepletedError); }); test('allows continuing when onBudgetExhausted returns true', () => { multiTracker.setBudget({ maxCostUsd: 0.01, thresholds: [1.0], onThresholdCrossed: () => {}, onBudgetExhausted: () => true, }); expect(() => multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1_000_000, outputTokens: 0, }), ).not.toThrow(); }); test('getBudgetState reflects current state', () => { multiTracker.setBudget({ maxCostUsd: 10.0, thresholds: [0.5], onThresholdCrossed: () => {}, }); multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1_000_000, outputTokens: 0, }); const status = multiTracker.getBudgetState(); expect(status.maxCostUsd).toBe(10.0); expect(status.currentCostUsd).toBeCloseTo(2.5, 2); expect(status.fractionUsed).toBeCloseTo(0.25, 2); expect(status.isExhausted).toBe(false); }); test('clearBudget removes budget configuration', () => { multiTracker.setBudget({ maxCostUsd: 1.0, thresholds: [0.5], onThresholdCrossed: () => {}, }); multiTracker.clearBudget(); const status = multiTracker.getBudgetState(); expect(status.maxCostUsd).toBeUndefined(); expect(status.fractionUsed).toBeUndefined(); expect(status.isExhausted).toBe(false); }); }); describe('MeteringSummary generation', () => { test('generates comprehensive summary', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1000, outputTokens: 500, stepIndex: 0, actionName: 'tap', }); multiTracker.record({ modelId: 'gpt-4o-mini', role: 'extraction', inputTokens: 2000, outputTokens: 300, stepIndex: 1, actionName: 'extract', }); const summary = multiTracker.getSummary(); expect(summary.totalInputTokens).toBe(3000); expect(summary.totalOutputTokens).toBe(800); expect(summary.totalTokens).toBe(3800); expect(summary.totalCalls).toBe(2); expect(summary.totalEstimatedCost).toBeGreaterThan(0); // By model breakdown expect(summary.byModel).toHaveLength(2); const gpt4o = summary.byModel.find((m) => m.modelId === 'gpt-4o'); expect(gpt4o).toBeDefined(); expect(gpt4o!.inputTokens).toBe(1000); expect(gpt4o!.callCount).toBe(1); // By role breakdown expect(summary.byRole).toHaveLength(2); const mainRole = summary.byRole.find((r) => r.role === 'main'); expect(mainRole).toBeDefined(); expect(mainRole!.callCount).toBe(1); // Action trace expect(summary.actionTrace).toHaveLength(2); expect(summary.actionTrace[0].actionName).toBe('tap'); expect(summary.actionTrace[1].actionName).toBe('extract'); // Duration expect(summary.durationMs).toBeDefined(); expect(summary.durationMs!).toBeGreaterThanOrEqual(0); }); test('generates human-readable summary text', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 10000, outputTokens: 5000, }); const text = multiTracker.getSummaryText(); expect(text).toContain('Token Usage Summary'); expect(text).toContain('Total:'); expect(text).toContain('Cost:'); expect(text).toContain('Calls:'); expect(text).toContain('Duration:'); expect(text).toContain('By Role'); expect(text).toContain('By Model'); }); test('includes budget info in summary text when configured', () => { multiTracker.setBudget({ maxCostUsd: 5.0, thresholds: [], onThresholdCrossed: () => {}, }); multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 100_000, outputTokens: 0, }); const text = multiTracker.getSummaryText(); expect(text).toContain('Budget:'); expect(text).toContain('$5.0000'); }); }); describe('reset', () => { test('clears all tracking data', () => { multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 1000, outputTokens: 500, }); multiTracker.record({ modelId: 'gpt-4o-mini', role: 'extraction', inputTokens: 500, outputTokens: 200, }); multiTracker.reset(); const usage = multiTracker.getTotalUsage(); expect(usage.totalTokens).toBe(0); expect(multiTracker.getTotalCost()).toBe(0); const summary = multiTracker.getSummary(); expect(summary.totalCalls).toBe(0); expect(summary.byModel).toHaveLength(0); expect(summary.byRole).toHaveLength(0); expect(summary.durationMs).toBeUndefined(); }); test('resets budget thresholds', () => { const thresholdCrossed = mock(() => {}); multiTracker.setBudget({ maxCostUsd: 1.0, thresholds: [0.5], onThresholdCrossed: thresholdCrossed, }); // Cross 0.5 threshold multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 200_000, outputTokens: 0, }); multiTracker.reset(); // Record again -- should fire threshold callback again since it was reset // But reset() clears crossedThresholds AND trackers, so cost starts at 0 multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 200_000, outputTokens: 0, }); // Both before and after reset should have fired expect(thresholdCrossed).toHaveBeenCalledTimes(2); }); }); describe('auto-start', () => { test('automatically starts timer on first record', () => { const summary1 = multiTracker.getSummary(); expect(summary1.durationMs).toBeUndefined(); multiTracker.record({ modelId: 'gpt-4o', role: 'main', inputTokens: 100, outputTokens: 50, }); const summary2 = multiTracker.getSummary(); expect(summary2.durationMs).toBeDefined(); }); test('explicit start() sets the timer', () => { multiTracker.start(); const summary = multiTracker.getSummary(); expect(summary.durationMs).toBeDefined(); expect(summary.durationMs!).toBeGreaterThanOrEqual(0); }); }); }); // ── estimateTokenCount ── describe('estimateTokenCount', () => { test('estimates roughly 1 token per 4 chars', () => { expect(estimateTokenCount('hello world')).toBe(3); // ceil(11/4) }); test('returns 0 for empty string', () => { expect(estimateTokenCount('')).toBe(0); }); test('rounds up', () => { expect(estimateTokenCount('a')).toBe(1); // ceil(1/4) = 1 }); }); // ── BudgetDepletedError ── describe('BudgetDepletedError', () => { test('has correct properties', () => { const error = new BudgetDepletedError(5.5, 5.0); expect(error.name).toBe('BudgetDepletedError'); expect(error.currentCost).toBe(5.5); expect(error.maxCost).toBe(5.0); expect(error.message).toContain('$5.5000'); expect(error.message).toContain('$5.0000'); }); test('is instanceof Error', () => { const error = new BudgetDepletedError(1, 1); expect(error instanceof Error).toBe(true); }); }); ================================================ FILE: packages/core/src/metering/tracker.ts ================================================ import type { UsageRecord, CostRates, PricingTable, ModelRole, ActionUsageRecord, MeteringSummary, ModelUsageBreakdown, RoleUsageBreakdown, BudgetPolicy, BudgetState, } from './types.js'; import { DEFAULT_COST_RATES } from './types.js'; // ── Single-model tracker (unchanged public API) ── export class UsageMeter { private usage: UsageRecord = { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; private pricing: PricingTable; private modelId: string; private stepUsages: UsageRecord[] = []; constructor(modelId: string, customPricing?: PricingTable) { this.modelId = modelId; this.pricing = customPricing ?? DEFAULT_COST_RATES; } record(inputTokens: number, outputTokens: number): void { const stepUsage: UsageRecord = { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens, }; this.usage.inputTokens += inputTokens; this.usage.outputTokens += outputTokens; this.usage.totalTokens += inputTokens + outputTokens; this.stepUsages.push(stepUsage); } getTotalUsage(): UsageRecord { return { ...this.usage }; } getStepUsages(): UsageRecord[] { return [...this.stepUsages]; } getEstimatedCost(): number { const cost = this.getModelCost(); if (!cost) return 0; return ( (this.usage.inputTokens / 1_000_000) * cost.inputCostPerMillion + (this.usage.outputTokens / 1_000_000) * cost.outputCostPerMillion ); } getEstimatedCostFormatted(): string { const cost = this.getEstimatedCost(); return `$${cost.toFixed(4)}`; } private getModelCost(): CostRates | undefined { return resolveModelCost(this.modelId, this.pricing); } getSummary(): string { const lines = [ `Model: ${this.modelId}`, `Steps: ${this.stepUsages.length}`, `Input tokens: ${this.usage.inputTokens.toLocaleString()}`, `Output tokens: ${this.usage.outputTokens.toLocaleString()}`, `Total tokens: ${this.usage.totalTokens.toLocaleString()}`, `Estimated cost: ${this.getEstimatedCostFormatted()}`, ]; return lines.join('\n'); } reset(): void { this.usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; this.stepUsages = []; } } // ── Multi-model tracker ── /** * Tracks token usage across multiple LLM roles (main, extraction, judge, compaction) * with per-action cost breakdown, budget alerts, and comprehensive summaries. */ export class CompositeUsageMeter { private readonly pricing: PricingTable; private readonly trackers = new Map(); private readonly actionTrace: ActionUsageRecord[] = []; private budgetConfig: BudgetPolicy | undefined; private crossedThresholds = new Set(); private startTime: number | undefined; constructor(customPricing?: PricingTable) { this.pricing = customPricing ?? DEFAULT_COST_RATES; } /** Start the session timer. Called automatically on first record if not called explicitly. */ start(): void { this.startTime = Date.now(); } /** * Configure budget alerts. Thresholds default to [0.5, 0.8, 1.0]. * Returns this for chaining. */ setBudget(config: BudgetPolicy): this { this.budgetConfig = { ...config, thresholds: config.thresholds ?? [0.5, 0.8, 1.0], }; this.crossedThresholds.clear(); return this; } /** Clear the budget configuration. */ clearBudget(): void { this.budgetConfig = undefined; this.crossedThresholds.clear(); } /** * Record token usage for a specific model and role. * Returns the estimated cost for this single call. * Throws if budget is exhausted and onBudgetExhausted returns false. */ record(opts: { modelId: string; role: ModelRole; inputTokens: number; outputTokens: number; stepIndex?: number; actionName?: string; }): number { if (!this.startTime) this.start(); // Get or create per-model tracker const tracker = this.getOrCreateTracker(opts.modelId); tracker.record(opts.inputTokens, opts.outputTokens); // Compute cost for this call const cost = computeCost(opts.inputTokens, opts.outputTokens, opts.modelId, this.pricing); // Append to action trace const entry: ActionUsageRecord = { stepIndex: opts.stepIndex ?? this.actionTrace.length, actionName: opts.actionName ?? 'unknown', role: opts.role, modelId: opts.modelId, usage: { inputTokens: opts.inputTokens, outputTokens: opts.outputTokens, totalTokens: opts.inputTokens + opts.outputTokens, }, cost, timestamp: Date.now(), }; this.actionTrace.push(entry); // Check budget thresholds this.checkBudget(); return cost; } /** Get the per-model UsageMeter (creates one if missing). */ getTracker(modelId: string): UsageMeter { return this.getOrCreateTracker(modelId); } /** Total estimated cost across all models. */ getTotalCost(): number { let total = 0; for (const tracker of this.trackers.values()) { total += tracker.getEstimatedCost(); } return total; } /** Formatted total cost string. */ getTotalCostFormatted(): string { return `$${this.getTotalCost().toFixed(4)}`; } /** Aggregate token usage across all models. */ getTotalUsage(): UsageRecord { let inputTokens = 0; let outputTokens = 0; for (const tracker of this.trackers.values()) { const u = tracker.getTotalUsage(); inputTokens += u.inputTokens; outputTokens += u.outputTokens; } return { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens }; } /** Get the current budget status. */ getBudgetState(): BudgetState { const currentCost = this.getTotalCost(); const maxCost = this.budgetConfig?.maxCostUsd; return { currentCostUsd: currentCost, maxCostUsd: maxCost, fractionUsed: maxCost != null ? currentCost / maxCost : undefined, isExhausted: maxCost != null ? currentCost >= maxCost : false, crossedThresholds: [...this.crossedThresholds].sort((a, b) => a - b), }; } /** Build a full MeteringSummary with per-model and per-role breakdowns. */ getSummary(): MeteringSummary { const totalUsage = this.getTotalUsage(); return { totalInputTokens: totalUsage.inputTokens, totalOutputTokens: totalUsage.outputTokens, totalTokens: totalUsage.totalTokens, totalEstimatedCost: this.getTotalCost(), totalCalls: this.actionTrace.length, byModel: this.buildModelBreakdown(), byRole: this.buildRoleBreakdown(), actionTrace: [...this.actionTrace], durationMs: this.startTime ? Date.now() - this.startTime : undefined, }; } /** Human-readable summary string. */ getSummaryText(): string { const s = this.getSummary(); const lines: string[] = [ '=== Token Usage Summary ===', `Total: ${s.totalTokens.toLocaleString()} tokens (${s.totalInputTokens.toLocaleString()} in / ${s.totalOutputTokens.toLocaleString()} out)`, `Cost: $${s.totalEstimatedCost.toFixed(4)}`, `Calls: ${s.totalCalls}`, ]; if (s.durationMs != null) { lines.push(`Duration: ${(s.durationMs / 1000).toFixed(1)}s`); } if (s.byRole.length > 0) { lines.push('', '--- By Role ---'); for (const r of s.byRole) { lines.push( ` ${r.role}: ${r.totalTokens.toLocaleString()} tokens, $${r.estimatedCost.toFixed(4)} (${r.callCount} calls)`, ); } } if (s.byModel.length > 0) { lines.push('', '--- By Model ---'); for (const m of s.byModel) { lines.push( ` ${m.modelId}: ${m.totalTokens.toLocaleString()} tokens, $${m.estimatedCost.toFixed(4)} (${m.callCount} calls)`, ); } } const budget = this.getBudgetState(); if (budget.maxCostUsd != null) { const pct = ((budget.fractionUsed ?? 0) * 100).toFixed(1); lines.push( '', `Budget: $${budget.currentCostUsd.toFixed(4)} / $${budget.maxCostUsd.toFixed(4)} (${pct}%)`, ); } return lines.join('\n'); } /** Reset all tracking data. */ reset(): void { for (const tracker of this.trackers.values()) { tracker.reset(); } this.trackers.clear(); this.actionTrace.length = 0; this.crossedThresholds.clear(); this.startTime = undefined; } // ── Private helpers ── private getOrCreateTracker(modelId: string): UsageMeter { let tracker = this.trackers.get(modelId); if (!tracker) { tracker = new UsageMeter(modelId, this.pricing); this.trackers.set(modelId, tracker); } return tracker; } private checkBudget(): void { if (!this.budgetConfig) return; const currentCost = this.getTotalCost(); const { maxCostUsd, thresholds, onThresholdCrossed, onBudgetExhausted } = this.budgetConfig; // Check each threshold for (const threshold of thresholds ?? []) { if (this.crossedThresholds.has(threshold)) continue; const thresholdCost = maxCostUsd * threshold; if (currentCost >= thresholdCost) { this.crossedThresholds.add(threshold); onThresholdCrossed(currentCost, threshold, maxCostUsd); } } // Check full exhaustion if (currentCost >= maxCostUsd) { if (onBudgetExhausted) { const allow = onBudgetExhausted(currentCost, maxCostUsd); if (!allow) { throw new BudgetDepletedError(currentCost, maxCostUsd); } } } } private buildModelBreakdown(): ModelUsageBreakdown[] { const map = new Map(); for (const entry of this.actionTrace) { let mb = map.get(entry.modelId); if (!mb) { mb = { modelId: entry.modelId, inputTokens: 0, outputTokens: 0, totalTokens: 0, estimatedCost: 0, callCount: 0, }; map.set(entry.modelId, mb); } mb.inputTokens += entry.usage.inputTokens; mb.outputTokens += entry.usage.outputTokens; mb.totalTokens += entry.usage.totalTokens; mb.estimatedCost += entry.cost; mb.callCount++; } return [...map.values()].sort((a, b) => b.estimatedCost - a.estimatedCost); } private buildRoleBreakdown(): RoleUsageBreakdown[] { const map = new Map(); for (const entry of this.actionTrace) { let rb = map.get(entry.role); if (!rb) { rb = { role: entry.role, inputTokens: 0, outputTokens: 0, totalTokens: 0, estimatedCost: 0, callCount: 0, }; map.set(entry.role, rb); } rb.inputTokens += entry.usage.inputTokens; rb.outputTokens += entry.usage.outputTokens; rb.totalTokens += entry.usage.totalTokens; rb.estimatedCost += entry.cost; rb.callCount++; } return [...map.values()].sort((a, b) => b.estimatedCost - a.estimatedCost); } } // ── Budget error ── export class BudgetDepletedError extends Error { readonly currentCost: number; readonly maxCost: number; constructor(currentCost: number, maxCost: number) { super( `Token budget exhausted: $${currentCost.toFixed(4)} spent, limit is $${maxCost.toFixed(4)}`, ); this.name = 'BudgetDepletedError'; this.currentCost = currentCost; this.maxCost = maxCost; } } // ── Shared utilities ── export function estimateTokenCount(text: string): number { return Math.ceil(text.length / 4); } /** Resolve pricing for a model ID with exact-match then partial-match fallback. */ function resolveModelCost(modelId: string, pricing: PricingTable): CostRates | undefined { if (pricing[modelId]) return pricing[modelId]; for (const [key, value] of Object.entries(pricing)) { if (modelId.includes(key) || key.includes(modelId)) { return value; } } return undefined; } /** Compute cost in USD for a single call. */ function computeCost( inputTokens: number, outputTokens: number, modelId: string, pricing: PricingTable, ): number { const cost = resolveModelCost(modelId, pricing); if (!cost) return 0; return ( (inputTokens / 1_000_000) * cost.inputCostPerMillion + (outputTokens / 1_000_000) * cost.outputCostPerMillion ); } ================================================ FILE: packages/core/src/metering/types.ts ================================================ export interface UsageRecord { inputTokens: number; outputTokens: number; totalTokens: number; } export interface CostRates { inputCostPerMillion: number; outputCostPerMillion: number; } export interface PricingTable { [modelId: string]: CostRates; } /** * Role that a model can serve in the agent pipeline. * - main: primary reasoning / action-selection model * - extraction: lightweight model for page content extraction * - judge: evaluates task completion * - compaction: summarizes / compresses conversation history */ export type ModelRole = 'main' | 'extraction' | 'judge' | 'compaction'; /** Token usage attributed to a single agent action (step). */ export interface ActionUsageRecord { stepIndex: number; actionName: string; role: ModelRole; modelId: string; usage: UsageRecord; cost: number; timestamp: number; } /** Per-model aggregated usage. */ export interface ModelUsageBreakdown { modelId: string; inputTokens: number; outputTokens: number; totalTokens: number; estimatedCost: number; callCount: number; } /** Per-role aggregated usage. */ export interface RoleUsageBreakdown { role: ModelRole; inputTokens: number; outputTokens: number; totalTokens: number; estimatedCost: number; callCount: number; } /** Comprehensive usage summary across all models and roles. */ export interface MeteringSummary { /** Aggregate across everything. */ totalInputTokens: number; totalOutputTokens: number; totalTokens: number; totalEstimatedCost: number; totalCalls: number; /** Breakdown by model ID. */ byModel: ModelUsageBreakdown[]; /** Breakdown by role. */ byRole: RoleUsageBreakdown[]; /** Per-action cost trace (chronological). */ actionTrace: ActionUsageRecord[]; /** Wall-clock duration of the tracked session in ms (if available). */ durationMs?: number; } /** Configuration for budget alerts. */ export interface BudgetPolicy { /** Maximum allowed cost in USD. */ maxCostUsd: number; /** * Warning thresholds as fractions of maxCostUsd (e.g. [0.5, 0.8, 1.0]). * Callbacks fire when cost first crosses each threshold. */ thresholds?: number[]; /** Called each time a threshold is crossed. */ onThresholdCrossed: (currentCost: number, threshold: number, maxCost: number) => void; /** Called when the budget is fully exhausted. Return true to allow continuing. */ onBudgetExhausted?: (currentCost: number, maxCost: number) => boolean; } /** Status of budget consumption. */ export interface BudgetState { currentCostUsd: number; maxCostUsd: number | undefined; /** Fraction 0..1+ of budget consumed. undefined if no budget set. */ fractionUsed: number | undefined; isExhausted: boolean; crossedThresholds: number[]; } // ── Comprehensive default pricing ── export const DEFAULT_COST_RATES: PricingTable = { // OpenAI 'gpt-4o': { inputCostPerMillion: 2.5, outputCostPerMillion: 10.0 }, 'gpt-4o-mini': { inputCostPerMillion: 0.15, outputCostPerMillion: 0.6 }, 'gpt-4-turbo': { inputCostPerMillion: 10.0, outputCostPerMillion: 30.0 }, 'gpt-4.5-preview': { inputCostPerMillion: 75.0, outputCostPerMillion: 150.0 }, 'o1': { inputCostPerMillion: 15.0, outputCostPerMillion: 60.0 }, 'o1-mini': { inputCostPerMillion: 3.0, outputCostPerMillion: 12.0 }, 'o1-preview': { inputCostPerMillion: 15.0, outputCostPerMillion: 60.0 }, 'o3-mini': { inputCostPerMillion: 1.1, outputCostPerMillion: 4.4 }, // Anthropic 'claude-3-5-sonnet': { inputCostPerMillion: 3.0, outputCostPerMillion: 15.0 }, 'claude-3-5-haiku': { inputCostPerMillion: 0.8, outputCostPerMillion: 4.0 }, 'claude-3-opus': { inputCostPerMillion: 15.0, outputCostPerMillion: 75.0 }, 'claude-3-haiku': { inputCostPerMillion: 0.25, outputCostPerMillion: 1.25 }, 'claude-4-sonnet': { inputCostPerMillion: 3.0, outputCostPerMillion: 15.0 }, 'claude-4-opus': { inputCostPerMillion: 15.0, outputCostPerMillion: 75.0 }, // Google 'gemini-1.5-pro': { inputCostPerMillion: 1.25, outputCostPerMillion: 5.0 }, 'gemini-1.5-flash': { inputCostPerMillion: 0.075, outputCostPerMillion: 0.3 }, 'gemini-2.0-flash': { inputCostPerMillion: 0.1, outputCostPerMillion: 0.4 }, 'gemini-2.0-pro': { inputCostPerMillion: 1.25, outputCostPerMillion: 5.0 }, 'gemini-2.5-pro': { inputCostPerMillion: 1.25, outputCostPerMillion: 10.0 }, 'gemini-2.5-flash': { inputCostPerMillion: 0.15, outputCostPerMillion: 0.6 }, // Mistral 'mistral-large': { inputCostPerMillion: 2.0, outputCostPerMillion: 6.0 }, 'mistral-small': { inputCostPerMillion: 0.2, outputCostPerMillion: 0.6 }, 'codestral': { inputCostPerMillion: 0.3, outputCostPerMillion: 0.9 }, // DeepSeek 'deepseek-chat': { inputCostPerMillion: 0.14, outputCostPerMillion: 0.28 }, 'deepseek-reasoner': { inputCostPerMillion: 0.55, outputCostPerMillion: 2.19 }, }; ================================================ FILE: packages/core/src/model/adapters/vercel.ts ================================================ import { generateObject, type CoreMessage, type CoreUserMessage } from 'ai'; import type { LanguageModelV1 } from 'ai'; import type { ZodType } from 'zod'; import type { LanguageModel, InferenceOptions, ModelProvider } from '../interface.js'; import type { InferenceResult, InferenceUsage } from '../types.js'; import type { Message, ContentPart } from '../messages.js'; import { ModelError, ModelThrottledError } from '../../errors.js'; export interface VercelModelAdapterOptions { model: LanguageModelV1; /** Override provider detection (otherwise inferred from model.provider or modelId). */ provider?: ModelProvider; temperature?: number; maxTokens?: number; maxRetries?: number; } export class VercelModelAdapter implements LanguageModel { private readonly model: LanguageModelV1; private readonly defaultTemperature: number; private readonly defaultMaxTokens: number; private readonly maxRetries: number; private readonly _provider: ModelProvider; constructor(options: VercelModelAdapterOptions) { this.model = options.model; this.defaultTemperature = options.temperature ?? 0; this.defaultMaxTokens = options.maxTokens ?? 4096; this.maxRetries = options.maxRetries ?? 3; this._provider = options.provider ?? inferProvider(this.model.modelId, this.model.provider); } get modelId(): string { return this.model.modelId; } get provider(): ModelProvider { return this._provider; } async invoke(options: InferenceOptions): Promise> { const messages = this.convertMessages(options.messages); try { const result = await generateObject({ model: this.model, schema: options.responseSchema as ZodType, schemaName: options.schemaName ?? 'AgentDecision', schemaDescription: options.schemaDescription, messages, temperature: options.temperature ?? this.defaultTemperature, maxTokens: options.maxTokens ?? this.defaultMaxTokens, maxRetries: this.maxRetries, }); const usage: InferenceUsage = { inputTokens: result.usage?.promptTokens ?? 0, outputTokens: result.usage?.completionTokens ?? 0, totalTokens: (result.usage?.promptTokens ?? 0) + (result.usage?.completionTokens ?? 0), }; return { parsed: result.object, usage, finishReason: mapFinishReason(result.finishReason), }; } catch (error: any) { if (error?.statusCode === 429 || error?.message?.includes('rate limit')) { const retryAfter = error?.headers?.['retry-after']; throw new ModelThrottledError( error.message ?? 'Rate limited', retryAfter ? Number.parseInt(retryAfter) * 1000 : undefined, ); } throw new ModelError( `LLM invocation failed: ${error?.message ?? String(error)}`, { cause: error }, ); } } private convertMessages(messages: Message[]): CoreMessage[] { return messages.map((msg): CoreMessage => { switch (msg.role) { case 'system': return { role: 'system', content: msg.content }; case 'user': { if (typeof msg.content === 'string') { return { role: 'user', content: msg.content }; } return { role: 'user', content: msg.content.map((part) => this.convertContentPart(part)), } as CoreUserMessage; } case 'assistant': { const content = typeof msg.content === 'string' ? msg.content : msg.content.map((part) => { if (part.type === 'text') return { type: 'text' as const, text: part.text }; return { type: 'text' as const, text: '[image]' }; }); return { role: 'assistant', content }; } case 'tool': return { role: 'user', content: `[Tool Result (${msg.toolCallId})]: ${msg.content}`, }; } }); } private convertContentPart( part: ContentPart, ): { type: 'text'; text: string } | { type: 'image'; image: string | URL } { switch (part.type) { case 'text': return { type: 'text', text: part.text }; case 'image': if (part.source.type === 'base64') { return { type: 'image', image: part.source.data, }; } return { type: 'image', image: new URL(part.source.url), }; } } } function mapFinishReason( reason: string, ): 'stop' | 'length' | 'content-filter' | 'tool-calls' | 'error' | 'other' { switch (reason) { case 'stop': return 'stop'; case 'length': return 'length'; case 'content-filter': return 'content-filter'; case 'tool-calls': return 'tool-calls'; case 'error': return 'error'; default: return 'other'; } } const PROVIDER_PATTERNS: Array<[RegExp, ModelProvider]> = [ [/anthropic|claude/i, 'anthropic'], [/openai|gpt|o1|o3/i, 'openai'], [/google|gemini/i, 'google'], [/mistral/i, 'mistral'], [/deepseek/i, 'deepseek'], [/groq/i, 'groq'], [/fireworks/i, 'fireworks'], [/together/i, 'together'], ]; function inferProvider(modelId: string, providerHint?: string): ModelProvider { const combined = `${providerHint ?? ''} ${modelId}`; for (const [pattern, provider] of PROVIDER_PATTERNS) { if (pattern.test(combined)) return provider; } return 'custom'; } ================================================ FILE: packages/core/src/model/index.ts ================================================ export { type LanguageModel, type InferenceOptions, type ModelProvider } from './interface.js'; export { type InferenceResult, type InferenceUsage } from './types.js'; export { type Message, type SystemMessage, type UserMessage, type AssistantMessage, type ToolResultMessage, type ToolCall, type ContentPart, type TextContent, type ImageContent, systemMessage, userMessage, assistantMessage, toolResultMessage, textContent, imageContent, } from './messages.js'; export { VercelModelAdapter, type VercelModelAdapterOptions } from './adapters/vercel.js'; export { zodToJsonSchema, optimizeSchemaForModel, optimizeJsonSchemaForModel, type SchemaOptimizationOptions, } from './schema-optimizer.js'; ================================================ FILE: packages/core/src/model/interface.ts ================================================ import type { ZodType } from 'zod'; import type { Message } from './messages.js'; import type { InferenceResult } from './types.js'; /** Known LLM provider identifiers. */ export type ModelProvider = | 'anthropic' | 'openai' | 'google' | 'mistral' | 'deepseek' | 'groq' | 'fireworks' | 'together' | 'custom'; export interface InferenceOptions { messages: Message[]; responseSchema: ZodType; schemaName?: string; schemaDescription?: string; temperature?: number; maxTokens?: number; /** * Token budget for extended thinking / chain-of-thought. * Only honored by models that support thinking (Claude 3.5+, o1, etc.). * Set to 0 to disable thinking even when the model supports it. */ thinkingBudget?: number; /** * Enable prompt caching for this call. When true, the adapter should * set cache-control headers / parameters where the provider supports it * (e.g. Anthropic prompt caching, OpenAI predicted outputs). */ cache?: boolean; /** * Per-call timeout in milliseconds. Overrides any default timeout * configured on the LanguageModel instance. */ timeout?: number; } export interface LanguageModel { invoke(options: InferenceOptions): Promise>; /** The model identifier string (e.g. "claude-3-5-sonnet-20241022"). */ readonly modelId: string; /** The LLM provider this model belongs to. */ readonly provider: ModelProvider; } ================================================ FILE: packages/core/src/model/messages.ts ================================================ export interface TextContent { type: 'text'; text: string; } export interface ImageContent { type: 'image'; source: | { type: 'base64'; mediaType: string; data: string } | { type: 'url'; url: string }; } export type ContentPart = TextContent | ImageContent; export interface SystemMessage { role: 'system'; content: string; } export interface UserMessage { role: 'user'; content: string | ContentPart[]; } export interface AssistantMessage { role: 'assistant'; content: string | ContentPart[]; toolCalls?: ToolCall[]; } export interface ToolResultMessage { role: 'tool'; toolCallId: string; content: string; } export interface ToolCall { id: string; name: string; args: Record; } export type Message = SystemMessage | UserMessage | AssistantMessage | ToolResultMessage; // ── Helpers ── export function systemMessage(content: string): SystemMessage { return { role: 'system', content }; } export function userMessage(content: string | ContentPart[]): UserMessage { return { role: 'user', content }; } export function assistantMessage( content: string | ContentPart[], toolCalls?: ToolCall[], ): AssistantMessage { return { role: 'assistant', content, toolCalls }; } export function toolResultMessage(toolCallId: string, content: string): ToolResultMessage { return { role: 'tool', toolCallId, content }; } export function textContent(text: string): TextContent { return { type: 'text', text }; } export function imageContent(base64: string, mediaType = 'image/png'): ImageContent { return { type: 'image', source: { type: 'base64', mediaType, data: base64 }, }; } ================================================ FILE: packages/core/src/model/schema-optimizer.ts ================================================ import { z, type ZodTypeAny } from 'zod'; import type { ModelProvider } from './interface.js'; // ── Configuration ── export interface SchemaOptimizationOptions { /** LLM provider to apply provider-specific tweaks for. */ provider?: ModelProvider; /** * Maximum number of variants in a discriminated union before collapsing * infrequently used ones into a generic fallback. */ maxUnionVariants?: number; /** * Maximum nesting depth before flattening deeply nested objects * into dot-separated flat keys. */ maxNestingDepth?: number; /** * Maximum number of enum values before collapsing similar ones. */ maxEnumValues?: number; } const DEFAULTS: Required> = { maxUnionVariants: 15, maxNestingDepth: 4, maxEnumValues: 30, }; // ── Main entry point ── /** * Optimizes a JSON Schema (as a plain object) for LLM consumption. * Applies union collapsing, enum simplification, provider-specific tweaks, * and nested object flattening. */ export function optimizeJsonSchemaForModel( schema: Record, options: SchemaOptimizationOptions = {}, ): Record { const opts = { ...DEFAULTS, ...options }; let result = structuredClone(schema); result = collapseUnions(result, opts.maxUnionVariants); result = collapseEnums(result, opts.maxEnumValues); result = flattenNesting(result, opts.maxNestingDepth); if (opts.provider) { result = applyProviderTweaks(result, opts.provider); } return result; } /** * Optimizes Zod schemas for LLM consumption by simplifying complex unions * and removing unnecessary constraints that confuse models. * * This works at the Zod level for simple transformations, but for deeper * optimization, convert to JSON Schema first with zodToJsonSchema() and * then call optimizeJsonSchemaForModel(). */ export function optimizeSchemaForModel( schema: T, options: SchemaOptimizationOptions = {}, ): T { // For discriminated unions with too many variants, wrap in a transformation // that strips the union down. We operate at the Zod type level where possible. if (schema instanceof z.ZodDiscriminatedUnion) { const variants = [...schema.options.values()] as ZodTypeAny[]; const maxVariants = options.maxUnionVariants ?? DEFAULTS.maxUnionVariants; if (variants.length > maxVariants) { // Keep the first maxVariants-1 variants and add a catch-all object const kept = variants.slice(0, maxVariants - 1); const catchAll = z.object({}).passthrough().describe('Other action (see documentation)'); const unionMembers = [...kept, catchAll] as unknown as [ZodTypeAny, ZodTypeAny, ...ZodTypeAny[]]; return z.union(unionMembers) as any; } } if (schema instanceof z.ZodUnion) { const variants = schema.options as ZodTypeAny[]; const maxVariants = options.maxUnionVariants ?? DEFAULTS.maxUnionVariants; if (variants.length > maxVariants) { const kept = variants.slice(0, maxVariants - 1); const catchAll = z.object({}).passthrough().describe('Other variant'); const unionMembers = [...kept, catchAll] as unknown as [ZodTypeAny, ZodTypeAny, ...ZodTypeAny[]]; return z.union(unionMembers) as any; } } return schema; } // ── Union collapsing ── /** * When a oneOf / anyOf has more variants than maxVariants, collapse the * excess into a single permissive object schema. */ function collapseUnions( schema: Record, maxVariants: number, ): Record { schema = walkSchema(schema, (node) => { const unionKey = node.oneOf ? 'oneOf' : node.anyOf ? 'anyOf' : undefined; if (!unionKey) return node; const variants = node[unionKey] as Record[]; if (!Array.isArray(variants) || variants.length <= maxVariants) return node; // Keep the first N-1 variants, replace the rest with a permissive catch-all const kept = variants.slice(0, maxVariants - 1); const catchAll: Record = { type: 'object', description: `One of ${variants.length - maxVariants + 1} additional variants (see documentation)`, additionalProperties: true, }; return { ...node, [unionKey]: [...kept, catchAll] }; }); return schema; } // ── Enum collapsing ── /** * When an enum has too many values, collapse similar values by removing * duplicates after case-normalization, and truncate with an annotation. */ function collapseEnums( schema: Record, maxValues: number, ): Record { return walkSchema(schema, (node) => { if (!Array.isArray(node.enum)) return node; const values = node.enum as unknown[]; if (values.length <= maxValues) return node; // Deduplicate by lowercase string representation const seen = new Set(); const deduped: unknown[] = []; for (const v of values) { const key = String(v).toLowerCase(); if (!seen.has(key)) { seen.add(key); deduped.push(v); } } // If still too many, truncate and annotate if (deduped.length > maxValues) { const truncated = deduped.slice(0, maxValues); const description = node.description ? `${node.description} (${deduped.length - maxValues} more values omitted)` : `${deduped.length - maxValues} additional values omitted`; return { ...node, enum: truncated, description }; } return { ...node, enum: deduped }; }); } // ── Nested object flattening ── /** * Flattens objects nested beyond maxDepth by lifting nested properties * to the parent level with dot-separated keys. */ function flattenNesting( schema: Record, maxDepth: number, ): Record { return walkSchema(schema, (node) => { if (node.type !== 'object' || !node.properties) return node; const flatProps: Record = {}; const flatRequired: string[] = []; const origRequired = new Set( Array.isArray(node.required) ? (node.required as string[]) : [], ); flattenProperties( node.properties as Record>, origRequired, '', 0, maxDepth, flatProps, flatRequired, ); // Only return the flattened version if we actually changed something const origKeys = Object.keys(node.properties as object); const flatKeys = Object.keys(flatProps); if ( flatKeys.length === origKeys.length && flatKeys.every((k) => origKeys.includes(k)) ) { return node; } const result: Record = { ...node, properties: flatProps }; if (flatRequired.length > 0) { result.required = flatRequired; } else { delete result.required; } return result; }); } function flattenProperties( properties: Record>, required: Set, prefix: string, currentDepth: number, maxDepth: number, out: Record, outRequired: string[], ): void { for (const [key, schema] of Object.entries(properties)) { const fullKey = prefix ? `${prefix}.${key}` : key; const isRequired = required.has(key); if ( schema.type === 'object' && schema.properties && currentDepth >= maxDepth ) { // Flatten: lift child properties up const childRequired = new Set( Array.isArray(schema.required) ? (schema.required as string[]) : [], ); flattenProperties( schema.properties as Record>, childRequired, fullKey, currentDepth + 1, maxDepth, out, outRequired, ); } else { out[fullKey] = schema; if (isRequired) { outRequired.push(fullKey); } } } } // ── Provider-specific tweaks ── /** * Apply provider-specific schema modifications: * - Gemini: requires description on all properties * - OpenAI: prefers simpler schemas, removes redundant constraints */ function applyProviderTweaks( schema: Record, provider: ModelProvider, ): Record { switch (provider) { case 'google': return applyGeminiTweaks(schema); case 'openai': return applyOpenAITweaks(schema); default: return schema; } } /** * Gemini requires description fields on all object properties. * Without descriptions, Gemini may produce empty or incorrect output. */ function applyGeminiTweaks(schema: Record): Record { return walkSchema(schema, (node) => { if (node.type !== 'object' || !node.properties) return node; const props = node.properties as Record>; const patched: Record> = {}; for (const [key, propSchema] of Object.entries(props)) { if (!propSchema.description) { patched[key] = { ...propSchema, description: humanizePropertyName(key), }; } else { patched[key] = propSchema; } } return { ...node, properties: patched }; }); } /** * OpenAI models work better with simpler schemas: * - Remove additionalProperties: false (it's the default for structured output) * - Ensure all required fields are listed */ function applyOpenAITweaks(schema: Record): Record { return walkSchema(schema, (node) => { if (node.type !== 'object') return node; const cleaned = { ...node }; // OpenAI structured output doesn't need additionalProperties: false if (cleaned.additionalProperties === false) { delete cleaned.additionalProperties; } // Ensure all properties are marked required (OpenAI prefers explicit required lists) if (cleaned.properties && !cleaned.required) { cleaned.required = Object.keys(cleaned.properties as object); } return cleaned; }); } // ── Schema walking utility ── type SchemaVisitor = (node: Record) => Record; /** * Recursively walks a JSON Schema tree, applying a visitor function * to each schema node (depth-first, post-order). */ function walkSchema( schema: Record, visitor: SchemaVisitor, ): Record { let node = { ...schema }; // Walk into properties if (node.properties && typeof node.properties === 'object') { const props: Record = {}; for (const [key, val] of Object.entries(node.properties as Record)) { if (val && typeof val === 'object' && !Array.isArray(val)) { props[key] = walkSchema(val as Record, visitor); } else { props[key] = val; } } node.properties = props; } // Walk into array items if (node.items && typeof node.items === 'object' && !Array.isArray(node.items)) { node.items = walkSchema(node.items as Record, visitor); } // Walk into oneOf / anyOf / allOf for (const combiner of ['oneOf', 'anyOf', 'allOf'] as const) { if (Array.isArray(node[combiner])) { node[combiner] = (node[combiner] as Record[]).map((s) => typeof s === 'object' && s !== null ? walkSchema(s, visitor) : s, ); } } // Walk into additionalProperties if ( node.additionalProperties && typeof node.additionalProperties === 'object' ) { node.additionalProperties = walkSchema( node.additionalProperties as Record, visitor, ); } return visitor(node); } // ── Helpers ── /** * Converts a camelCase or snake_case property name to a human-readable description. * Used for Gemini which requires descriptions on all properties. */ function humanizePropertyName(name: string): string { // Split on camelCase boundaries and underscores const words = name .replace(/([a-z])([A-Z])/g, '$1 $2') .replace(/[_-]/g, ' ') .toLowerCase() .split(/\s+/); if (words.length === 0) return name; // Capitalize first word words[0] = words[0].charAt(0).toUpperCase() + words[0].slice(1); return words.join(' '); } // ── zodToJsonSchema (existing, unchanged) ── /** * Converts a Zod schema to a JSON Schema representation suitable for LLM tool use. */ export function zodToJsonSchema(schema: ZodTypeAny): Record { const jsonSchema: Record = {}; if (schema instanceof z.ZodObject) { jsonSchema.type = 'object'; const shape = schema.shape; const properties: Record = {}; const required: string[] = []; for (const [key, value] of Object.entries(shape)) { properties[key] = zodToJsonSchema(value as ZodTypeAny); if (!(value instanceof z.ZodOptional)) { required.push(key); } } jsonSchema.properties = properties; if (required.length > 0) { jsonSchema.required = required; } } else if (schema instanceof z.ZodString) { jsonSchema.type = 'string'; } else if (schema instanceof z.ZodNumber) { jsonSchema.type = 'number'; } else if (schema instanceof z.ZodBoolean) { jsonSchema.type = 'boolean'; } else if (schema instanceof z.ZodArray) { jsonSchema.type = 'array'; jsonSchema.items = zodToJsonSchema(schema.element); } else if (schema instanceof z.ZodOptional) { return zodToJsonSchema(schema.unwrap()) as any; } else if (schema instanceof z.ZodDefault) { const inner = zodToJsonSchema(schema.removeDefault()) as any; inner.default = schema._def.defaultValue(); return inner as any; } else if (schema instanceof z.ZodEnum) { jsonSchema.type = 'string'; jsonSchema.enum = schema.options; } else if (schema instanceof z.ZodLiteral) { jsonSchema.const = schema.value; } else if (schema instanceof z.ZodUnion) { jsonSchema.oneOf = (schema.options as ZodTypeAny[]).map(zodToJsonSchema); } else if (schema instanceof z.ZodDiscriminatedUnion) { jsonSchema.oneOf = [...schema.options.values()].map((opt: ZodTypeAny) => zodToJsonSchema(opt), ); } else if (schema instanceof z.ZodNullable) { const inner = zodToJsonSchema(schema.unwrap()); return { oneOf: [inner, { type: 'null' }] } as any; } else if (schema instanceof z.ZodRecord) { jsonSchema.type = 'object'; jsonSchema.additionalProperties = zodToJsonSchema(schema.element); } else { jsonSchema.type = 'object'; } if (schema.description) { jsonSchema.description = schema.description; } return jsonSchema as any; } ================================================ FILE: packages/core/src/model/types.ts ================================================ import { z } from 'zod'; export interface InferenceUsage { inputTokens: number; outputTokens: number; totalTokens: number; } export interface InferenceResult { parsed: T; rawText?: string; usage: InferenceUsage; finishReason: 'stop' | 'length' | 'content-filter' | 'tool-calls' | 'error' | 'other'; } export const InferenceUsageSchema = z.object({ inputTokens: z.number(), outputTokens: z.number(), totalTokens: z.number(), }); ================================================ FILE: packages/core/src/page/content-extractor.ts ================================================ import TurndownService from 'turndown'; import type { Page } from 'playwright'; let turndownInstance: TurndownService | null = null; function getTurndown(): TurndownService { if (!turndownInstance) { turndownInstance = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', emDelimiter: '*', }); // Remove scripts, styles, and other non-content elements turndownInstance.remove(['script', 'style', 'nav', 'footer', 'header', 'noscript']); // Preserve tables as markdown tables turndownInstance.addRule('table', { filter: 'table', replacement: (_content, node) => { const table = node as HTMLTableElement; return htmlTableToMarkdown(table); }, }); // Preserve code blocks with enhanced language detection from class attributes. // Supports patterns: language-xxx, lang-xxx, highlight-xxx, brush:xxx, and bare lang names. turndownInstance.addRule('codeBlock', { filter: (node) => { return ( node.nodeName === 'PRE' && node.firstChild !== null && node.firstChild.nodeName === 'CODE' ); }, replacement: (_content, node) => { const codeEl = node.firstChild as HTMLElement; const lang = detectCodeLanguage(codeEl); const code = codeEl?.textContent ?? ''; return `\n\`\`\`${lang}\n${code}\n\`\`\`\n`; }, }); } return turndownInstance; } function htmlTableToMarkdown(table: HTMLTableElement): string { const rows: string[][] = []; const tableRows = table.querySelectorAll('tr'); for (const row of tableRows) { const cells: string[] = []; for (const cell of row.querySelectorAll('th, td')) { cells.push((cell.textContent ?? '').trim().replace(/\|/g, '\\|')); } if (cells.length > 0) { rows.push(cells); } } if (rows.length === 0) return ''; const maxCols = Math.max(...rows.map((r) => r.length)); // Pad rows to same column count for (const row of rows) { while (row.length < maxCols) { row.push(''); } } const lines: string[] = []; // Header lines.push(`| ${rows[0].join(' | ')} |`); lines.push(`| ${rows[0].map(() => '---').join(' | ')} |`); // Body for (let i = 1; i < rows.length; i++) { lines.push(`| ${rows[i].join(' | ')} |`); } return '\n' + lines.join('\n') + '\n'; } /** * Known programming language names used as a fallback for bare class name matching. */ const KNOWN_LANGUAGES = new Set([ 'javascript', 'typescript', 'python', 'ruby', 'java', 'go', 'rust', 'c', 'cpp', 'csharp', 'swift', 'kotlin', 'scala', 'php', 'perl', 'lua', 'bash', 'shell', 'sh', 'zsh', 'powershell', 'sql', 'html', 'css', 'scss', 'less', 'json', 'yaml', 'yml', 'xml', 'toml', 'ini', 'markdown', 'md', 'jsx', 'tsx', 'graphql', 'r', 'matlab', 'dart', 'elixir', 'erlang', 'haskell', 'ocaml', 'clojure', 'vim', 'dockerfile', 'makefile', 'cmake', 'protobuf', 'terraform', 'hcl', ]); /** * Detect the programming language from a code element's class attribute. * Tries multiple patterns commonly used by syntax highlighters: * - language-xxx (Prism, highlight.js) * - lang-xxx (some highlighters) * - highlight-xxx / hljs xxx * - brush: xxx (SyntaxHighlighter) * - data-lang attribute * - bare class name matching a known language */ function detectCodeLanguage(codeEl: HTMLElement | null): string { if (!codeEl) return ''; // Check data-lang attribute first (used by some markdown renderers) const dataLang = codeEl.getAttribute?.('data-lang') ?? ''; if (dataLang) return dataLang.toLowerCase(); const className = codeEl.getAttribute?.('class') ?? ''; if (!className) return ''; // Pattern: language-xxx or lang-xxx const langPrefixMatch = className.match(/(?:language|lang)-(\w+)/); if (langPrefixMatch) return langPrefixMatch[1].toLowerCase(); // Pattern: highlight-xxx const highlightMatch = className.match(/highlight-(\w+)/); if (highlightMatch) return highlightMatch[1].toLowerCase(); // Pattern: brush: xxx (SyntaxHighlighter legacy) const brushMatch = className.match(/brush:\s*(\w+)/); if (brushMatch) return brushMatch[1].toLowerCase(); // Fallback: check if any class token is a known language name const tokens = className.split(/\s+/); for (const token of tokens) { const lower = token.toLowerCase(); if (KNOWN_LANGUAGES.has(lower)) return lower; } return ''; } /** * Tracks reading position across multiple extractMarkdown calls, * allowing incremental content consumption without re-reading. */ export class ReadingState { private charOffset = 0; private totalLength = 0; private pageUrl = ''; /** * Get the current character offset for the next read. */ get currentOffset(): number { return this.charOffset; } /** * Get the total length of the last-known content. */ get contentLength(): number { return this.totalLength; } /** * Whether there is more content to read. */ get hasMore(): boolean { return this.charOffset < this.totalLength; } /** * Fraction of content consumed so far (0..1). */ get progress(): number { if (this.totalLength === 0) return 0; return Math.min(1, this.charOffset / this.totalLength); } /** * Advance the reading position by the given number of characters. */ advance(chars: number): void { this.charOffset = Math.min(this.charOffset + chars, this.totalLength); } /** * Update state with fresh content metadata. If the URL changes, * the offset resets to the beginning. */ update(url: string, totalLength: number): void { if (url !== this.pageUrl) { this.charOffset = 0; this.pageUrl = url; } this.totalLength = totalLength; } /** * Reset the reading state to the beginning. */ reset(): void { this.charOffset = 0; this.totalLength = 0; this.pageUrl = ''; } } export interface MarkdownExtractionOptions { startFromChar?: number; maxLength?: number; extractLinks?: boolean; readingState?: ReadingState; } export async function extractMarkdown( page: Page, options?: MarkdownExtractionOptions, ): Promise { const html = await page.evaluate(() => { // Try to get main content first const main = document.querySelector('main, article, [role="main"], .content, #content'); if (main) return main.innerHTML; // Fallback to body return document.body?.innerHTML ?? ''; }); let markdown = htmlToMarkdown(html); const fullLength = markdown.length; // Update reading state if provided const readingState = options?.readingState; if (readingState) { const url = page.url(); readingState.update(url, fullLength); } // Determine the starting offset: explicit option takes priority, // then reading state's tracked position, then 0. const startOffset = options?.startFromChar ?? (readingState ? readingState.currentOffset : 0); if (startOffset > 0) { markdown = markdown.slice(startOffset); } // Apply max length let truncated = false; if (options?.maxLength && markdown.length > options.maxLength) { markdown = markdown.slice(0, options.maxLength); // Try to break at a paragraph boundary const lastParagraph = markdown.lastIndexOf('\n\n'); if (lastParagraph > markdown.length * 0.8) { markdown = markdown.slice(0, lastParagraph); } truncated = true; } // Advance reading state by the number of characters consumed if (readingState) { readingState.advance(markdown.length); } if (truncated) { const remaining = fullLength - startOffset - markdown.length; markdown += `\n\n[... content truncated, ~${remaining} chars remaining]`; } // Append links section if requested if (options?.extractLinks) { const links = await extractLinks(page); if (links.length > 0) { markdown += '\n\n## Links\n'; for (const link of links) { const marker = link.isExternal ? ' (external)' : ''; markdown += `- [${link.text}](${link.url})${marker}\n`; } } } return markdown; } export function htmlToMarkdown(html: string): string { const turndown = getTurndown(); const markdown = turndown.turndown(html); // Clean up excessive whitespace return markdown .replace(/\n{3,}/g, '\n\n') .replace(/^\s+|\s+$/gm, (match) => match.replace(/ +/g, '')) .trim(); } /** * Extract all links from a page as a structured list. */ export async function extractLinks( page: Page, ): Promise> { return page.evaluate(() => { const links: Array<{ text: string; url: string; isExternal: boolean }> = []; const currentHost = window.location.hostname; for (const anchor of document.querySelectorAll('a[href]')) { const href = anchor.getAttribute('href'); if (!href || href.startsWith('#') || href.startsWith('javascript:')) continue; let url: string; try { url = new URL(href, window.location.href).href; } catch { continue; } const text = (anchor.textContent ?? '').trim().slice(0, 200); if (!text) continue; let isExternal = false; try { isExternal = new URL(url).hostname !== currentHost; } catch { // ignore } links.push({ text, url, isExternal }); } return links; }); } export async function extractTextContent(page: Page): Promise { return page.evaluate(() => { const main = document.querySelector('main, article, [role="main"], .content, #content'); const element = (main ?? document.body) as HTMLElement | null; return element?.innerText ?? ''; }); } export function chunkText(text: string, maxChunkSize: number): string[] { if (text.length <= maxChunkSize) return [text]; const chunks: string[] = []; const paragraphs = text.split(/\n\n+/); let currentChunk = ''; for (const para of paragraphs) { if (currentChunk.length + para.length + 2 > maxChunkSize) { if (currentChunk) { chunks.push(currentChunk.trim()); currentChunk = ''; } // If a single paragraph is too long, split by sentences if (para.length > maxChunkSize) { const sentences = para.split(/(?<=[.!?])\s+/); for (const sentence of sentences) { if (currentChunk.length + sentence.length + 1 > maxChunkSize) { if (currentChunk) chunks.push(currentChunk.trim()); currentChunk = sentence; } else { currentChunk += (currentChunk ? ' ' : '') + sentence; } } } else { currentChunk = para; } } else { currentChunk += (currentChunk ? '\n\n' : '') + para; } } if (currentChunk) { chunks.push(currentChunk.trim()); } return chunks; } ================================================ FILE: packages/core/src/page/index.ts ================================================ export { PageAnalyzer, type PageAnalyzerOptions } from './page-analyzer.js'; export { SnapshotBuilder } from './snapshot-builder.js'; export { TreeRenderer, type RendererOptions } from './renderer/tree-renderer.js'; export { extractMarkdown, htmlToMarkdown, extractTextContent, extractLinks, chunkText, type MarkdownExtractionOptions, } from './content-extractor.js'; export { type PageTreeNode, type SelectorIndex, type RenderedPageState, type DOMRect, type CDPSnapshotResult, type AXNode, type TargetInfo, type TargetAllTrees, type InteractedElement, type MatchLevel, type SimplifiedNode, } from './types.js'; ================================================ FILE: packages/core/src/page/page-analyzer.test.ts ================================================ import { test, expect, describe, beforeEach, mock } from 'bun:test'; import { PageAnalyzer } from './page-analyzer.js'; import { PageExtractionError } from '../errors.js'; import type { PageTreeNode, SelectorIndex, RenderedPageState } from './types.js'; import type { ElementRef } from '../types.js'; // ── Mock factories ── function makeMockPage(overrides: Record = {}) { return { viewportSize: () => ({ width: 1280, height: 800 }), evaluate: mock(() => Promise.resolve({ x: 0, y: 0 })), click: mock(() => Promise.resolve()), fill: mock(() => Promise.resolve()), mouse: { click: mock(() => Promise.resolve()), }, keyboard: { type: mock(() => Promise.resolve()), }, frames: () => [], ...overrides, } as any; } function makeMockCdpSession(overrides: Record = {}) { return { send: mock(() => Promise.resolve({})), ...overrides, } as any; } function makeNode(overrides: Partial = {}): PageTreeNode { return { tagName: 'div', nodeType: 'element', attributes: {}, children: [], isVisible: true, isInteractive: false, isClickable: false, isEditable: false, isScrollable: false, ...overrides, }; } // ── Tests ── describe('PageAnalyzer', () => { let service: PageAnalyzer; beforeEach(() => { service = new PageAnalyzer(); }); describe('constructor defaults', () => { test('has default viewport expansion of 0', () => { // The service is created with defaults, including viewportExpansion = 0 expect(service).toBeDefined(); }); test('accepts custom options', () => { const custom = new PageAnalyzer({ viewportExpansion: 500, maxElementsInDom: 100, maxIframes: 1, capturedAttributes: ['title'], }); expect(custom).toBeDefined(); }); }); describe('cache management', () => { test('getCachedTree returns null initially', () => { expect(service.getCachedTree()).toBeNull(); }); test('getCachedSelectorMap returns null initially', () => { expect(service.getCachedSelectorMap()).toBeNull(); }); test('clearCache resets tree and selector map', () => { // We can't set cachedTree directly, but clearCache should work on empty state service.clearCache(); expect(service.getCachedTree()).toBeNull(); expect(service.getCachedSelectorMap()).toBeNull(); }); }); describe('interaction recording', () => { test('getInteractedElements returns empty array initially', () => { expect(service.getInteractedElements()).toEqual([]); }); test('clearInteractedElements resets the list', () => { service.clearInteractedElements(); expect(service.getInteractedElements()).toEqual([]); }); test('getInteractedElements returns a copy', () => { const elements = service.getInteractedElements(); expect(elements).not.toBe(service.getInteractedElements()); }); }); describe('clickElementByIndex', () => { test('throws PageExtractionError when element not in selector map', async () => { const page = makeMockPage(); const cdp = makeMockCdpSession(); await expect( service.clickElementByIndex(page, cdp, 42), ).rejects.toThrow(PageExtractionError); }); test('Strategy 1: uses CDP box model when backendNodeId is available', async () => { const page = makeMockPage(); const cdp = makeMockCdpSession({ send: mock(() => Promise.resolve({ model: { content: [10, 10, 110, 10, 110, 60, 10, 60], }, }), ), }); // Inject a selector map with a backendNodeId const selectorMap: SelectorIndex = { 0: { cssSelector: '#btn', backendNodeId: 123, tagName: 'button', }, }; // Use the private cachedSelectorMap via prototype access (service as any).cachedSelectorMap = selectorMap; await service.clickElementByIndex(page, cdp, 0); // Should have used mouse.click with center coordinates expect(page.mouse.click).toHaveBeenCalledTimes(1); // Center of quad: ((10+110+110+10)/4, (10+10+60+60)/4) = (60, 35) expect(page.mouse.click).toHaveBeenCalledWith(60, 35); // Should have recorded the interaction const interactions = service.getInteractedElements(); expect(interactions).toHaveLength(1); expect(interactions[0].action).toBe('click'); expect(interactions[0].tagName).toBe('button'); }); test('Strategy 2: falls back to JS getBoundingClientRect when CDP fails', async () => { const evaluateMock = mock(() => Promise.resolve({ x: 50, y: 25 }), ); const page = makeMockPage({ evaluate: evaluateMock }); const cdp = makeMockCdpSession({ send: mock(() => Promise.reject(new Error('CDP failed'))), }); const selectorMap: SelectorIndex = { 0: { cssSelector: '#btn', backendNodeId: 123, tagName: 'button', }, }; (service as any).cachedSelectorMap = selectorMap; await service.clickElementByIndex(page, cdp, 0); // Should have called page.evaluate (JS fallback) expect(evaluateMock).toHaveBeenCalled(); // Then mouse.click with the returned coords expect(page.mouse.click).toHaveBeenCalledWith(50, 25); }); test('Strategy 3: falls back to CSS selector click when JS rect returns null', async () => { const evaluateMock = mock(() => Promise.resolve(null)); const page = makeMockPage({ evaluate: evaluateMock }); const cdp = makeMockCdpSession({ send: mock(() => Promise.reject(new Error('CDP failed'))), }); const selectorMap: SelectorIndex = { 0: { cssSelector: '.my-btn', backendNodeId: 123, tagName: 'button', }, }; (service as any).cachedSelectorMap = selectorMap; await service.clickElementByIndex(page, cdp, 0); // Should have fallen through to page.click(cssSelector) expect(page.click).toHaveBeenCalledWith('.my-btn', { timeout: 5000 }); }); test('uses CSS selector click when no backendNodeId', async () => { const evaluateMock = mock(() => Promise.resolve(null)); const page = makeMockPage({ evaluate: evaluateMock }); const cdp = makeMockCdpSession(); const selectorMap: SelectorIndex = { 0: { cssSelector: '#submit', tagName: 'button', // No backendNodeId }, }; (service as any).cachedSelectorMap = selectorMap; await service.clickElementByIndex(page, cdp, 0); expect(page.click).toHaveBeenCalledWith('#submit', { timeout: 5000 }); }); }); describe('clickAtCoordinates', () => { test('clicks at the specified coordinates', async () => { const page = makeMockPage(); await service.clickAtCoordinates(page, 100, 200); expect(page.mouse.click).toHaveBeenCalledWith(100, 200); }); }); describe('inputTextByIndex', () => { test('throws when element not in selector map', async () => { const page = makeMockPage(); const cdp = makeMockCdpSession(); await expect( service.inputTextByIndex(page, cdp, 99, 'hello'), ).rejects.toThrow(PageExtractionError); }); test('fills input with text when clearFirst is true (default)', async () => { const page = makeMockPage(); const cdp = makeMockCdpSession(); (service as any).cachedSelectorMap = { 0: { cssSelector: '#name', tagName: 'input' }, }; await service.inputTextByIndex(page, cdp, 0, 'Alice'); expect(page.fill).toHaveBeenCalledWith('#name', 'Alice'); expect(service.getInteractedElements()).toHaveLength(1); expect(service.getInteractedElements()[0].action).toBe('input'); }); test('types text without clearing when clearFirst is false', async () => { const page = makeMockPage(); const cdp = makeMockCdpSession(); (service as any).cachedSelectorMap = { 0: { cssSelector: '#name', tagName: 'input' }, }; await service.inputTextByIndex(page, cdp, 0, 'Bob', false); expect(page.click).toHaveBeenCalledWith('#name'); expect(page.keyboard.type).toHaveBeenCalledWith('Bob'); }); }); describe('getElementSelector', () => { test('returns undefined when no selector map cached', async () => { const result = await service.getElementSelector(0); expect(result).toBeUndefined(); }); test('returns CSS selector when element is in the map', async () => { (service as any).cachedSelectorMap = { 5: { cssSelector: '.item-5', tagName: 'div' }, }; const result = await service.getElementSelector(5); expect(result).toBe('.item-5'); }); }); describe('getElementByBackendNodeId', () => { test('returns selector with ID when available', async () => { const cdp = makeMockCdpSession({ send: mock(() => Promise.resolve({ node: { nodeName: 'DIV', attributes: ['id', 'main-content', 'class', 'wrapper'], }, }), ), }); const result = await service.getElementByBackendNodeId(cdp, 42); expect(result).toEqual({ selector: '#main-content' }); }); test('returns tag name when no ID attribute', async () => { const cdp = makeMockCdpSession({ send: mock(() => Promise.resolve({ node: { nodeName: 'BUTTON', attributes: ['class', 'primary'], }, }), ), }); const result = await service.getElementByBackendNodeId(cdp, 42); expect(result).toEqual({ selector: 'button' }); }); test('returns null when CDP call fails', async () => { const cdp = makeMockCdpSession({ send: mock(() => Promise.reject(new Error('not found'))), }); const result = await service.getElementByBackendNodeId(cdp, 42); expect(result).toBeNull(); }); test('returns null when node has no result', async () => { const cdp = makeMockCdpSession({ send: mock(() => Promise.resolve({ node: null })), }); const result = await service.getElementByBackendNodeId(cdp, 42); expect(result).toBeNull(); }); }); describe('collectHiddenElementHints (via private access)', () => { test('collects hints for elements below the viewport', () => { const root = makeNode({ children: [ makeNode({ tagName: 'button', isInteractive: true, isVisible: false, highlightIndex: 0 as ElementRef, ariaLabel: 'Submit form', rect: { x: 0, y: 2000, width: 100, height: 30 }, }), ], }); const viewport = { width: 1280, height: 800 }; const scroll = { x: 0, y: 0 }; const hints = (service as any).collectHiddenElementHints(root, viewport, scroll); expect(hints).toHaveLength(1); expect(hints[0]).toContain('Submit form'); expect(hints[0]).toContain('pages below'); }); test('collects hints for elements above the viewport', () => { const root = makeNode({ children: [ makeNode({ tagName: 'a', isInteractive: true, isVisible: false, highlightIndex: 1 as ElementRef, text: 'Top link', rect: { x: 0, y: 100, width: 80, height: 20 }, }), ], }); const viewport = { width: 1280, height: 800 }; const scroll = { x: 0, y: 1600 }; // scrolled way down const hints = (service as any).collectHiddenElementHints(root, viewport, scroll); expect(hints).toHaveLength(1); expect(hints[0]).toContain('Top link'); expect(hints[0]).toContain('pages above'); }); test('ignores visible or non-interactive elements', () => { const root = makeNode({ children: [ makeNode({ tagName: 'button', isInteractive: true, isVisible: true, // visible elements are not collected highlightIndex: 0 as ElementRef, rect: { x: 0, y: 2000, width: 100, height: 30 }, }), makeNode({ tagName: 'div', isInteractive: false, // non-interactive isVisible: false, highlightIndex: 1 as ElementRef, rect: { x: 0, y: 2000, width: 100, height: 30 }, }), ], }); const viewport = { width: 1280, height: 800 }; const scroll = { x: 0, y: 0 }; const hints = (service as any).collectHiddenElementHints(root, viewport, scroll); expect(hints).toHaveLength(0); }); }); describe('applyViewportThresholdFilter (via private access)', () => { test('removes highlightIndex from elements outside expanded viewport', () => { const outsideNode = makeNode({ tagName: 'button', highlightIndex: 0 as ElementRef, rect: { x: 0, y: 5000, width: 100, height: 30 }, }); const insideNode = makeNode({ tagName: 'input', highlightIndex: 1 as ElementRef, rect: { x: 0, y: 200, width: 200, height: 30 }, }); const root = makeNode({ children: [outsideNode, insideNode], }); const viewport = { width: 1280, height: 800 }; const scroll = { x: 0, y: 0 }; (service as any).applyViewportThresholdFilter(root, viewport, scroll); // The outside node should have its highlightIndex removed expect(outsideNode.highlightIndex).toBeUndefined(); // The inside node should keep its highlightIndex expect(insideNode.highlightIndex).toBe(1 as ElementRef); }); test('keeps elements within the viewport expansion margin', () => { const svc = new PageAnalyzer({ viewportExpansion: 500 }); const nearNode = makeNode({ tagName: 'a', highlightIndex: 0 as ElementRef, rect: { x: 0, y: 1100, width: 100, height: 30 }, }); const root = makeNode({ children: [nearNode] }); (svc as any).applyViewportThresholdFilter( root, { width: 1280, height: 800 }, { x: 0, y: 0 }, ); // y=1100 is within 0..800+500=1300, so should be kept expect(nearNode.highlightIndex).toBe(0 as ElementRef); }); test('removes elements far to the right of the viewport', () => { const farRightNode = makeNode({ tagName: 'button', highlightIndex: 0 as ElementRef, rect: { x: 5000, y: 100, width: 100, height: 30 }, }); const root = makeNode({ children: [farRightNode] }); (service as any).applyViewportThresholdFilter( root, { width: 1280, height: 800 }, { x: 0, y: 0 }, ); expect(farRightNode.highlightIndex).toBeUndefined(); }); }); describe('integrateShadowDOMChildren (via private access)', () => { test('merges shadow children into the children array', () => { const shadowChild = makeNode({ tagName: 'span', text: 'shadow' }); const regularChild = makeNode({ tagName: 'p', text: 'regular' }); const root = makeNode({ children: [regularChild], shadowChildren: [shadowChild], }); (service as any).integrateShadowDOMChildren(root); expect(root.children).toHaveLength(2); expect(root.children[0].tagName).toBe('span'); // shadow comes first expect(root.children[1].tagName).toBe('p'); expect(root.children[0].isShadowRoot).toBe(true); expect(root.children[0].parentNode).toBe(root); expect(root.shadowChildren).toBeUndefined(); }); test('handles nodes with no shadow children', () => { const root = makeNode({ children: [makeNode({ tagName: 'div' })], }); (service as any).integrateShadowDOMChildren(root); expect(root.children).toHaveLength(1); }); }); }); ================================================ FILE: packages/core/src/page/page-analyzer.ts ================================================ import type { CDPSession, Page } from 'playwright'; import { SnapshotBuilder } from './snapshot-builder.js'; import { TreeRenderer, type RendererOptions } from './renderer/tree-renderer.js'; import type { PageTreeNode, RenderedPageState, SelectorIndex, TargetInfo, TargetAllTrees, InteractedElement, } from './types.js'; import { PageExtractionError } from '../errors.js'; import { createLogger } from '../logging.js'; import { timed } from '../telemetry.js'; import type { ElementRef } from '../types.js'; const logger = createLogger('dom'); export interface PageAnalyzerOptions { serializer?: Partial; capturedAttributes?: string[]; maxIframes?: number; viewportExpansion?: number; maxElementsInDom?: number; } export class PageAnalyzer { private snapshotProcessor: SnapshotBuilder; private serializer: TreeRenderer; private capturedAttributes: string[]; private maxIframes: number; private viewportExpansion: number; private maxElementsInDom: number; private cachedTree: PageTreeNode | null = null; private cachedSelectorMap: SelectorIndex | null = null; private interactedElements: InteractedElement[] = []; private hiddenElementHints: string[] = []; constructor(options?: PageAnalyzerOptions) { this.snapshotProcessor = new SnapshotBuilder(); this.capturedAttributes = options?.capturedAttributes ?? [ 'title', 'type', 'name', 'role', 'tabindex', 'aria-label', 'placeholder', 'value', 'alt', 'aria-expanded', ]; this.maxIframes = options?.maxIframes ?? 3; this.viewportExpansion = options?.viewportExpansion ?? 0; this.maxElementsInDom = options?.maxElementsInDom ?? 2000; this.serializer = new TreeRenderer({ capturedAttributes: this.capturedAttributes, maxElementsInDom: this.maxElementsInDom, ...options?.serializer, }); } async extractState( page: Page, cdpSession: CDPSession, ): Promise { const { result } = await timed('dom-extract', () => this._extractState(page, cdpSession), ); return result; } private async _extractState( page: Page, cdpSession: CDPSession, ): Promise { try { // Capture CDP snapshot const { domSnapshot, axTree } = await this.snapshotProcessor.captureSnapshot(cdpSession); // Get viewport and document info const [viewportSize, scrollPosition, documentSize] = await Promise.all([ page.viewportSize() ?? { width: 1280, height: 1100 }, page.evaluate(() => ({ x: window.scrollX, y: window.scrollY })), page.evaluate(() => ({ width: document.documentElement.scrollWidth, height: document.documentElement.scrollHeight, })), ]); // Build enhanced DOM tree const { root } = this.snapshotProcessor.buildTree( domSnapshot, axTree, viewportSize, this.capturedAttributes, ); // Traverse shadow DOM roots and merge their children into the main tree this.integrateShadowDOMChildren(root); // Filter interactive elements by viewport visibility threshold. // Elements far outside the expanded viewport are stripped of their // highlight index so they do not clutter the serialized output. if (this.viewportExpansion >= 0) { this.applyViewportThresholdFilter(root, viewportSize, scrollPosition); } this.cachedTree = root; // Collect hidden element hints for scroll guidance this.hiddenElementHints = this.collectHiddenElementHints( root, viewportSize, scrollPosition, ); // Serialize for LLM const state = this.serializer.serializeTree( root, scrollPosition, viewportSize, documentSize, ); this.cachedSelectorMap = state.selectorMap; // Append hidden element hints if (this.hiddenElementHints.length > 0) { state.tree += '\n\n--- Hidden interactive elements (scroll to access) ---\n'; state.tree += this.hiddenElementHints.slice(0, 10).join('\n'); if (this.hiddenElementHints.length > 10) { state.tree += `\n... and ${this.hiddenElementHints.length - 10} more`; } } logger.debug( `Extracted DOM: ${state.elementCount} elements, ${state.interactiveElementCount} interactive`, ); return state; } catch (error) { throw new PageExtractionError( `Failed to extract DOM state: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined }, ); } } /** * Discover cross-origin iframes and extract their DOM trees via CDP Target discovery. * For same-origin iframes, uses Playwright frame evaluation. * For cross-origin iframes, attaches CDP sessions to their targets and extracts DOM snapshots. */ async extractWithIframes( page: Page, cdpSession: CDPSession, ): Promise { const mainTree = await this._extractState(page, cdpSession).then(() => this.cachedTree!); const iframeTrees: TargetAllTrees['iframeTrees'] = []; try { const frames = page.frames().slice(0, this.maxIframes + 1); // +1 for main const processedUrls = new Set(); for (const frame of frames.slice(1, this.maxIframes + 1)) { try { const url = frame.url(); if (!url || url === 'about:blank' || processedUrls.has(url)) continue; processedUrls.add(url); const targetInfo: TargetInfo = { targetId: url, type: 'iframe', url, attached: true, }; // Try same-origin access first via Playwright frame evaluation const html = await frame.evaluate(() => document.body?.innerHTML ?? '').catch(() => ''); if (html) { iframeTrees.push({ targetInfo, tree: { tagName: 'iframe', nodeType: 'element', attributes: { src: url }, children: [], isVisible: true, isInteractive: false, isClickable: false, isEditable: false, isScrollable: false, text: `[iframe: ${url}]`, }, }); continue; } // Cross-origin: use CDP Target discovery to attach a session const iframeTree = await this.extractCrossOriginIframe(cdpSession, url); if (iframeTree) { iframeTrees.push({ targetInfo, tree: iframeTree, }); } } catch (error) { logger.debug(`Failed to extract iframe ${frame.url()}: ${error}`); } } } catch (error) { logger.debug(`Failed to extract iframe trees: ${error}`); } return { mainTree, iframeTrees }; } /** * Attach a CDP session to a cross-origin iframe target and extract its DOM tree. * Uses Target.getTargets to find the matching iframe target, then attaches a session * and runs DOMSnapshot.captureSnapshot on it. */ private async extractCrossOriginIframe( cdpSession: CDPSession, iframeUrl: string, ): Promise { try { const { targetInfos } = await cdpSession.send('Target.getTargets', {}) as unknown as { targetInfos: Array<{ targetId: string; type: string; url: string; attached: boolean }>; }; const iframeTarget = targetInfos.find( (t) => t.type === 'iframe' && t.url === iframeUrl, ); if (!iframeTarget) { logger.debug(`No CDP target found for cross-origin iframe: ${iframeUrl}`); return null; } // Attach to the iframe target const { sessionId: iframeSessionId } = await cdpSession.send('Target.attachToTarget', { targetId: iframeTarget.targetId, flatten: true, }) as unknown as { sessionId: string }; try { // Capture a DOM snapshot from the iframe session const snapshotResult = await cdpSession.send('Target.sendMessageToTarget', { sessionId: iframeSessionId, message: JSON.stringify({ id: 1, method: 'DOMSnapshot.captureSnapshot', params: { computedStyles: ['display', 'visibility', 'opacity'], includeDOMRects: true, }, }), }) as unknown; // The snapshot result comes back as a string via Target protocol // Build a minimal tree node representing the iframe content const iframeNode: PageTreeNode = { tagName: 'iframe', nodeType: 'element', attributes: { src: iframeUrl }, children: [], isVisible: true, isInteractive: false, isClickable: false, isEditable: false, isScrollable: false, text: `[cross-origin iframe: ${iframeUrl}]`, }; // If snapshot returned usable data, try to annotate the node if (snapshotResult && typeof snapshotResult === 'object') { iframeNode.text = `[cross-origin iframe content: ${iframeUrl}]`; } return iframeNode; } finally { // Detach from the iframe target to clean up await cdpSession.send('Target.detachFromTarget', { sessionId: iframeSessionId, }).catch(() => {}); } } catch (error) { logger.debug(`CDP cross-origin iframe extraction failed for ${iframeUrl}: ${error}`); return null; } } /** * Collect hints about interactive elements that are off-screen, * including approximate scroll distance. */ private collectHiddenElementHints( root: PageTreeNode, viewportSize: { width: number; height: number }, scrollPosition: { x: number; y: number }, ): string[] { const hints: string[] = []; const viewportTop = scrollPosition.y; const viewportBottom = viewportTop + viewportSize.height; const visit = (node: PageTreeNode) => { if ( node.isInteractive && node.rect && !node.isVisible && node.highlightIndex !== undefined ) { const elementY = node.rect.y; if (elementY > viewportBottom) { const pagesBelow = ((elementY - viewportBottom) / viewportSize.height).toFixed(1); const desc = node.ariaLabel || node.text?.trim()?.slice(0, 50) || node.tagName; hints.push( `${node.tagName} '${desc}' is ~${pagesBelow} pages below`, ); } else if (elementY < viewportTop) { const pagesAbove = ((viewportTop - elementY) / viewportSize.height).toFixed(1); const desc = node.ariaLabel || node.text?.trim()?.slice(0, 50) || node.tagName; hints.push( `${node.tagName} '${desc}' is ~${pagesAbove} pages above`, ); } } for (const child of node.children) { visit(child); } }; visit(root); return hints; } /** * Apply viewport threshold filtering to the tree. * Interactive elements whose rects fall entirely outside the expanded viewport * have their highlightIndex removed so they are not serialized as interactive. * The expansion margin is controlled by viewportExpansion (in pixels). */ private applyViewportThresholdFilter( root: PageTreeNode, viewportSize: { width: number; height: number }, scrollPosition: { x: number; y: number }, ): void { const expansion = this.viewportExpansion; const vpTop = scrollPosition.y - expansion; const vpBottom = scrollPosition.y + viewportSize.height + expansion; const vpLeft = scrollPosition.x - expansion; const vpRight = scrollPosition.x + viewportSize.width + expansion; const visit = (node: PageTreeNode) => { if (node.highlightIndex !== undefined && node.rect) { const nodeBottom = node.rect.y + node.rect.height; const nodeRight = node.rect.x + node.rect.width; // Element is entirely outside the expanded viewport const outsideVertically = nodeBottom < vpTop || node.rect.y > vpBottom; const outsideHorizontally = nodeRight < vpLeft || node.rect.x > vpRight; if (outsideVertically || outsideHorizontally) { // Remove the highlight index so it will not appear in the serialized map, // but keep the node in the tree for structure. node.highlightIndex = undefined; } } for (const child of node.children) { visit(child); } }; visit(root); } /** * Walk the tree and integrate shadow DOM children. * Nodes that have shadowChildren get those children merged into the * regular children array so downstream serialization handles them uniformly. */ private integrateShadowDOMChildren(root: PageTreeNode): void { const visit = (node: PageTreeNode) => { if (node.shadowChildren && node.shadowChildren.length > 0) { // Prepend shadow children before regular children so they // appear first, matching browser rendering order. for (const shadowChild of node.shadowChildren) { shadowChild.parentNode = node; shadowChild.isShadowRoot = true; } node.children = [...node.shadowChildren, ...node.children]; node.shadowChildren = undefined; } for (const child of node.children) { visit(child); } }; visit(root); } async getElementSelector(index: number): Promise { return this.cachedSelectorMap?.[index]?.cssSelector; } async getElementByBackendNodeId( cdpSession: CDPSession, backendNodeId: number, ): Promise<{ selector: string } | null> { try { const result = await cdpSession.send('DOM.describeNode', { backendNodeId, }) as { node: { nodeName: string; attributes?: string[] } }; if (!result?.node) return null; const attrs = result.node.attributes ?? []; for (let i = 0; i < attrs.length; i += 2) { if (attrs[i] === 'id' && attrs[i + 1]) { return { selector: `#${attrs[i + 1]}` }; } } return { selector: result.node.nodeName.toLowerCase() }; } catch { return null; } } /** * Click an element using a fallback chain: * 1. CDP box model (most reliable for overlapping elements) * 2. JS getBoundingClientRect * 3. CSS selector click */ async clickElementByIndex( page: Page, cdpSession: CDPSession, index: number, ): Promise { const selectorInfo = this.cachedSelectorMap?.[index]; if (!selectorInfo) { throw new PageExtractionError(`Element with index ${index} not found in selector map`); } // Strategy 1: CDP box model click if (selectorInfo.backendNodeId) { try { const { model } = await cdpSession.send('DOM.getBoxModel', { backendNodeId: selectorInfo.backendNodeId, }) as { model: { content: number[] } }; if (model?.content) { const [x1, y1, x2, y2, x3, y3, x4, y4] = model.content; const centerX = (x1 + x2 + x3 + x4) / 4; const centerY = (y1 + y2 + y3 + y4) / 4; await page.mouse.click(centerX, centerY); this.recordInteraction(index, selectorInfo.tagName, 'click'); return; } } catch { logger.debug(`CDP box model click failed for index ${index}, trying JS fallback`); } } // Strategy 2: JS getBoundingClientRect try { const rect = await page.evaluate((sel: string) => { const el = document.querySelector(sel); if (!el) return null; const r = el.getBoundingClientRect(); return { x: r.x + r.width / 2, y: r.y + r.height / 2 }; }, selectorInfo.cssSelector); if (rect) { await page.mouse.click(rect.x, rect.y); this.recordInteraction(index, selectorInfo.tagName, 'click'); return; } } catch { logger.debug(`JS rect click failed for index ${index}, trying CSS selector`); } // Strategy 3: CSS selector await page.click(selectorInfo.cssSelector, { timeout: 5000 }); this.recordInteraction(index, selectorInfo.tagName, 'click'); } /** * Click at specific coordinates on the page. */ async clickAtCoordinates( page: Page, x: number, y: number, ): Promise { await page.mouse.click(x, y); } async inputTextByIndex( page: Page, _cdpSession: CDPSession, index: number, text: string, clearFirst = true, ): Promise { const selectorInfo = this.cachedSelectorMap?.[index]; if (!selectorInfo) { throw new PageExtractionError(`Element with index ${index} not found in selector map`); } const selector = selectorInfo.cssSelector; if (clearFirst) { await page.fill(selector, text); } else { await page.click(selector); await page.keyboard.type(text); } this.recordInteraction(index, selectorInfo.tagName, 'input'); } private recordInteraction( index: number, tagName: string, action: string, ): void { this.interactedElements.push({ index: index as ElementRef, tagName, action, timestamp: Date.now(), }); } getInteractedElements(): InteractedElement[] { return [...this.interactedElements]; } clearInteractedElements(): void { this.interactedElements = []; } getCachedTree(): PageTreeNode | null { return this.cachedTree; } getCachedSelectorMap(): SelectorIndex | null { return this.cachedSelectorMap; } clearCache(): void { this.cachedTree = null; this.cachedSelectorMap = null; this.hiddenElementHints = []; } } ================================================ FILE: packages/core/src/page/renderer/interactive-elements.ts ================================================ import type { PageTreeNode } from '../types.js'; const ALWAYS_CLICKABLE_TAGS = new Set([ 'a', 'button', 'input', 'select', 'textarea', 'summary', ]); const CLICKABLE_ROLES = new Set([ 'button', 'link', 'menuitem', 'option', 'tab', 'treeitem', 'checkbox', 'radio', 'switch', ]); export function isClickableElement(node: PageTreeNode): boolean { if (ALWAYS_CLICKABLE_TAGS.has(node.tagName)) return true; if (node.role && CLICKABLE_ROLES.has(node.role)) return true; if (node.attributes['onclick']) return true; if (node.attributes['tabindex'] && node.attributes['tabindex'] !== '-1') return true; if (node.attributes['role'] && CLICKABLE_ROLES.has(node.attributes['role'])) return true; return node.isClickable; } export function getClickableDescription(node: PageTreeNode): string { const parts: string[] = []; if (node.ariaLabel) { parts.push(node.ariaLabel); } else if (node.text) { parts.push(node.text.trim().slice(0, 50)); } else if (node.attributes['title']) { parts.push(node.attributes['title']); } else if (node.attributes['alt']) { parts.push(node.attributes['alt']); } else if (node.attributes['placeholder']) { parts.push(node.attributes['placeholder']); } return parts.join(' - ') || node.tagName; } ================================================ FILE: packages/core/src/page/renderer/layer-order.ts ================================================ import type { PageTreeNode, DOMRect } from '../types.js'; /** * Filter overlapping elements by paint order (z-index). * When two interactive elements overlap, only keep the one painted on top. */ export function filterByPaintOrder(nodes: PageTreeNode[]): PageTreeNode[] { if (nodes.length === 0) return nodes; // Group nodes by approximate position const gridSize = 50; const grid = new Map(); for (const node of nodes) { if (!node.rect || !node.isVisible) continue; const cellX = Math.floor(node.rect.x / gridSize); const cellY = Math.floor(node.rect.y / gridSize); const key = `${cellX},${cellY}`; if (!grid.has(key)) grid.set(key, []); grid.get(key)!.push(node); } const hidden = new Set(); for (const cellNodes of grid.values()) { if (cellNodes.length < 2) continue; for (let i = 0; i < cellNodes.length; i++) { for (let j = i + 1; j < cellNodes.length; j++) { const a = cellNodes[i]; const b = cellNodes[j]; if (rectsOverlap(a.rect!, b.rect!, 0.5)) { const paintA = a.paintOrder ?? 0; const paintB = b.paintOrder ?? 0; if (paintA < paintB) { hidden.add(a); } else if (paintB < paintA) { hidden.add(b); } } } } } return nodes.filter((n) => !hidden.has(n)); } function rectsOverlap(a: DOMRect, b: DOMRect, threshold: number): boolean { const overlapX = Math.max( 0, Math.min(a.x + a.width, b.x + b.width) - Math.max(a.x, b.x), ); const overlapY = Math.max( 0, Math.min(a.y + a.height, b.y + b.height) - Math.max(a.y, b.y), ); const overlapArea = overlapX * overlapY; const smallerArea = Math.min(a.width * a.height, b.width * b.height); return smallerArea > 0 && overlapArea / smallerArea >= threshold; } ================================================ FILE: packages/core/src/page/renderer/tree-renderer.ts ================================================ import type { PageTreeNode, SelectorIndex, RenderedPageState } from '../types.js'; import type { ElementRef } from '../../types.js'; import { isClickableElement, getClickableDescription } from './interactive-elements.js'; import { filterByPaintOrder } from './layer-order.js'; export interface RendererOptions { capturedAttributes: string[]; maxDepth: number; filterPaintOrder: boolean; maxElementsInDom: number; collapseSvg: boolean; deduplicateSiblings: boolean; siblingDeduplicateThreshold: number; containmentThreshold: number; } const DEFAULT_OPTIONS: RendererOptions = { capturedAttributes: [ 'title', 'type', 'name', 'role', 'tabindex', 'aria-label', 'placeholder', 'value', 'alt', 'aria-expanded', ], maxDepth: 100, filterPaintOrder: true, maxElementsInDom: 2000, collapseSvg: true, deduplicateSiblings: true, siblingDeduplicateThreshold: 5, containmentThreshold: 0.95, }; const SVG_TAGS = new Set(['svg', 'path', 'rect', 'circle', 'ellipse', 'line', 'polyline', 'polygon', 'g', 'defs', 'use', 'symbol', 'clippath', 'lineargradient', 'radialgradient', 'stop', 'text', 'tspan', 'mask', 'filter']); export class TreeRenderer { private options: RendererOptions; constructor(options?: Partial) { this.options = { ...DEFAULT_OPTIONS, ...options }; } serializeTree( root: PageTreeNode, scrollPosition: { x: number; y: number }, viewportSize: { width: number; height: number }, documentSize: { width: number; height: number }, ): RenderedPageState { const selectorMap: SelectorIndex = {}; const interactiveElements: PageTreeNode[] = []; // Collect interactive elements this.collectInteractiveElements(root, interactiveElements); // Filter by paint order if enabled let visibleElements = this.options.filterPaintOrder ? filterByPaintOrder(interactiveElements) : interactiveElements; // Enhanced bounding-box off-screen filtering: // Remove elements that are clearly off-screen (negative coords beyond // a reasonable threshold, or positioned entirely past the document bounds). const offScreenHidden: PageTreeNode[] = []; visibleElements = this.filterOffScreenElements( visibleElements, scrollPosition, viewportSize, documentSize, offScreenHidden, ); // Build selector map for (const node of visibleElements) { if (node.highlightIndex !== undefined) { selectorMap[node.highlightIndex] = { cssSelector: node.cssSelector ?? this.buildCssSelector(node), xpath: node.xpath, backendNodeId: node.backendNodeId, tagName: node.tagName, role: node.role, ariaLabel: node.ariaLabel, text: node.text?.trim()?.slice(0, 100), }; } } // Serialize to text with element cap const lines: string[] = []; let elementCount = 0; const maxElements = this.options.maxElementsInDom; const countingContext = { count: 0, maxReached: false }; this.serializeNode(root, lines, 0, selectorMap, countingContext, maxElements); elementCount = Object.keys(selectorMap).length; if (countingContext.maxReached) { lines.push(`\n[... DOM truncated at ${maxElements} elements]`); } // Append hidden element hint section for off-screen interactive elements const hiddenHints = this.formatHiddenElementHints(offScreenHidden, scrollPosition, viewportSize); if (hiddenHints.length > 0) { lines.push(''); lines.push('--- Off-screen interactive elements ---'); for (const hint of hiddenHints.slice(0, 15)) { lines.push(hint); } if (hiddenHints.length > 15) { lines.push(`... and ${hiddenHints.length - 15} more off-screen elements`); } } const pixelsAbove = scrollPosition.y; const pixelsBelow = Math.max(0, documentSize.height - scrollPosition.y - viewportSize.height); return { tree: lines.join('\n'), selectorMap, elementCount, interactiveElementCount: visibleElements.length, scrollPosition, viewportSize, documentSize, pixelsAbove, pixelsBelow, }; } private serializeNode( node: PageTreeNode, lines: string[], depth: number, selectorMap: SelectorIndex, ctx: { count: number; maxReached: boolean }, maxElements: number, ): void { if (depth > this.options.maxDepth) return; if (ctx.maxReached) return; if (!node.isVisible && node.nodeType === 'element' && node.children.length === 0) return; const indent = '\t'.repeat(depth); if (node.nodeType === 'text') { const text = node.text?.trim(); if (text) { lines.push(`${indent}${text}`); } return; } // Skip invisible non-interactive containers with no visible children if (!node.isVisible && !node.isInteractive && !this.hasVisibleDescendant(node)) { return; } // Collapse SVGs to placeholder, with containment deduplication for nested SVGs. // When an SVG contains only other SVG elements (nested wrappers), we collapse // them into a single placeholder using the deepest label we can find. if (this.options.collapseSvg && node.tagName === 'svg') { const desc = this.resolveSvgDescription(node); if (node.highlightIndex !== undefined && selectorMap[node.highlightIndex]) { lines.push(`${indent}[${node.highlightIndex}]${desc}`); } else { lines.push(`${indent}${desc}`); } ctx.count++; return; } // Skip inner SVG elements if (SVG_TAGS.has(node.tagName) && node.tagName !== 'svg') { return; } ctx.count++; if (ctx.count > maxElements) { ctx.maxReached = true; return; } // Containment check: if parent fully contains only this child, prefer showing child // (handled implicitly by tree traversal — we just skip redundant wrappers) if (this.isRedundantWrapper(node)) { for (const child of node.children) { this.serializeNode(child, lines, depth, selectorMap, ctx, maxElements); } return; } // Build tag representation const parts: string[] = []; // Highlight index for interactive elements if (node.highlightIndex !== undefined && selectorMap[node.highlightIndex]) { parts.push(`[${node.highlightIndex}]`); } // Tag name parts.push(`<${node.tagName}`); // Attributes const attrParts: string[] = []; for (const attr of this.options.capturedAttributes) { const value = node.attributes[attr]; if (value !== undefined && value !== '') { attrParts.push(`${attr}="${value}"`); } } // Prefer AX node name over DOM text when available if (node.role) { attrParts.push(`role="${node.role}"`); } if (node.ariaLabel && !node.attributes['aria-label']) { attrParts.push(`aria-label="${node.ariaLabel}"`); } if (attrParts.length > 0) { parts.push(` ${attrParts.join(' ')}`); } // Input value if (node.inputValue !== undefined) { parts.push(` value="${node.inputValue}"`); } parts.push('>'); // Inline text for leaf elements const inlineText = this.getInlineText(node); if (inlineText) { parts.push(inlineText); parts.push(``); lines.push(`${indent}${parts.join('')}`); return; } lines.push(`${indent}${parts.join('')}`); // Deduplicate similar siblings if (this.options.deduplicateSiblings) { this.serializeChildrenWithDedup(node.children, lines, depth + 1, selectorMap, ctx, maxElements); } else { for (const child of node.children) { this.serializeNode(child, lines, depth + 1, selectorMap, ctx, maxElements); } } // Closing tag only if there were children if (node.children.some((c) => c.isVisible || c.nodeType === 'text')) { lines.push(`${indent}`); } } /** * Serialize children but deduplicate runs of similar siblings. * If more than N consecutive siblings have the same tagName and no interactive children, * show the first few and add "... and N-3 more" summary. */ private serializeChildrenWithDedup( children: PageTreeNode[], lines: string[], depth: number, selectorMap: SelectorIndex, ctx: { count: number; maxReached: boolean }, maxElements: number, ): void { const threshold = this.options.siblingDeduplicateThreshold; let i = 0; while (i < children.length) { if (ctx.maxReached) return; const child = children[i]; // Find run of same-tag non-interactive siblings let runEnd = i + 1; if ( child.nodeType === 'element' && !child.isInteractive && !this.hasInteractiveDescendant(child) ) { while ( runEnd < children.length && children[runEnd].nodeType === 'element' && children[runEnd].tagName === child.tagName && !children[runEnd].isInteractive && !this.hasInteractiveDescendant(children[runEnd]) ) { runEnd++; } } const runLength = runEnd - i; if (runLength > threshold) { // Show first 3, then summarize const showCount = 3; for (let j = i; j < i + showCount && j < runEnd; j++) { this.serializeNode(children[j], lines, depth, selectorMap, ctx, maxElements); } const indent = '\t'.repeat(depth); lines.push(`${indent}... and ${runLength - showCount} more <${child.tagName}> elements`); i = runEnd; } else { this.serializeNode(child, lines, depth, selectorMap, ctx, maxElements); i++; } } } /** * Check if a node is a redundant wrapper: single visible child, no interactive * properties, no highlight index, generic tag. */ private isRedundantWrapper(node: PageTreeNode): boolean { if (node.highlightIndex !== undefined) return false; if (node.isInteractive) return false; const visibleChildren = node.children.filter( (c) => c.isVisible || c.isInteractive || c.nodeType === 'text', ); if (visibleChildren.length !== 1) return false; const genericTags = new Set(['div', 'span', 'section', 'article', 'main']); if (!genericTags.has(node.tagName)) return false; // Check containment: does the parent rect fully contain the child rect? if (node.rect && visibleChildren[0].rect) { const parentArea = node.rect.width * node.rect.height; const childArea = visibleChildren[0].rect.width * visibleChildren[0].rect.height; if (parentArea > 0 && childArea / parentArea > this.options.containmentThreshold) { return true; } } return false; } private getInlineText(node: PageTreeNode): string | null { if (node.children.length === 0) { return node.text?.trim() || null; } if ( node.children.length === 1 && node.children[0].nodeType === 'text' && node.children[0].text ) { return node.children[0].text.trim(); } return null; } private hasVisibleDescendant(node: PageTreeNode): boolean { for (const child of node.children) { if (child.isVisible || child.isInteractive) return true; if (this.hasVisibleDescendant(child)) return true; } return false; } private hasInteractiveDescendant(node: PageTreeNode): boolean { for (const child of node.children) { if (child.isInteractive || child.highlightIndex !== undefined) return true; if (this.hasInteractiveDescendant(child)) return true; } return false; } private collectInteractiveElements( node: PageTreeNode, result: PageTreeNode[], ): void { if (node.highlightIndex !== undefined && node.isVisible) { result.push(node); } for (const child of node.children) { this.collectInteractiveElements(child, result); } } private buildCssSelector(node: PageTreeNode): string { const parts: string[] = []; let current: PageTreeNode | undefined = node; while (current && current.tagName !== 'html') { let selector = current.tagName; if (current.attributes['id']) { selector = `#${current.attributes['id']}`; parts.unshift(selector); break; } if (current.parentNode) { const siblings = current.parentNode.children.filter( (c) => c.tagName === current!.tagName, ); if (siblings.length > 1) { const idx = siblings.indexOf(current) + 1; selector += `:nth-of-type(${idx})`; } } parts.unshift(selector); current = current.parentNode; } return parts.join(' > '); } /** * Enhanced off-screen element filtering. * Removes interactive elements whose bounding boxes fall entirely outside * reasonable document bounds, or that have degenerate rects (negative width/height, * extremely large offsets indicating hidden off-canvas positioning). * Elements that are simply scrolled out of the current viewport are NOT removed -- * they are collected into the offScreenHidden array for hint formatting. */ private filterOffScreenElements( elements: PageTreeNode[], scrollPosition: { x: number; y: number }, viewportSize: { width: number; height: number }, documentSize: { width: number; height: number }, offScreenHidden: PageTreeNode[], ): PageTreeNode[] { // Anything positioned more than this many pixels outside the document // is almost certainly a hidden/off-canvas element (e.g. left: -9999px). const offCanvasThreshold = 5000; const vpTop = scrollPosition.y; const vpBottom = scrollPosition.y + viewportSize.height; const vpLeft = scrollPosition.x; const vpRight = scrollPosition.x + viewportSize.width; const result: PageTreeNode[] = []; for (const node of elements) { if (!node.rect) { result.push(node); continue; } const { x, y, width, height } = node.rect; // Degenerate rects: negative dimensions or zero-area if (width <= 0 || height <= 0) { continue; } // Off-canvas positioning (common CSS hidden pattern: left: -9999px) if ( x + width < -offCanvasThreshold || y + height < -offCanvasThreshold || x > documentSize.width + offCanvasThreshold || y > documentSize.height + offCanvasThreshold ) { continue; } // Check if the element is inside the current viewport const nodeBottom = y + height; const nodeRight = x + width; const inViewport = nodeBottom >= vpTop && y <= vpBottom && nodeRight >= vpLeft && x <= vpRight; if (inViewport) { result.push(node); } else { // Off-screen but within reasonable document bounds -- // keep it in the selector map but track it for hint section result.push(node); offScreenHidden.push(node); } } return result; } /** * Format hidden element hints for the serialized output. * Groups off-screen elements by direction and provides scroll distance estimates. */ private formatHiddenElementHints( offScreenElements: PageTreeNode[], scrollPosition: { x: number; y: number }, viewportSize: { width: number; height: number }, ): string[] { if (offScreenElements.length === 0) return []; const vpBottom = scrollPosition.y + viewportSize.height; const vpTop = scrollPosition.y; const hints: string[] = []; for (const node of offScreenElements) { if (!node.rect) continue; const desc = this.getNodeDescription(node); const elementY = node.rect.y; if (elementY > vpBottom) { const pxBelow = elementY - vpBottom; const pagesBelow = (pxBelow / viewportSize.height).toFixed(1); hints.push(` ${node.tagName} "${desc}" ~${pagesBelow} pages below`); } else if (elementY + node.rect.height < vpTop) { const pxAbove = vpTop - (elementY + node.rect.height); const pagesAbove = (pxAbove / viewportSize.height).toFixed(1); hints.push(` ${node.tagName} "${desc}" ~${pagesAbove} pages above`); } else { // Off to the side hints.push(` ${node.tagName} "${desc}" off-screen horizontally`); } } return hints; } /** * Get a short human-readable description of a node for hint text. */ private getNodeDescription(node: PageTreeNode): string { if (node.ariaLabel) return node.ariaLabel.slice(0, 60); if (node.text) return node.text.trim().slice(0, 60); if (node.attributes['title']) return node.attributes['title'].slice(0, 60); if (node.attributes['placeholder']) return node.attributes['placeholder'].slice(0, 60); return node.tagName; } /** * Resolve the best description for an SVG, traversing nested SVG wrappers * to find the deepest aria-label or title. This collapses redundant * nested SVG containers into a single description. */ private resolveSvgDescription(node: PageTreeNode): string { // Check the current node for labels const label = node.ariaLabel || node.attributes['aria-label'] || ''; const title = node.attributes['title'] || ''; // Look for nested SVGs that might carry a better description let deepLabel = ''; const visitSvgChildren = (n: PageTreeNode): void => { for (const child of n.children) { if (child.tagName === 'title' && child.text) { deepLabel = child.text.trim(); return; } if (child.tagName === 'svg') { // Nested SVG -- check it for labels const nested = child.ariaLabel || child.attributes['aria-label'] || child.attributes['title'] || ''; if (nested) { deepLabel = nested; return; } // Keep traversing deeper visitSvgChildren(child); if (deepLabel) return; } if (SVG_TAGS.has(child.tagName)) { visitSvgChildren(child); if (deepLabel) return; } } }; visitSvgChildren(node); return label || title || deepLabel || 'icon'; } } ================================================ FILE: packages/core/src/page/renderer.test.ts ================================================ import { test, expect, describe, beforeEach } from 'bun:test'; import { TreeRenderer } from './renderer/tree-renderer.js'; import type { PageTreeNode, SelectorIndex } from './types.js'; import type { ElementRef } from '../types.js'; // ── Helpers ── function makeNode(overrides: Partial = {}): PageTreeNode { return { tagName: 'div', nodeType: 'element', attributes: {}, children: [], isVisible: true, isInteractive: false, isClickable: false, isEditable: false, isScrollable: false, ...overrides, }; } function makeTextNode(text: string): PageTreeNode { return makeNode({ tagName: '', nodeType: 'text', text, children: [], }); } const defaultScroll = { x: 0, y: 0 }; const defaultViewport = { width: 1280, height: 800 }; const defaultDocSize = { width: 1280, height: 3000 }; // ── Tests ── describe('TreeRenderer', () => { let serializer: TreeRenderer; beforeEach(() => { serializer = new TreeRenderer({ capturedAttributes: ['title', 'role', 'aria-label', 'placeholder'], filterPaintOrder: false, }); }); describe('basic tree serialization', () => { test('serializes a simple root with text child', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'body', children: [ makeNode({ tagName: 'h1', text: 'Hello World', children: [], }), ], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('h1'); expect(state.tree).toContain('Hello World'); expect(state.scrollPosition).toEqual(defaultScroll); expect(state.viewportSize).toEqual(defaultViewport); }); test('includes element count and interactive element count', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'button', isInteractive: true, isVisible: true, highlightIndex: 0 as ElementRef, text: 'Click me', cssSelector: '#btn', }), makeNode({ tagName: 'p', text: 'Paragraph', }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.interactiveElementCount).toBeGreaterThanOrEqual(1); expect(state.elementCount).toBeGreaterThanOrEqual(1); }); test('builds selector map for interactive elements with highlightIndex', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'button', isInteractive: true, isVisible: true, highlightIndex: 0 as ElementRef, cssSelector: '#submit-btn', text: 'Submit', role: 'button', ariaLabel: 'Submit form', }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.selectorMap[0]).toBeDefined(); expect(state.selectorMap[0].cssSelector).toBe('#submit-btn'); expect(state.selectorMap[0].tagName).toBe('button'); expect(state.selectorMap[0].role).toBe('button'); expect(state.selectorMap[0].ariaLabel).toBe('Submit form'); }); test('includes highlight index in serialized output', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'a', isInteractive: true, isVisible: true, highlightIndex: 3 as ElementRef, cssSelector: 'a.link', text: 'Link text', }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('[3]'); }); test('computes pixelsAbove and pixelsBelow', () => { const root = makeNode({ tagName: 'html' }); const state = serializer.serializeTree( root, { x: 0, y: 400 }, { width: 1280, height: 800 }, { width: 1280, height: 2000 }, ); expect(state.pixelsAbove).toBe(400); expect(state.pixelsBelow).toBe(800); // 2000 - 400 - 800 }); }); describe('SVG collapse', () => { test('collapses SVG to placeholder with icon label', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'svg', isVisible: true, attributes: {}, children: [ makeNode({ tagName: 'path', isVisible: true, attributes: { d: 'M0 0L10 10' }, }), ], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('icon'); }); test('uses aria-label from SVG if available', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'svg', isVisible: true, ariaLabel: 'Search icon', attributes: {}, children: [], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('Search icon'); }); test('finds title in nested SVG structure', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'svg', isVisible: true, attributes: {}, children: [ makeNode({ tagName: 'title', isVisible: true, text: 'Close button', nodeType: 'element', attributes: {}, children: [], }), ], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('Close button'); }); test('includes highlight index on interactive SVG', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'svg', isVisible: true, isInteractive: true, highlightIndex: 5 as ElementRef, cssSelector: 'svg.icon', attributes: {}, children: [], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('[5]'); }); test('does not collapse SVG when collapseSvg is disabled', () => { const noCollapse = new TreeRenderer({ collapseSvg: false, filterPaintOrder: false, }); const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'svg', isVisible: true, attributes: {}, children: [ makeNode({ tagName: 'rect', isVisible: true, attributes: {}, }), ], }), ], }); const state = noCollapse.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // Should not be collapsed to a single icon placeholder expect(state.tree).toContain(''); // Inner SVG elements (path, rect, etc.) are always skipped by the // SVG_TAGS filter, so they won't appear. The key difference is // collapseSvg=false does NOT produce the collapsed placeholder format. expect(state.tree).not.toContain('icon'); }); }); describe('sibling deduplication', () => { test('deduplicates runs of same-tag non-interactive siblings', () => { // Create 8 identical li elements (threshold = 5) const listItems = Array.from({ length: 8 }, (_, i) => makeNode({ tagName: 'li', isVisible: true, text: `Item ${i}`, children: [], }), ); const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'ul', isVisible: true, children: listItems, }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // Should show first 3 and then "... and 5 more" summary expect(state.tree).toContain('Item 0'); expect(state.tree).toContain('Item 1'); expect(state.tree).toContain('Item 2'); expect(state.tree).toContain('... and 5 more
  • elements'); expect(state.tree).not.toContain('Item 7'); }); test('does not deduplicate when below threshold', () => { const items = Array.from({ length: 3 }, (_, i) => makeNode({ tagName: 'li', isVisible: true, text: `Item ${i}`, children: [], }), ); const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'ul', isVisible: true, children: items, }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('Item 0'); expect(state.tree).toContain('Item 1'); expect(state.tree).toContain('Item 2'); expect(state.tree).not.toContain('... and'); }); test('does not deduplicate siblings with interactive descendants', () => { const items = Array.from({ length: 8 }, (_, i) => makeNode({ tagName: 'li', isVisible: true, children: [ makeNode({ tagName: 'a', isInteractive: i === 4, // one has interactive child isVisible: true, highlightIndex: i === 4 ? (10 as ElementRef) : undefined, text: `Link ${i}`, }), ], }), ); const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'ul', isVisible: true, children: items, }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // Because item 4 has an interactive descendant, the run is broken // and items should not all be deduped away expect(state.tree).toContain('Link 4'); }); test('does not deduplicate when deduplicateSiblings is disabled', () => { const noDedup = new TreeRenderer({ deduplicateSiblings: false, filterPaintOrder: false, }); const items = Array.from({ length: 8 }, (_, i) => makeNode({ tagName: 'li', isVisible: true, text: `Item ${i}`, children: [], }), ); const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'ul', isVisible: true, children: items, }), ], }); const state = noDedup.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).not.toContain('... and'); expect(state.tree).toContain('Item 7'); }); }); describe('max elements cap', () => { test('truncates tree when max elements is exceeded', () => { const small = new TreeRenderer({ maxElementsInDom: 5, filterPaintOrder: false, deduplicateSiblings: false, }); const children = Array.from({ length: 20 }, (_, i) => makeNode({ tagName: 'p', isVisible: true, text: `Para ${i}`, children: [], }), ); const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'body', isVisible: true, children, }), ], }); const state = small.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('DOM truncated at 5 elements'); }); }); describe('containment threshold (redundant wrappers)', () => { test('skips redundant div wrapper when child fills parent', () => { const innerButton = makeNode({ tagName: 'button', isVisible: true, isInteractive: true, highlightIndex: 0 as ElementRef, cssSelector: 'button', text: 'Click', rect: { x: 0, y: 0, width: 200, height: 50 }, }); const wrapper = makeNode({ tagName: 'div', isVisible: true, isInteractive: false, rect: { x: 0, y: 0, width: 200, height: 50 }, children: [innerButton], }); const root = makeNode({ tagName: 'html', children: [wrapper], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // The redundant div wrapper should be skipped in output; // the button should appear directly expect(state.tree).toContain('button'); expect(state.tree).toContain('Click'); }); test('does not skip wrapper when it has a highlightIndex', () => { const inner = makeNode({ tagName: 'span', isVisible: true, text: 'Text', rect: { x: 0, y: 0, width: 100, height: 20 }, }); const wrapper = makeNode({ tagName: 'div', isVisible: true, highlightIndex: 1 as ElementRef, cssSelector: 'div#parent', rect: { x: 0, y: 0, width: 100, height: 20 }, children: [inner], }); const root = makeNode({ tagName: 'html', children: [wrapper], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain(' { const inner = makeNode({ tagName: 'p', isVisible: true, text: 'Hello', rect: { x: 0, y: 0, width: 100, height: 20 }, }); const wrapper = makeNode({ tagName: 'nav', // not in genericTags set isVisible: true, rect: { x: 0, y: 0, width: 100, height: 20 }, children: [inner], }); const root = makeNode({ tagName: 'html', children: [wrapper], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain(' { test('filters out elements with degenerate rects (zero area)', () => { const zeroWidth = makeNode({ tagName: 'button', isVisible: true, isInteractive: true, highlightIndex: 0 as ElementRef, cssSelector: 'button.hidden', rect: { x: 0, y: 0, width: 0, height: 30 }, }); const root = makeNode({ tagName: 'html', children: [zeroWidth], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // Zero-width element should be filtered from the selector map expect(state.selectorMap[0]).toBeUndefined(); }); test('filters out elements with extreme off-canvas positioning', () => { const offCanvas = makeNode({ tagName: 'a', isVisible: true, isInteractive: true, highlightIndex: 0 as ElementRef, cssSelector: 'a.sr-only', rect: { x: -10000, y: 0, width: 100, height: 20 }, }); const root = makeNode({ tagName: 'html', children: [offCanvas], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.selectorMap[0]).toBeUndefined(); }); test('keeps elements that are off-viewport but within document bounds', () => { const belowViewport = makeNode({ tagName: 'button', isVisible: true, isInteractive: true, highlightIndex: 0 as ElementRef, cssSelector: 'button.below', rect: { x: 100, y: 2000, width: 100, height: 30 }, }); const root = makeNode({ tagName: 'html', children: [belowViewport], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // Should be kept in selector map even though off-viewport expect(state.selectorMap[0]).toBeDefined(); expect(state.selectorMap[0].cssSelector).toBe('button.below'); }); }); describe('hidden element hints formatting', () => { test('formats hints for off-screen elements below viewport', () => { const belowElement = makeNode({ tagName: 'button', isVisible: true, isInteractive: true, highlightIndex: 0 as ElementRef, cssSelector: 'button.far', ariaLabel: 'Load more', rect: { x: 100, y: 2400, width: 100, height: 30 }, }); const root = makeNode({ tagName: 'html', children: [belowElement], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('Off-screen interactive elements'); expect(state.tree).toContain('Load more'); expect(state.tree).toContain('pages below'); }); test('formats hints for elements above viewport', () => { const aboveElement = makeNode({ tagName: 'a', isVisible: true, isInteractive: true, highlightIndex: 0 as ElementRef, cssSelector: 'a.header', ariaLabel: 'Home link', rect: { x: 100, y: 50, width: 100, height: 30 }, }); const root = makeNode({ tagName: 'html', children: [aboveElement], }); // Scrolled down so element is above const state = serializer.serializeTree( root, { x: 0, y: 1000 }, defaultViewport, defaultDocSize, ); expect(state.tree).toContain('Home link'); expect(state.tree).toContain('pages above'); }); test('limits hints to 15 off-screen elements', () => { const children = Array.from({ length: 20 }, (_, i) => makeNode({ tagName: 'button', isVisible: true, isInteractive: true, highlightIndex: i as ElementRef, cssSelector: `button.item-${i}`, ariaLabel: `Button ${i}`, rect: { x: 100, y: 2000 + i * 100, width: 100, height: 30 }, }), ); const root = makeNode({ tagName: 'html', children, }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); // Should cap at 15 and say "... and N more" expect(state.tree).toContain('more off-screen elements'); }); }); describe('attributes serialization', () => { test('includes configured attributes in output', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'input', isVisible: true, attributes: { placeholder: 'Enter email', title: 'Email field', }, children: [], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('placeholder="Enter email"'); expect(state.tree).toContain('title="Email field"'); }); test('includes role and aria-label from node properties', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'div', isVisible: true, role: 'navigation', ariaLabel: 'Main menu', attributes: {}, children: [makeTextNode('Menu')], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('role="navigation"'); expect(state.tree).toContain('aria-label="Main menu"'); }); test('includes input value in output', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'input', isVisible: true, inputValue: 'current text', attributes: {}, children: [], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('value="current text"'); }); }); describe('text node handling', () => { test('renders text content inline for leaf elements', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'p', isVisible: true, text: 'Hello world', children: [], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('Hello world'); }); test('renders text node children', () => { const root = makeNode({ tagName: 'html', children: [ makeNode({ tagName: 'p', isVisible: true, children: [makeTextNode('Some text content')], }), ], }); const state = serializer.serializeTree(root, defaultScroll, defaultViewport, defaultDocSize); expect(state.tree).toContain('Some text content'); }); }); }); ================================================ FILE: packages/core/src/page/snapshot-builder.ts ================================================ import type { CDPSession } from 'playwright'; import type { CDPSnapshotResult, AXNode, PageTreeNode, DOMRect, } from './types.js'; import { type ElementRef, elementIndex } from '../types.js'; const INTERACTIVE_TAGS = new Set([ 'a', 'button', 'input', 'select', 'textarea', 'details', 'summary', 'label', 'option', 'fieldset', 'legend', ]); const INTERACTIVE_ROLES = new Set([ 'button', 'link', 'textbox', 'checkbox', 'radio', 'combobox', 'listbox', 'menu', 'menuitem', 'menuitemcheckbox', 'menuitemradio', 'option', 'searchbox', 'slider', 'spinbutton', 'switch', 'tab', 'treeitem', 'gridcell', 'columnheader', 'rowheader', ]); const INVISIBLE_TAGS = new Set([ 'script', 'style', 'link', 'meta', 'head', 'noscript', 'template', ]); export class SnapshotBuilder { private indexCounter = 0; async captureSnapshot(cdpSession: CDPSession): Promise<{ domSnapshot: CDPSnapshotResult; axTree: AXNode; }> { const [domSnapshot, axTree] = await Promise.all([ cdpSession.send('DOMSnapshot.captureSnapshot', { computedStyles: ['display', 'visibility', 'opacity', 'overflow'], includeDOMRects: true, includePaintOrder: true, }) as Promise as Promise, cdpSession.send('Accessibility.getFullAXTree', {}) as Promise as Promise<{ nodes: AXNode[] }>, ]); // Convert flat AX tree list to the root node const rootAx: AXNode = axTree.nodes?.[0] ?? { nodeId: '0', role: { value: 'WebArea' }, }; return { domSnapshot, axTree: rootAx }; } buildTree( snapshot: CDPSnapshotResult, axTree: AXNode, viewportSize: { width: number; height: number }, capturedAttributes: string[] = [], ): { root: PageTreeNode; indexCounter: number } { this.indexCounter = 0; const doc = snapshot.documents[0]; if (!doc) { return { root: this.createEmptyNode(), indexCounter: 0, }; } const { nodes, layout, strings } = doc; // Build backend node ID → AX node map const axNodeMap = new Map(); this.buildAXMap(axTree, axNodeMap); // Build layout index map const layoutMap = new Map(); for (let i = 0; i < layout.nodeIndex.length; i++) { const nodeIdx = layout.nodeIndex[i]; layoutMap.set(nodeIdx, { bounds: layout.bounds[i], text: layout.text[i] !== -1 ? strings[layout.text[i]] : undefined, paintOrder: layout.paintOrder?.[i], }); } // Build clickable set const clickableSet = new Set(); if (nodes.isClickable) { for (const idx of nodes.isClickable.index) { clickableSet.add(idx); } } // Build input value map const inputValueMap = new Map(); if (nodes.inputValue) { for (let i = 0; i < nodes.inputValue.index.length; i++) { const nodeIdx = nodes.inputValue.index[i]; const valueIdx = nodes.inputValue.value[i]; inputValueMap.set(nodeIdx, strings[valueIdx]); } } // Build the tree recursively const root = this.buildNodeTree( 0, nodes, strings, layoutMap, axNodeMap, clickableSet, inputValueMap, viewportSize, capturedAttributes, ); return { root, indexCounter: this.indexCounter }; } private buildNodeTree( nodeIndex: number, nodes: CDPSnapshotResult['documents'][0]['nodes'], strings: string[], layoutMap: Map, axNodeMap: Map, clickableSet: Set, inputValueMap: Map, viewportSize: { width: number; height: number }, capturedAttributes: string[], ): PageTreeNode { const nodeType = nodes.nodeType[nodeIndex]; const tagName = strings[nodes.nodeName[nodeIndex]]?.toLowerCase() ?? ''; const backendNodeId = nodes.backendNodeId[nodeIndex]; // Check layout const layoutInfo = layoutMap.get(nodeIndex); let rect: DOMRect | undefined; let isVisible = false; if (layoutInfo) { const [x, y, w, h] = layoutInfo.bounds; rect = { x, y, width: w, height: h }; isVisible = w > 0 && h > 0 && !INVISIBLE_TAGS.has(tagName); } // Parse attributes const rawAttrs = nodes.attributes[nodeIndex] ?? []; const attributes: Record = {}; for (let i = 0; i < rawAttrs.length; i += 2) { const name = strings[rawAttrs[i]]; const value = strings[rawAttrs[i + 1]]; if (name && (capturedAttributes.length === 0 || capturedAttributes.includes(name))) { attributes[name] = value ?? ''; } } // Get AX info const axNode = axNodeMap.get(backendNodeId); const role = axNode?.role?.value; const ariaLabel = axNode?.name?.value; // Determine interactivity const isInteractive = INTERACTIVE_TAGS.has(tagName) || (role ? INTERACTIVE_ROLES.has(role) : false) || clickableSet.has(nodeIndex) || attributes['tabindex'] !== undefined || attributes['contenteditable'] === 'true'; const isEditable = tagName === 'input' || tagName === 'textarea' || attributes['contenteditable'] === 'true' || role === 'textbox' || role === 'searchbox'; const isScrollable = tagName === 'body' || tagName === 'html' || attributes['role'] === 'scrollbar'; // Build node const node: PageTreeNode = { tagName, nodeType: nodeType === 3 ? 'text' : 'element', text: nodeType === 3 ? strings[nodes.nodeValue[nodeIndex]] : layoutInfo?.text, attributes, children: [], isVisible, rect, role: role && role !== 'none' && role !== 'generic' ? role : undefined, ariaLabel, isInteractive, isClickable: clickableSet.has(nodeIndex) || INTERACTIVE_TAGS.has(tagName), isEditable, isScrollable, backendNodeId, paintOrder: layoutInfo?.paintOrder, inputValue: inputValueMap.get(nodeIndex), }; // Assign highlight index for interactive/visible elements if (isInteractive && isVisible) { node.highlightIndex = elementIndex(this.indexCounter++); } // Build children const childIndexes: number[] = nodes.childNodeIndexes?.[nodeIndex] ?? []; for (const childIdx of childIndexes) { const child = this.buildNodeTree( childIdx, nodes, strings, layoutMap, axNodeMap, clickableSet, inputValueMap, viewportSize, capturedAttributes, ); child.parentNode = node; node.children.push(child); } return node; } private buildAXMap(node: AXNode, map: Map): void { if (node.backendDOMNodeId) { map.set(node.backendDOMNodeId, node); } if (node.children) { for (const child of node.children) { this.buildAXMap(child, map); } } } private createEmptyNode(): PageTreeNode { return { tagName: 'html', nodeType: 'element', attributes: {}, children: [], isVisible: false, isInteractive: false, isClickable: false, isEditable: false, isScrollable: false, }; } } ================================================ FILE: packages/core/src/page/types.ts ================================================ import type { ElementRef } from '../types.js'; export interface DOMRect { x: number; y: number; width: number; height: number; } export interface TargetInfo { targetId: string; type: 'page' | 'iframe' | 'worker' | 'other'; url: string; title?: string; attached: boolean; } export interface TargetAllTrees { mainTree: PageTreeNode; iframeTrees: Array<{ targetInfo: TargetInfo; tree: PageTreeNode; parentNodeId?: number; }>; } export interface InteractedElement { index: ElementRef; tagName: string; text?: string; role?: string; ariaLabel?: string; action: string; timestamp: number; } export const MatchLevel = { EXACT: 'exact', PARTIAL: 'partial', FUZZY: 'fuzzy', NONE: 'none', } as const; export type MatchLevel = (typeof MatchLevel)[keyof typeof MatchLevel]; export interface SimplifiedNode { tag: string; text?: string; attrs: Record; children: SimplifiedNode[]; index?: ElementRef; isInteractive: boolean; } export interface PageTreeNode { tagName: string; nodeType: 'element' | 'text'; text?: string; attributes: Record; children: PageTreeNode[]; // Layout info isVisible: boolean; rect?: DOMRect; // A11y info role?: string; ariaLabel?: string; ariaExpanded?: boolean; // Interaction info isInteractive: boolean; isClickable: boolean; isEditable: boolean; isScrollable: boolean; // Index for LLM reference highlightIndex?: ElementRef; // Parent reference (not serialized) parentNode?: PageTreeNode; // CDP node info backendNodeId?: number; nodeId?: number; // Selector info cssSelector?: string; xpath?: string; // Shadow DOM isShadowRoot?: boolean; shadowChildren?: PageTreeNode[]; // Input state inputValue?: string; isChecked?: boolean; selectedOption?: string; // Paint order for z-index filtering paintOrder?: number; } export interface SelectorIndex { [index: number]: { cssSelector: string; xpath?: string; backendNodeId?: number; tagName: string; role?: string; ariaLabel?: string; text?: string; }; } export interface RenderedPageState { tree: string; selectorMap: SelectorIndex; elementCount: number; interactiveElementCount: number; scrollPosition: { x: number; y: number }; viewportSize: { width: number; height: number }; documentSize: { width: number; height: number }; pixelsAbove: number; pixelsBelow: number; } export interface CDPDOMNode { nodeType: number; nodeName: string; nodeValue: string; backendNodeId: number; childNodeIndexes?: number[]; attributes?: string[]; parentIndex?: number; contentDocumentIndex?: number; shadowRootType?: string; isClickable?: boolean; inputValue?: { value: string; type?: string }; currentSourceURL?: string; textValue?: string; layoutNodeIndex?: number; } export interface CDPLayoutNode { nodeIndex: number; bounds: number[]; text?: string; stackingContexts?: { index: number }[]; paintOrder?: number; isStackingContext?: boolean; } export interface CDPSnapshotResult { documents: Array<{ nodes: { nodeType: number[]; nodeName: number[]; nodeValue: number[]; backendNodeId: number[]; childNodeIndexes?: number[][]; attributes: Array; parentIndex: number[]; contentDocumentIndex?: { index: number[] }; shadowRootType?: { index: number[]; value: number[] }; isClickable?: { index: number[] }; inputValue?: { index: number[]; value: number[] }; currentSourceURL?: { index: number[]; value: number[] }; }; layout: { nodeIndex: number[]; bounds: number[][]; text: number[]; stackingContexts?: { index: number[] }; paintOrder?: number[]; styles: number[][]; }; textBoxes: { layoutIndex: number[]; bounds: number[][]; }; strings: string[]; }>; } export interface AXNode { nodeId: string; role: { value: string }; name?: { value: string }; description?: { value: string }; value?: { value: string }; properties?: Array<{ name: string; value: { value: unknown }; }>; children?: AXNode[]; backendDOMNodeId?: number; ignored?: boolean; } ================================================ FILE: packages/core/src/sandbox/file-access.ts ================================================ import * as fs from 'node:fs'; import * as path from 'node:path'; import { createLogger } from '../logging.js'; const logger = createLogger('filesystem'); const ALLOWED_EXTENSIONS = new Set([ '.txt', '.md', '.json', '.csv', '.html', '.xml', '.yaml', '.yml', '.js', '.ts', '.py', '.rb', '.go', '.rs', '.java', '.c', '.cpp', '.css', '.scss', '.less', '.svg', '.log', '.env', '.toml', '.ini', '.sh', '.bash', '.zsh', '.sql', '.graphql', ]); const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB export interface FileAccessOptions { sandboxDir: string; allowedExtensions?: Set; maxFileSize?: number; readOnly?: boolean; } export interface FileInfo { name: string; path: string; size: number; isDirectory: boolean; modifiedAt: Date; extension: string; } export interface FileAccessState { files: Map; totalSize: number; operationCount: number; } export class FileAccess { private sandboxDir: string; private allowedExtensions: Set; private maxFileSize: number; private readOnly: boolean; private state: FileAccessState; constructor(options: FileAccessOptions) { this.sandboxDir = path.resolve(options.sandboxDir); this.allowedExtensions = options.allowedExtensions ?? ALLOWED_EXTENSIONS; this.maxFileSize = options.maxFileSize ?? MAX_FILE_SIZE; this.readOnly = options.readOnly ?? false; this.state = { files: new Map(), totalSize: 0, operationCount: 0, }; // Ensure sandbox directory exists if (!fs.existsSync(this.sandboxDir)) { fs.mkdirSync(this.sandboxDir, { recursive: true }); } // Index existing files this.indexDirectory(); } private indexDirectory(): void { try { const entries = fs.readdirSync(this.sandboxDir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(this.sandboxDir, entry.name); if (entry.isFile()) { const stat = fs.statSync(fullPath); this.state.files.set(entry.name, { name: entry.name, path: fullPath, size: stat.size, isDirectory: false, modifiedAt: stat.mtime, extension: path.extname(entry.name).toLowerCase(), }); this.state.totalSize += stat.size; } } } catch { logger.debug('Failed to index sandbox directory'); } } private resolvePath(relativePath: string): string { const resolved = path.resolve(this.sandboxDir, relativePath); // Prevent path traversal if (!resolved.startsWith(this.sandboxDir)) { throw new Error(`Path traversal detected: ${relativePath}`); } return resolved; } private validateExtension(filePath: string): void { const ext = path.extname(filePath).toLowerCase(); if (!this.allowedExtensions.has(ext)) { throw new Error( `File extension "${ext}" is not allowed. Allowed: ${[...this.allowedExtensions].join(', ')}`, ); } } private isBinaryFile(filePath: string): boolean { const ext = path.extname(filePath).toLowerCase(); const binaryExts = new Set([ '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib', '.mp3', '.mp4', '.avi', '.mkv', '.wav', '.woff', '.woff2', '.ttf', '.eot', ]); return binaryExts.has(ext); } async read(relativePath: string): Promise { const fullPath = this.resolvePath(relativePath); if (!fs.existsSync(fullPath)) { throw new Error(`File not found: ${relativePath}`); } if (this.isBinaryFile(fullPath)) { throw new Error(`Cannot read binary file: ${relativePath}`); } const stat = fs.statSync(fullPath); if (stat.size > this.maxFileSize) { throw new Error( `File too large: ${(stat.size / 1024 / 1024).toFixed(1)}MB (max: ${(this.maxFileSize / 1024 / 1024).toFixed(1)}MB)`, ); } this.state.operationCount++; logger.debug(`Read file: ${relativePath} (${stat.size} bytes)`); return fs.readFileSync(fullPath, 'utf-8'); } async write(relativePath: string, content: string): Promise { if (this.readOnly) { throw new Error('File system is read-only'); } const fullPath = this.resolvePath(relativePath); this.validateExtension(fullPath); const contentSize = Buffer.byteLength(content, 'utf-8'); if (contentSize > this.maxFileSize) { throw new Error(`Content too large: ${(contentSize / 1024 / 1024).toFixed(1)}MB`); } // Ensure parent directory exists const dir = path.dirname(fullPath); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } fs.writeFileSync(fullPath, content, 'utf-8'); const info: FileInfo = { name: path.basename(relativePath), path: fullPath, size: contentSize, isDirectory: false, modifiedAt: new Date(), extension: path.extname(relativePath).toLowerCase(), }; this.state.files.set(relativePath, info); this.state.totalSize += contentSize; this.state.operationCount++; logger.debug(`Wrote file: ${relativePath} (${contentSize} bytes)`); } async list(relativeDir = '.'): Promise { const fullPath = this.resolvePath(relativeDir); if (!fs.existsSync(fullPath)) { return []; } const entries = fs.readdirSync(fullPath, { withFileTypes: true }); const result: FileInfo[] = []; for (const entry of entries) { const entryPath = path.join(fullPath, entry.name); const stat = fs.statSync(entryPath); result.push({ name: entry.name, path: entryPath, size: stat.size, isDirectory: entry.isDirectory(), modifiedAt: stat.mtime, extension: path.extname(entry.name).toLowerCase(), }); } this.state.operationCount++; return result; } async delete(relativePath: string): Promise { if (this.readOnly) { throw new Error('File system is read-only'); } const fullPath = this.resolvePath(relativePath); if (!fs.existsSync(fullPath)) { throw new Error(`File not found: ${relativePath}`); } const stat = fs.statSync(fullPath); fs.unlinkSync(fullPath); this.state.files.delete(relativePath); this.state.totalSize -= stat.size; this.state.operationCount++; logger.debug(`Deleted file: ${relativePath}`); } async exists(relativePath: string): Promise { const fullPath = this.resolvePath(relativePath); return fs.existsSync(fullPath); } getState(): FileAccessState { return { files: new Map(this.state.files), totalSize: this.state.totalSize, operationCount: this.state.operationCount, }; } getSandboxDir(): string { return this.sandboxDir; } } ================================================ FILE: packages/core/src/sandbox/index.ts ================================================ export { FileAccess, type FileAccessOptions, type FileInfo, type FileAccessState } from './file-access.js'; ================================================ FILE: packages/core/src/telemetry.ts ================================================ import { createLogger } from './logging.js'; const logger = createLogger('perf'); export interface TimingResult { result: T; durationMs: number; } /** * Wraps an async function to measure and log its execution time. * Returns the result along with timing information. */ export async function timed( label: string, fn: () => Promise, ): Promise> { const start = performance.now(); try { const result = await fn(); const durationMs = performance.now() - start; logger.debug(`${label}: ${durationMs.toFixed(1)}ms`); return { result, durationMs }; } catch (error) { const durationMs = performance.now() - start; logger.debug(`${label}: FAILED after ${durationMs.toFixed(1)}ms`); throw error; } } /** * Creates a decorator-style wrapper that times all calls to the provided function. */ export function withTiming( label: string, fn: (...args: Args) => Promise, ): (...args: Args) => Promise { return async (...args: Args): Promise => { const { result } = await timed(label, () => fn(...args)); return result; }; } /** * Simple stopwatch for manual timing control. */ export class Stopwatch { private startTime: number; private splits: Array<{ label: string; timeMs: number }> = []; constructor() { this.startTime = performance.now(); } split(label: string): number { const elapsed = performance.now() - this.startTime; this.splits.push({ label, timeMs: elapsed }); return elapsed; } elapsed(): number { return performance.now() - this.startTime; } reset(): void { this.startTime = performance.now(); this.splits = []; } getSplits(): Array<{ label: string; timeMs: number }> { return [...this.splits]; } summary(): string { const lines = this.splits.map( (s) => ` ${s.label}: ${s.timeMs.toFixed(1)}ms`, ); lines.push(` total: ${this.elapsed().toFixed(1)}ms`); return lines.join('\n'); } } ================================================ FILE: packages/core/src/types.ts ================================================ import { z } from 'zod'; // ── Branded types for compile-time safety ── declare const __brand: unique symbol; type Brand = T & { readonly [__brand]: B }; export type TargetId = Brand; export type SessionId = Brand; export type ElementRef = Brand; export type TabId = Brand; export function targetId(id: string): TargetId { return id as TargetId; } export function sessionId(id: string): SessionId { return id as SessionId; } export function elementIndex(index: number): ElementRef { return index as ElementRef; } export function tabId(id: number): TabId { return id as TabId; } // ── Result type for error handling ── export type Result = { ok: true; value: T } | { ok: false; error: E }; export function ok(value: T): Result { return { ok: true, value }; } export function err(error: E): Result { return { ok: false, error }; } // ── Position & geometry ── export const PositionSchema = z.object({ x: z.number(), y: z.number(), }); export type Position = z.infer; export const RectSchema = z.object({ x: z.number(), y: z.number(), width: z.number(), height: z.number(), }); export type Rect = z.infer; // ── Common enums ── export const LogLevel = { DEBUG: 0, INFO: 1, WARN: 2, ERROR: 3, } as const; export type LogLevel = (typeof LogLevel)[keyof typeof LogLevel]; // ── Utility types ── export type DeepPartial = { [P in keyof T]?: T[P] extends object ? DeepPartial : T[P]; }; export type Awaitable = T | Promise; ================================================ FILE: packages/core/src/utils.ts ================================================ import { nanoid } from 'nanoid'; // ── ID generation ── export function generateId(size = 12): string { return nanoid(size); } // ── URL matching ── export function matchesUrlPattern(url: string, pattern: string): boolean { if (pattern === '*') return true; try { const urlObj = new URL(url); const patternObj = new URL(pattern.includes('://') ? pattern : `https://${pattern}`); if (patternObj.hostname.startsWith('*.')) { const baseDomain = patternObj.hostname.slice(2); if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) { return false; } } else if (urlObj.hostname !== patternObj.hostname) { return false; } if (patternObj.pathname !== '/' && patternObj.pathname !== '/*') { const patternPath = patternObj.pathname.replace(/\*/g, '.*'); const regex = new RegExp(`^${patternPath}`); if (!regex.test(urlObj.pathname)) { return false; } } return true; } catch { return url.includes(pattern); } } export function isUrlPermitted( url: string, allowedUrls?: string[], blockedUrls?: string[], ): boolean { if (blockedUrls?.some((pattern) => matchesUrlPattern(url, pattern))) { return false; } if (allowedUrls && allowedUrls.length > 0) { return allowedUrls.some((pattern) => matchesUrlPattern(url, pattern)); } return true; } // ── Text utilities ── export function sanitizeText(text: string): string { return text .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, '') .replace(/\s+/g, ' ') .trim(); } export function truncateText(text: string, maxLength: number, suffix = '...'): string { if (text.length <= maxLength) return text; return text.slice(0, maxLength - suffix.length) + suffix; } export function removeTags(html: string): string { return html.replace(/<[^>]*>/g, ''); } // ── Timing ── export function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } export async function withDeadline( promise: Promise, ms: number, message = 'Operation timed out', ): Promise { const timer = new Promise((_, reject) => setTimeout(() => reject(new Error(message)), ms), ); return Promise.race([promise, timer]); } export class Timer { private startTime: number; constructor() { this.startTime = Date.now(); } elapsed(): number { return Date.now() - this.startTime; } elapsedSeconds(): number { return this.elapsed() / 1000; } reset(): void { this.startTime = Date.now(); } } // ── Retry ── export interface RetryOptions { maxRetries: number; initialDelayMs: number; maxDelayMs: number; backoffFactor: number; } const DEFAULT_RETRY: RetryOptions = { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 30000, backoffFactor: 2, }; export async function withRetry( fn: () => Promise, options: Partial = {}, ): Promise { const opts = { ...DEFAULT_RETRY, ...options }; let lastError: Error | undefined; let delay = opts.initialDelayMs; for (let attempt = 0; attempt <= opts.maxRetries; attempt++) { try { return await fn(); } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); if (attempt < opts.maxRetries) { await sleep(Math.min(delay, opts.maxDelayMs)); delay *= opts.backoffFactor; } } } throw lastError; } // ── Misc ── export function groupBy( items: T[], keyFn: (item: T) => K, ): Record { return items.reduce( (acc, item) => { const key = keyFn(item); (acc[key] ??= []).push(item); return acc; }, {} as Record, ); } export function dedent(str: string): string { const lines = str.split('\n'); if (lines[0]?.trim() === '') lines.shift(); if (lines[lines.length - 1]?.trim() === '') lines.pop(); const minIndent = lines .filter((line) => line.trim().length > 0) .reduce((min, line) => { const match = line.match(/^(\s*)/); return Math.min(min, match ? match[1].length : 0); }, Number.POSITIVE_INFINITY); if (minIndent === Number.POSITIVE_INFINITY) return str; return lines.map((line) => line.slice(minIndent)).join('\n'); } // ── URL utilities ── /** * Match a URL against a domain pattern like "*.example.com" or "example.com/path/*". * More comprehensive than matchesUrlPattern — handles port stripping, www normalization. */ export function matchUrlWithDomainPattern(url: string, pattern: string): boolean { try { const urlObj = new URL(url); const urlHost = urlObj.hostname.replace(/^www\./, ''); // Pattern can be a plain domain, wildcard domain, or full URL pattern if (pattern.startsWith('*.')) { const base = pattern.slice(2); return urlHost === base || urlHost.endsWith(`.${base}`); } // Try parsing as URL const patternHost = pattern.includes('://') ? new URL(pattern).hostname.replace(/^www\./, '') : pattern.replace(/^www\./, '').split('/')[0]; return urlHost === patternHost; } catch { return url.includes(pattern); } } const NEW_TAB_URLS = new Set([ 'about:blank', 'about:newtab', 'chrome://newtab/', 'chrome://new-tab-page/', 'edge://newtab/', 'about:home', ]); export function isNewTabPage(url: string): boolean { return NEW_TAB_URLS.has(url) || url === '' || url === 'about:blank'; } /** * Remove unpaired surrogates from a string to prevent JSON serialization issues. */ export function sanitizeSurrogates(text: string): string { return text.replace( /[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?"{}|\\^`\[\]]+/g; /** * Extract all URLs from a text string. */ export function extractUrls(text: string): string[] { return [...text.matchAll(URL_REGEX)].map((m) => m[0]); } /** * Escape special regex characters in a string. */ export function escapeRegExp(string: string): string { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } ================================================ FILE: packages/core/src/viewport/event-hub.ts ================================================ type Handler = (payload: T) => void; type RequestHandler = (payload: Req) => Promise; export class EventHub< EventMap extends { [K in keyof EventMap]: EventMap[K] } = Record, RequestMap extends { [K in keyof RequestMap]: { request: unknown; response: unknown } } = Record< string, { request: unknown; response: unknown } >, > { private handlers = new Map>(); private requestHandlers = new Map(); private history: Array<{ event: string; payload: unknown; timestamp: number }> = []; private maxHistory: number; constructor(options?: { maxHistory?: number }) { this.maxHistory = options?.maxHistory ?? 100; } on(event: K, handler: Handler): () => void { if (!this.handlers.has(event)) { this.handlers.set(event, new Set()); } this.handlers.get(event)!.add(handler as Handler); return () => { this.handlers.get(event)?.delete(handler as Handler); }; } once(event: K, handler: Handler): () => void { const wrappedHandler: Handler = (payload) => { off(); handler(payload); }; const off = this.on(event, wrappedHandler); return off; } emit(event: K, payload: EventMap[K]): void { this.recordHistory(event, payload); const handlers = this.handlers.get(event); if (handlers) { for (const handler of handlers) { try { handler(payload); } catch (error) { console.error(`Error in event handler for "${event}":`, error); } } } } onRequest( event: K, handler: RequestHandler, ): () => void { this.requestHandlers.set(event, handler as RequestHandler); return () => { this.requestHandlers.delete(event); }; } async request( event: K, payload: RequestMap[K]['request'], timeoutMs = 30000, ): Promise { const handler = this.requestHandlers.get(event); if (!handler) { throw new Error(`No handler registered for request "${event}"`); } const result = await Promise.race([ handler(payload), new Promise((_, reject) => setTimeout(() => reject(new Error(`Request "${event}" timed out after ${timeoutMs}ms`)), timeoutMs), ), ]); return result as RequestMap[K]['response']; } off(event: K, handler?: Handler): void { if (handler) { this.handlers.get(event)?.delete(handler as Handler); } else { this.handlers.delete(event); } } removeAllListeners(): void { this.handlers.clear(); this.requestHandlers.clear(); } getHistory(event?: string): Array<{ event: string; payload: unknown; timestamp: number }> { if (event) { return this.history.filter((h) => h.event === event); } return [...this.history]; } clearHistory(): void { this.history = []; } private recordHistory(event: string, payload: unknown): void { this.history.push({ event, payload, timestamp: Date.now() }); if (this.history.length > this.maxHistory) { this.history = this.history.slice(-this.maxHistory); } } } ================================================ FILE: packages/core/src/viewport/events.ts ================================================ import type { ElementRef } from '../types.js'; // ── Event payload types ── export interface NavigateEvent { url: string; waitUntil?: 'load' | 'domcontentloaded' | 'networkidle'; } export interface ClickEvent { elementIndex: ElementRef; clickCount?: number; } export interface InputEvent { elementIndex: ElementRef; text: string; clearFirst?: boolean; } export interface SelectOptionEvent { elementIndex: ElementRef; value: string; } export interface ScrollEvent { direction: 'up' | 'down'; amount?: number; elementIndex?: ElementRef; } export interface ScreenshotEvent { fullPage?: boolean; } export interface ScreenshotResult { base64: string; width: number; height: number; } export interface TabSwitchEvent { tabIndex: number; } export interface FileUploadEvent { elementIndex: ElementRef; filePaths: string[]; } export interface KeyPressEvent { key: string; } export interface BrowserStateEvent { url: string; title: string; tabCount: number; } export interface DownloadEvent { url: string; suggestedFilename: string; path?: string; } export interface PopupEvent { url: string; type: 'popup' | 'dialog'; } export interface SecurityEvent { type: 'navigation-blocked' | 'download-blocked' | 'popup-blocked'; url: string; reason: string; } export interface CrashEvent { reason: string; } // ── Event map ── export interface ViewportEventMap { 'navigation': NavigateEvent; 'click': ClickEvent; 'input': InputEvent; 'selection': SelectOptionEvent; 'scroll': ScrollEvent; 'capture': ScreenshotEvent; 'capture-result': ScreenshotResult; 'tab-changed': TabSwitchEvent; 'tab-closed': { tabIndex: number }; 'tab-opened': { url: string }; 'file-uploaded': FileUploadEvent; 'keystroke': KeyPressEvent; 'viewport-state': BrowserStateEvent; 'download': DownloadEvent; 'popup': PopupEvent; 'policy-violation': SecurityEvent; 'crash': CrashEvent; 'page-ready': { url: string }; 'content-ready': void; 'shutdown': void; } // ── Request-response event map ── export interface ViewportRequestMap { 'get-screenshot': { request: ScreenshotEvent; response: ScreenshotResult }; 'get-state': { request: void; response: BrowserStateEvent }; } ================================================ FILE: packages/core/src/viewport/guard-base.ts ================================================ import type { Page, BrowserContext } from 'playwright'; import type { EventHub } from './event-hub.js'; import type { ViewportEventMap, ViewportRequestMap } from './events.js'; export interface GuardContext { page: Page; context: BrowserContext; eventBus: EventHub; } /** * Base class for browser watchdogs that monitor and react to browser events. * Each watchdog handles a specific concern (security, popups, downloads, etc.). */ export abstract class BaseGuard { protected ctx!: GuardContext; protected cleanupFns: Array<() => void> = []; private _active = false; get active(): boolean { return this._active; } abstract readonly name: string; abstract readonly priority: number; async attach(ctx: GuardContext): Promise { this.ctx = ctx; this._active = true; await this.setup(); } async detach(): Promise { this._active = false; for (const cleanup of this.cleanupFns) { try { cleanup(); } catch { // Ignore cleanup errors } } this.cleanupFns = []; await this.teardown(); } protected abstract setup(): Promise; protected async teardown(): Promise { // Override if needed } protected onEvent( event: K, handler: (payload: ViewportEventMap[K]) => void, ): void { const off = this.ctx.eventBus.on(event, handler); this.cleanupFns.push(off); } } ================================================ FILE: packages/core/src/viewport/guards/blank-page.ts ================================================ import { BaseGuard } from '../guard-base.js'; /** * Handles about:blank pages. If the page navigates to about:blank, * attempts to navigate back to the previous page. */ export class BlankPageGuard extends BaseGuard { readonly name = 'about-blank'; readonly priority = 400; protected async setup(): Promise { const handler = () => { const url = this.ctx.page.url(); if (url === 'about:blank') { this.ctx.page.goBack().catch(() => { // Cannot go back; ignore }); } }; this.ctx.page.on('framenavigated', handler); this.cleanupFns.push(() => this.ctx.page.off('framenavigated', handler)); } } ================================================ FILE: packages/core/src/viewport/guards/crash.ts ================================================ import { BaseGuard } from '../guard-base.js'; /** * Monitors for browser page crashes. Emits crash events * and attempts recovery by creating a new page. */ export class CrashGuard extends BaseGuard { readonly name = 'crash'; readonly priority = 500; protected async setup(): Promise { const handler = () => { this.ctx.eventBus.emit('crash', { reason: 'Page crashed unexpectedly', }); // Attempt recovery by creating a new page this.ctx.context .newPage() .then((newPage) => { this.ctx.page = newPage; }) .catch(() => { // Recovery failed; context may be closed }); }; this.ctx.page.on('crash', handler); this.cleanupFns.push(() => this.ctx.page.off('crash', handler)); } } ================================================ FILE: packages/core/src/viewport/guards/default-handler.ts ================================================ import type { Dialog } from 'playwright'; import { BaseGuard } from '../guard-base.js'; /** * Monitors for default browser actions that need to be handled, * such as catching unhandled dialogs and auto-dismissing them. */ export class DefaultHandlerGuard extends BaseGuard { readonly name = 'default-action'; readonly priority = 100; protected async setup(): Promise { const handler = async (dialog: Dialog) => { this.ctx.eventBus.emit('popup', { url: this.ctx.page.url(), type: 'dialog', }); try { await dialog.accept(); } catch { // Dialog may already be dismissed } }; this.ctx.page.on('dialog', handler); this.cleanupFns.push(() => this.ctx.page.off('dialog', handler)); } } ================================================ FILE: packages/core/src/viewport/guards/downloads.ts ================================================ import type { Download } from 'playwright'; import * as fs from 'node:fs'; import * as path from 'node:path'; import * as crypto from 'node:crypto'; import { BaseGuard } from '../guard-base.js'; import { createLogger } from '../../logging.js'; const logger = createLogger('watchdog:downloads'); // ── Options ── export interface DownloadGuardOptions { /** Directory to save downloads to. Defaults to OS temp directory. */ downloadsPath?: string; /** Automatically accept all downloads without prompting. Defaults to true. */ autoAccept?: boolean; /** Settings for PDF printing when a page triggers a print-to-PDF download. */ pdfSettings?: { printBackground: boolean; landscape: boolean; }; } // ── Download tracking ── export type DownloadStatus = 'started' | 'completed' | 'failed'; export interface DownloadInfo { url: string; suggestedFilename: string; savedPath?: string; status: DownloadStatus; startTime: number; endTime?: number; fileSize?: number; } // ── Watchdog ── /** * Monitors for file downloads with full lifecycle tracking. * * Features: * - Configures CDP download behavior for reliable acceptance * - Tracks every download from start to completion/failure * - Deduplicates filenames with UUID suffixes when collisions occur * - Provides download history and a promise-based wait API */ export class DownloadGuard extends BaseGuard { readonly name = 'downloads'; readonly priority = 300; private readonly options: Required; private readonly downloads = new Map(); private downloadCounter = 0; /** * Listeners waiting for the next download to complete. * Each call to `waitForDownload` pushes a resolver here; * it is removed once a download completes or the timeout fires. */ private pendingWaiters: Array<{ resolve: (info: DownloadInfo) => void; reject: (err: Error) => void; timer: ReturnType; }> = []; constructor(options?: DownloadGuardOptions) { super(); const defaultPath = path.join( (typeof process !== 'undefined' && process.env.TMPDIR) || '/tmp', 'open-browser-downloads', ); this.options = { downloadsPath: options?.downloadsPath ?? defaultPath, autoAccept: options?.autoAccept ?? true, pdfSettings: options?.pdfSettings ?? { printBackground: true, landscape: false, }, }; } // ── Setup / Teardown ── protected async setup(): Promise { // Ensure the downloads directory exists. this.ensureDownloadsDir(); // Try to enable CDP-level auto-accept so the browser never shows a // "Save As" dialog, even for cross-origin downloads. await this.configureCdpDownloadBehavior(); // Listen for Playwright download events on the page. const handler = (download: Download) => { this.handleDownload(download).catch((err) => { logger.error('Unhandled error processing download', err); }); }; this.ctx.page.on('download', handler); this.cleanupFns.push(() => this.ctx.page.off('download', handler)); logger.debug(`Downloads watchdog active – saving to ${this.options.downloadsPath}`); } protected async teardown(): Promise { // Reject any pending waiters so they don't hang forever. for (const waiter of this.pendingWaiters) { clearTimeout(waiter.timer); waiter.reject(new Error('DownloadGuard detached before download completed')); } this.pendingWaiters = []; logger.debug('Downloads watchdog detached'); } // ── CDP configuration ── private async configureCdpDownloadBehavior(): Promise { if (!this.options.autoAccept) return; try { const cdpSession = await this.ctx.page.context().newCDPSession(this.ctx.page); await (cdpSession.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath: this.options.downloadsPath, }) as Promise as Promise); this.cleanupFns.push(() => { cdpSession.detach().catch(() => { // Session may already be closed. }); }); logger.debug('CDP download behavior set to "allow"'); } catch (err) { // CDP may not be available (e.g. Firefox). Fall back to Playwright-only handling. logger.warn('Could not set CDP download behavior – falling back to Playwright handling', err); } } // ── Download handler ── private async handleDownload(download: Download): Promise { const id = `dl_${++this.downloadCounter}`; const suggestedFilename = download.suggestedFilename(); const url = download.url(); const info: DownloadInfo = { url, suggestedFilename, status: 'started', startTime: Date.now(), }; this.downloads.set(id, info); logger.info(`Download started: ${suggestedFilename} (${url})`); // Emit the initial event so consumers know a download has begun. this.ctx.eventBus.emit('download', { url, suggestedFilename, }); try { const destPath = this.resolveUniquePath(suggestedFilename); // Save the file to our chosen path. await download.saveAs(destPath); // Gather file size. let fileSize: number | undefined; try { const stat = fs.statSync(destPath); fileSize = stat.size; } catch { // File may have been moved/deleted by another process. } info.savedPath = destPath; info.status = 'completed'; info.endTime = Date.now(); info.fileSize = fileSize; const elapsed = info.endTime - info.startTime; logger.info( `Download completed: ${suggestedFilename} → ${destPath} (${formatBytes(fileSize)} in ${elapsed}ms)`, ); // Emit a follow-up download event with the saved path. this.ctx.eventBus.emit('download', { url, suggestedFilename, path: destPath, }); // Resolve any pending waiters. this.notifyWaiters(info); } catch (err) { info.status = 'failed'; info.endTime = Date.now(); const reason = err instanceof Error ? err.message : String(err); logger.error(`Download failed: ${suggestedFilename} – ${reason}`); } } // ── Filename collision handling ── /** * Returns a path inside the downloads directory. If a file with the same * name already exists, a short UUID is inserted before the extension. */ private resolveUniquePath(suggestedFilename: string): string { const candidate = path.join(this.options.downloadsPath, suggestedFilename); if (!fs.existsSync(candidate)) { return candidate; } const ext = path.extname(suggestedFilename); const base = path.basename(suggestedFilename, ext); const uuid = crypto.randomUUID().slice(0, 8); const uniqueName = `${base}-${uuid}${ext}`; logger.debug(`File "${suggestedFilename}" already exists – saving as "${uniqueName}"`); return path.join(this.options.downloadsPath, uniqueName); } // ── Directory helpers ── private ensureDownloadsDir(): void { if (!fs.existsSync(this.options.downloadsPath)) { fs.mkdirSync(this.options.downloadsPath, { recursive: true }); logger.debug(`Created downloads directory: ${this.options.downloadsPath}`); } } // ── Public API ── /** * Returns a snapshot of all tracked downloads (both in-progress and finished). */ getDownloadHistory(): DownloadInfo[] { return Array.from(this.downloads.values()); } /** * Returns a promise that resolves with the `DownloadInfo` of the next * download that completes (or rejects after `timeout` ms). * * @param timeout Maximum milliseconds to wait. Defaults to 30 000 ms. */ waitForDownload(timeout = 30_000): Promise { return new Promise((resolve, reject) => { const timer = setTimeout(() => { this.removePendingWaiter(waiter); reject(new Error(`waitForDownload timed out after ${timeout}ms`)); }, timeout); const waiter = { resolve, reject, timer }; this.pendingWaiters.push(waiter); }); } // ── Waiter helpers ── private notifyWaiters(info: DownloadInfo): void { const waiters = this.pendingWaiters.splice(0); for (const waiter of waiters) { clearTimeout(waiter.timer); waiter.resolve(info); } } private removePendingWaiter(waiter: (typeof this.pendingWaiters)[number]): void { const idx = this.pendingWaiters.indexOf(waiter); if (idx !== -1) { this.pendingWaiters.splice(idx, 1); } } } // ── Helpers ── function formatBytes(bytes: number | undefined): string { if (bytes == null) return '? bytes'; if (bytes < 1024) return `${bytes} B`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; } ================================================ FILE: packages/core/src/viewport/guards/har-capture.ts ================================================ import { writeFile, mkdir } from 'node:fs/promises'; import { dirname } from 'node:path'; import type { CDPSession } from 'playwright'; import { BaseGuard } from '../guard-base.js'; // ── HAR 1.2 types ── interface HarRequest { method: string; url: string; httpVersion: string; headers: Array<{ name: string; value: string }>; queryString: Array<{ name: string; value: string }>; headersSize: number; bodySize: number; } interface HarResponse { status: number; statusText: string; httpVersion: string; headers: Array<{ name: string; value: string }>; content: { size: number; mimeType: string; }; headersSize: number; bodySize: number; redirectURL: string; } interface HarEntry { startedDateTime: string; time: number; request: HarRequest; response: HarResponse; cache: Record; timings: { send: number; wait: number; receive: number; }; } interface PendingRequest { requestId: string; startTime: number; method: string; url: string; headers: Record; } interface ResponseInfo { status: number; statusText: string; headers: Record; mimeType: string; encodedDataLength: number; } /** * Records network traffic in HAR 1.2 format using CDP Network domain events. * On teardown, writes the complete HAR log to the configured output path. */ export class HarCaptureGuard extends BaseGuard { readonly name = 'har-recording'; readonly priority = 500; private readonly outputPath: string; private cdpSession: CDPSession | null = null; private pendingRequests = new Map(); private responses = new Map(); private entries: HarEntry[] = []; constructor(outputPath: string) { super(); this.outputPath = outputPath; } protected async setup(): Promise { this.cdpSession = await this.ctx.page.context().newCDPSession(this.ctx.page); await this.cdpSession.send('Network.enable'); this.cdpSession.on('Network.requestWillBeSent', (params) => { const { requestId, request, timestamp } = params as { requestId: string; request: { method: string; url: string; headers: Record }; timestamp: number; }; this.pendingRequests.set(requestId, { requestId, startTime: timestamp, method: request.method, url: request.url, headers: request.headers, }); }); this.cdpSession.on('Network.responseReceived', (params) => { const { requestId, response } = params as { requestId: string; response: { status: number; statusText: string; headers: Record; mimeType: string; encodedDataLength: number; }; }; this.responses.set(requestId, { status: response.status, statusText: response.statusText, headers: response.headers, mimeType: response.mimeType, encodedDataLength: response.encodedDataLength, }); }); this.cdpSession.on('Network.loadingFinished', (params) => { const { requestId, timestamp, encodedDataLength } = params as { requestId: string; timestamp: number; encodedDataLength: number; }; this.finalizeEntry(requestId, timestamp, encodedDataLength); }); this.cdpSession.on('Network.loadingFailed', (params) => { const { requestId, timestamp } = params as { requestId: string; timestamp: number; }; // Still record failed requests with a zero-length response this.finalizeEntry(requestId, timestamp, 0); }); this.cleanupFns.push(() => { this.cdpSession?.detach().catch(() => { // Ignore detach errors during cleanup }); }); } private finalizeEntry(requestId: string, endTimestamp: number, encodedDataLength: number): void { const pending = this.pendingRequests.get(requestId); if (!pending) return; const response = this.responses.get(requestId); const elapsedMs = (endTimestamp - pending.startTime) * 1000; const harRequest: HarRequest = { method: pending.method, url: pending.url, httpVersion: 'HTTP/1.1', headers: toHeaderArray(pending.headers), queryString: parseQueryString(pending.url), headersSize: -1, bodySize: -1, }; const harResponse: HarResponse = response ? { status: response.status, statusText: response.statusText, httpVersion: 'HTTP/1.1', headers: toHeaderArray(response.headers), content: { size: encodedDataLength, mimeType: response.mimeType, }, headersSize: -1, bodySize: encodedDataLength, redirectURL: response.headers['location'] ?? '', } : { status: 0, statusText: '', httpVersion: 'HTTP/1.1', headers: [], content: { size: 0, mimeType: '' }, headersSize: -1, bodySize: 0, redirectURL: '', }; this.entries.push({ startedDateTime: new Date(pending.startTime * 1000).toISOString(), time: Math.max(0, elapsedMs), request: harRequest, response: harResponse, cache: {}, timings: { send: 0, wait: Math.max(0, elapsedMs), receive: 0, }, }); this.pendingRequests.delete(requestId); this.responses.delete(requestId); } protected override async teardown(): Promise { const har = { log: { version: '1.2', creator: { name: 'open-browser', version: '1.0.0', }, entries: this.entries, }, }; await mkdir(dirname(this.outputPath), { recursive: true }); await writeFile(this.outputPath, JSON.stringify(har, null, 2), 'utf-8'); } } // ── Helpers ── function toHeaderArray(headers: Record): Array<{ name: string; value: string }> { return Object.entries(headers).map(([name, value]) => ({ name, value })); } function parseQueryString(url: string): Array<{ name: string; value: string }> { try { const parsed = new URL(url); return [...parsed.searchParams.entries()].map(([name, value]) => ({ name, value })); } catch { return []; } } ================================================ FILE: packages/core/src/viewport/guards/local-instance.ts ================================================ import { BaseGuard } from '../guard-base.js'; /** * Ensures a local browser is connected by verifying * the page is accessible during setup. */ export class LocalInstanceGuard extends BaseGuard { readonly name = 'local-browser'; readonly priority = 10; protected async setup(): Promise { // Verify the page is accessible by checking its URL. // This is a no-op check that throws if the page is not connected. this.ctx.page.url(); } } ================================================ FILE: packages/core/src/viewport/guards/page-ready.ts ================================================ import { BaseGuard } from '../guard-base.js'; import { createLogger } from '../../logging.js'; const logger = createLogger('watchdog:dom'); // ── Options ── export interface PageReadyGuardOptions { /** * Milliseconds of mutation silence required before the DOM is considered * "stable". Defaults to 500 ms. */ idleTimeoutMs?: number; /** * Debounce interval for grouping rapid-fire mutation callbacks. * Defaults to 100 ms. */ debounceMs?: number; } // ── Load-state tracking ── export type LoadState = 'domcontentloaded' | 'load' | 'networkidle'; // ── Watchdog ── /** * Monitors DOM readiness and mutation activity. * * Features: * - Listens for standard Playwright page lifecycle events * (`domcontentloaded`, `load`, `networkidle`) * - Injects a MutationObserver via `page.evaluate` to detect in-page DOM * changes and determine when the page has "settled" * - Emits `dom-ready` once the DOM is stable (no mutations for `idleTimeoutMs`) * - Exposes `waitForDomStable()` for external consumers * - Tracks cumulative mutation count for debugging */ export class PageReadyGuard extends BaseGuard { readonly name = 'dom'; readonly priority = 200; private readonly idleTimeoutMs: number; private readonly debounceMs: number; /** Which lifecycle states the current page has reached. */ private reachedStates = new Set(); /** Running total of mutation batches observed (useful for debugging). */ private mutationCount = 0; /** Whether we currently consider the DOM to be stable. */ private stable = false; /** Timer handle for the idle-detection window. */ private idleTimer: ReturnType | null = null; /** Timer handle for the debounce window. */ private debounceTimer: ReturnType | null = null; /** Resolvers for external callers waiting on `waitForDomStable`. */ private stableWaiters: Array<{ resolve: () => void; reject: (err: Error) => void; timer: ReturnType; }> = []; /** Callback used for `page.exposeFunction` – stored so we can reference it. */ private readonly exposedFnName = '__ob_dom_mutation'; constructor(options?: PageReadyGuardOptions) { super(); this.idleTimeoutMs = options?.idleTimeoutMs ?? 500; this.debounceMs = options?.debounceMs ?? 100; } // ── Setup ── protected async setup(): Promise { this.reachedStates.clear(); this.mutationCount = 0; this.stable = false; // 1. Standard lifecycle events. this.setupLifecycleListeners(); // 2. MutationObserver bridge via an exposed function. await this.setupMutationObserver(); logger.debug( `DOM watchdog active (idleTimeout=${this.idleTimeoutMs}ms, debounce=${this.debounceMs}ms)`, ); } // ── Teardown ── protected async teardown(): Promise { this.clearTimers(); // Reject pending waiters. for (const waiter of this.stableWaiters) { clearTimeout(waiter.timer); waiter.reject(new Error('PageReadyGuard detached before DOM became stable')); } this.stableWaiters = []; logger.debug( `DOM watchdog detached (observed ${this.mutationCount} mutation batches)`, ); } // ── Lifecycle listeners ── private setupLifecycleListeners(): void { const onDomContentLoaded = () => { this.reachedStates.add('domcontentloaded'); logger.debug('Page reached domcontentloaded'); this.resetIdleTimer(); }; const onLoad = () => { this.reachedStates.add('load'); logger.debug('Page reached load'); this.resetIdleTimer(); }; this.ctx.page.on('domcontentloaded', onDomContentLoaded); this.ctx.page.on('load', onLoad); this.cleanupFns.push( () => this.ctx.page.off('domcontentloaded', onDomContentLoaded), () => this.ctx.page.off('load', onLoad), ); // `networkidle` is not a standard event – we wait for it asynchronously // after page load to avoid blocking setup. const watchNetworkIdle = async () => { try { await this.ctx.page.waitForLoadState('networkidle'); if (!this.active) return; this.reachedStates.add('networkidle'); logger.debug('Page reached networkidle'); this.resetIdleTimer(); } catch { // Navigation may have occurred or page closed – ignore. } }; // Fire-and-forget; we do not await. watchNetworkIdle(); } // ── MutationObserver bridge ── private async setupMutationObserver(): Promise { // Expose a function so the in-page MutationObserver can call back into Node. try { await this.ctx.page.exposeFunction(this.exposedFnName, (count: number) => { this.onMutationBatch(count); }); } catch { // Function may already be exposed from a previous attach cycle. logger.debug('Mutation bridge function already exposed – reusing'); } // Inject the observer. We re-inject on every `domcontentloaded` so it // survives navigations. const injectObserver = async () => { try { await this.ctx.page.evaluate((fnName: string) => { const win = window as unknown as Record; // Avoid double-installing on the same document. if (win.__ob_observer_installed) return; win.__ob_observer_installed = true; let pending = 0; const observer = new MutationObserver((mutations) => { pending += mutations.length; }); observer.observe(document.documentElement, { childList: true, subtree: true, attributes: true, characterData: true, }); // Flush accumulated mutation count periodically rather than on // every single micro-mutation. setInterval(() => { if (pending > 0) { const count = pending; pending = 0; const fn = win[fnName]; if (typeof fn === 'function') fn(count); } }, 50); }, this.exposedFnName); } catch { // Page may have navigated away or closed. } }; // Inject immediately for the current document... await injectObserver(); // ...and re-inject on future navigations. const onDomContentLoaded = () => { injectObserver(); }; this.ctx.page.on('domcontentloaded', onDomContentLoaded); this.cleanupFns.push(() => this.ctx.page.off('domcontentloaded', onDomContentLoaded)); } // ── Mutation handling ── private onMutationBatch(count: number): void { this.mutationCount += count; this.stable = false; // Debounce: delay the idle-timer reset so we don't restart it on // every single mutation callback. if (this.debounceTimer) { clearTimeout(this.debounceTimer); } this.debounceTimer = setTimeout(() => { this.debounceTimer = null; this.resetIdleTimer(); }, this.debounceMs); } // ── Idle detection ── private resetIdleTimer(): void { if (this.idleTimer) { clearTimeout(this.idleTimer); } this.idleTimer = setTimeout(() => { this.idleTimer = null; this.markStable(); }, this.idleTimeoutMs); } private markStable(): void { if (this.stable) return; this.stable = true; logger.debug( `DOM stable after ${this.mutationCount} mutation batches ` + `(states: ${[...this.reachedStates].join(', ') || 'none'})`, ); this.ctx.eventBus.emit('content-ready', undefined as void); this.notifyStableWaiters(); } // ── Public API ── /** * Returns a promise that resolves once the DOM is considered stable * (no mutations for `idleTimeoutMs`). * * If the DOM is already stable the promise resolves immediately. * * @param timeout Maximum milliseconds to wait. Defaults to 10 000 ms. */ waitForDomStable(timeout = 10_000): Promise { if (this.stable) { return Promise.resolve(); } return new Promise((resolve, reject) => { const timer = setTimeout(() => { this.removeStableWaiter(waiter); reject(new Error(`waitForDomStable timed out after ${timeout}ms`)); }, timeout); const waiter = { resolve, reject, timer }; this.stableWaiters.push(waiter); }); } /** * Returns the set of lifecycle states the current page has reached. */ getReachedStates(): ReadonlySet { return this.reachedStates; } /** * Returns the total number of mutation batches observed since the * watchdog was attached. */ getMutationCount(): number { return this.mutationCount; } /** * Whether the DOM is currently considered stable. */ isStable(): boolean { return this.stable; } // ── Waiter helpers ── private notifyStableWaiters(): void { const waiters = this.stableWaiters.splice(0); for (const waiter of waiters) { clearTimeout(waiter.timer); waiter.resolve(); } } private removeStableWaiter(waiter: (typeof this.stableWaiters)[number]): void { const idx = this.stableWaiters.indexOf(waiter); if (idx !== -1) { this.stableWaiters.splice(idx, 1); } } // ── Timer cleanup ── private clearTimers(): void { if (this.idleTimer) { clearTimeout(this.idleTimer); this.idleTimer = null; } if (this.debounceTimer) { clearTimeout(this.debounceTimer); this.debounceTimer = null; } } } ================================================ FILE: packages/core/src/viewport/guards/permissions.ts ================================================ import type { CDPSession } from 'playwright'; import { BaseGuard } from '../guard-base.js'; /** * Grants browser permissions (geolocation, notifications, camera, etc.) * via CDP. Re-grants permissions when the page navigates to a new origin. */ export class PermissionsGuard extends BaseGuard { readonly name = 'permissions'; readonly priority = 400; private readonly permissions: string[]; private cdpSession: CDPSession | null = null; private lastOrigin: string | null = null; constructor(permissions: string[]) { super(); this.permissions = permissions; } protected async setup(): Promise { this.cdpSession = await this.ctx.page.context().newCDPSession(this.ctx.page); // Grant permissions for the current page origin await this.grantForCurrentPage(); // Re-grant permissions when navigating to a new origin const handler = () => { this.grantForCurrentPage().catch(() => { // Ignore errors from navigations to about:blank, etc. }); }; this.ctx.page.on('framenavigated', handler); this.cleanupFns.push(() => this.ctx.page.off('framenavigated', handler)); this.cleanupFns.push(() => { this.cdpSession?.detach().catch(() => { // Ignore detach errors during cleanup }); }); } private async grantForCurrentPage(): Promise { const url = this.ctx.page.url(); let origin: string; try { origin = new URL(url).origin; } catch { return; } // Skip non-http origins and avoid re-granting for the same origin if (!origin.startsWith('http') || origin === this.lastOrigin) return; this.lastOrigin = origin; if (!this.cdpSession) return; // CDP types require PermissionType[] but we accept string[] for ergonomics type SendFn = (method: string, params: Record) => Promise; await (this.cdpSession.send as unknown as SendFn)( 'Browser.grantPermissions', { permissions: this.permissions, origin }, ); } } ================================================ FILE: packages/core/src/viewport/guards/persistence.ts ================================================ import { readFile, writeFile, mkdir } from 'node:fs/promises'; import { dirname } from 'node:path'; import { BaseGuard } from '../guard-base.js'; /** * Handles saving and restoring browser storage state (cookies, localStorage). * Persists state to a file so it can be restored across sessions. */ export class PersistenceGuard extends BaseGuard { readonly name = 'storage-state'; readonly priority = 600; private readonly storagePath: string; constructor(storagePath: string) { super(); this.storagePath = storagePath; } protected async setup(): Promise { // Try to restore storage state from file if it exists try { const data = await readFile(this.storagePath, 'utf-8'); const storageState = JSON.parse(data) as { cookies?: Array<{ name: string; value: string; domain: string; path: string; expires?: number; httpOnly?: boolean; secure?: boolean; sameSite?: 'Strict' | 'Lax' | 'None'; }>; }; if (storageState.cookies) { await this.ctx.context.addCookies(storageState.cookies); } } catch { // File doesn't exist or is invalid; start fresh } } /** * Saves the current context storage state to the configured file path. */ async save(): Promise { const storageState = await this.ctx.context.storageState(); await mkdir(dirname(this.storagePath), { recursive: true }); await writeFile(this.storagePath, JSON.stringify(storageState, null, 2), 'utf-8'); } } ================================================ FILE: packages/core/src/viewport/guards/popups.ts ================================================ import type { Page } from 'playwright'; import { BaseGuard } from '../guard-base.js'; /** * Monitors for popups and new windows/tabs. Listens for new pages * created in the browser context and emits tab-created events. */ export class PopupGuard extends BaseGuard { readonly name = 'popups'; readonly priority = 150; protected async setup(): Promise { const handler = async (page: Page) => { try { await page.waitForLoadState('domcontentloaded'); } catch { // Page may have been closed before load } const url = page.url(); this.ctx.eventBus.emit('tab-opened', { url }); // Bring focus to the new page try { await page.bringToFront(); } catch { // Page may have been closed } }; this.ctx.context.on('page', handler); this.cleanupFns.push(() => this.ctx.context.off('page', handler)); } } ================================================ FILE: packages/core/src/viewport/guards/screenshot.ts ================================================ import type { ScreenshotEvent, ScreenshotResult } from '../events.js'; import { BaseGuard } from '../guard-base.js'; /** * Handles screenshot requests by registering a request handler * for 'get-screenshot' on the event bus. */ export class ScreenshotGuard extends BaseGuard { readonly name = 'screenshot'; readonly priority = 700; protected async setup(): Promise { const off = this.ctx.eventBus.onRequest( 'get-screenshot', async (event: ScreenshotEvent): Promise => { const buffer = await this.ctx.page.screenshot({ fullPage: event?.fullPage ?? false, type: 'png', }); const base64 = buffer.toString('base64'); const viewport = this.ctx.page.viewportSize(); return { base64, width: viewport?.width ?? 0, height: viewport?.height ?? 0, }; }, ); this.cleanupFns.push(off); } } ================================================ FILE: packages/core/src/viewport/guards/url-policy.ts ================================================ import type { Route } from 'playwright'; import { BaseGuard } from '../guard-base.js'; import { isUrlPermitted } from '../../utils.js'; /** * Monitors for security concerns by intercepting navigation requests. * Checks URLs against allowed/blocked lists before permitting navigation. */ export class UrlPolicyGuard extends BaseGuard { readonly name = 'policy-violation'; readonly priority = 50; private readonly allowedUrls: string[]; private readonly blockedUrls: string[]; constructor(allowedUrls: string[] = [], blockedUrls: string[] = []) { super(); this.allowedUrls = allowedUrls; this.blockedUrls = blockedUrls; } protected async setup(): Promise { const handler = async (route: Route) => { const url = route.request().url(); if ( route.request().isNavigationRequest() && !isUrlPermitted(url, this.allowedUrls, this.blockedUrls) ) { this.ctx.eventBus.emit('policy-violation', { type: 'navigation-blocked', url, reason: `URL not allowed by security policy: ${url}`, }); await route.abort('blockedbyclient'); return; } await route.continue(); }; await this.ctx.page.route('**/*', handler); this.cleanupFns.push(() => { this.ctx.page.unroute('**/*', handler).catch(() => { // Ignore errors during cleanup }); }); } } ================================================ FILE: packages/core/src/viewport/guards/video-capture.ts ================================================ import { mkdir, writeFile } from 'node:fs/promises'; import { dirname, join } from 'node:path'; import type { CDPSession } from 'playwright'; import { BaseGuard } from '../guard-base.js'; import { createLogger } from '../../logging.js'; const logger = createLogger('watchdog:video-recording'); // ── Options ── export interface VideoRecordingOptions { /** Path for the Playwright trace archive (.zip). */ outputPath: string; /** * Recording mode. `'tracing'` uses Playwright's built-in tracing API * (screenshots + DOM snapshots). `'screencast'` falls back to CDP * Page.startScreencast for raw frame capture. `'auto'` tries tracing * first and falls back to screencast on failure. * * @default 'auto' */ mode?: 'tracing' | 'screencast' | 'auto'; /** * Maximum frames per second for CDP screencast mode. * Ignored when using Playwright tracing. * * @default 5 */ maxFrameRate?: number; /** * Screencast image format. * @default 'jpeg' */ format?: 'jpeg' | 'png'; /** * Screencast image quality (1-100). Only applies to JPEG. * @default 60 */ quality?: number; /** * Maximum width of captured screencast frames in pixels. * The browser scales down if the viewport is larger. * * @default 1280 */ maxWidth?: number; /** * Maximum height of captured screencast frames in pixels. * @default 720 */ maxHeight?: number; } // ── Resolved defaults ── interface ResolvedOptions { outputPath: string; mode: 'tracing' | 'screencast' | 'auto'; maxFrameRate: number; format: 'jpeg' | 'png'; quality: number; maxWidth: number; maxHeight: number; } function resolveOptions(opts: VideoRecordingOptions): ResolvedOptions { return { outputPath: opts.outputPath, mode: opts.mode ?? 'auto', maxFrameRate: opts.maxFrameRate ?? 5, format: opts.format ?? 'jpeg', quality: opts.quality ?? 60, maxWidth: opts.maxWidth ?? 1280, maxHeight: opts.maxHeight ?? 720, }; } // ── Watchdog ── /** * Records browser activity using Playwright's tracing API or CDP * Page.startScreencast as a fallback. * * - **Tracing mode** captures screenshots and DOM snapshots viewable in * the Playwright Trace Viewer. Produces a `.zip` archive. * - **Screencast mode** uses CDP to capture individual frames at a * configurable frame rate and quality. Produces numbered image files * written into a directory alongside the output path. * * Supports pause/resume so callers can temporarily halt recording * (e.g. during long waits) and restart without losing earlier frames. */ export class VideoCaptureGuard extends BaseGuard { readonly name = 'video-recording'; readonly priority = 500; private readonly options: ResolvedOptions; // ── Tracing state ── private tracingStarted = false; // ── Screencast state ── private cdpSession: CDPSession | null = null; private screencastActive = false; private paused = false; private frameCount = 0; private readonly frames: Array<{ data: string; timestamp: number }> = []; constructor(options: VideoRecordingOptions) { super(); this.options = resolveOptions(options); } // ── Setup ── protected async setup(): Promise { const { mode } = this.options; if (mode === 'tracing' || mode === 'auto') { const tracingOk = await this.startTracing(); if (tracingOk) return; if (mode === 'tracing') { logger.warn('Tracing failed and mode is "tracing" – recording will be unavailable'); return; } logger.info('Tracing unavailable, falling back to CDP screencast'); } await this.startScreencast(); } // ── Teardown ── protected override async teardown(): Promise { if (this.tracingStarted) { await this.stopTracing(); } else if (this.screencastActive) { await this.stopScreencast(); } } // ── Pause / Resume ── /** * Temporarily pauses frame capture (screencast only). * Tracing mode does not support granular pause/resume. */ pause(): void { if (!this.screencastActive || this.paused) return; this.paused = true; logger.debug('Screencast paused'); } /** * Resumes frame capture after a pause (screencast only). */ resume(): void { if (!this.screencastActive || !this.paused) return; this.paused = false; logger.debug('Screencast resumed'); } /** Whether the recording is currently paused. */ get isPaused(): boolean { return this.paused; } /** Number of frames captured so far (screencast mode). */ get capturedFrameCount(): number { return this.frameCount; } // ── Tracing ── private async startTracing(): Promise { try { await this.ctx.context.tracing.start({ screenshots: true, snapshots: true, }); this.tracingStarted = true; logger.info('Playwright tracing started'); return true; } catch (err) { const reason = err instanceof Error ? err.message : String(err); logger.debug(`Could not start tracing: ${reason}`); return false; } } private async stopTracing(): Promise { try { await mkdir(dirname(this.options.outputPath), { recursive: true }); await this.ctx.context.tracing.stop({ path: this.options.outputPath, }); logger.info(`Trace saved to ${this.options.outputPath}`); } catch (err) { const reason = err instanceof Error ? err.message : String(err); logger.error(`Failed to save trace: ${reason}`); } this.tracingStarted = false; } // ── Screencast ── private async startScreencast(): Promise { try { this.cdpSession = await this.ctx.page.context().newCDPSession(this.ctx.page); this.cdpSession.on('Page.screencastFrame', (params) => { const { data, metadata, sessionId } = params as { data: string; metadata: { timestamp: number }; sessionId: number; }; // Acknowledge the frame so the browser keeps sending them. this.cdpSession?.send('Page.screencastFrameAck', { sessionId }).catch(() => { // Ignore ack errors; session may have closed. }); if (this.paused) return; this.frameCount++; this.frames.push({ data, timestamp: metadata.timestamp }); if (this.frameCount % 50 === 0) { logger.debug(`Screencast: captured ${this.frameCount} frames`); } }); await (this.cdpSession.send('Page.startScreencast', { format: this.options.format, quality: this.options.format === 'jpeg' ? this.options.quality : undefined, maxWidth: this.options.maxWidth, maxHeight: this.options.maxHeight, everyNthFrame: Math.max(1, Math.round(60 / this.options.maxFrameRate)), }) as Promise as Promise); this.screencastActive = true; this.cleanupFns.push(() => { this.cdpSession?.detach().catch(() => { // Ignore detach errors during cleanup. }); }); logger.info( `CDP screencast started (${this.options.maxWidth}x${this.options.maxHeight}, ` + `${this.options.format} q${this.options.quality}, ~${this.options.maxFrameRate} fps)`, ); } catch (err) { const reason = err instanceof Error ? err.message : String(err); logger.error(`Failed to start CDP screencast: ${reason}`); } } private async stopScreencast(): Promise { if (!this.cdpSession) return; try { await (this.cdpSession.send('Page.stopScreencast') as Promise as Promise); } catch { // Session may already be closed. } this.screencastActive = false; logger.info(`Screencast stopped – ${this.frameCount} frames captured`); await this.saveFrames(); } private async saveFrames(): Promise { if (this.frames.length === 0) { logger.debug('No screencast frames to save'); return; } const framesDir = join(dirname(this.options.outputPath), 'screencast-frames'); await mkdir(framesDir, { recursive: true }); const ext = this.options.format === 'png' ? 'png' : 'jpg'; const manifest: Array<{ file: string; timestamp: number }> = []; for (let i = 0; i < this.frames.length; i++) { const frame = this.frames[i]; const filename = `frame-${String(i).padStart(5, '0')}.${ext}`; const filePath = join(framesDir, filename); await writeFile(filePath, Buffer.from(frame.data, 'base64')); manifest.push({ file: filename, timestamp: frame.timestamp }); } // Write a JSON manifest alongside the frames for downstream tooling. const manifestPath = join(framesDir, 'manifest.json'); await writeFile(manifestPath, JSON.stringify(manifest, null, 2), 'utf-8'); logger.info(`Saved ${this.frames.length} frames to ${framesDir}`); } } ================================================ FILE: packages/core/src/viewport/index.ts ================================================ export { Viewport, type ViewportOptions } from './viewport.js'; export { LaunchProfile } from './launch-profile.js'; export { EventHub } from './event-hub.js'; export { BaseGuard, type GuardContext } from './guard-base.js'; export { VisualTracer, type VisualTracerOptions } from './visual-tracer.js'; export { type TabDescriptor, type ViewportSnapshot, type ViewportHistory, type LaunchOptions, type PageState, } from './types.js'; export { type ViewportEventMap, type ViewportRequestMap, type NavigateEvent, type ClickEvent, type InputEvent, type ScrollEvent, type ScreenshotEvent, type ScreenshotResult, type DownloadEvent, type PopupEvent, type SecurityEvent, type CrashEvent, } from './events.js'; ================================================ FILE: packages/core/src/viewport/launch-profile.test.ts ================================================ import { test, expect, describe } from 'bun:test'; import { LaunchProfile, CHROME_AUTOMATION_FLAGS, CHROME_STRIPPED_FEATURES, ANTI_DETECTION_FLAGS, CONTAINER_FLAGS, REPRODUCIBLE_RENDER_FLAGS, RELAXED_SECURITY_FLAGS, } from './launch-profile.js'; describe('LaunchProfile', () => { describe('static create', () => { test('returns a LaunchProfile instance', () => { const profile = LaunchProfile.create(); expect(profile).toBeInstanceOf(LaunchProfile); }); }); describe('default build', () => { test('produces headless true by default', () => { const opts = LaunchProfile.create().build(); expect(opts.headless).toBe(true); }); test('produces default window size 1280x1100', () => { const opts = LaunchProfile.create().build(); expect(opts.windowWidth).toBe(1280); expect(opts.windowHeight).toBe(1100); }); test('persistAfterClose defaults to false', () => { const opts = LaunchProfile.create().build(); expect(opts.persistAfterClose).toBe(false); }); test('relaxedSecurity defaults to false', () => { const opts = LaunchProfile.create().build(); expect(opts.relaxedSecurity).toBe(false); }); test('includes CHROME_AUTOMATION_FLAGS in extraArgs', () => { const opts = LaunchProfile.create().build(); for (const arg of CHROME_AUTOMATION_FLAGS) { expect(opts.extraArgs).toContain(arg); } }); test('includes disabled components feature flag', () => { const opts = LaunchProfile.create().build(); const disableFeatures = opts.extraArgs.find((a) => a.startsWith('--disable-features='), ); expect(disableFeatures).toBeDefined(); for (const component of CHROME_STRIPPED_FEATURES) { expect(disableFeatures).toContain(component); } }); test('includes window-size arg', () => { const opts = LaunchProfile.create().build(); expect(opts.extraArgs).toContain('--window-size=1280,1100'); }); test('proxy is undefined by default', () => { const opts = LaunchProfile.create().build(); expect(opts.proxy).toBeUndefined(); }); test('userDataDir is undefined by default', () => { const opts = LaunchProfile.create().build(); expect(opts.userDataDir).toBeUndefined(); }); test('channelName is undefined by default', () => { const opts = LaunchProfile.create().build(); expect(opts.channelName).toBeUndefined(); }); }); describe('.headless()', () => { test('headless(true) sets headless to true', () => { const opts = LaunchProfile.create().headless(true).build(); expect(opts.headless).toBe(true); }); test('headless(false) sets headless to false', () => { const opts = LaunchProfile.create().headless(false).build(); expect(opts.headless).toBe(false); }); test('headless() with no argument defaults to true', () => { const opts = LaunchProfile.create().headless().build(); expect(opts.headless).toBe(true); }); }); describe('.headful() equivalent', () => { test('headless(false) creates headful mode', () => { const opts = LaunchProfile.create().headless(false).build(); expect(opts.headless).toBe(false); }); }); describe('.stealthMode()', () => { test('adds stealth args when enabled', () => { const opts = LaunchProfile.create().stealthMode().build(); for (const arg of ANTI_DETECTION_FLAGS) { expect(opts.extraArgs).toContain(arg); } }); test('does not add stealth args when disabled', () => { const opts = LaunchProfile.create().stealthMode(false).build(); // ANTI_DETECTION_FLAGS[1] is --disable-features=AutomationControlled // which won't be in the base args (only in ANTI_DETECTION_FLAGS) // But CHROME_AUTOMATION_FLAGS also contains --disable-blink-features=AutomationControlled // so check for the features one specifically const stealthOnlyArg = '--disable-features=AutomationControlled'; const hasStealthOnlyArg = opts.extraArgs.some( (a) => a === stealthOnlyArg, ); expect(hasStealthOnlyArg).toBe(false); }); test('returns this for chaining', () => { const profile = LaunchProfile.create(); const result = profile.stealthMode(); expect(result).toBe(profile); }); }); describe('.dockerMode()', () => { test('adds docker args when enabled', () => { const opts = LaunchProfile.create().dockerMode().build(); for (const arg of CONTAINER_FLAGS) { expect(opts.extraArgs).toContain(arg); } }); test('does not add docker args when disabled', () => { const opts = LaunchProfile.create().dockerMode(false).build(); // --no-sandbox should not be present when docker mode is off expect(opts.extraArgs).not.toContain('--no-sandbox'); }); }); describe('.deterministicRendering()', () => { test('adds deterministic rendering args when enabled', () => { const opts = LaunchProfile.create().deterministicRendering().build(); for (const arg of REPRODUCIBLE_RENDER_FLAGS) { expect(opts.extraArgs).toContain(arg); } }); test('does not add deterministic args when disabled', () => { const opts = LaunchProfile.create().deterministicRendering(false).build(); expect(opts.extraArgs).not.toContain('--deterministic-mode'); }); }); describe('.relaxedSecurity()', () => { test('adds security-disable args when enabled', () => { const opts = LaunchProfile.create().relaxedSecurity().build(); expect(opts.relaxedSecurity).toBe(true); for (const arg of RELAXED_SECURITY_FLAGS) { expect(opts.extraArgs).toContain(arg); } }); test('does not add security args when disabled', () => { const opts = LaunchProfile.create().relaxedSecurity(false).build(); expect(opts.relaxedSecurity).toBe(false); expect(opts.extraArgs).not.toContain('--disable-web-security'); }); }); describe('.downloadsPath()', () => { test('adds download-default-directory arg', () => { const opts = LaunchProfile.create() .downloadsPath('/tmp/downloads') .build(); expect(opts.extraArgs).toContain( '--download-default-directory=/tmp/downloads', ); }); }); describe('.maxIframes()', () => { test('returns this for chaining', () => { const profile = LaunchProfile.create(); const result = profile.maxIframes(5); expect(result).toBe(profile); }); }); describe('.addExtension()', () => { test('adds single extension path to load-extension arg', () => { const opts = LaunchProfile.create() .addExtension('/path/to/ext1') .build(); const loadExtArg = opts.extraArgs.find((a) => a.startsWith('--load-extension='), ); expect(loadExtArg).toBe('--load-extension=/path/to/ext1'); }); test('adds multiple extensions as comma-separated list', () => { const opts = LaunchProfile.create() .addExtension('/path/to/ext1') .addExtension('/path/to/ext2') .build(); const loadExtArg = opts.extraArgs.find((a) => a.startsWith('--load-extension='), ); expect(loadExtArg).toBe( '--load-extension=/path/to/ext1,/path/to/ext2', ); }); test('no load-extension arg when no extensions added', () => { const opts = LaunchProfile.create().build(); const loadExtArg = opts.extraArgs.find((a) => a.startsWith('--load-extension='), ); expect(loadExtArg).toBeUndefined(); }); }); describe('.windowSize()', () => { test('sets custom window dimensions', () => { const opts = LaunchProfile.create().windowSize(1920, 1080).build(); expect(opts.windowWidth).toBe(1920); expect(opts.windowHeight).toBe(1080); expect(opts.extraArgs).toContain('--window-size=1920,1080'); }); }); describe('.proxy()', () => { test('sets proxy server', () => { const opts = LaunchProfile.create() .proxy('http://proxy:8080') .build(); expect(opts.proxy).toEqual({ server: 'http://proxy:8080', username: undefined, password: undefined, }); }); test('sets proxy with credentials', () => { const opts = LaunchProfile.create() .proxy('http://proxy:8080', 'user', 'pass') .build(); expect(opts.proxy).toEqual({ server: 'http://proxy:8080', username: 'user', password: 'pass', }); }); }); describe('.userDataDir()', () => { test('sets user data directory', () => { const opts = LaunchProfile.create() .userDataDir('/tmp/chrome-data') .build(); expect(opts.userDataDir).toBe('/tmp/chrome-data'); }); }); describe('.browserBinary()', () => { test('sets browser binary path', () => { const opts = LaunchProfile.create() .browserBinary('/usr/bin/chromium') .build(); expect(opts.browserBinaryPath).toBe('/usr/bin/chromium'); }); }); describe('.persistAfterClose()', () => { test('sets persistAfterClose to true', () => { const opts = LaunchProfile.create().persistAfterClose().build(); expect(opts.persistAfterClose).toBe(true); }); test('sets persistAfterClose to false', () => { const opts = LaunchProfile.create().persistAfterClose(false).build(); expect(opts.persistAfterClose).toBe(false); }); }); describe('.channel()', () => { test('sets channel name', () => { const opts = LaunchProfile.create().channel('chrome').build(); expect(opts.channelName).toBe('chrome'); }); }); describe('.extraArgs()', () => { test('appends extra args to the end', () => { const opts = LaunchProfile.create() .extraArgs('--custom-flag', '--another-flag') .build(); expect(opts.extraArgs).toContain('--custom-flag'); expect(opts.extraArgs).toContain('--another-flag'); }); test('user extra args can override earlier args', () => { const opts = LaunchProfile.create() .extraArgs('--override=value') .build(); // The user arg should be at the end of the array (after CHROME_AUTOMATION_FLAGS) const lastArgs = opts.extraArgs.slice(-1); expect(lastArgs).toContain('--override=value'); }); }); describe('builder chaining', () => { test('multiple methods can be chained together', () => { const opts = LaunchProfile.create() .headless(false) .stealthMode() .dockerMode() .deterministicRendering() .windowSize(800, 600) .downloadsPath('/downloads') .addExtension('/ext') .persistAfterClose() .build(); expect(opts.headless).toBe(false); expect(opts.persistAfterClose).toBe(true); expect(opts.windowWidth).toBe(800); expect(opts.windowHeight).toBe(600); expect(opts.extraArgs).toContain('--window-size=800,600'); for (const arg of ANTI_DETECTION_FLAGS) { expect(opts.extraArgs).toContain(arg); } for (const arg of CONTAINER_FLAGS) { expect(opts.extraArgs).toContain(arg); } for (const arg of REPRODUCIBLE_RENDER_FLAGS) { expect(opts.extraArgs).toContain(arg); } }); }); }); describe('CHROME_AUTOMATION_FLAGS', () => { test('is a non-empty array', () => { expect(Array.isArray(CHROME_AUTOMATION_FLAGS)).toBe(true); expect(CHROME_AUTOMATION_FLAGS.length).toBeGreaterThan(10); }); test('contains essential flags', () => { expect(CHROME_AUTOMATION_FLAGS).toContain('--no-first-run'); expect(CHROME_AUTOMATION_FLAGS).toContain('--disable-popup-blocking'); expect(CHROME_AUTOMATION_FLAGS).toContain('--disable-infobars'); }); test('all entries are strings starting with --', () => { for (const arg of CHROME_AUTOMATION_FLAGS) { expect(typeof arg).toBe('string'); expect(arg.startsWith('--')).toBe(true); } }); }); describe('CHROME_STRIPPED_FEATURES', () => { test('is a non-empty array', () => { expect(Array.isArray(CHROME_STRIPPED_FEATURES)).toBe(true); expect(CHROME_STRIPPED_FEATURES.length).toBeGreaterThan(10); }); test('contains known components', () => { expect(CHROME_STRIPPED_FEATURES).toContain('Translate'); expect(CHROME_STRIPPED_FEATURES).toContain('MediaRouter'); expect(CHROME_STRIPPED_FEATURES).toContain('Prerender2'); }); test('all entries are non-empty strings', () => { for (const component of CHROME_STRIPPED_FEATURES) { expect(typeof component).toBe('string'); expect(component.length).toBeGreaterThan(0); } }); }); ================================================ FILE: packages/core/src/viewport/launch-profile.ts ================================================ import type { LaunchOptions } from './types.js'; import { Config } from '../config/config.js'; /** * Chrome default args for automation — standard flags to disable * background noise, throttling, and other non-essential features. */ export const CHROME_AUTOMATION_FLAGS = [ '--no-first-run', '--no-default-browser-check', '--disable-background-networking', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-breakpad', '--disable-component-update', '--disable-default-apps', '--disable-dev-shm-usage', '--disable-extensions-except=', '--disable-hang-monitor', '--disable-ipc-flooding-protection', '--disable-popup-blocking', '--disable-prompt-on-repost', '--disable-renderer-backgrounding', '--disable-sync', '--disable-translate', '--metrics-recording-only', '--no-pings', '--password-store=basic', '--use-mock-keychain', '--disable-blink-features=AutomationControlled', '--disable-infobars', '--disable-session-crashed-bubble', '--force-color-profile=srgb', ]; /** * Chrome disabled-components flag values that reduce resource usage * and prevent interfering background services. */ export const CHROME_STRIPPED_FEATURES = [ 'InterestFeedContentSuggestions', 'Translate', 'OptimizationHints', 'MediaRouter', 'DialMediaRouteProvider', 'CalculatorTool', 'CrashedTabFinder', 'AutofillServerCommunication', 'BackgroundTracing', 'NtpTiles', 'OneGoogleBar', 'ReadLater', 'NTPArticleSuggestions', 'CrossDeviceSync', 'PrivacySandboxSettings4', 'SidePanelPinning', 'HistoryEmbeddings', 'PrivacySandboxPromptV2', 'GlobalMediaControls', 'ComposeService', 'AutofillFeature', 'NTPSigninPromo', 'Prerender2', 'TabGroupsSave', ]; export const ANTI_DETECTION_FLAGS = [ '--disable-blink-features=AutomationControlled', '--disable-features=AutomationControlled', ]; export const CONTAINER_FLAGS = [ '--no-sandbox', '--disable-gpu', '--disable-software-rasterizer', '--disable-setuid-sandbox', '--single-process', ]; export const RELAXED_SECURITY_FLAGS = [ '--disable-web-security', '--disable-site-isolation-trials', '--disable-features=IsolateOrigins,site-per-process', ]; export const REPRODUCIBLE_RENDER_FLAGS = [ '--deterministic-mode', '--disable-skia-runtime-opts', '--disable-font-subpixel-positioning', '--force-color-profile=srgb', '--disable-lcd-text', ]; /** * Builder pattern for browser profile configuration. * Replaces the Python ViewportConfig with a fluent API. */ export class LaunchProfile { private options: Partial = {}; private _stealthMode = false; private _dockerMode = false; private _deterministicRendering = false; private _maxIframes = 3; private _downloadsPath?: string; private _extensions: string[] = []; static create(): LaunchProfile { return new LaunchProfile(); } headless(value = true): this { this.options.headless = value; return this; } relaxedSecurity(value = true): this { this.options.relaxedSecurity = value; return this; } windowSize(width: number, height: number): this { this.options.windowWidth = width; this.options.windowHeight = height; return this; } proxy(server: string, username?: string, password?: string): this { this.options.proxy = { server, username, password }; return this; } userDataDir(dir: string): this { this.options.userDataDir = dir; return this; } browserBinary(path: string): this { this.options.browserBinaryPath = path; return this; } persistAfterClose(value = true): this { this.options.persistAfterClose = value; return this; } channel(name: string): this { this.options.channelName = name; return this; } extraArgs(...args: string[]): this { this.options.extraArgs = [...(this.options.extraArgs ?? []), ...args]; return this; } stealthMode(value = true): this { this._stealthMode = value; return this; } dockerMode(value = true): this { this._dockerMode = value; return this; } deterministicRendering(value = true): this { this._deterministicRendering = value; return this; } downloadsPath(path: string): this { this._downloadsPath = path; return this; } maxIframes(max: number): this { this._maxIframes = max; return this; } addExtension(extensionPath: string): this { this._extensions.push(extensionPath); return this; } /** * Auto-detect and apply Docker settings if running inside a container. */ autoDetect(): this { if (Config.isDocker()) { this._dockerMode = true; // Force headless in Docker if no display if (!Config.hasDisplay()) { this.options.headless = true; } } return this; } build(): LaunchOptions { const args = [...CHROME_AUTOMATION_FLAGS]; // Disabled components args.push(`--disable-component-extensions-with-background-pages`); args.push(`--disable-features=${CHROME_STRIPPED_FEATURES.join(',')}`); // Mode-specific args if (this._stealthMode) { args.push(...ANTI_DETECTION_FLAGS); } if (this._dockerMode) { args.push(...CONTAINER_FLAGS); } if (this._deterministicRendering) { args.push(...REPRODUCIBLE_RENDER_FLAGS); } if (this.options.relaxedSecurity) { args.push(...RELAXED_SECURITY_FLAGS); } // Window size const width = this.options.windowWidth ?? 1280; const height = this.options.windowHeight ?? 1100; args.push(`--window-size=${width},${height}`); // Extensions if (this._extensions.length > 0) { args.push(`--load-extension=${this._extensions.join(',')}`); } // Downloads if (this._downloadsPath) { args.push(`--download-default-directory=${this._downloadsPath}`); } // User extra args (last, so they can override) if (this.options.extraArgs) { args.push(...this.options.extraArgs); } return { headless: this.options.headless ?? true, relaxedSecurity: this.options.relaxedSecurity ?? false, extraArgs: args, windowWidth: width, windowHeight: height, proxy: this.options.proxy, userDataDir: this.options.userDataDir, browserBinaryPath: this.options.browserBinaryPath, persistAfterClose: this.options.persistAfterClose ?? false, channelName: this.options.channelName, }; } } ================================================ FILE: packages/core/src/viewport/types.ts ================================================ import { z } from 'zod'; import type { TabId } from '../types.js'; export interface TabDescriptor { tabId: TabId; url: string; title: string; isActive: boolean; } export interface ViewportSnapshot { url: string; title: string; tabs: TabDescriptor[]; activeTabIndex: number; screenshot?: string; domTree?: string; selectorMap?: Record; pixelsAbove?: number; pixelsBelow?: number; } export interface ViewportHistory { url: string; title: string; tabs: TabDescriptor[]; interactedElements: Array<{ index: number; description: string; action: string; }>; screenshot?: string; } export const LaunchOptionsSchema = z.object({ headless: z.boolean().default(true), relaxedSecurity: z.boolean().default(false), extraArgs: z.array(z.string()).default([]), windowWidth: z.number().default(1280), windowHeight: z.number().default(1100), proxy: z .object({ server: z.string(), username: z.string().optional(), password: z.string().optional(), }) .optional(), userDataDir: z.string().optional(), browserBinaryPath: z.string().optional(), persistAfterClose: z.boolean().default(false), channelName: z.string().optional(), }); export type LaunchOptions = z.infer; export interface PageState { url: string; title: string; content?: string; screenshot?: string; } ================================================ FILE: packages/core/src/viewport/viewport.ts ================================================ import { chromium, type Browser, type BrowserContext, type Page, type CDPSession, } from 'playwright'; import { EventHub } from './event-hub.js'; import type { ViewportEventMap, ViewportRequestMap } from './events.js'; import type { LaunchOptions, ViewportSnapshot, TabDescriptor } from './types.js'; import { LaunchProfile } from './launch-profile.js'; import { BaseGuard, type GuardContext } from './guard-base.js'; import { LaunchFailedError, ViewportCrashedError } from '../errors.js'; import { tabId, targetId, type TargetId } from '../types.js'; import { createLogger } from '../logging.js'; import { timed } from '../telemetry.js'; import { isNewTabPage } from '../utils.js'; // Watchdogs import { LocalInstanceGuard } from './guards/local-instance.js'; import { UrlPolicyGuard } from './guards/url-policy.js'; import { DefaultHandlerGuard } from './guards/default-handler.js'; import { PopupGuard } from './guards/popups.js'; import { PageReadyGuard } from './guards/page-ready.js'; import { DownloadGuard } from './guards/downloads.js'; import { BlankPageGuard } from './guards/blank-page.js'; import { CrashGuard } from './guards/crash.js'; import { PersistenceGuard } from './guards/persistence.js'; import { ScreenshotGuard } from './guards/screenshot.js'; const logger = createLogger('browser-session'); // ── Multi-target tracking ── /** Represents a single CDP target (page, iframe, service worker, etc.) */ export interface Target { targetId: TargetId; type: 'page' | 'iframe' | 'service_worker' | 'worker' | 'other'; url: string; title: string; } /** Viewport dimensions as detected via CDP */ export interface ViewportInfo { width: number; height: number; deviceScaleFactor: number; isMobile: boolean; } export interface ViewportOptions { /** Launch options (or use LaunchProfile) */ launchOptions?: Partial; /** Pre-built browser profile */ profile?: LaunchProfile; /** Connect to existing browser via WebSocket URL */ wsEndpoint?: string; /** Connect to existing browser via CDP URL */ cdpUrl?: string; /** Headless mode shortcut */ headless?: boolean; /** Allowed URLs for security watchdog */ allowedUrls?: string[]; /** Blocked URLs for security watchdog */ blockedUrls?: string[]; /** Storage state file path */ storageStatePath?: string; /** Extra watchdogs */ watchdogs?: BaseGuard[]; /** Minimum wait after page load (ms) */ minWaitPageLoadMs?: number; /** Wait for network idle (ms) */ waitForNetworkIdleMs?: number; /** Max wait for page load (ms) */ maxWaitPageLoadMs?: number; /** Max reconnection attempts */ maxReconnectAttempts?: number; /** Delay between reconnection attempts (ms) */ reconnectDelayMs?: number; } export class Viewport { private browser: Browser | null = null; private context: BrowserContext | null = null; private _currentPage: Page | null = null; private cdpSession: CDPSession | null = null; readonly eventBus: EventHub; private watchdogs: BaseGuard[] = []; private options: ViewportOptions; private launchOptions: LaunchOptions; private _isConnected = false; private readonly minWaitPageLoadMs: number; private readonly waitForNetworkIdleMs: number; private readonly maxWaitPageLoadMs: number; private readonly maxReconnectAttempts: number; private readonly reconnectDelayMs: number; /** Tracks known CDP targets keyed by targetId */ private knownTargets = new Map(); /** Cached viewport info, invalidated on page/tab switch */ private cachedViewport: ViewportInfo | null = null; /** Tracks whether a reconnection is currently in progress */ private reconnecting = false; constructor(options: ViewportOptions = {}) { this.options = options; this.eventBus = new EventHub({ maxHistory: 200 }); if (options.profile) { this.launchOptions = options.profile.build(); } else { this.launchOptions = { headless: options.headless ?? options.launchOptions?.headless ?? true, relaxedSecurity: options.launchOptions?.relaxedSecurity ?? false, extraArgs: options.launchOptions?.extraArgs ?? [], windowWidth: options.launchOptions?.windowWidth ?? 1280, windowHeight: options.launchOptions?.windowHeight ?? 1100, proxy: options.launchOptions?.proxy, userDataDir: options.launchOptions?.userDataDir, browserBinaryPath: options.launchOptions?.browserBinaryPath, persistAfterClose: options.launchOptions?.persistAfterClose ?? false, channelName: options.launchOptions?.channelName, }; } this.minWaitPageLoadMs = options.minWaitPageLoadMs ?? 500; this.waitForNetworkIdleMs = options.waitForNetworkIdleMs ?? 1000; this.maxWaitPageLoadMs = options.maxWaitPageLoadMs ?? 5000; this.maxReconnectAttempts = options.maxReconnectAttempts ?? 3; this.reconnectDelayMs = options.reconnectDelayMs ?? 1000; } get isConnected(): boolean { return this._isConnected; } get currentPage(): Page { if (!this._currentPage) { throw new ViewportCrashedError('No active page'); } return this._currentPage; } get browserContext(): BrowserContext { if (!this.context) { throw new ViewportCrashedError('No active browser context'); } return this.context; } get cdp(): CDPSession | null { return this.cdpSession; } // ── Lifecycle ── async start(): Promise { const { durationMs } = await timed('browser-session.start', async () => { try { logger.info('Starting browser session'); if (this.options.wsEndpoint) { logger.debug(`Connecting via WebSocket: ${this.options.wsEndpoint}`); this.browser = await chromium.connect(this.options.wsEndpoint); } else if (this.options.cdpUrl) { logger.debug(`Connecting via CDP: ${this.options.cdpUrl}`); this.browser = await chromium.connectOverCDP(this.options.cdpUrl); } else { this.browser = await this.launchBrowser(); } const contexts = this.browser.contexts(); if (contexts.length > 0) { this.context = contexts[0]; logger.debug('Reusing existing browser context'); } else { this.context = await this.createContext(); logger.debug('Created new browser context'); } const pages = this.context.pages(); if (pages.length > 0) { this._currentPage = pages[0]; } else { this._currentPage = await this.context.newPage(); } // Create CDP session this.cdpSession = await this._currentPage.context().newCDPSession(this._currentPage); this._isConnected = true; // Wire up disconnect detection on the browser this.setupDisconnectHandler(); // Discover initial targets await this.refreshTargets(); // Detect initial viewport via CDP this.cachedViewport = null; await this.detectViewport(); // Initialize watchdogs await this.initializeWatchdogs(); // Set up page lifecycle listeners on the context this.setupPageLifecycleListeners(); const pageUrl = this._currentPage.url(); const pageTitle = await this._currentPage.title(); // Emit initial lifecycle events this.eventBus.emit('content-ready', undefined as any); if (!isNewTabPage(pageUrl)) { this.eventBus.emit('page-ready', { url: pageUrl }); } this.eventBus.emit('viewport-state', { url: pageUrl, title: pageTitle, tabCount: this.context.pages().length, }); logger.info(`Browser session started: ${pageUrl}`); } catch (error) { throw new LaunchFailedError( `Failed to start browser: ${error instanceof Error ? error.message : String(error)}`, { cause: error instanceof Error ? error : undefined }, ); } }); logger.debug(`start() completed in ${durationMs.toFixed(1)}ms`); } private setupDisconnectHandler(): void { if (!this.browser) return; this.browser.on('disconnected', () => { logger.warn('Browser disconnected'); this._isConnected = false; this.eventBus.emit('crash', { reason: 'Browser disconnected unexpectedly' }); }); } private setupPageLifecycleListeners(): void { if (!this.context) return; // Track new pages (tabs) being created this.context.on('page', async (page: Page) => { const url = page.url(); logger.debug(`New page created: ${url}`); this.eventBus.emit('tab-opened', { url }); // Refresh target list when new pages appear await this.refreshTargets(); // Emit browser-state update try { this.eventBus.emit('viewport-state', { url: this._currentPage?.url() ?? url, title: this._currentPage ? await this._currentPage.title() : '', tabCount: this.context?.pages().length ?? 1, }); } catch { // Page might be closed already } // When the new page loads, emit page-loaded page.on('load', () => { const loadedUrl = page.url(); if (!isNewTabPage(loadedUrl)) { logger.debug(`Page loaded in new tab: ${loadedUrl}`); } }); }); } // ── Multi-target tracking ── /** * Queries CDP for the current list of targets (pages, iframes, workers, etc.) * and updates the internal target map. */ async getTargets(): Promise { await this.refreshTargets(); return Array.from(this.knownTargets.values()); } private async refreshTargets(): Promise { if (!this.cdpSession) return; try { const result = await ( this.cdpSession.send('Target.getTargets') as Promise ) as Promise<{ targetInfos: Array<{ targetId: string; type: string; url: string; title: string }> }>; const { targetInfos } = await result; this.knownTargets.clear(); for (const info of targetInfos) { const type = normalizeTargetType(info.type); this.knownTargets.set(info.targetId, { targetId: targetId(info.targetId), type, url: info.url, title: info.title, }); } logger.debug(`Refreshed targets: ${this.knownTargets.size} found`); } catch (error) { logger.debug( `Failed to refresh targets: ${error instanceof Error ? error.message : String(error)}`, ); } } /** * Find a target by its targetId. */ findTarget(id: TargetId): Target | undefined { return this.knownTargets.get(id); } /** * Get only page-type targets, filtering out new-tab pages. */ async getPageTargets(): Promise { const targets = await this.getTargets(); return targets.filter((t) => t.type === 'page' && !isNewTabPage(t.url)); } // ── Viewport detection via CDP ── /** * Detects the actual viewport dimensions by evaluating JavaScript in the page * via CDP Runtime.evaluate. This is more accurate than Playwright's viewportSize() * because it reflects the real rendered viewport including device pixel ratio. */ async detectViewport(): Promise { if (this.cachedViewport) { return this.cachedViewport; } if (!this.cdpSession) { // Fallback to launch options if no CDP session const fallback: ViewportInfo = { width: this.launchOptions.windowWidth, height: this.launchOptions.windowHeight, deviceScaleFactor: 1, isMobile: false, }; this.cachedViewport = fallback; return fallback; } try { const { result: viewportResult } = await timed('detectViewport', async () => { const evalResult = await ( this.cdpSession!.send('Runtime.evaluate', { expression: `JSON.stringify({ width: window.innerWidth, height: window.innerHeight, deviceScaleFactor: window.devicePixelRatio || 1, isMobile: /Mobi|Android/i.test(navigator.userAgent) })`, returnByValue: true, }) as Promise ) as Promise<{ result: { value: string } }>; return evalResult; }); const parsed = JSON.parse(viewportResult.result.value) as ViewportInfo; this.cachedViewport = parsed; logger.debug( `Viewport detected: ${parsed.width}x${parsed.height} @${parsed.deviceScaleFactor}x`, ); return parsed; } catch (error) { logger.warn( `Viewport detection failed, using defaults: ${error instanceof Error ? error.message : String(error)}`, ); const fallback: ViewportInfo = { width: this.launchOptions.windowWidth, height: this.launchOptions.windowHeight, deviceScaleFactor: 1, isMobile: false, }; this.cachedViewport = fallback; return fallback; } } /** Invalidates the cached viewport, forcing a fresh CDP detection on next access. */ invalidateViewportCache(): void { this.cachedViewport = null; } // ── Reconnection logic ── /** * Attempts to reconnect to the browser after a disconnect. Uses the original * connection method (wsEndpoint, cdpUrl, or local launch). Retries up to * maxReconnectAttempts with exponential backoff. * * Returns true if reconnection succeeded, false otherwise. */ async reconnect(): Promise { if (this.reconnecting) { logger.warn('Reconnection already in progress, skipping'); return false; } this.reconnecting = true; logger.info('Attempting to reconnect browser session'); try { // Clean up current state without emitting close event await this.cleanupForReconnect(); let delay = this.reconnectDelayMs; for (let attempt = 1; attempt <= this.maxReconnectAttempts; attempt++) { logger.info(`Reconnect attempt ${attempt}/${this.maxReconnectAttempts}`); try { if (this.options.wsEndpoint) { this.browser = await chromium.connect(this.options.wsEndpoint); } else if (this.options.cdpUrl) { this.browser = await chromium.connectOverCDP(this.options.cdpUrl); } else { // For locally launched browsers, we need to launch a new instance this.browser = await this.launchBrowser(); } // Re-establish context const contexts = this.browser.contexts(); if (contexts.length > 0) { this.context = contexts[0]; } else { this.context = await this.createContext(); } // Re-establish page const pages = this.context.pages(); if (pages.length > 0) { this._currentPage = pages[0]; } else { this._currentPage = await this.context.newPage(); } // Re-create CDP session this.cdpSession = await this._currentPage.context().newCDPSession(this._currentPage); this._isConnected = true; this.cachedViewport = null; // Re-wire handlers this.setupDisconnectHandler(); this.setupPageLifecycleListeners(); // Refresh targets after reconnect await this.refreshTargets(); // Re-initialize watchdogs await this.initializeWatchdogs(); logger.info(`Reconnected successfully on attempt ${attempt}`); // Emit lifecycle events for the reconnected state const url = this._currentPage.url(); const title = await this._currentPage.title(); this.eventBus.emit('viewport-state', { url, title, tabCount: this.context.pages().length, }); return true; } catch (error) { logger.warn( `Reconnect attempt ${attempt} failed: ${error instanceof Error ? error.message : String(error)}`, ); if (attempt < this.maxReconnectAttempts) { await new Promise((resolve) => setTimeout(resolve, delay)); delay *= 2; // Exponential backoff } } } logger.error(`All ${this.maxReconnectAttempts} reconnect attempts failed`); this.eventBus.emit('crash', { reason: 'Reconnection failed after all attempts' }); return false; } finally { this.reconnecting = false; } } /** * Cleans up internal state in preparation for a reconnect attempt, * without emitting lifecycle events or clearing the event bus. */ private async cleanupForReconnect(): Promise { // Detach watchdogs for (const watchdog of this.watchdogs) { try { await watchdog.detach(); } catch { // Ignore detach errors during reconnect } } this.watchdogs = []; // Detach CDP session if (this.cdpSession) { try { await this.cdpSession.detach(); } catch { // Ignore } this.cdpSession = null; } // Don't close the browser if connecting remotely -- it's already disconnected if (this.browser && !this.options.wsEndpoint && !this.options.cdpUrl) { try { await this.browser.close(); } catch { // Ignore } } this.browser = null; this.context = null; this._currentPage = null; this._isConnected = false; this.knownTargets.clear(); this.cachedViewport = null; } // ── DOM stability ── /** * Waits for the DOM to stop mutating. Uses a MutationObserver injected via * page.evaluate to detect when no DOM changes occur for a quiet period. * * @param timeout - Maximum time to wait in ms (default: 3000) * @param quietPeriodMs - How long the DOM must be silent to be considered stable (default: 300) */ async waitForStableDOM(timeout = 3000, quietPeriodMs = 300): Promise { const page = this.currentPage; const { durationMs } = await timed('waitForStableDOM', async () => { try { await page.evaluate( ({ timeoutMs, quietMs }) => { return new Promise((resolve) => { let timer: ReturnType; let overallTimer: ReturnType; const observer = new MutationObserver(() => { clearTimeout(timer); timer = setTimeout(() => { observer.disconnect(); clearTimeout(overallTimer); resolve(); }, quietMs); }); observer.observe(document.body, { childList: true, subtree: true, attributes: true, characterData: true, }); // Start the quiet period timer immediately -- if no mutations // happen at all, we resolve after quietMs timer = setTimeout(() => { observer.disconnect(); clearTimeout(overallTimer); resolve(); }, quietMs); // Overall timeout: resolve even if mutations keep happening overallTimer = setTimeout(() => { observer.disconnect(); clearTimeout(timer); resolve(); }, timeoutMs); }); }, { timeoutMs: timeout, quietMs: quietPeriodMs }, ); } catch (error) { // If the page navigated or was closed, just return logger.debug( `waitForStableDOM interrupted: ${error instanceof Error ? error.message : String(error)}`, ); } }); logger.debug(`DOM stabilized in ${durationMs.toFixed(1)}ms`); } // ── Visible HTML extraction ── /** * Returns the HTML of elements currently visible in the viewport. * Uses IntersectionObserver logic evaluated in-page to collect only * elements that are within the visible area, then serializes them. */ async getVisibleHtml(): Promise { const page = this.currentPage; const { result: html } = await timed('getVisibleHtml', async () => { return page.evaluate(() => { function isInViewport(el: Element): boolean { const rect = el.getBoundingClientRect(); // Element is at least partially visible return ( rect.bottom > 0 && rect.right > 0 && rect.top < window.innerHeight && rect.left < window.innerWidth && rect.width > 0 && rect.height > 0 ); } function isVisible(el: Element): boolean { const style = window.getComputedStyle(el); return ( style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0' && isInViewport(el) ); } // Walk the DOM and collect visible top-level elements const visibleParts: string[] = []; const body = document.body; if (!body) return ''; // Collect direct children of body that are visible, // or recurse one level for major containers for (const child of Array.from(body.children)) { if (isVisible(child)) { // Clone the element and remove hidden descendants const clone = child.cloneNode(true) as Element; const hiddenDescendants = Array.from(clone.querySelectorAll('*')).filter( (desc) => { const s = window.getComputedStyle(desc); return s.display === 'none' || s.visibility === 'hidden'; }, ); for (const hidden of hiddenDescendants) { hidden.remove(); } visibleParts.push(clone.outerHTML); } } if (visibleParts.length === 0) { // Fallback: return the body's innerHTML truncated return body.innerHTML.slice(0, 50000); } return visibleParts.join('\n'); }); }); return html; } // ── Launch & context setup (existing) ── private async launchBrowser(): Promise { const args = this.buildChromiumArgs(); logger.debug(`Launching chromium with ${args.length} args`); return chromium.launch({ headless: this.launchOptions.headless, args, executablePath: this.launchOptions.browserBinaryPath || undefined, channel: this.launchOptions.channelName || undefined, proxy: this.launchOptions.proxy ? { server: this.launchOptions.proxy.server, username: this.launchOptions.proxy.username, password: this.launchOptions.proxy.password, } : undefined, }); } private buildChromiumArgs(): string[] { const args = [ `--window-size=${this.launchOptions.windowWidth},${this.launchOptions.windowHeight}`, ...this.launchOptions.extraArgs, ]; if (this.launchOptions.relaxedSecurity) { args.push( '--disable-web-security', '--disable-site-isolation-trials', '--disable-features=IsolateOrigins,site-per-process', ); } return args; } private async createContext(): Promise { const context = await this.browser!.newContext({ viewport: { width: this.launchOptions.windowWidth, height: this.launchOptions.windowHeight, }, userAgent: undefined, // Use default javaScriptEnabled: true, ignoreHTTPSErrors: this.launchOptions.relaxedSecurity, acceptDownloads: true, }); return context; } private async initializeWatchdogs(): Promise { const ctx: GuardContext = { page: this._currentPage!, context: this.context!, eventBus: this.eventBus, }; // Create default watchdogs this.watchdogs = [ new LocalInstanceGuard(), new UrlPolicyGuard(this.options.allowedUrls, this.options.blockedUrls), new DefaultHandlerGuard(), new PopupGuard(), new PageReadyGuard(), new DownloadGuard(), new BlankPageGuard(), new CrashGuard(), new ScreenshotGuard(), ...(this.options.watchdogs ?? []), ]; if (this.options.storageStatePath) { this.watchdogs.push(new PersistenceGuard(this.options.storageStatePath)); } // Sort by priority (lower = higher priority) this.watchdogs.sort((a, b) => a.priority - b.priority); // Attach all watchdogs for (const watchdog of this.watchdogs) { await watchdog.attach(ctx); } logger.debug(`Initialized ${this.watchdogs.length} watchdogs`); } // ── Navigation & interaction (existing, enhanced) ── async navigate(url: string): Promise { const page = this.currentPage; logger.debug(`Navigating to: ${url}`); try { await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.maxWaitPageLoadMs, }); } catch (error) { // Timeout is OK, page might still be loading if (error instanceof Error && !error.message.includes('Timeout')) { throw error; } } await this.waitForPageReady(); // Invalidate viewport cache after navigation (page dimensions may change) this.cachedViewport = null; // Refresh targets (navigation may create/destroy targets) await this.refreshTargets(); this.eventBus.emit('page-ready', { url: page.url() }); this.eventBus.emit('viewport-state', { url: page.url(), title: await page.title(), tabCount: this.context!.pages().length, }); } async waitForPageReady(): Promise { const page = this.currentPage; // Minimum wait await new Promise((resolve) => setTimeout(resolve, this.minWaitPageLoadMs)); // Wait for network idle try { await page.waitForLoadState('networkidle', { timeout: this.waitForNetworkIdleMs, }); } catch { // Timeout is OK } } async click(selector: string): Promise { await this.currentPage.click(selector, { timeout: 5000 }); } async type(selector: string, text: string): Promise { await this.currentPage.fill(selector, text); } async pressKey(key: string): Promise { await this.currentPage.keyboard.press(key); } async screenshot(fullPage = false): Promise<{ base64: string; width: number; height: number }> { const page = this.currentPage; const buffer = await page.screenshot({ fullPage, type: 'png', }); const base64 = buffer.toString('base64'); const viewport = page.viewportSize(); return { base64, width: viewport?.width ?? this.launchOptions.windowWidth, height: viewport?.height ?? this.launchOptions.windowHeight, }; } async getState(): Promise { const page = this.currentPage; const pages = this.context!.pages(); const activeIndex = pages.indexOf(page); const tabs: TabDescriptor[] = pages.map((p, i) => ({ tabId: tabId(i), url: p.url(), title: '', // Will be populated async isActive: i === activeIndex, })); // Get titles in parallel await Promise.all( tabs.map(async (tab, i) => { try { tab.title = await pages[i].title(); } catch { tab.title = ''; } }), ); return { url: page.url(), title: await page.title(), tabs, activeTabIndex: activeIndex, }; } async switchTab(tabIndex: number): Promise { const pages = this.context!.pages(); if (tabIndex < 0 || tabIndex >= pages.length) { throw new Error(`Invalid tab index: ${tabIndex}. Available tabs: ${pages.length}`); } this._currentPage = pages[tabIndex]; await this._currentPage.bringToFront(); // Re-create CDP session for new page this.cdpSession = await this._currentPage.context().newCDPSession(this._currentPage); // Invalidate viewport cache when switching tabs this.cachedViewport = null; // Refresh target list await this.refreshTargets(); this.eventBus.emit('tab-changed', { tabIndex }); } async closeTab(tabIndex?: number): Promise { const pages = this.context!.pages(); const index = tabIndex ?? pages.indexOf(this.currentPage); if (pages.length <= 1) { throw new Error('Cannot close the last tab'); } const pageToClose = pages[index]; await pageToClose.close(); // Switch to remaining page const remainingPages = this.context!.pages(); if (remainingPages.length > 0) { const newIndex = Math.min(index, remainingPages.length - 1); this._currentPage = remainingPages[newIndex]; await this._currentPage.bringToFront(); this.cdpSession = await this._currentPage.context().newCDPSession(this._currentPage); } // Invalidate caches this.cachedViewport = null; // Refresh targets after closing a tab await this.refreshTargets(); this.eventBus.emit('tab-closed', { tabIndex: index }); } async newTab(url?: string): Promise { const page = await this.context!.newPage(); this._currentPage = page; if (url) { await this.navigate(url); } this.cdpSession = await this._currentPage.context().newCDPSession(this._currentPage); // Invalidate caches this.cachedViewport = null; } async evaluate(expression: string): Promise { return this.currentPage.evaluate(expression) as Promise; } async setPage(page: Page): Promise { this._currentPage = page; this.cdpSession = await page.context().newCDPSession(page); this.cachedViewport = null; } // ── Cleanup ── async close(): Promise { logger.info('Closing browser session'); // Detach all watchdogs for (const watchdog of this.watchdogs) { await watchdog.detach(); } this.watchdogs = []; // Close CDP session if (this.cdpSession) { try { await this.cdpSession.detach(); } catch { // Ignore } this.cdpSession = null; } // Close browser if (this.browser && !this.launchOptions.persistAfterClose) { try { await this.browser.close(); } catch { // Ignore } } this.browser = null; this.context = null; this._currentPage = null; this._isConnected = false; this.knownTargets.clear(); this.cachedViewport = null; this.eventBus.emit('shutdown', undefined as any); this.eventBus.removeAllListeners(); logger.info('Browser session closed'); } // AsyncDisposable support async [Symbol.asyncDispose](): Promise { await this.close(); } } // ── Helpers ── /** * Normalizes a CDP target type string to our Target type union. */ function normalizeTargetType( cdpType: string, ): 'page' | 'iframe' | 'service_worker' | 'worker' | 'other' { switch (cdpType) { case 'page': return 'page'; case 'iframe': return 'iframe'; case 'service_worker': return 'service_worker'; case 'worker': case 'shared_worker': return 'worker'; default: return 'other'; } } ================================================ FILE: packages/core/src/viewport/visual-tracer.ts ================================================ import type { Page } from 'playwright'; export interface VisualTracerOptions { highlightColor?: string; highlightDuration?: number; annotationFontSize?: number; showTimeline?: boolean; showCoordinates?: boolean; actionColors?: Record; } const DEFAULT_OPTIONS: Required = { highlightColor: 'rgba(255, 0, 0, 0.3)', highlightDuration: 2000, annotationFontSize: 14, showTimeline: false, showCoordinates: false, actionColors: { click: '#ff4444', scroll: '#44aaff', type: '#44cc44', navigate: '#ff9900', default: '#aa44ff', }, }; const OVERLAY_ATTR = 'data-demo-mode-overlay'; export class VisualTracer { private options: Required; constructor(options?: VisualTracerOptions) { this.options = { ...DEFAULT_OPTIONS, ...options, actionColors: { ...DEFAULT_OPTIONS.actionColors, ...options?.actionColors }, }; } // ─────────────────────────────────────────── // Existing methods // ─────────────────────────────────────────── async highlightElement(page: Page, selector: string, label?: string): Promise { await page.evaluate( ({ selector, color, duration, label, fontSize, attr }) => { const element = document.querySelector(selector); if (!element) return; const rect = element.getBoundingClientRect(); const overlay = document.createElement('div'); overlay.setAttribute(attr, ''); overlay.style.cssText = ` position: fixed; left: ${rect.left}px; top: ${rect.top}px; width: ${rect.width}px; height: ${rect.height}px; background: ${color}; border: 2px solid red; pointer-events: none; z-index: 999999; transition: opacity 0.3s; `; if (label) { const labelEl = document.createElement('div'); labelEl.textContent = label; labelEl.style.cssText = ` position: absolute; top: -24px; left: 0; background: red; color: white; padding: 2px 6px; font-size: ${fontSize}px; font-family: monospace; border-radius: 3px; white-space: nowrap; `; overlay.appendChild(labelEl); } document.body.appendChild(overlay); setTimeout(() => { overlay.style.opacity = '0'; setTimeout(() => overlay.remove(), 300); }, duration); }, { selector, color: this.options.highlightColor, duration: this.options.highlightDuration, label, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } async showAction(page: Page, action: string, details?: string): Promise { await page.evaluate( ({ action, details, fontSize, attr }) => { const toast = document.createElement('div'); toast.setAttribute(attr, ''); toast.style.cssText = ` position: fixed; bottom: 20px; right: 20px; background: rgba(0, 0, 0, 0.8); color: white; padding: 12px 20px; border-radius: 8px; font-family: monospace; font-size: ${fontSize}px; z-index: 999999; max-width: 400px; transition: opacity 0.3s; `; toast.innerHTML = `${action}${details ? `
    ${details}` : ''}`; document.body.appendChild(toast); setTimeout(() => { toast.style.opacity = '0'; setTimeout(() => toast.remove(), 300); }, 2000); }, { action, details, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR }, ); } // ─────────────────────────────────────────── // Action-specific visual overlays // ─────────────────────────────────────────── /** * Shows an expanding circle animation at the given click coordinates. * Optionally displays a label next to the click point. */ async highlightClick(page: Page, x: number, y: number, label?: string): Promise { await page.evaluate( ({ x, y, label, color, duration, fontSize, attr }) => { const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; // Inject keyframes for the expanding ring const styleEl = document.createElement('style'); styleEl.textContent = ` @keyframes demo-click-ring { 0% { transform: translate(-50%, -50%) scale(0); opacity: 1; } 70% { opacity: 0.6; } 100% { transform: translate(-50%, -50%) scale(1); opacity: 0; } } `; container.appendChild(styleEl); // Create three staggered rings for a ripple effect for (let i = 0; i < 3; i++) { const ring = document.createElement('div'); ring.style.cssText = ` position: fixed; left: ${x}px; top: ${y}px; width: 60px; height: 60px; border: 3px solid ${color}; border-radius: 50%; pointer-events: none; animation: demo-click-ring ${duration * 0.6}ms ease-out ${i * 120}ms forwards; `; container.appendChild(ring); } // Small filled dot at click center const dot = document.createElement('div'); dot.style.cssText = ` position: fixed; left: ${x}px; top: ${y}px; width: 10px; height: 10px; background: ${color}; border-radius: 50%; transform: translate(-50%, -50%); pointer-events: none; transition: opacity 0.3s; `; container.appendChild(dot); // Optional label if (label) { const labelEl = document.createElement('div'); labelEl.textContent = label; labelEl.style.cssText = ` position: fixed; left: ${x + 16}px; top: ${y - 12}px; background: ${color}; color: white; padding: 2px 8px; font-size: ${fontSize}px; font-family: monospace; border-radius: 3px; white-space: nowrap; pointer-events: none; `; container.appendChild(labelEl); } document.body.appendChild(container); setTimeout(() => { container.style.opacity = '0'; setTimeout(() => container.remove(), 300); }, duration); }, { x, y, label, color: this.options.actionColors.click, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } /** * Shows an arrow animation indicating the scroll direction. */ async highlightScroll(page: Page, direction: 'up' | 'down'): Promise { await page.evaluate( ({ direction, color, duration, fontSize, attr }) => { const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; const styleEl = document.createElement('style'); const translateY = direction === 'up' ? '-40px' : '40px'; styleEl.textContent = ` @keyframes demo-scroll-arrow { 0% { opacity: 0; transform: translateX(-50%) translateY(0); } 30% { opacity: 1; } 100% { opacity: 0; transform: translateX(-50%) translateY(${translateY}); } } `; container.appendChild(styleEl); const arrowChar = direction === 'up' ? '\u25B2' : '\u25BC'; // Show three staggered arrows along the right side for (let i = 0; i < 3; i++) { const arrow = document.createElement('div'); const topOffset = direction === 'up' ? 60 + i * 40 : 40 + i * 40; arrow.textContent = arrowChar; arrow.style.cssText = ` position: fixed; right: 30px; top: ${topOffset}%; transform: translateX(-50%); color: ${color}; font-size: ${fontSize * 2}px; pointer-events: none; animation: demo-scroll-arrow ${duration * 0.5}ms ease-out ${i * 150}ms forwards; `; container.appendChild(arrow); } // Direction label const label = document.createElement('div'); label.textContent = `Scroll ${direction}`; label.style.cssText = ` position: fixed; right: 12px; top: 50%; transform: translateY(-50%); background: ${color}; color: white; padding: 4px 12px; font-size: ${fontSize}px; font-family: monospace; border-radius: 4px; pointer-events: none; transition: opacity 0.3s; `; container.appendChild(label); document.body.appendChild(container); setTimeout(() => { container.style.opacity = '0'; setTimeout(() => container.remove(), 300); }, duration); }, { direction, color: this.options.actionColors.scroll, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } /** * Shows a keyboard icon animation near the target element with a preview of the text being typed. */ async highlightType(page: Page, selector: string, text: string): Promise { await page.evaluate( ({ selector, text, color, duration, fontSize, attr }) => { const element = document.querySelector(selector); if (!element) return; const rect = element.getBoundingClientRect(); const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; const styleEl = document.createElement('style'); styleEl.textContent = ` @keyframes demo-type-blink { 0%, 100% { border-right-color: transparent; } 50% { border-right-color: white; } } @keyframes demo-type-fadein { 0% { opacity: 0; transform: translateY(4px); } 100% { opacity: 1; transform: translateY(0); } } `; container.appendChild(styleEl); // Highlight the target element const highlight = document.createElement('div'); highlight.style.cssText = ` position: fixed; left: ${rect.left - 2}px; top: ${rect.top - 2}px; width: ${rect.width + 4}px; height: ${rect.height + 4}px; border: 2px solid ${color}; border-radius: 3px; pointer-events: none; transition: opacity 0.3s; `; container.appendChild(highlight); // Keyboard icon (simplified as a unicode symbol + label) const kbIcon = document.createElement('div'); kbIcon.style.cssText = ` position: fixed; left: ${rect.left}px; top: ${rect.bottom + 6}px; display: flex; align-items: center; gap: 6px; animation: demo-type-fadein 0.2s ease-out forwards; pointer-events: none; `; const iconSpan = document.createElement('span'); iconSpan.textContent = '\u2328'; iconSpan.style.cssText = ` font-size: ${fontSize * 1.4}px; color: ${color}; `; kbIcon.appendChild(iconSpan); // Text preview bubble with blinking cursor const textBubble = document.createElement('div'); const truncated = text.length > 40 ? `${text.slice(0, 37)}...` : text; textBubble.textContent = truncated; textBubble.style.cssText = ` background: ${color}; color: white; padding: 3px 10px; font-size: ${fontSize}px; font-family: monospace; border-radius: 4px; white-space: nowrap; border-right: 2px solid white; animation: demo-type-blink 0.7s step-end infinite; `; kbIcon.appendChild(textBubble); container.appendChild(kbIcon); document.body.appendChild(container); setTimeout(() => { container.style.opacity = '0'; setTimeout(() => container.remove(), 300); }, duration); }, { selector, text, color: this.options.actionColors.type, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } /** * Shows a URL bar-like overlay at the top of the viewport to indicate navigation. */ async highlightNavigation(page: Page, url: string): Promise { await page.evaluate( ({ url, color, duration, fontSize, attr }) => { const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; const styleEl = document.createElement('style'); styleEl.textContent = ` @keyframes demo-nav-slide { 0% { transform: translateY(-100%); opacity: 0; } 15% { transform: translateY(0); opacity: 1; } 85% { transform: translateY(0); opacity: 1; } 100% { transform: translateY(-100%); opacity: 0; } } @keyframes demo-nav-progress { 0% { width: 0%; } 100% { width: 100%; } } `; container.appendChild(styleEl); // URL bar const bar = document.createElement('div'); bar.style.cssText = ` position: fixed; top: 0; left: 0; right: 0; background: rgba(0, 0, 0, 0.9); padding: 10px 16px; display: flex; align-items: center; gap: 10px; animation: demo-nav-slide ${duration}ms ease-in-out forwards; border-bottom: 2px solid ${color}; `; // Globe icon const globe = document.createElement('span'); globe.textContent = '\uD83C\uDF10'; globe.style.cssText = `font-size: ${fontSize * 1.2}px;`; bar.appendChild(globe); // URL text in a pill const urlPill = document.createElement('div'); urlPill.style.cssText = ` flex: 1; background: rgba(255, 255, 255, 0.1); border: 1px solid rgba(255, 255, 255, 0.2); border-radius: 20px; padding: 6px 14px; color: white; font-size: ${fontSize}px; font-family: monospace; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; `; urlPill.textContent = url; bar.appendChild(urlPill); // Navigate label const label = document.createElement('div'); label.textContent = 'Navigate'; label.style.cssText = ` background: ${color}; color: white; padding: 4px 10px; font-size: ${fontSize - 2}px; font-family: monospace; border-radius: 4px; white-space: nowrap; `; bar.appendChild(label); container.appendChild(bar); // Progress bar const progress = document.createElement('div'); progress.style.cssText = ` position: fixed; top: 0; left: 0; height: 3px; background: ${color}; animation: demo-nav-progress ${duration * 0.7}ms ease-out forwards; z-index: 1; `; container.appendChild(progress); document.body.appendChild(container); setTimeout(() => container.remove(), duration + 100); }, { url, color: this.options.actionColors.navigate, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } // ─────────────────────────────────────────── // Multi-element and composite overlays // ─────────────────────────────────────────── /** * Highlights multiple elements with numbered labels, useful for showing a sequence of targets. */ async showElementSequence( page: Page, elements: Array<{ selector: string; label: string }>, ): Promise { await page.evaluate( ({ elements, color, duration, fontSize, attr }) => { const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; const styleEl = document.createElement('style'); styleEl.textContent = ` @keyframes demo-seq-appear { 0% { transform: scale(0); opacity: 0; } 60% { transform: scale(1.15); } 100% { transform: scale(1); opacity: 1; } } `; container.appendChild(styleEl); // Draw connecting lines between sequential elements const rects: DOMRect[] = []; for (const { selector } of elements) { const el = document.querySelector(selector); if (el) { rects.push(el.getBoundingClientRect()); } else { rects.push(new DOMRect(0, 0, 0, 0)); } } // SVG for connecting lines if (rects.length > 1) { const svg = document.createElementNS('http://www.w3.org/2000/svg', 'svg'); svg.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; `; for (let i = 0; i < rects.length - 1; i++) { const from = rects[i]; const to = rects[i + 1]; if (from.width === 0 || to.width === 0) continue; const line = document.createElementNS('http://www.w3.org/2000/svg', 'line'); line.setAttribute('x1', String(from.left + from.width / 2)); line.setAttribute('y1', String(from.top + from.height / 2)); line.setAttribute('x2', String(to.left + to.width / 2)); line.setAttribute('y2', String(to.top + to.height / 2)); line.setAttribute('stroke', color); line.setAttribute('stroke-width', '2'); line.setAttribute('stroke-dasharray', '6,4'); line.setAttribute('opacity', '0.5'); svg.appendChild(line); } container.appendChild(svg); } // Numbered badges and highlight boxes for each element elements.forEach(({ selector, label }, index) => { const el = document.querySelector(selector); if (!el) return; const rect = el.getBoundingClientRect(); // Highlight box const box = document.createElement('div'); box.style.cssText = ` position: fixed; left: ${rect.left - 3}px; top: ${rect.top - 3}px; width: ${rect.width + 6}px; height: ${rect.height + 6}px; border: 2px solid ${color}; border-radius: 4px; pointer-events: none; animation: demo-seq-appear 0.3s ease-out ${index * 150}ms both; `; container.appendChild(box); // Numbered badge const badge = document.createElement('div'); badge.style.cssText = ` position: fixed; left: ${rect.left - 12}px; top: ${rect.top - 12}px; width: 24px; height: 24px; background: ${color}; color: white; border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: ${fontSize - 2}px; font-family: monospace; font-weight: bold; pointer-events: none; animation: demo-seq-appear 0.3s ease-out ${index * 150}ms both; `; badge.textContent = String(index + 1); container.appendChild(badge); // Label text const labelEl = document.createElement('div'); labelEl.textContent = label; labelEl.style.cssText = ` position: fixed; left: ${rect.left + 16}px; top: ${rect.top - 28}px; background: ${color}; color: white; padding: 2px 8px; font-size: ${fontSize}px; font-family: monospace; border-radius: 3px; white-space: nowrap; pointer-events: none; animation: demo-seq-appear 0.3s ease-out ${index * 150 + 80}ms both; `; container.appendChild(labelEl); }); document.body.appendChild(container); setTimeout(() => { container.style.opacity = '0'; container.style.transition = 'opacity 0.3s'; setTimeout(() => container.remove(), 300); }, duration); }, { elements, color: this.options.actionColors.default, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } /** * Shows a horizontal timeline panel at the bottom of the viewport summarizing actions taken. */ async showTimeline( page: Page, steps: Array<{ action: string; timestamp: number; success: boolean }>, ): Promise { await page.evaluate( ({ steps, colors, duration, fontSize, attr }) => { const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; const styleEl = document.createElement('style'); styleEl.textContent = ` @keyframes demo-timeline-slide { 0% { transform: translateY(100%); opacity: 0; } 10% { transform: translateY(0); opacity: 1; } 90% { transform: translateY(0); opacity: 1; } 100% { transform: translateY(100%); opacity: 0; } } @keyframes demo-timeline-dot { 0% { transform: scale(0); } 60% { transform: scale(1.3); } 100% { transform: scale(1); } } `; container.appendChild(styleEl); // Timeline panel const panel = document.createElement('div'); panel.style.cssText = ` position: fixed; bottom: 0; left: 0; right: 0; background: rgba(0, 0, 0, 0.92); padding: 14px 20px 18px; animation: demo-timeline-slide ${duration}ms ease-in-out forwards; border-top: 2px solid rgba(255, 255, 255, 0.15); `; // Title const title = document.createElement('div'); title.textContent = 'Action Timeline'; title.style.cssText = ` color: rgba(255, 255, 255, 0.6); font-size: ${fontSize - 2}px; font-family: monospace; margin-bottom: 10px; text-transform: uppercase; letter-spacing: 1px; `; panel.appendChild(title); // Timeline track const track = document.createElement('div'); track.style.cssText = ` display: flex; align-items: center; gap: 0; overflow-x: auto; padding-bottom: 4px; `; steps.forEach((step, index) => { // Step item const item = document.createElement('div'); item.style.cssText = ` display: flex; align-items: center; flex-shrink: 0; `; // Dot const actionKey = step.action.toLowerCase(); const dotColor = step.success ? (colors[actionKey] || colors.default) : '#ff4444'; const dot = document.createElement('div'); dot.style.cssText = ` width: 14px; height: 14px; border-radius: 50%; background: ${dotColor}; border: 2px solid ${step.success ? 'transparent' : '#ff0000'}; flex-shrink: 0; animation: demo-timeline-dot 0.3s ease-out ${index * 100}ms both; `; item.appendChild(dot); // Label below const label = document.createElement('div'); const time = new Date(step.timestamp).toLocaleTimeString([], { hour: '2-digit', minute: '2-digit', second: '2-digit', }); label.innerHTML = `
    ${step.action}
    ${time}
    `; label.style.cssText = ` margin-left: 4px; margin-right: 4px; `; item.appendChild(label); track.appendChild(item); // Connector line between steps if (index < steps.length - 1) { const connector = document.createElement('div'); connector.style.cssText = ` width: 30px; height: 2px; background: rgba(255, 255, 255, 0.2); flex-shrink: 0; margin: 0 2px; `; track.appendChild(connector); } }); panel.appendChild(track); container.appendChild(panel); document.body.appendChild(container); setTimeout(() => container.remove(), duration + 100); }, { steps, colors: this.options.actionColors, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } /** * Shows a crosshair and coordinate text at the given position. */ async showCoordinates(page: Page, x: number, y: number): Promise { await page.evaluate( ({ x, y, color, duration, fontSize, attr }) => { const container = document.createElement('div'); container.setAttribute(attr, ''); container.style.cssText = ` position: fixed; left: 0; top: 0; width: 100%; height: 100%; pointer-events: none; z-index: 999999; `; // Horizontal crosshair line const hLine = document.createElement('div'); hLine.style.cssText = ` position: fixed; left: 0; top: ${y}px; width: 100%; height: 1px; background: ${color}; opacity: 0.4; pointer-events: none; `; container.appendChild(hLine); // Vertical crosshair line const vLine = document.createElement('div'); vLine.style.cssText = ` position: fixed; left: ${x}px; top: 0; width: 1px; height: 100%; background: ${color}; opacity: 0.4; pointer-events: none; `; container.appendChild(vLine); // Center crosshair marks (thicker, shorter lines) const crossSize = 12; const marks = [ // Left mark { left: x - crossSize, top: y, width: crossSize - 3, height: 2 }, // Right mark { left: x + 3, top: y, width: crossSize - 3, height: 2 }, // Top mark { left: x, top: y - crossSize, width: 2, height: crossSize - 3 }, // Bottom mark { left: x, top: y + 3, width: 2, height: crossSize - 3 }, ]; for (const m of marks) { const mark = document.createElement('div'); mark.style.cssText = ` position: fixed; left: ${m.left}px; top: ${m.top}px; width: ${m.width}px; height: ${m.height}px; background: ${color}; pointer-events: none; `; container.appendChild(mark); } // Coordinate label const label = document.createElement('div'); label.textContent = `(${Math.round(x)}, ${Math.round(y)})`; label.style.cssText = ` position: fixed; left: ${x + 14}px; top: ${y + 14}px; background: rgba(0, 0, 0, 0.8); color: ${color}; padding: 3px 8px; font-size: ${fontSize}px; font-family: monospace; border-radius: 3px; border: 1px solid ${color}; white-space: nowrap; pointer-events: none; `; container.appendChild(label); document.body.appendChild(container); setTimeout(() => { container.style.opacity = '0'; container.style.transition = 'opacity 0.3s'; setTimeout(() => container.remove(), 300); }, duration); }, { x, y, color: this.options.actionColors.default, duration: this.options.highlightDuration, fontSize: this.options.annotationFontSize, attr: OVERLAY_ATTR, }, ); } // ─────────────────────────────────────────── // Cleanup // ─────────────────────────────────────────── /** * Removes all demo-mode overlays currently on the page. */ async clearOverlays(page: Page): Promise { await page.evaluate( ({ attr }) => { const overlays = document.querySelectorAll(`[${attr}]`); for (const overlay of overlays) { overlay.remove(); } }, { attr: OVERLAY_ATTR }, ); } } ================================================ FILE: packages/core/tsconfig.json ================================================ { "extends": "../../tsconfig.base.json", "compilerOptions": { "rootDir": "src", "outDir": "dist" }, "include": ["src/**/*.ts"] } ================================================ FILE: packages/sandbox/package.json ================================================ { "name": "@open-browser/sandbox", "version": "1.1.0", "description": "Sandboxed execution environment for Open Browser", "type": "module", "main": "src/index.ts", "types": "src/index.ts", "exports": { ".": "./src/index.ts" }, "scripts": { "build": "tsc --noEmit", "test": "bun test" }, "dependencies": { "open-browser": "workspace:*" }, "license": "MIT" } ================================================ FILE: packages/sandbox/src/index.ts ================================================ export { Sandbox } from './sandbox.js'; export type { SandboxOptions, SandboxResult, SandboxError, SandboxErrorCategory, SandboxMetrics, CapturedOutput, ResourceSnapshot, } from './types.js'; ================================================ FILE: packages/sandbox/src/sandbox.ts ================================================ import type { SandboxOptions, SandboxResult, SandboxError, SandboxMetrics, CapturedOutput, ResourceSnapshot, SandboxErrorCategory, } from './types.js'; import { Viewport, Agent, type AgentOptions, type CommandResult } from 'open-browser'; // ── Defaults ── const DEFAULT_OPTIONS: Required = { timeout: 300_000, maxMemoryMB: 512, allowedDomains: [], blockedDomains: [], enableNetworking: true, enableFileAccess: false, workDir: process.cwd(), resourceCheckIntervalMs: 1_000, captureOutput: true, stepLimit: 100, }; // ── Resource Monitor ── /** * Monitors memory and CPU usage during sandbox execution. * Takes periodic snapshots and detects OOM conditions. */ class ResourceMonitor { private intervalId: ReturnType | null = null; private snapshots: ResourceSnapshot[] = []; private peakMemoryMB = 0; private startCpuUsage: NodeJS.CpuUsage | null = null; private readonly limitMB: number; private onOOM: (() => void) | null = null; constructor(limitMB: number) { this.limitMB = limitMB; } start(intervalMs: number, onOOM: () => void): void { this.startCpuUsage = process.cpuUsage(); this.onOOM = onOOM; this.takeSnapshot(); this.intervalId = setInterval(() => { this.takeSnapshot(); }, intervalMs); } stop(): void { if (this.intervalId !== null) { clearInterval(this.intervalId); this.intervalId = null; } // Final snapshot this.takeSnapshot(); } private takeSnapshot(): void { const mem = process.memoryUsage(); const cpu = this.startCpuUsage ? process.cpuUsage(this.startCpuUsage) : process.cpuUsage(); const rssMB = mem.rss / (1024 * 1024); const heapUsedMB = mem.heapUsed / (1024 * 1024); const heapTotalMB = mem.heapTotal / (1024 * 1024); const externalMB = mem.external / (1024 * 1024); const snapshot: ResourceSnapshot = { timestampMs: Date.now(), heapUsedMB, heapTotalMB, rssMB, externalMB, cpuUserMs: cpu.user / 1000, cpuSystemMs: cpu.system / 1000, }; this.snapshots.push(snapshot); if (rssMB > this.peakMemoryMB) { this.peakMemoryMB = rssMB; } // Check OOM condition against RSS (total process memory) if (rssMB > this.limitMB && this.onOOM) { this.onOOM(); } } getPeakMemoryMB(): number { return Math.round(this.peakMemoryMB * 100) / 100; } getCpuTimeMs(): number { if (!this.startCpuUsage) return 0; const usage = process.cpuUsage(this.startCpuUsage); return Math.round((usage.user + usage.system) / 1000); } getSnapshots(): ResourceSnapshot[] { return [...this.snapshots]; } getCurrentMemoryMB(): number { const mem = process.memoryUsage(); return Math.round((mem.rss / (1024 * 1024)) * 100) / 100; } } // ── Output Capture ── /** * Captures stdout and stderr output during execution. * Intercepts process.stdout.write and process.stderr.write. */ class OutputCapture { private stdoutChunks: string[] = []; private stderrChunks: string[] = []; private originalStdoutWrite: typeof process.stdout.write | null = null; private originalStderrWrite: typeof process.stderr.write | null = null; private active = false; start(): void { if (this.active) return; this.active = true; this.stdoutChunks = []; this.stderrChunks = []; this.originalStdoutWrite = process.stdout.write.bind(process.stdout); this.originalStderrWrite = process.stderr.write.bind(process.stderr); process.stdout.write = ((chunk: string | Uint8Array, ...args: unknown[]): boolean => { const text = typeof chunk === 'string' ? chunk : new TextDecoder().decode(chunk); this.stdoutChunks.push(text); // Still write to original stdout for real-time visibility return this.originalStdoutWrite!(chunk as string, ...args as []); }) as typeof process.stdout.write; process.stderr.write = ((chunk: string | Uint8Array, ...args: unknown[]): boolean => { const text = typeof chunk === 'string' ? chunk : new TextDecoder().decode(chunk); this.stderrChunks.push(text); return this.originalStderrWrite!(chunk as string, ...args as []); }) as typeof process.stderr.write; } stop(): void { if (!this.active) return; this.active = false; if (this.originalStdoutWrite) { process.stdout.write = this.originalStdoutWrite as typeof process.stdout.write; this.originalStdoutWrite = null; } if (this.originalStderrWrite) { process.stderr.write = this.originalStderrWrite as typeof process.stderr.write; this.originalStderrWrite = null; } } getOutput(): CapturedOutput { return { stdout: this.stdoutChunks.join(''), stderr: this.stderrChunks.join(''), }; } } // ── Sandbox ── /** * Sandboxed execution environment for browser automation. * Runs agent tasks in an isolated context with resource limits, * output capture, and comprehensive metrics. */ export class Sandbox { private options: Required; constructor(options?: SandboxOptions) { this.options = { ...DEFAULT_OPTIONS, ...options }; } /** * Run an agent task inside the sandbox with resource monitoring, * output capture, and timeout enforcement. */ async run(agentOptions: Omit): Promise { const startTime = Date.now(); const resourceMonitor = new ResourceMonitor(this.options.maxMemoryMB); const outputCapture = new OutputCapture(); // Track visited URLs and step/action counts const visitedUrls = new Set(); let stepsExecuted = 0; let totalActions = 0; // OOM abort controller let oomTriggered = false; const abortController = new AbortController(); const browser = new Viewport({ headless: true, allowedUrls: this.options.allowedDomains.length > 0 ? this.options.allowedDomains : undefined, blockedUrls: this.options.blockedDomains.length > 0 ? this.options.blockedDomains : undefined, }); // Start resource monitoring with OOM callback resourceMonitor.start(this.options.resourceCheckIntervalMs, () => { if (!oomTriggered) { oomTriggered = true; abortController.abort(); } }); // Start output capture if (this.options.captureOutput) { outputCapture.start(); } try { await browser.start(); const agent = new Agent({ ...agentOptions, browser, settings: { ...agentOptions.settings, allowedUrls: this.options.allowedDomains, blockedUrls: this.options.blockedDomains, stepLimit: this.options.stepLimit, }, onStepStart: (step) => { stepsExecuted = step; // Track URL at step start try { const url = browser.currentPage?.url(); if (url && url !== 'about:blank') { visitedUrls.add(url); } } catch { // Page may not be ready } // Delegate to caller's onStepStart if provided agentOptions.onStepStart?.(step); }, onStepEnd: (step, results) => { totalActions += results.length; // Track URL at step end (may have changed) try { const url = browser.currentPage?.url(); if (url && url !== 'about:blank') { visitedUrls.add(url); } } catch { // Page may not be ready } agentOptions.onStepEnd?.(step, results); }, }); // Race the agent execution against timeout and OOM const result = await Promise.race([ this.executeAgent(agent, startTime), this.createTimeoutPromise(startTime), this.createOOMPromise(abortController.signal, startTime, resourceMonitor), ]); // Build metrics const metrics = this.buildMetrics( startTime, resourceMonitor, stepsExecuted, visitedUrls, totalActions, ); return { ...result, memoryUsageMB: resourceMonitor.getCurrentMemoryMB(), capturedOutput: this.options.captureOutput ? outputCapture.getOutput() : undefined, metrics, }; } catch (error) { const sandboxError = this.classifyError(error, oomTriggered); const metrics = this.buildMetrics( startTime, resourceMonitor, stepsExecuted, visitedUrls, totalActions, ); return { success: false, error: sandboxError, errorMessage: sandboxError.message, duration: Date.now() - startTime, memoryUsageMB: resourceMonitor.getCurrentMemoryMB(), capturedOutput: this.options.captureOutput ? outputCapture.getOutput() : undefined, metrics, }; } finally { // Always clean up in reverse order resourceMonitor.stop(); if (this.options.captureOutput) { outputCapture.stop(); } await this.forceCleanup(browser); } } /** * Execute the agent and wrap the result. */ private async executeAgent( agent: Agent, startTime: number, ): Promise { const result = await agent.run(); return { success: result.success, output: result.finalResult, duration: Date.now() - startTime, }; } /** * Create a timeout promise that resolves with a timeout error. */ private createTimeoutPromise(startTime: number): Promise { return new Promise((resolve) => { setTimeout(() => { resolve({ success: false, error: { category: 'timeout', message: `Sandbox timeout after ${this.options.timeout}ms`, }, errorMessage: `Sandbox timeout after ${this.options.timeout}ms`, duration: Date.now() - startTime, }); }, this.options.timeout); }); } /** * Create a promise that rejects when OOM is detected via the AbortSignal. */ private createOOMPromise( signal: AbortSignal, startTime: number, monitor: ResourceMonitor, ): Promise { return new Promise((resolve) => { const onAbort = () => { resolve({ success: false, error: { category: 'oom', message: `Memory limit exceeded: ${monitor.getPeakMemoryMB()}MB > ${this.options.maxMemoryMB}MB`, }, errorMessage: `Memory limit exceeded: ${monitor.getPeakMemoryMB()}MB > ${this.options.maxMemoryMB}MB`, duration: Date.now() - startTime, memoryUsageMB: monitor.getPeakMemoryMB(), }); }; if (signal.aborted) { onAbort(); } else { signal.addEventListener('abort', onAbort, { once: true }); } }); } /** * Classify an error into a SandboxError with the appropriate category. */ private classifyError(error: unknown, oomTriggered: boolean): SandboxError { if (oomTriggered) { return { category: 'oom', message: 'Execution terminated due to memory limit exceeded', stack: error instanceof Error ? error.stack : undefined, }; } const message = error instanceof Error ? error.message : String(error); const stack = error instanceof Error ? error.stack : undefined; // Detect browser crashes if ( message.includes('browser has been closed') || message.includes('Target page') || message.includes('Target closed') || message.includes('Protocol error') ) { return { category: 'crash', message, stack }; } // Detect timeout patterns if ( message.includes('timeout') || message.includes('Timeout') || message.includes('ETIMEDOUT') ) { return { category: 'timeout', message, stack }; } // Detect agent-specific errors if ( message.includes('Agent') || message.includes('maximum steps') || message.includes('stuck in a loop') ) { return { category: 'agent_error', message, stack }; } // Detect browser/navigation errors if ( message.includes('net::ERR_') || message.includes('Navigation') || message.includes('navigation') ) { return { category: 'browser_error', message, stack }; } return { category: 'unknown', message, stack }; } /** * Build metrics from the execution data. */ private buildMetrics( startTime: number, monitor: ResourceMonitor, stepsExecuted: number, visitedUrls: Set, totalActions: number, ): SandboxMetrics { return { durationMs: Date.now() - startTime, peakMemoryMB: monitor.getPeakMemoryMB(), stepsExecuted, pagesVisited: visitedUrls.size, visitedUrls: [...visitedUrls], totalActions, cpuTimeMs: monitor.getCpuTimeMs(), }; } /** * Force cleanup of browser resources. Catches and ignores errors * since the browser may already be crashed or closed. */ private async forceCleanup(browser: Viewport): Promise { try { await Promise.race([ browser.close(), // Give cleanup 5 seconds max, then move on new Promise((resolve) => setTimeout(resolve, 5_000)), ]); } catch { // Browser may already be closed or crashed - ignore } } /** * Get the current sandbox configuration. */ getOptions(): Readonly> { return { ...this.options }; } } ================================================ FILE: packages/sandbox/src/types.ts ================================================ // ── Sandbox configuration ── export interface SandboxOptions { /** Maximum execution time in milliseconds (default: 300000 = 5 minutes) */ timeout?: number; /** Maximum memory usage in MB (default: 512) */ maxMemoryMB?: number; /** Domains the agent is allowed to visit */ allowedDomains?: string[]; /** Domains the agent is blocked from visiting */ blockedDomains?: string[]; /** Whether network access is allowed (default: true) */ enableNetworking?: boolean; /** Whether file system access is allowed (default: false) */ enableFileAccess?: boolean; /** Working directory for the sandbox */ workDir?: string; /** Interval in ms to check resource usage (default: 1000) */ resourceCheckIntervalMs?: number; /** Whether to capture stdout/stderr from the agent execution (default: true) */ captureOutput?: boolean; /** Maximum number of agent steps (default: 100) */ stepLimit?: number; } // ── Sandbox error categories ── export type SandboxErrorCategory = | 'timeout' | 'oom' | 'crash' | 'agent_error' | 'browser_error' | 'unknown'; export interface SandboxError { category: SandboxErrorCategory; message: string; /** Original stack trace if available */ stack?: string; } // ── Output capture ── export interface CapturedOutput { stdout: string; stderr: string; } // ── Metrics ── export interface SandboxMetrics { /** Total execution time in milliseconds */ durationMs: number; /** Peak memory usage in MB */ peakMemoryMB: number; /** Number of agent steps executed */ stepsExecuted: number; /** Number of unique pages visited */ pagesVisited: number; /** URLs of pages visited */ visitedUrls: string[]; /** Number of actions taken across all steps */ totalActions: number; /** CPU time used (user + system) in milliseconds */ cpuTimeMs: number; } // ── Sandbox result ── export interface SandboxResult { success: boolean; output?: string; error?: SandboxError; /** Legacy string error for backwards compatibility */ errorMessage?: string; duration: number; memoryUsageMB?: number; /** Captured stdout/stderr from the execution */ capturedOutput?: CapturedOutput; /** Detailed execution metrics */ metrics?: SandboxMetrics; } // ── Resource monitor state ── export interface ResourceSnapshot { timestampMs: number; heapUsedMB: number; heapTotalMB: number; rssMB: number; externalMB: number; cpuUserMs: number; cpuSystemMs: number; } ================================================ FILE: packages/sandbox/tsconfig.json ================================================ { "extends": "../../tsconfig.base.json", "compilerOptions": { "rootDir": "src", "outDir": "dist" }, "include": ["src/**/*.ts"] } ================================================ FILE: tsconfig.base.json ================================================ { "compilerOptions": { "target": "ESNext", "module": "ESNext", "moduleResolution": "bundler", "esModuleInterop": true, "strict": true, "skipLibCheck": true, "declaration": true, "declarationMap": true, "sourceMap": true, "outDir": "dist", "rootDir": "src", "composite": true, "incremental": true, "resolveJsonModule": true, "isolatedModules": true, "forceConsistentCasingInFileNames": true, "noUnusedLocals": false, "noUnusedParameters": false, "noFallthroughCasesInSwitch": true, "allowImportingTsExtensions": true, "noEmit": true, "types": ["bun"] } } ================================================ FILE: tsconfig.json ================================================ { "compilerOptions": { "target": "ESNext", "module": "ESNext", "moduleResolution": "bundler", "lib": ["ESNext"], "outDir": "dist", "rootDir": "src", "strict": true, "esModuleInterop": true, "declaration": true, "declarationMap": true, "sourceMap": true, "resolveJsonModule": true, "forceConsistentCasingInFileNames": true, "skipLibCheck": true }, "include": ["src/**/*.ts"], "exclude": ["node_modules", "dist", "**/__tests__/**"] }