Repository: anthropics/claude-quickstarts Branch: main Commit: 4b2549e8093a Files: 197 Total size: 761.7 KB Directory structure: gitextract_5euojtzz/ ├── .github/ │ ├── pull_request_template.md │ └── workflows/ │ ├── build.yaml │ ├── reusable_build_step.yaml │ └── tests.yaml ├── .pre-commit-config.yaml ├── CLAUDE.md ├── LICENSE ├── README.md ├── agents/ │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── agent.py │ ├── agent_demo.ipynb │ ├── test_message_params.py │ ├── tools/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── calculator_mcp.py │ │ ├── code_execution.py │ │ ├── file_tools.py │ │ ├── mcp_tool.py │ │ ├── think.py │ │ └── web_search.py │ └── utils/ │ ├── __init__.py │ ├── connections.py │ ├── history_util.py │ └── tool_util.py ├── autonomous-coding/ │ ├── .gitignore │ ├── README.md │ ├── agent.py │ ├── autonomous_agent_demo.py │ ├── client.py │ ├── progress.py │ ├── prompts/ │ │ ├── app_spec.txt │ │ ├── coding_prompt.md │ │ └── initializer_prompt.md │ ├── prompts.py │ ├── requirements.txt │ ├── security.py │ └── test_security.py ├── browser-use-demo/ │ ├── .dockerignore │ ├── .gitignore │ ├── CHANGELOG.md │ ├── Dockerfile │ ├── NOTICE │ ├── README.md │ ├── browser_use_demo/ │ │ ├── __init__.py │ │ ├── browser_tool_utils/ │ │ │ ├── __init__.py │ │ │ ├── browser_dom_script.js │ │ │ ├── browser_element_script.js │ │ │ ├── browser_form_input_script.js │ │ │ ├── browser_key_map.py │ │ │ └── browser_text_script.js │ │ ├── display_constants.py │ │ ├── loop.py │ │ ├── message_handler.py │ │ ├── message_renderer.py │ │ ├── requirements.txt │ │ ├── streamlit.py │ │ └── tools/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── browser.py │ │ ├── collection.py │ │ └── coordinate_scaling.py │ ├── build.sh │ ├── docker-compose.yml │ ├── image/ │ │ ├── .config/ │ │ │ └── tint2/ │ │ │ ├── applications/ │ │ │ │ ├── firefox-custom.desktop │ │ │ │ ├── gedit.desktop │ │ │ │ └── terminal.desktop │ │ │ └── tint2rc │ │ ├── entrypoint.sh │ │ ├── http_server.py │ │ ├── index.html │ │ ├── mutter_startup.sh │ │ ├── novnc_startup.sh │ │ ├── start_all.sh │ │ ├── static_content/ │ │ │ └── index.html │ │ ├── tint2_startup.sh │ │ ├── x11vnc_startup.sh │ │ └── xvfb_startup.sh │ ├── pytest.ini │ ├── setup.py │ ├── test-requirements.txt │ ├── tests/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_integration.py │ │ ├── test_message_renderer.py │ │ ├── test_sampling_loop.py │ │ └── test_streamlit_helpers.py │ └── validate_env.py ├── computer-use-demo/ │ ├── .gitignore │ ├── .zed/ │ │ └── settings.json │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE │ ├── README.md │ ├── computer_use_demo/ │ │ ├── __init__.py │ │ ├── loop.py │ │ ├── requirements.txt │ │ ├── streamlit.py │ │ └── tools/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── bash.py │ │ ├── collection.py │ │ ├── computer.py │ │ ├── edit.py │ │ ├── groups.py │ │ └── run.py │ ├── dev-requirements.txt │ ├── image/ │ │ ├── .config/ │ │ │ └── tint2/ │ │ │ ├── applications/ │ │ │ │ ├── firefox-custom.desktop │ │ │ │ ├── gedit.desktop │ │ │ │ └── terminal.desktop │ │ │ └── tint2rc │ │ ├── .streamlit/ │ │ │ └── config.toml │ │ ├── entrypoint.sh │ │ ├── http_server.py │ │ ├── index.html │ │ ├── mutter_startup.sh │ │ ├── novnc_startup.sh │ │ ├── start_all.sh │ │ ├── static_content/ │ │ │ └── index.html │ │ ├── tint2_startup.sh │ │ ├── x11vnc_startup.sh │ │ └── xvfb_startup.sh │ ├── pyproject.toml │ ├── ruff.toml │ ├── setup.sh │ └── tests/ │ ├── conftest.py │ ├── loop_test.py │ ├── streamlit_test.py │ └── tools/ │ ├── bash_test.py │ ├── computer_test.py │ └── edit_test.py ├── customer-support-agent/ │ ├── .eslintrc.json │ ├── .gitignore │ ├── README.md │ ├── amplify.yml │ ├── app/ │ │ ├── api/ │ │ │ └── chat/ │ │ │ └── route.ts │ │ ├── globals.css │ │ ├── layout.tsx │ │ ├── lib/ │ │ │ ├── customer_support_categories.json │ │ │ └── utils.ts │ │ └── page.tsx │ ├── components/ │ │ ├── ChatArea.tsx │ │ ├── FullSourceModal.tsx │ │ ├── LeftSidebar.tsx │ │ ├── RightSidebar.tsx │ │ ├── TopNavBar.tsx │ │ ├── theme-provider.tsx │ │ └── ui/ │ │ ├── avatar.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── dialog.tsx │ │ ├── dropdown-menu.tsx │ │ ├── input.tsx │ │ └── textarea.tsx │ ├── components.json │ ├── config.ts │ ├── lib/ │ │ └── utils.ts │ ├── next.config.mjs │ ├── package.json │ ├── postcss.config.mjs │ ├── styles/ │ │ └── themes.js │ ├── tailwind.config.ts │ └── tsconfig.json ├── financial-data-analyst/ │ ├── .eslintrc.json │ ├── .gitignore │ ├── README.md │ ├── app/ │ │ ├── api/ │ │ │ └── finance/ │ │ │ └── route.ts │ │ ├── finance/ │ │ │ └── page.tsx │ │ ├── globals.css │ │ ├── layout.tsx │ │ └── page.tsx │ ├── components/ │ │ ├── ChartRenderer.tsx │ │ ├── FilePreview.tsx │ │ ├── TopNavBar.tsx │ │ ├── theme-provider.tsx │ │ └── ui/ │ │ ├── avatar.tsx │ │ ├── badge.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── chart.tsx │ │ ├── dropdown-menu.tsx │ │ ├── textarea.tsx │ │ ├── toast.tsx │ │ └── toaster.tsx │ ├── components.json │ ├── hooks/ │ │ └── use-toast.ts │ ├── lib/ │ │ └── utils.ts │ ├── next.config.mjs │ ├── package.json │ ├── postcss.config.mjs │ ├── tailwind.config.ts │ ├── tsconfig.json │ ├── types/ │ │ └── chart.ts │ └── utils/ │ └── fileHandling.ts └── pyproject.toml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/pull_request_template.md ================================================ ## Description ## Quickstart - [ ] Computer Use Demo - [ ] Customer Support Agent - [ ] Financial Data Analyst - [ ] N/A ## Type of Change - [ ] Bug fix - [ ] New feature - [ ] Documentation update - [ ] Code refactoring - [ ] Other (please describe): ## Testing - [ ] Added/updated unit tests - [ ] Tested manually - [ ] Verified in development environment ## Screenshots ## Additional Notes ================================================ FILE: .github/workflows/build.yaml ================================================ env: REGISTRY: ghcr.io/anthropics/anthropic-quickstarts name: build on: pull_request: paths: - .github/** - computer-use-demo/** push: branches: - main paths: - .github/** - computer-use-demo/** jobs: build-amd64: uses: ./.github/workflows/reusable_build_step.yaml with: platform: amd64 builder: ubuntu-latest-16core registry: ghcr.io/anthropics/anthropic-quickstarts tag_prefix: computer-use-demo- context: computer-use-demo permissions: contents: read packages: write build-arm64: uses: ./.github/workflows/reusable_build_step.yaml with: platform: arm64 builder: ubuntu-22.04-arm64-16core registry: ghcr.io/anthropics/anthropic-quickstarts tag_prefix: computer-use-demo- context: computer-use-demo permissions: contents: read packages: write merge: runs-on: ubuntu-latest needs: - build-arm64 - build-amd64 permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{github.actor}} password: ${{secrets.GITHUB_TOKEN}} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Set image tag run: | echo "SHORT_SHA=$(git rev-parse --short ${{ github.sha }})" >> "$GITHUB_ENV" - name: Create SHA manifest and push run: | docker buildx imagetools create -t \ ${REGISTRY}:computer-use-demo-${SHORT_SHA} \ ${REGISTRY}:computer-use-demo-${SHORT_SHA}-amd64 \ ${REGISTRY}:computer-use-demo-${SHORT_SHA}-arm64 - name: Create latest manifest and push if: github.event_name == 'push' && github.ref == 'refs/heads/main' run: | docker buildx imagetools create -t \ ${REGISTRY}:computer-use-demo-latest \ ${REGISTRY}:computer-use-demo-latest-amd64 \ ${REGISTRY}:computer-use-demo-latest-arm64 ================================================ FILE: .github/workflows/reusable_build_step.yaml ================================================ on: workflow_call: inputs: platform: required: true type: string builder: required: true type: string registry: required: true type: string tag_prefix: required: false type: string context: required: false type: string jobs: build: runs-on: ${{ inputs.builder }} permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{github.actor}} password: ${{secrets.GITHUB_TOKEN}} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Set image tag run: | short_sha=$(git rev-parse --short ${{ github.sha }}) echo "TAG=${{ inputs.registry }}:${{ inputs.tag_prefix }}${short_sha}" >> "$GITHUB_ENV" - name: Build Docker image uses: docker/build-push-action@v5 with: platforms: linux/${{ inputs.platform }} context: ${{ inputs.context || '.' }} push: false tags: ${{ env.TAG }} cache-from: type=gha,scope=computer-use-${{ inputs.platform }} cache-to: type=gha,mode=max,scope=computer-use-${{ inputs.platform }} load: true - name: Run container run: docker run -d -p 8051:8051 ${{ env.TAG }} - name: Check streamlit run: | timeout=60 start_time=$(date +%s) docker_id=$(docker ps --filter "ancestor=${{ env.TAG }}" --format "{{.ID}}") echo "docker_id=$docker_id" >> "$GITHUB_ENV" while true; do current_time=$(date +%s) elapsed=$((current_time - start_time)) if [ $elapsed -ge $timeout ]; then echo "Timeout reached. Container did not respond within $timeout seconds." exit 1 fi response=$(docker exec $docker_id curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:8501 || echo "000") if [ "$response" = "200" ]; then echo "Container responded with 200 OK" exit 0 fi done - name: Check VNC run: docker exec $docker_id nc localhost 5900 -z - name: Check noVNC run: docker exec $docker_id curl -s -o /dev/null -w "%{http_code}" http://localhost:6080 | grep -q 200 || exit 1 - name: Check landing page run: docker exec $docker_id curl -s -o /dev/null -w "%{http_code}" http://localhost:8080 | grep -q 200 || exit 1 - name: Determine push tags run: | if [ "${{ github.event_name }}" == "pull_request" ]; then echo "PUSH_TAGS=${TAG}-${{ inputs.platform }}" >> "$GITHUB_ENV" else echo "PUSH_TAGS=${TAG}-${{ inputs.platform }},${{ inputs.registry }}:${{ inputs.tag_prefix }}latest-${{ inputs.platform }}" >> "$GITHUB_ENV" fi - name: Push Docker image uses: docker/build-push-action@v5 with: platforms: linux/${{ inputs.platform }} context: ${{ inputs.context || '.' }} push: true tags: ${{ env.PUSH_TAGS }} cache-from: type=gha,scope=computer-use-${{ inputs.platform }} cache-to: type=gha,mode=max,scope=computer-use-${{ inputs.platform }} ================================================ FILE: .github/workflows/tests.yaml ================================================ name: tests on: pull_request: paths: - .github/** - computer-use-demo/** push: branches: - main paths: - .github/** - computer-use-demo/** jobs: ruff: runs-on: ubuntu-latest defaults: run: working-directory: computer-use-demo steps: - uses: actions/checkout@v4 - uses: astral-sh/ruff-action@v1 with: src: "computer-use-demo" pyright: runs-on: ubuntu-latest defaults: run: working-directory: computer-use-demo steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: cache: "pip" python-version: "3.11.6" - run: | python -m venv .venv source .venv/bin/activate pip install -r dev-requirements.txt - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH - uses: jakebailey/pyright-action@v1 with: working-directory: computer-use-demo pytest: runs-on: ubuntu-latest defaults: run: working-directory: computer-use-demo steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: cache: "pip" python-version: "3.11.6" - run: | python -m venv .venv source .venv/bin/activate pip install -r dev-requirements.txt - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH - run: pytest tests --junitxml=junit/test-results.xml ================================================ FILE: .pre-commit-config.yaml ================================================ files: ^computer-use-demo/ default_stages: [pre-commit, pre-push] repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.6.7 hooks: - id: ruff name: Run `ruff` to autofix lint errors args: [--fix-only] - id: ruff name: Run `ruff` to format code entry: ruff format --force-exclude - id: ruff name: Run `ruff` to lint code - repo: https://github.com/RobertCraigie/pyright-python rev: v1.1.384 hooks: - id: pyright ================================================ FILE: CLAUDE.md ================================================ # Claude Quickstarts Development Guide ## Legal - When changes are made to files that have a copyright notice add them to that subdirectory's CHANGELOG.md file. ## Computer-Use Demo ### Setup & Development - **Setup environment**: `./setup.sh` - **Build Docker**: `docker build . -t computer-use-demo:local` - **Run container**: `docker run -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY -v $(pwd)/computer_use_demo:/home/computeruse/computer_use_demo/ -v $HOME/.anthropic:/home/computeruse/.anthropic -p 5900:5900 -p 8501:8501 -p 6080:6080 -p 8080:8080 -it computer-use-demo:local` ### Testing & Code Quality - **Lint**: `ruff check .` - **Format**: `ruff format .` - **Typecheck**: `pyright` - **Run tests**: `pytest` - **Run single test**: `pytest tests/path_to_test.py::test_name -v` ### Code Style - **Python**: snake_case for functions/variables, PascalCase for classes - **Imports**: Use isort with combine-as-imports - **Error handling**: Use custom ToolError for tool errors - **Types**: Add type annotations for all parameters and returns - **Classes**: Use dataclasses and abstract base classes ## Customer Support Agent ### Setup & Development - **Install dependencies**: `npm install` - **Run dev server**: `npm run dev` (full UI) - **UI variants**: `npm run dev:left` (left sidebar), `npm run dev:right` (right sidebar), `npm run dev:chat` (chat only) - **Lint**: `npm run lint` - **Build**: `npm run build` (full UI), see package.json for variants ### Code Style - **TypeScript**: Strict mode with proper interfaces - **Components**: Function components with React hooks - **Formatting**: Follow ESLint Next.js configuration - **UI components**: Use shadcn/ui components library ## Financial Data Analyst ### Setup & Development - **Install dependencies**: `npm install` - **Run dev server**: `npm run dev` - **Lint**: `npm run lint` - **Build**: `npm run build` ### Code Style - **TypeScript**: Strict mode with proper type definitions - **Components**: Function components with type annotations - **Visualization**: Use Recharts library for data visualization - **State management**: React hooks for state ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 Anthropic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Claude Quickstarts Claude Quickstarts is a collection of projects designed to help developers quickly get started with building applications using the Claude API. Each quickstart provides a foundation that you can easily build upon and customize for your specific needs. ## Getting Started To use these quickstarts, you'll need an Claude API key. If you don't have one yet, you can sign up for free at [console.anthropic.com](https://console.anthropic.com). ## Available Quickstarts ### Customer Support Agent A customer support agent powered by Claude. This project demonstrates how to leverage Claude's natural language understanding and generation capabilities to create an AI-assisted customer support system with access to a knowledge base. [Go to Customer Support Agent Quickstart](./customer-support-agent) ### Financial Data Analyst A financial data analyst powered by Claude. This project demonstrates how to leverage Claude's capabilities with interactive data visualization to analyze financial data via chat. [Go to Financial Data Analyst Quickstart](./financial-data-analyst) ### Computer Use Demo An environment and tools that Claude can use to control a desktop computer. This project demonstrates how to leverage the computer use capabilities of Claude, including support for the latest `computer_use_20251124` tool version with zoom actions. [Go to Computer Use Demo Quickstart](./computer-use-demo) ### Browser Tools API Demo A complete reference implementation for browser automation powered by Claude. This project demonstrates how to leverage Claude's browser tools API for web interaction, including navigation, DOM inspection, and form manipulation using Playwright. [Go to Browser Tools API Demo Quickstart](./browser-tools-api-demo) ### Autonomous Coding Agent An autonomous coding agent powered by the Claude Agent SDK. This project demonstrates a two-agent pattern (initializer + coding agent) that can build complete applications over multiple sessions, with progress persisted via git and a feature list that the agent works through incrementally. [Go to Autonomous Coding Agent Quickstart](./autonomous-coding) ## General Usage Each quickstart project comes with its own README and setup instructions. Generally, you'll follow these steps: 1. Clone this repository 2. Navigate to the specific quickstart directory 3. Install the required dependencies 4. Set up your Claude API key as an environment variable 5. Run the quickstart application ## Explore Further To deepen your understanding of working with Claude and the Claude API, check out these resources: - [Claude API Documentation](https://docs.claude.com) - [Claude Cookbooks](https://github.com/anthropics/claude-cookbooks) - A collection of code snippets and guides for common tasks - [Claude API Fundamentals Course](https://github.com/anthropics/courses/tree/master/anthropic_api_fundamentals) ## Contributing We welcome contributions to the Claude Quickstarts repository! If you have ideas for new quickstart projects or improvements to existing ones, please open an issue or submit a pull request. ## Community and Support - Join our [Anthropic Discord community](https://www.anthropic.com/discord) for discussions and support - Check out the [Anthropic support documentation](https://support.anthropic.com) for additional help ## License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. ================================================ FILE: agents/.gitignore ================================================ # Python bytecode __pycache__/ *.py[cod] *$py.class # Jupyter Notebook .ipynb_checkpoints */.ipynb_checkpoints/* # MacOS .DS_Store # Environment variables .env # Virtual environments venv/ env/ .venv/ # MCP logs *.log # Distribution / packaging dist/ build/ *.egg-info/ ================================================ FILE: agents/README.md ================================================ # Agents A minimal educational implementation of LLM agents using the Claude API. > **Note:** This is NOT an SDK, but a reference implementation of key concepts ## Overview & Core Components This repo demonstrates how to [build effective agents](https://www.anthropic.com/engineering/building-effective-agents) with the Claude API. It shows how sophisticated AI behaviors can emerge from a simple foundation: LLMs using tools in a loop. This implementation is not prescriptive - the core logic is <300 lines of code and deliberately lacks production features. Feel free to translate these patterns to your language and production stack ([Claude Code](https://docs.claude.com/en/docs/agents-and-tools/claude-code/overview) can help!) It contains three components: - `agent.py`: Manages Claude API interactions and tool execution - `tools/`: Tool implementations (both native and MCP tools) - `utils/`: Utilities for message history and MCP server connections ## Usage ```python from agents.agent import Agent from agents.tools.think import ThinkTool # Create an agent with both local tools and MCP server tools agent = Agent( name="MyAgent", system="You are a helpful assistant.", tools=[ThinkTool()], # Local tools mcp_servers=[ { "type": "stdio", "command": "python", "args": ["-m", "mcp_server"], }, ] ) # Run the agent response = agent.run("What should I consider when buying a new laptop?") ``` From this foundation, you can add domain-specific tools, optimize performance, or implement custom response handling. We remain deliberately unopinionated - this backbone simply gets you started with fundamentals. ## Requirements - Python 3.8+ - Claude API key (set as `ANTHROPIC_API_KEY` environment variable) - `anthropic` Python library - `mcp` Python library ================================================ FILE: agents/__init__.py ================================================ """Core agent implementations.""" from .agent import Agent, ModelConfig from .tools.base import Tool __all__ = ["Agent", "ModelConfig", "Tool"] ================================================ FILE: agents/agent.py ================================================ """Agent implementation with Claude API and tools.""" import asyncio import os from contextlib import AsyncExitStack from dataclasses import dataclass from typing import Any from anthropic import Anthropic from .tools.base import Tool from .utils.connections import setup_mcp_connections from .utils.history_util import MessageHistory from .utils.tool_util import execute_tools @dataclass class ModelConfig: """Configuration settings for Claude model parameters.""" # Available models include: # - claude-sonnet-4-20250514 (default) # - claude-opus-4-20250514 # - claude-haiku-4-5-20251001 # - claude-3-5-sonnet-20240620 # - claude-3-haiku-20240307 model: str = "claude-sonnet-4-20250514" max_tokens: int = 4096 temperature: float = 1.0 context_window_tokens: int = 180000 class Agent: """Claude-powered agent with tool use capabilities.""" def __init__( self, name: str, system: str, tools: list[Tool] | None = None, mcp_servers: list[dict[str, Any]] | None = None, config: ModelConfig | None = None, verbose: bool = False, client: Anthropic | None = None, message_params: dict[str, Any] | None = None, ): """Initialize an Agent. Args: name: Agent identifier for logging system: System prompt for the agent tools: List of tools available to the agent mcp_servers: MCP server configurations config: Model configuration with defaults verbose: Enable detailed logging client: Anthropic client instance message_params: Additional parameters for client.messages.create(). These override any conflicting parameters from config. """ self.name = name self.system = system self.verbose = verbose self.tools = list(tools or []) self.config = config or ModelConfig() self.mcp_servers = mcp_servers or [] self.message_params = message_params or {} self.client = client or Anthropic( api_key=os.environ.get("ANTHROPIC_API_KEY", "") ) self.history = MessageHistory( model=self.config.model, system=self.system, context_window_tokens=self.config.context_window_tokens, client=self.client, ) if self.verbose: print(f"\n[{self.name}] Agent initialized") def _prepare_message_params(self) -> dict[str, Any]: """Prepare parameters for client.messages.create() call. Returns a dict with base parameters from config, with any message_params overriding conflicting keys. """ return { "model": self.config.model, "max_tokens": self.config.max_tokens, "temperature": self.config.temperature, "system": self.system, "messages": self.history.format_for_api(), "tools": [tool.to_dict() for tool in self.tools], **self.message_params, } async def _agent_loop(self, user_input: str) -> list[dict[str, Any]]: """Process user input and handle tool calls in a loop""" if self.verbose: print(f"\n[{self.name}] Received: {user_input}") await self.history.add_message("user", user_input, None) tool_dict = {tool.name: tool for tool in self.tools} while True: self.history.truncate() params = self._prepare_message_params() # Merge headers properly - default beta header can be overridden by message_params default_headers = {"anthropic-beta": "code-execution-2025-05-22"} if "extra_headers" in params: # Pop extra_headers from params and merge with defaults custom_headers = params.pop("extra_headers") merged_headers = {**default_headers, **custom_headers} else: merged_headers = default_headers response = self.client.messages.create( **params, extra_headers=merged_headers ) tool_calls = [ block for block in response.content if block.type == "tool_use" ] if self.verbose: for block in response.content: if block.type == "text": print(f"\n[{self.name}] Output: {block.text}") elif block.type == "tool_use": params_str = ", ".join( [f"{k}={v}" for k, v in block.input.items()] ) print( f"\n[{self.name}] Tool call: " f"{block.name}({params_str})" ) await self.history.add_message( "assistant", response.content, response.usage ) if tool_calls: tool_results = await execute_tools( tool_calls, tool_dict, ) if self.verbose: for block in tool_results: print( f"\n[{self.name}] Tool result: " f"{block.get('content')}" ) await self.history.add_message("user", tool_results) else: return response async def run_async(self, user_input: str) -> list[dict[str, Any]]: """Run agent with MCP tools asynchronously.""" async with AsyncExitStack() as stack: original_tools = list(self.tools) try: mcp_tools = await setup_mcp_connections( self.mcp_servers, stack ) self.tools.extend(mcp_tools) return await self._agent_loop(user_input) finally: self.tools = original_tools def run(self, user_input: str) -> list[dict[str, Any]]: """Run agent synchronously""" return asyncio.run(self.run_async(user_input)) ================================================ FILE: agents/agent_demo.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Agent Demo with Simplified Connection Management\n", "\n", "This notebook demonstrates an agent using multiple tool types with simplified connection management:\n", "1. **Local Calculator**: Mathematical expressions via MCP\n", "2. **Brave Search**: Web search via MCP (if API key available)\n", "3. **Think Tool**: Internal reasoning tool" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "parent_dir = os.path.dirname(os.getcwd())\n", "sys.path.insert(0, parent_dir)\n", "from agents.agent import Agent, ModelConfig\n", "from agents.tools.think import ThinkTool\n", "from agents.tools.web_search import WebSearchServerTool\n", "from agents.tools.code_execution import CodeExecutionServerTool" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Configure tools and MCP servers" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Calculator server configured: Yes\n", "Brave API key available: Yes\n", "Brave search server configured: Yes\n" ] } ], "source": [ "# Standard Python tool\n", "think_tool = ThinkTool()\n", "\n", "# Python MCP server\n", "calculator_server_path = os.path.abspath(os.path.join(os.getcwd(), \"tools/calculator_mcp.py\"))\n", "calculator_server = {\n", " \"type\": \"stdio\",\n", " \"command\": \"python\",\n", " \"args\": [calculator_server_path]\n", "}\n", "print(f\"Calculator server configured: {'Yes' if calculator_server else 'No'}\")\n", "\n", "\n", "# Brave MCP server written in TypeScript\n", "\n", "brave_api_key = os.environ.get(\"BRAVE_API_KEY_BASE_DATA\", \"\")\n", "print(f\"Brave API key available: {'Yes' if brave_api_key else 'No'}\")\n", "brave_search_server = {\n", " \"type\": \"stdio\",\n", " \"command\": \"npx\",\n", " \"args\": [\"-y\", \"@modelcontextprotocol/server-brave-search\"],\n", " \"env\": {\n", " \"BRAVE_API_KEY\": brave_api_key, \n", " \"PATH\": f\"{os.path.dirname('npx')}:\" + os.environ.get(\"PATH\", \"\")\n", " }\n", "}\n", "print(f\"Brave search server configured: {'Yes' if brave_search_server else 'No'}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Initialize agent with combined tools" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "[Multi-Tool Agent] Agent initialized\n" ] } ], "source": [ "# Create agent config \n", "system_prompt = \"\"\"\n", "You are a helpful assistant with access to:\n", "1. Web search (brave_web_search, brave_local_search)\n", "2. Mathematical calculator (calculate) \n", "3. A tool to think and reason (think)\n", "\n", "Always use the most appropriate tool for each task.\n", "\"\"\"\n", "\n", "\n", "# Initialize agent with standard tools and MCP servers\n", "agent = Agent(\n", " name=\"Multi-Tool Agent\",\n", " system=system_prompt,\n", " tools=[think_tool], \n", " mcp_servers=[brave_search_server, calculator_server], \n", " config=ModelConfig(\n", " model=\"claude-3-7-sonnet-20250219\", \n", " max_tokens=4096,\n", " temperature=1.0\n", " ),\n", " verbose=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Run queries" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 3 MCP tools from 2 servers.\n", "\n", "[Multi-Tool Agent] Received: What's the square root of the OKC population in 2022\n", "\n", "[Multi-Tool Agent] Output: I'll help you find the square root of Oklahoma City's population in 2022. First, I need to look up the population data, then I can calculate the square root.\n", "\n", "[Multi-Tool Agent] Tool call: brave_web_search(query=Oklahoma City population 2022 census data)\n", "\n", "[Multi-Tool Agent] Tool result: Title: Oklahoma Cities by Population (2025)\n", "Description: The most populous cities in Oklahoma are Oklahoma City with 702,767, Tulsa at 411,894, Norman with 130,046, Broken Arrow at 119,194, and Edmond with 98,103. You can get the most current Census population data for all 834 Oklahoma cities, sorted from the most populous cities in Oklahoma to the ...\n", "URL: https://www.oklahoma-demographics.com/cities_by_population\n", "\n", "Title: U.S. Census Bureau QuickFacts: Oklahoma City city, Oklahoma\n", "Description: QuickFacts provides statistics for all states and counties. Also for cities and towns with a population of 5,000 or more. ... Methodology differences may exist between data sources, and so estimates from different sources are not comparable.\n", "URL: https://www.census.gov/quickfacts/fact/table/oklahomacitycityoklahoma/PST045224\n", "\n", "Title: Oklahoma City lands in top 20 of largest US cities by population: New census data\n", "Description: More: 10 Oklahoma cities where population is growing the fastest · Oklahoma City grew by about 1% between 2022 and 2023, adding 7,589 people. And between the 2010 and 2020 Censuses, Oklahoma City was one of 14 American cities that added more than 100,000 people.\n", "URL: https://www.oklahoman.com/story/news/2024/05/17/oklahoma-city-population-top-20-us-census-data-growth/73729737007/\n", "\n", "Title: Oklahoma City - Wikipedia\n", "Description: Oklahoma City (/ˌoʊkləˈhoʊmə ... it ranks 20th in population among U.S. cities and 8th in the Southern United States, with a population of 681,054 at the 2020 census....\n", "URL: https://en.wikipedia.org/wiki/Oklahoma_City\n", "\n", "Title: Oklahoma City city, Oklahoma - Census Bureau Profile\n", "Description: We're sorry but this website doesn't work properly without JavaScript enabled. Please enable it to continue\n", "URL: https://data.census.gov/profile/Oklahoma_City_city,_Oklahoma?g=160XX00US4055000\n", "\n", "Title: Oklahoma City, Oklahoma Population 2025\n", "Description: Oklahoma City is a city located in Cleveland County, Oklahoma. Oklahoma City has a 2025 population of 715,893. Oklahoma City is currently growing at a rate of 0.93% annually and its population has increased by 4.8% since the most recent census, which recorded a population of 683,078 in 2020.\n", "URL: https://worldpopulationreview.com/us-cities/oklahoma/oklahoma-city\n", "\n", "Title: Oklahoma Demographics | Current Census Data\n", "Description: Get the Oklahoma demographic data that you need. You can get the most current Oklahoma data from the US Census Bureau without spending 40 high-value hours searching the internet. The Oklahoma population is 4,053,824 people according to the US Census Bureau's 2023 Population Estimates Program.\n", "URL: https://www.oklahoma-demographics.com/\n", "\n", "Title: Oklahoma City, OK - Profile data - Census Reporter\n", "Description: a little less than the figure in the Oklahoma City, OK Metro Area: 23.5 14,717,310 (±0.2 / ±145,724)\n", "URL: https://censusreporter.org/profiles/16000US4055000-oklahoma-city-ok/\n", "\n", "Title: Oklahoma population by year, county, race, & more | USAFacts\n", "Description: The ages, races, and population density of Oklahoma tell a story. Understand the shifts in demographic trends with these charts visualizing decades of population data. ... Florida K-12 public teachers are paid the lowest average salary when adjusted for cost of living.Published on November 8, 2022 ... The Census ...\n", "URL: https://usafacts.org/data/topics/people-society/population-and-demographics/our-changing-population/state/oklahoma/\n", "\n", "Title: Resident Population in Oklahoma City, OK (MSA) (OKCPOP) | FRED | St. Louis Fed\n", "Description: Graph and download economic data for Resident Population in Oklahoma City, OK (MSA) (OKCPOP) from 2000 to 2024 about Oklahoma City, OK, residents, population, and USA.\n", "URL: https://fred.stlouisfed.org/series/OKCPOP\n", "\n", "[Multi-Tool Agent] Output: Based on the search results, I can see multiple population figures for Oklahoma City in different years. The most relevant information for 2022 specifically isn't directly provided, but I can see:\n", "\n", "- 2020 census: 681,054 people\n", "- 2023 estimate showing growth of 7,589 people from 2022\n", "- 2025 projection: 715,893 people\n", "\n", "Let me search more specifically for the 2022 population:\n", "\n", "[Multi-Tool Agent] Tool call: brave_web_search(query=Oklahoma City population 2022 exact figure)\n", "\n", "[Multi-Tool Agent] Tool result: Title: Oklahoma City, Oklahoma Population 2025\n", "Description: Oklahoma City is a city located in Cleveland County, Oklahoma. Oklahoma City has a 2025 population of 715,893.\n", "URL: https://worldpopulationreview.com/us-cities/oklahoma/oklahoma-city\n", "\n", "Title: Oklahoma City Metro Area Population 1950-2025\n", "Description: The metro area population of Oklahoma City in 2022 was 1,008,000, a 1% increase from 2021.\n", "URL: https://www.macrotrends.net/cities/23088/oklahoma-city/population\n", "\n", "Title: Oklahoma City city, Oklahoma - Census Bureau Profile\n", "Description: We're sorry but this website doesn't work properly without JavaScript enabled. Please enable it to continue\n", "URL: https://data.census.gov/profile/Oklahoma_City_city,_Oklahoma?g=160XX00US4055000\n", "\n", "Title: Oklahoma City - Wikipedia\n", "Description: The Oklahoma City metropolitan area has an estimated 1.49 million residents, the largest metropolitan area in the state and 42nd-most populous in the nation. The city limits extend into Canadian, Cleveland, and Pottawatomie counties; however, areas beyond Oklahoma County primarily consist of ...\n", "URL: https://en.wikipedia.org/wiki/Oklahoma_City\n", "\n", "Title: U.S. Census Bureau QuickFacts: Oklahoma City city, Oklahoma\n", "Description: QuickFacts provides statistics for all states and counties. Also for cities and towns with a population of 5,000 or more.\n", "URL: https://www.census.gov/quickfacts/oklahomacitycityoklahoma\n", "\n", "Title: Oklahoma City, OK | Data USA\n", "Description: As of 2023, 12% of Oklahoma City, OK residents (82.6k people) were born outside of the United States, which is lower than the national average of 13.8%. In 2022, the percentage of foreign-born citizens in Oklahoma City, OK was 11.8%, meaning that the rate has been increasing.\n", "URL: https://datausa.io/profile/geo/oklahoma-city-ok\n", "\n", "Title: Oklahoma City lands in top 20 of largest US cities by population: New census data\n", "Description: More: 10 Oklahoma cities where population is growing the fastest · Oklahoma City grew by about 1% between 2022 and 2023, adding 7,589 people.\n", "URL: https://www.oklahoman.com/story/news/2024/05/17/oklahoma-city-population-top-20-us-census-data-growth/73729737007/\n", "\n", "Title: Oklahoma Cities by Population (2025)\n", "Description: The most populous cities in Oklahoma are Oklahoma City with 702,767, Tulsa at 411,894, Norman with 130,046, Broken Arrow at 119,194, and Edmond with 98,103.\n", "URL: https://www.oklahoma-demographics.com/cities_by_population\n", "\n", "Title: Oklahoma City, OK\n", "Description: Oklahoma City, Oklahoma has a population of 688,700. The median household income is $66,700 and 15% of residents live below the federal poverty level. The racial and ethnic composition is 22% Hispanic, 51% White, 13% Black, 4% Asian and 2% American Indian or Alaska Native.\n", "URL: https://bigcitieshealthdata.org/city/oklahoma-city-ok/\n", "\n", "Title: Oklahoma Cities by Population 2025\n", "Description: Oklahoma City is the biggest city in Oklahoma, with a population of 715,893 in 2025.\n", "URL: https://worldpopulationreview.com/us-cities/oklahoma\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "[Multi-Tool Agent] Output: I still don't see a specific figure for exactly 2022, but I can gather enough information to make a reasonable estimate.\n", "\n", "[Multi-Tool Agent] Tool call: think(thought=From the search results, I can see:\n", "1. The 2020 census population was 681,054\n", "2. Oklahoma City grew by about 1% between 2022 and 2023, adding 7,589 people\n", "3. This implies the 2022 population was approximately 7,589/0.01 = 758,900 people\n", "4. But this seems too high compared to other figures like 702,767 and 688,700 mentioned\n", "\n", "Let me reconsider. If Oklahoma City added 7,589 people between 2022-2023 and that represented a 1% growth, then the 2022 population would be:\n", "7,589 ÷ 0.01 = 758,900\n", "\n", "However, this contradicts other data points. Perhaps the clearest estimate is from the second search result which states: \"The metro area population of Oklahoma City in 2022 was 1,008,000\". But this refers to the metro area (larger than just the city proper).\n", "\n", "Another data point from the first search states \"Oklahoma City has a 2025 population of 715,893\" and mentions it \"has increased by 4.8% since the most recent census, which recorded a population of 683,078 in 2020.\"\n", "\n", "If the growth was relatively steady from 2020 to 2025, the annual growth rate would be about 0.96% per year. \n", "Using that rate, the 2022 population would be approximately:\n", "683,078 × (1.0096)² = 683,078 × 1.0193 = 696,272\n", "\n", "This seems most consistent with the other data points. I'll use 696,272 as the best estimate for Oklahoma City's 2022 population.)\n", "\n", "[Multi-Tool Agent] Tool result: Thinking complete!\n", "\n", "[Multi-Tool Agent] Output: Based on the available data, I can estimate that Oklahoma City's population in 2022 was approximately 696,272 people. Now I'll calculate the square root of this number.\n", "\n", "[Multi-Tool Agent] Tool call: calculator(number1=696272, number2=0, operator=sqrt)\n", "\n", "[Multi-Tool Agent] Tool result: Result: 834.4291461831855\n", "\n", "[Multi-Tool Agent] Output: The square root of Oklahoma City's population in 2022 (estimated at 696,272 people) is approximately 834.43.\n" ] }, { "data": { "text/plain": [ "Message(id='msg_016wLY8JCd5baUgaWEHcrage', content=[TextBlock(text=\"The square root of Oklahoma City's population in 2022 (estimated at 696,272 people) is approximately 834.43.\", type='text')], model='claude-3-7-sonnet-20250219', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=5, output_tokens=34, cache_creation_input_tokens=150, cache_read_input_tokens=3881, service_tier='standard'))" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example query\n", "agent.run(\"What's the square root of the OKC population in 2022\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 3 MCP tools from 2 servers.\n", "\n", "[Multi-Tool Agent] Received: How many bananas will fit in an Toyota GR86?\n", "\n", "[Multi-Tool Agent] Output: That's an interesting question! To answer how many bananas will fit in a Toyota GR86, I'll need to find information about the car's cargo capacity and then estimate how many bananas could fit in that space.\n", "\n", "[Multi-Tool Agent] Tool call: brave_web_search(query=Toyota GR86 cargo capacity trunk volume)\n", "\n", "[Multi-Tool Agent] Tool result: Title: Cargo space. What can it fit? : r/GR86\n", "Description: 11 votes, 36 comments. 42K subscribers in the GR86 community. For anyone interested in the Toyota GR86 & Subaru BRZ ('22+).\n", "URL: https://www.reddit.com/r/GR86/comments/16xh6ii/cargo_space_what_can_it_fit/\n", "\n", "Title: 2024 Toyota GR86 Review, Pricing, and Specs\n", "Description: The 2024 Toyota GR86 turns every day into an autocross day thanks to its playful chassis and easy-to-slide rear-wheel drive antics.\n", "URL: https://www.caranddriver.com/toyota/gr-86/specs\n", "\n", "Title: 2023 Toyota GR86 Interior, Cargo Space & Seating | U.S. News\n", "Description: "You might not think practicality was a consideration with the GR86, but it is surprisingly useful for weekly errands. Its relatively small 6.3 cubic feet of trunk space is hampered by its short height. But when the seats are folded, Toyota says the GR86 can hold a set of its own replacement ...\n", "URL: https://cars.usnews.com/cars-trucks/toyota/gr86/2023/interior\n", "\n", "Title: 2025 Toyota GR86 Review, Pricing, and Specs\n", "Description: DIMENSIONS Wheelbase: 101.4 in Length: 167.9 in Width: 69.9 in Height: 51.6 in Passenger Volume: 78 ft3 Trunk Volume: 6 ft3 Curb Weight: 2838 lb · C/D TEST RESULTS 60 mph: 5.4 sec 100 mph: 13.8 sec 1/4-Mile: 14.0 sec @ 101 mph 130 mph: 27.2 sec Results above omit 1-ft rollout of 0.3 sec.\n", "URL: https://www.caranddriver.com/toyota/gr-86\n", "\n", "Title: Toyota GR86 dimensions, boot space and similars\n", "Description: Toyota GR86 dimensions with interior photos, boot space and measurements of length, width and height. Previous cars and similars in size.\n", "URL: https://www.automobiledimension.com/model/toyota/gr86\n", "\n", "Title: Trunk Size | Toyota GR86 Forum - GT86 Forum, Subaru BRZ Forum, Scion ...\n", "Description: It still surprises me that I can fit so much in the trunk. I am able to fit my entire paintball gear bag, pods, cleats, 2 cases of paint, mask case, small ice chest, and my extra clothes and smaller items. Gotta enjoy the little things with these beautiful cars!\n", "URL: https://www.gr86.org/threads/trunk-size.8574/\n", "\n", "Title: Toyota GR86 Cargo Space | Toyota GR86 Forum - GT86 Forum, Subaru BRZ Forum, Scion ...\n", "Description: Not sure if serious or market differences, but it is mandated in all cars in the U.S. if the trunk is much bigger than a shoebox lol. ... This is an older thread, you may not receive a response, and could be reviving an old thread. Please consider creating a new thread. ... Toyota GR86 Forum ...\n", "URL: https://www.gr86.org/threads/toyota-gr86-cargo-space.372/\n", "\n", "Title: GR86 - GR86 Boot Dimensions | Toyota GR Forum for owners of GR Yaris, GR86, GR Supra and GR Corolla\n", "Description: During my time with the GR86, I tried to take accurate measurements of the boot dimensions (in centimetres), as shown below. Sorry the pictures are not exactly to scale - my MS Paint skills are rather limited! = The boot opening is 60cm at its narrowest (at the bottom), 80cm at the widest...\n", "URL: https://www.gr-zoo.com/threads/gr86-boot-dimensions.2567/\n", "\n", "Title: 2025 Toyota GR86 Interior, Cargo Space & Seating | U.S. News\n", "Description: Few sports cars have large trunks, and the GR86 is no exception. It has just 6.3 cubic feet of trunk space, which is not much even for this class. You can fit several grocery bags or a couple of carry-on bags with the rear seats upright.\n", "URL: https://cars.usnews.com/cars-trucks/toyota/gr86/interior\n", "\n", "Title: Toyota GR86 (2025) boot space & practicality\n", "Description: Read what our experts think about the Toyota GR86's practicality, boot size and overall dimensions, how it compares to its rivals and more.\n", "URL: https://www.parkers.co.uk/toyota/gr86/review/practicality/\n", "\n", "[Multi-Tool Agent] Output: Now I need to get information about the average size of a banana to make this calculation:\n", "\n", "[Multi-Tool Agent] Tool call: brave_web_search(query=average banana size dimensions volume)\n", "\n", "[Multi-Tool Agent] Tool result: Title: Banana for scale - Universal converter - Unit conversion for all!\n", "Description: Introducing the banana as the universal measurement unit, all conversion-related confusion can be avoided. Use this handy app to convert imperial and metric measurements to universal bananas.\n", "URL: http://bananaforscale.info/\n", "\n", "Title: How big is a banana? - Quora\n", "Description: Answer (1 of 10): How big is a banana? Is that your banana or my banana? Musa Ingens, the world's largest banana tree growing in the forest of Papua New Guinea and Indonesia. It is the tallest species of bananas and the world's largest herbaceous plant. The size of the Musa Ingens banana is imp...\n", "URL: https://www.quora.com/How-big-is-a-banana\n", "\n", "Title: How Many Calories and Carbs Are in a Banana?\n", "Description: Bananas are extremely healthy and nutritious. Most people know this, but many wonder how many calories and carbs bananas contain.\n", "URL: https://www.healthline.com/nutrition/bananas-calories-carbs\n", "\n", "Title: Banana - Wikipedia\n", "Description: As of 2018, bananas are exported in larger volume and to a larger value than any other fruit. In 2022, world production of bananas and plantains combined was 179 million tonnes, led by India and China with a combined total of 26% of global production. Other major producers were Uganda, Indonesia, ...\n", "URL: https://en.wikipedia.org/wiki/Banana\n", "\n", "Title: On the Usefulness of Bananas: A New System of Measurement | by Liam Foley | Medium\n", "Description: According to a study conducted by Mahmoud Soltani of the University of Tehran, on average, a fully ripe banana fruit of the Cavendish variety measures 200.25 millimetres in length, with a mass of 180.56 grams, and an ellipsoid volume of 156.10 cubic centimetres.\n", "URL: https://medium.com/@liamfoley2430/on-the-usefulness-of-bananas-a-new-system-of-measurement-76c50d8bb89a\n", "\n", "Title: Banana-based Length & Distance Unit Converter | Convert To Bananas\n", "Description: Convert any length to bananas (and to useful units as well)\n", "URL: https://www.converttobananas.com/\n", "\n", "Title: r/atlanticdiscussions on Reddit: HOW LONG IS THE, UM, AVERAGE BANANA? And why do you all want to know so bad?\n", "Description: As a writer on the internet, I often feel as though I’m getting a peek into the American mind. Recently, while looking at search data, I discovered…\n", "URL: https://www.reddit.com/r/atlanticdiscussions/comments/pbenqf/how_long_is_the_um_average_banana_and_why_do_you/\n", "\n", "Title: How Long Is the, Um, Average Banana?\n", "Description: Bananas come in a total of five sizes — extra small, small, medium, large and extra large. But how long is the average banana?\n", "URL: https://melmagazine.com/en-us/story/how-long-is-a-banana\n", "\n", "Title: r/theydidthemath on Reddit: [Request] Assuming the bananas are normal size how many rooms would 102,769 bananas fill?\n", "Description: The average volume of a banana is 156.1 cm³. Assuming you mean whole, uncrushed bananas, google search results peg the packing effeciency around 65%.\n", "URL: https://www.reddit.com/r/theydidthemath/comments/rdiyis/request_assuming_the_bananas_are_normal_size_how/\n", "\n", "Title: How big is your banana?\n", "Description: BMI Calculator Enter your details: Height (cm): Weight (kg): BMI (kg/m2) is Category is\n", "URL: https://healthieryou.net.au/wp-content/uploads/2015/11/How-big-is-your-banana-Article.pdf\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "[Multi-Tool Agent] Output: Now I have the information I need to calculate how many bananas will fit in a Toyota GR86. Let me think through this:\n", "\n", "[Multi-Tool Agent] Tool call: think(thought=From the search results, I found:\n", "\n", "1. Toyota GR86 trunk capacity:\n", " - 6.3 cubic feet of trunk space (according to multiple sources)\n", " - This equals approximately 178,385 cubic centimeters (6.3 ft³ × 28,316.85 cm³/ft³)\n", "\n", "2. Average banana volume:\n", " - A fully ripe Cavendish banana has a volume of approximately 156.10 cubic centimeters\n", "\n", "3. Packing efficiency:\n", " - When packing irregular objects like bananas, we need to account for the space between them\n", " - A Reddit comment mentioned packing efficiency of around 65% for bananas\n", " - This means only about 65% of the total volume can be filled with bananas due to gaps\n", "\n", "So to calculate how many bananas will fit in the Toyota GR86 trunk:\n", "- Available volume with packing efficiency: 178,385 cm³ × 0.65 = 115,950 cm³\n", "- Number of bananas: 115,950 cm³ ÷ 156.10 cm³ = 743 bananas\n", "\n", "However, the Toyota GR86 also has an interior cabin with a passenger volume of approximately 78 ft³ (according to one of the search results). If we were to fill the entire car with bananas (which is not practical but answers the hypothetical question):\n", "\n", "Total car volume = 78 ft³ (passenger) + 6.3 ft³ (trunk) = 84.3 ft³ = 2,387,110 cm³\n", "\n", "With 65% packing efficiency:\n", "- Available volume: 2,387,110 cm³ × 0.65 = 1,551,622 cm³\n", "- Number of bananas: 1,551,622 cm³ ÷ 156.10 cm³ = 9,940 bananas\n", "\n", "So, approximately 9,940 bananas could theoretically fit in an entire Toyota GR86 if completely filled, or about 743 bananas in just the trunk.)\n", "\n", "[Multi-Tool Agent] Tool result: Thinking complete!\n", "\n", "[Multi-Tool Agent] Output: # How Many Bananas Will Fit in a Toyota GR86\n", "\n", "Based on my research, I can provide you with an estimate of how many bananas would fit in a Toyota GR86.\n", "\n", "## The Data:\n", "\n", "- **Toyota GR86 trunk capacity**: 6.3 cubic feet (approximately 178,385 cubic centimeters)\n", "- **Toyota GR86 passenger volume**: 78 cubic feet\n", "- **Average banana volume**: 156.10 cubic centimeters per banana\n", "- **Packing efficiency**: Approximately 65% (accounting for inevitable gaps between irregularly shaped bananas)\n", "\n", "## The Calculation:\n", "\n", "If we're talking about filling just the trunk:\n", "- Available volume with packing efficiency: 178,385 cm³ × 0.65 = 115,950 cm³\n", "- Number of bananas in trunk: 115,950 cm³ ÷ 156.10 cm³ = approximately **743 bananas**\n", "\n", "If we're considering the entire car (including passenger compartment):\n", "- Total car volume: 84.3 cubic feet (2,387,110 cm³)\n", "- Available volume with packing efficiency: 2,387,110 cm³ × 0.65 = 1,551,622 cm³\n", "- Total number of bananas: 1,551,622 cm³ ÷ 156.10 cm³ = approximately **9,940 bananas**\n", "\n", "So, you could fit approximately 743 bananas in the trunk alone, or nearly 10,000 bananas if you filled the entire Toyota GR86 (though that would leave no room for a driver!).\n" ] }, { "data": { "text/plain": [ "Message(id='msg_01NC46TLNZSG97ZH5v7b7YcN', content=[TextBlock(text=\"# How Many Bananas Will Fit in a Toyota GR86\\n\\nBased on my research, I can provide you with an estimate of how many bananas would fit in a Toyota GR86.\\n\\n## The Data:\\n\\n- **Toyota GR86 trunk capacity**: 6.3 cubic feet (approximately 178,385 cubic centimeters)\\n- **Toyota GR86 passenger volume**: 78 cubic feet\\n- **Average banana volume**: 156.10 cubic centimeters per banana\\n- **Packing efficiency**: Approximately 65% (accounting for inevitable gaps between irregularly shaped bananas)\\n\\n## The Calculation:\\n\\nIf we're talking about filling just the trunk:\\n- Available volume with packing efficiency: 178,385 cm³ × 0.65 = 115,950 cm³\\n- Number of bananas in trunk: 115,950 cm³ ÷ 156.10 cm³ = approximately **743 bananas**\\n\\nIf we're considering the entire car (including passenger compartment):\\n- Total car volume: 84.3 cubic feet (2,387,110 cm³)\\n- Available volume with packing efficiency: 2,387,110 cm³ × 0.65 = 1,551,622 cm³\\n- Total number of bananas: 1,551,622 cm³ ÷ 156.10 cm³ = approximately **9,940 bananas**\\n\\nSo, you could fit approximately 743 bananas in the trunk alone, or nearly 10,000 bananas if you filled the entire Toyota GR86 (though that would leave no room for a driver!).\", type='text')], model='claude-3-7-sonnet-20250219', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=4, output_tokens=385, cache_creation_input_tokens=574, cache_read_input_tokens=6630, service_tier='standard'))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "await agent.run_async(\"How many bananas will fit in an Toyota GR86?\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Example with Anthropic Server Tools\n", "\n", "This example demonstrates using Anthropic's native server tools for web search and code execution." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "[Server Tools Agent] Agent initialized\n" ] } ], "source": [ "# Create Anthropic server tools\n", "web_search_tool = WebSearchServerTool(\n", " name=\"web_search\",\n", " max_uses=5, # Limit to 5 searches per request\n", " blocked_domains=[\"example.com\"] # Example of blocking specific domains\n", ")\n", "\n", "code_execution_tool = CodeExecutionServerTool()\n", "\n", "# Initialize agent with server tools\n", "server_agent = Agent(\n", " name=\"Server Tools Agent\",\n", " system=\"\"\"\n", "You are a helpful assistant with access to:\n", "1. Web search for finding current information\n", "2. Code execution for running Python code\n", "3. Think tool for complex reasoning\n", "\n", "Use these tools effectively to answer questions that require current data or calculations.\n", "\"\"\",\n", " tools=[think_tool, web_search_tool, code_execution_tool],\n", " config=ModelConfig(\n", " model=\"claude-sonnet-4-20250514\",\n", " max_tokens=4096,\n", " temperature=0.7\n", " ),\n", " verbose=True\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "[Server Tools Agent] Received: \n", "Search for the current population of Tokyo, Japan. \n", "Then write and execute Python code to calculate how many people that would be per square kilometer, \n", "given that Tokyo's area is approximately 2,194 square kilometers.\n", "\n", "\n", "[Server Tools Agent] Output: I'll help you find Tokyo's current population and then calculate the population density. Let me start by searching for the current population data.\n", "\n", "[Server Tools Agent] Output: Now let me search for more specific information about Tokyo's current population, as the results show different figures for different definitions of \"Tokyo\" (city proper vs metropolitan area).\n", "\n", "[Server Tools Agent] Tool call: think(thought=Now I have a clearer picture of Tokyo's population. From the search results, I can see there are different definitions of \"Tokyo\":\n", "\n", "1. Tokyo Metropolis (Tokyo-to) - the prefecture/administrative area: Around 14 million people as of 2024\n", "2. The 23 Special Wards (central Tokyo): Around 9.2-9.7 million people\n", "3. Greater Tokyo Area (metropolitan area): Around 37 million people\n", "\n", "For the calculation with the given area of 2,194 square kilometers, this seems to refer to the Tokyo Metropolis (Tokyo-to) area, as the search results mention:\n", "- \"metropolis population is 13.9 Million as on 2019 and has area of 2,194 Sq km\" (from source 17)\n", "- The surface area of the city metropolis comes to approximately 2,187.66 km² (from source 2)\n", "\n", "So I should use approximately 14 million as the current population for Tokyo Metropolis for the calculation with the 2,194 sq km area.\n", "\n", "Let me write Python code to calculate the population density.)\n", "\n", "[Server Tools Agent] Tool result: Thinking complete!\n", "\n", "[Server Tools Agent] Output: Now I'll write and execute Python code to calculate Tokyo's population density based on the information I found.\n", "\n", "[Server Tools Agent] Output: Based on my search and calculations, here are the key findings:\n", "\n", "## Tokyo's Current Population\n", "\n", "\n", "\n", "[Server Tools Agent] Output: Tokyo Metropolis (Tokyo-to) has a population of over 14 million people as of 2023-2024\n", "\n", "[Server Tools Agent] Output: . The search results show different figures depending on the specific definition of \"Tokyo\":\n", "\n", "- \n", "\n", "[Server Tools Agent] Output: The 23 wards have a population of 9.2 million\n", "\n", "[Server Tools Agent] Output: \n", "- \n", "\n", "[Server Tools Agent] Output: The metropolis has a population that exceeds 13 million\n", "\n", "[Server Tools Agent] Output: \n", "- \n", "\n", "[Server Tools Agent] Output: The Greater Tokyo Area has 41 million residents as of 2024\n", "\n", "[Server Tools Agent] Output: \n", "\n", "## Population Density Calculation\n", "\n", "Using the current population of approximately **14 million people** for Tokyo Metropolis and the given area of **2,194 square kilometers**, the calculation shows:\n", "\n", "**Tokyo's population density is approximately 6,381 people per square kilometer**\n", "\n", "This aligns well with existing data, as \n", "\n", "[Server Tools Agent] Output: one source indicates Tokyo metropolis has a density of 6,349 persons per square kilometer\n", "\n", "[Server Tools Agent] Output: , confirming our calculation is accurate.\n", "\n", "## Context\n", "\n", "This makes Tokyo one of the most densely populated metropolitan areas in the world. \n", "\n", "[Server Tools Agent] Output: The greater Tokyo metropolitan area is estimated to be over 36 million people and is home to 25% of Japan's population, making it the most populous metropolitan area in the world\n", "\n", "[Server Tools Agent] Output: .\n" ] }, { "data": { "text/plain": [ "Message(id='msg_018nAgEXuVMSJCTSrSjcZtrj', content=[TextBlock(text=\"Now I'll write and execute Python code to calculate Tokyo's population density based on the information I found.\", type='text'), TextBlock(text=None, type='server_tool_use', id='srvtoolu_01JAxi9fcajoScWQWEG1bJ2S', name='code_execution', input={'code': '# Tokyo population and area data\\n# Based on search results, Tokyo Metropolis (Tokyo-to) has approximately 14 million people\\n# and covers 2,194 square kilometers\\n\\ntokyo_population = 14_000_000 # Current population of Tokyo Metropolis (2024)\\ntokyo_area_sq_km = 2194 # Area in square kilometers\\n\\n# Calculate population density\\npopulation_density = tokyo_population / tokyo_area_sq_km\\n\\nprint(\"Tokyo Population Density Calculation\")\\nprint(\"=\" * 40)\\nprint(f\"Tokyo Metropolis Population: {tokyo_population:,} people\")\\nprint(f\"Tokyo Metropolis Area: {tokyo_area_sq_km:,} square kilometers\")\\nprint(f\"Population Density: {population_density:,.2f} people per square kilometer\")\\n\\n# Let\\'s also show some context with rounded numbers\\nprint(f\"\\\\nRounded: Approximately {population_density:,.0f} people per square kilometer\")\\n\\n# For comparison, let\\'s also calculate for different population estimates\\nprint(\"\\\\n\" + \"=\"*50)\\nprint(\"COMPARISON WITH DIFFERENT POPULATION ESTIMATES:\")\\nprint(\"=\"*50)\\n\\n# Different population estimates from search results\\nestimates = [\\n (\"Tokyo Metropolis (conservative estimate)\", 13_500_000),\\n (\"Tokyo Metropolis (current estimate)\", 14_000_000),\\n (\"Tokyo Metropolis (high estimate)\", 14_300_000),\\n]\\n\\nfor description, population in estimates:\\n density = population / tokyo_area_sq_km\\n print(f\"{description}: {density:,.0f} people/km²\")'}), TextBlock(text=None, type='code_execution_tool_result', tool_use_id='srvtoolu_01JAxi9fcajoScWQWEG1bJ2S', content={'type': 'code_execution_result', 'stdout': 'Tokyo Population Density Calculation\\n========================================\\nTokyo Metropolis Population: 14,000,000 people\\nTokyo Metropolis Area: 2,194 square kilometers\\nPopulation Density: 6,381.04 people per square kilometer\\n\\nRounded: Approximately 6,381 people per square kilometer\\n\\n==================================================\\nCOMPARISON WITH DIFFERENT POPULATION ESTIMATES:\\n==================================================\\nTokyo Metropolis (conservative estimate): 6,153 people/km²\\nTokyo Metropolis (current estimate): 6,381 people/km²\\nTokyo Metropolis (high estimate): 6,518 people/km²\\n', 'stderr': '', 'return_code': 0, 'content': []}), TextBlock(text=\"Based on my search and calculations, here are the key findings:\\n\\n## Tokyo's Current Population\\n\\n\", type='text'), TextBlock(text='Tokyo Metropolis (Tokyo-to) has a population of over 14 million people as of 2023-2024', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'With a population of over 14 million in the city proper in 2023, it is one of the most populous urban areas in the world. ', 'url': 'https://en.wikipedia.org/wiki/Tokyo', 'title': 'Tokyo - Wikipedia', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDNkmb67meDKwvfNSWBoMey/C2mf4C76VGRm/IjB/p9OlOe00CdTjisZ+Soyr8DL2IDH8J+LioeVOOHK7L/3HfUnClNRaZvtHREGHnFQqE0Pko8rhETBI/USi53r6+Vq2i1sYBA=='}, {'type': 'web_search_result_location', 'cited_text': 'The area within the dashed white line is home to 14 million people, as of 2020. ', 'url': 'http://www.statsmapsnpix.com/2022/09/how-big-is-tokyo.html', 'title': 'Stats, Maps n Pix: How big is Tokyo?', 'encrypted_index': 'EpABCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDN3Dl6f/w5+R2DpZzRoMZiaU1oKXtaWciBkzIjCeqzTw6VNVH9T8pJcQGylS/KdoN8tamtX70LsJ7ygZeDUBnbVxfCebu8Xfpv9tdJAqFN2KZA2z6Q/22MX5gewO0+OfwUtrGAQ='}, {'type': 'web_search_result_location', 'cited_text': 'The area within the dashed white line is home to 14 million people, as of 2020. ', 'url': 'http://www.statsmapsnpix.com/2022/09/how-big-is-tokyo.html', 'title': 'Stats, Maps n Pix: How big is Tokyo?', 'encrypted_index': 'EpABCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDPgOVKuta9qNyug8dRoMzPFdKW3fR2mvZIK7IjCUZybt1VVAMoVOA3VEOCprNID+ZQWavx0Z9ug53tjoeanpJvyGUIqn65McFNAKksEqFDn6oY/apm+AjLJk908w1NyD4w0OGAQ='}]), TextBlock(text='. The search results show different figures depending on the specific definition of \"Tokyo\":\\n\\n- ', type='text'), TextBlock(text='The 23 wards have a population of 9.2 million', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The 23 wards claim a population of 9.2 million, but the metropolis has a population that exceeds 13 million. ', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDGHmgL6AaaF2lV1RIBoMst2h5K/VIu2RGDTwIjBLqvkNQPtkPP1V49Wf9IxwZBOyM+HgBvi7rd6yR/Wkjr9LpT3HnIQOT5FetsLzqUoqE5k8jG1Xsz7hyicRcL/9UCJHKU8YBA=='}]), TextBlock(text='\\n- ', type='text'), TextBlock(text='The metropolis has a population that exceeds 13 million', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The 23 wards claim a population of 9.2 million, but the metropolis has a population that exceeds 13 million. ', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDHsQQzoI+Hz+TiOD7xoMgVi2a9XbrLmJlkNKIjCXGiOzb7fzGY/ueYkCuE280j3RfDhLCJ3YWi6gVboWe2BidSHKRwJBoPbhHAaWwVYqEwZ9kwdNZiabocE1Kbh3m6GW/SMYBA=='}, {'type': 'web_search_result_location', 'cited_text': 'The 23 wards claim a population of 9.2 million, but the metropolis has a population that exceeds 13 million. ', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'EpEBCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDHolkkUxMX/ORAlhZBoMoDhnLJx+SR+8R8aTIjDBKIO2DJG07HcLucjTH1y1LCGBRw1mv/t061+rSVpTbV7iY5DXREoEUGCcMFVasUgqFT9KXif7+m2jD/0yN42wWd02HG+jbRgE'}]), TextBlock(text='\\n- ', type='text'), TextBlock(text='The Greater Tokyo Area has 41 million residents as of 2024', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The Greater Tokyo Area, which includes Tokyo and parts of six neighboring prefectures, is the most populous metropolitan area in the world, with 41 mi...', 'url': 'https://en.wikipedia.org/wiki/Tokyo', 'title': 'Tokyo - Wikipedia', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDPu/83J4h+//6WejfhoMabArnZYopPgIgBadIjABzh8N5tuANZ8z7yijY+HpuAP+iw6WDSwqolJU6364du78SvzlqUXbWLujKQdIhMcqE1fHzX0qTHJmNVRZHIt3AcmEBLwYBA=='}]), TextBlock(text=\"\\n\\n## Population Density Calculation\\n\\nUsing the current population of approximately **14 million people** for Tokyo Metropolis and the given area of **2,194 square kilometers**, the calculation shows:\\n\\n**Tokyo's population density is approximately 6,381 people per square kilometer**\\n\\nThis aligns well with existing data, as \", type='text'), TextBlock(text='one source indicates Tokyo metropolis has a density of 6,349 persons per square kilometer', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'Tokyo population in 2023 is estimated to be 35.8 million, population in 2022 is estimated to be 35.6 million, in 2021 is estimated to be 37.5 million,...', 'url': 'https://www.populationu.com/cities/tokyo-population', 'title': 'Tokyo Population 2025', 'encrypted_index': 'EpEBCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDH+CHv/68jATiDYFzRoMKun6Si5c+OHOiG9WIjAq6lZhNchC2RxsNLZMgheItJe0yOT53PlDChrsyxJztTkx8aSELTlrH+2V953cZhYqFUMI2gojH59prn81fC8mSgHNmvaKKBgE'}]), TextBlock(text=', confirming our calculation is accurate.\\n\\n## Context\\n\\nThis makes Tokyo one of the most densely populated metropolitan areas in the world. ', type='text'), TextBlock(text=\"The greater Tokyo metropolitan area is estimated to be over 36 million people and is home to 25% of Japan's population, making it the most populous metropolitan area in the world\", type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The greater Tokyo metropolitan area, which is spread over 3 prefectures, is much larger and has a population that is estimated to be over 36 million. ...', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'EpEBCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDC3eHB8MpV+6PvSJ9hoMGBBbLOOC80uQmSO4IjCujon0QmWYTtDWe+Po+0YK+bXYbwcEwKmurDHUK1IuU6w2rvi9Rq2XeSecqpnOhNgqFU8JodAjBnotfaYFd1dee+V5IBAlIxgE'}]), TextBlock(text='.', type='text')], model='claude-sonnet-4-20250514', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=9, output_tokens=841, cache_creation_input_tokens=928, cache_read_input_tokens=48328, service_tier='standard', server_tool_use={'web_search_requests': 0}), container={'id': 'container_011CQTnr9bhVu3fPTnaT85oA', 'expires_at': '2025-06-24T22:55:24.157374+00:00'})" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example 1: Use web search to find current information and code execution for analysis\n", "server_agent.run(\"\"\"\n", "Search for the current population of Tokyo, Japan. \n", "Then write and execute Python code to calculate how many people that would be per square kilometer, \n", "given that Tokyo's area is approximately 2,194 square kilometers.\n", "\"\"\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: agents/test_message_params.py ================================================ #!/usr/bin/env python3 """Test suite for Agent message_params functionality. This module tests the ability to pass custom parameters to the Claude API through the Agent's message_params argument, including headers, metadata, and API parameters. """ import os import sys # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from agents.agent import Agent, ModelConfig class TestMessageParams: """Test cases for message_params functionality.""" def __init__(self, verbose: bool = True): """Initialize test suite. Args: verbose: Whether to print detailed output """ self.verbose = verbose self.passed = 0 self.failed = 0 def _print(self, message: str) -> None: """Print message if verbose mode is on.""" if self.verbose: print(message) def _run_test(self, test_name: str, test_func: callable) -> None: """Run a single test and track results. Args: test_name: Name of the test test_func: Test function to execute """ self._print(f"\n{'='*60}") self._print(f"Running: {test_name}") self._print('='*60) try: test_func() self.passed += 1 self._print(f"✓ {test_name} PASSED") except Exception as e: self.failed += 1 self._print(f"✗ {test_name} FAILED: {str(e)}") if self.verbose: import traceback traceback.print_exc() def test_basic_agent(self) -> None: """Test agent without message_params to ensure backward compatibility.""" agent = Agent( name="BasicAgent", system="You are a helpful assistant. Be very brief.", verbose=False ) response = agent.run("What is 2+2?") # response is a list of message content blocks assert any("4" in str(block.get("text", "")) for block in response if block.get("type") == "text") response_text = next((block["text"] for block in response if block.get("type") == "text"), "") self._print(f"Response: {response_text}") def test_custom_headers(self) -> None: """Test passing custom headers through message_params.""" agent = Agent( name="HeaderAgent", system="You are a helpful assistant. Be very brief.", verbose=False, message_params={ "extra_headers": { "X-Custom-Header": "test-value", "X-Request-ID": "test-12345" } } ) # Verify headers are stored assert "extra_headers" in agent.message_params assert agent.message_params["extra_headers"]["X-Custom-Header"] == "test-value" response = agent.run("What is 3+3?") response_text = next((block["text"] for block in response if block.get("type") == "text"), "") assert "6" in response_text self._print(f"Response with custom headers: {response_text}") def test_beta_headers(self) -> None: """Test passing beta feature headers.""" agent = Agent( name="BetaAgent", system="You are a helpful assistant. Be very brief.", verbose=False, message_params={ "extra_headers": { "anthropic-beta": "files-api-2025-04-14" } } ) # The API call should succeed even with beta headers response = agent.run("What is 5*5?") response_text = next((block["text"] for block in response if block.get("type") == "text"), "") assert "25" in response_text self._print(f"Response with beta headers: {response_text}") def test_metadata(self) -> None: """Test passing valid metadata fields.""" agent = Agent( name="MetadataAgent", system="You are a helpful assistant. Be very brief.", verbose=False, message_params={ "metadata": { "user_id": "test-user-123" } } ) response = agent.run("What is 10/2?") response_text = next((block["text"] for block in response if block.get("type") == "text"), "") assert "5" in response_text self._print(f"Response with metadata: {response_text}") def test_api_parameters(self) -> None: """Test passing various API parameters.""" agent = Agent( name="ParamsAgent", system="You are a helpful assistant.", verbose=False, message_params={ "top_k": 10, "top_p": 0.95, "temperature": 0.7 } ) # Verify parameters are passed through params = agent._prepare_message_params() assert params["top_k"] == 10 assert params["top_p"] == 0.95 assert params["temperature"] == 0.7 response = agent.run("Say 'test'") response_text = next((block["text"] for block in response if block.get("type") == "text"), "") assert response_text self._print(f"Response with custom params: {response_text}") def test_parameter_override(self) -> None: """Test that message_params override config defaults.""" config = ModelConfig( temperature=1.0, max_tokens=100 ) agent = Agent( name="OverrideAgent", system="You are a helpful assistant.", config=config, verbose=False, message_params={ "temperature": 0.5, # Should override config "max_tokens": 200 # Should override config } ) params = agent._prepare_message_params() assert params["temperature"] == 0.5 assert params["max_tokens"] == 200 self._print("Parameter override successful") def test_invalid_metadata_field(self) -> None: """Test that invalid metadata fields are properly rejected by the API.""" agent = Agent( name="InvalidAgent", system="You are a helpful assistant.", verbose=False, message_params={ "metadata": { "user_id": "valid", "invalid_field": "should-fail" } } ) try: agent.run("Test") # Should not reach here raise AssertionError("Expected API error for invalid metadata field") except Exception as e: assert "invalid_request_error" in str(e) or "metadata" in str(e).lower() self._print(f"Correctly rejected invalid metadata: {type(e).__name__}") def test_combined_parameters(self) -> None: """Test combining multiple parameter types.""" agent = Agent( name="CombinedAgent", system="You are a helpful assistant. Be very brief.", verbose=False, message_params={ "extra_headers": { "X-Test": "combined", "anthropic-beta": "files-api-2025-04-14" }, "metadata": { "user_id": "combined-test" }, "temperature": 0.8, "top_k": 5 } ) params = agent._prepare_message_params() assert params["extra_headers"]["X-Test"] == "combined" assert params["metadata"]["user_id"] == "combined-test" assert params["temperature"] == 0.8 assert params["top_k"] == 5 response = agent.run("What is 1+1?") response_text = next((block["text"] for block in response if block.get("type") == "text"), "") assert "2" in response_text self._print(f"Response with combined params: {response_text}") def run_all_tests(self) -> None: """Run all test cases.""" self._print("\nAgent message_params Test Suite") self._print("="*60) tests = [ ("Basic Agent (No message_params)", self.test_basic_agent), ("Custom Headers", self.test_custom_headers), ("Beta Feature Headers", self.test_beta_headers), ("Valid Metadata", self.test_metadata), ("API Parameters", self.test_api_parameters), ("Parameter Override", self.test_parameter_override), ("Invalid Metadata Field", self.test_invalid_metadata_field), ("Combined Parameters", self.test_combined_parameters), ] for test_name, test_func in tests: self._run_test(test_name, test_func) self._print(f"\n{'='*60}") self._print(f"Test Results: {self.passed} passed, {self.failed} failed") self._print("="*60) return self.failed == 0 def main(): """Run the test suite.""" # Check for API key if not os.environ.get("ANTHROPIC_API_KEY"): print("Error: Please set ANTHROPIC_API_KEY environment variable") sys.exit(1) # Run tests test_suite = TestMessageParams(verbose=True) success = test_suite.run_all_tests() # Exit with appropriate code sys.exit(0 if success else 1) if __name__ == "__main__": main() ================================================ FILE: agents/tools/__init__.py ================================================ """Tools module for agent framework.""" from .base import Tool from .code_execution import CodeExecutionServerTool from .file_tools import FileReadTool, FileWriteTool from .think import ThinkTool from .web_search import WebSearchServerTool __all__ = [ "Tool", "CodeExecutionServerTool", "FileReadTool", "FileWriteTool", "ThinkTool", "WebSearchServerTool", ] ================================================ FILE: agents/tools/base.py ================================================ """Base tool definitions for the agent framework.""" from dataclasses import dataclass from typing import Any @dataclass class Tool: """Base class for all agent tools.""" name: str description: str input_schema: dict[str, Any] def to_dict(self) -> dict[str, Any]: """Convert tool to Claude API format.""" return { "name": self.name, "description": self.description, "input_schema": self.input_schema, } async def execute(self, **kwargs) -> str: """Execute the tool with provided parameters.""" raise NotImplementedError( "Tool subclasses must implement execute method" ) ================================================ FILE: agents/tools/calculator_mcp.py ================================================ #!/usr/bin/env python3 """Simple calculator tool for basic math operations.""" import math from mcp.server import FastMCP mcp = FastMCP("Calculator") @mcp.tool(name="calculator") def calculator(number1: float, number2: float, operator: str) -> str: """Performs basic calculations with two numbers. Args: number1: First number in the calculation number2: Second number in the calculation operator: Operation symbol to perform (+, -, *, /, ^, sqrt) Note: Only these exact symbols are supported, not words Returns: Result of the calculation """ try: if operator == "+": result = number1 + number2 elif operator == "-": result = number1 - number2 elif operator == "*": result = number1 * number2 elif operator == "/": if number2 == 0: return "Error: Division by zero" result = number1 / number2 elif operator == "^": result = number1**number2 elif operator == "sqrt": if number1 < 0: return "Error: Cannot take square root of negative number" result = math.sqrt(number1) else: return f"Error: Unsupported operator '{operator}'" # Format the result if isinstance(result, float) and result.is_integer(): result = int(result) return f"Result: {result}" except Exception as e: return f"Error: {str(e)}" if __name__ == "__main__": mcp.run() ================================================ FILE: agents/tools/code_execution.py ================================================ """Code execution server tool for the agent framework.""" from dataclasses import dataclass from typing import Any @dataclass class CodeExecutionServerTool: """Code execution server tool that uses Anthropic's server tool format.""" name: str = "code_execution" type: str = "code_execution_20250522" def to_dict(self) -> dict[str, Any]: """Convert to Anthropic server tool format.""" return { "type": self.type, "name": self.name, } ================================================ FILE: agents/tools/file_tools.py ================================================ """File operation tools for reading and writing files.""" import asyncio import glob import os from pathlib import Path from .base import Tool class FileReadTool(Tool): """Tool for reading files and listing directories.""" def __init__(self): super().__init__( name="file_read", description=""" Read files or list directory contents. Operations: - read: Read the contents of a file - list: List files in a directory """, input_schema={ "type": "object", "properties": { "operation": { "type": "string", "enum": ["read", "list"], "description": "File operation to perform", }, "path": { "type": "string", "description": "File path for read or directory path", }, "max_lines": { "type": "integer", "description": "Maximum lines to read (0 means no limit)", }, "pattern": { "type": "string", "description": "File pattern to match", }, }, "required": ["operation", "path"], }, ) async def execute( self, operation: str, path: str, max_lines: int = 0, pattern: str = "*", ) -> str: """Execute a file read operation. Args: operation: The operation to perform (read or list) path: The file or directory path max_lines: Maximum lines to read (for read operation, 0 means no limit) pattern: File pattern to match (for list operation) Returns: Result of the operation as string """ if operation == "read": return await self._read_file(path, max_lines) elif operation == "list": return await self._list_files(path, pattern) else: return f"Error: Unsupported operation '{operation}'" async def _read_file(self, path: str, max_lines: int = 0) -> str: """Read a file from disk. Args: path: Path to the file to read max_lines: Maximum number of lines to read (0 means read entire file) """ try: file_path = Path(path) if not file_path.exists(): return f"Error: File not found at {path}" if not file_path.is_file(): return f"Error: {path} is not a file" def read_sync(): with open(file_path, encoding="utf-8", errors="replace") as f: if max_lines > 0: lines = [] for i, line in enumerate(f): if i >= max_lines: break lines.append(line) return "".join(lines) return f.read() return await asyncio.to_thread(read_sync) except Exception as e: return f"Error reading {path}: {str(e)}" async def _list_files(self, directory: str, pattern: str = "*") -> str: """List files in a directory.""" try: dir_path = Path(directory) if not dir_path.exists(): return f"Error: Directory not found at {directory}" if not dir_path.is_dir(): return f"Error: {directory} is not a directory" def list_sync(): search_pattern = f"{directory}/{pattern}" files = glob.glob(search_pattern) if not files: return f"No files found matching {directory}/{pattern}" file_list = [] for file_path in sorted(files): path_obj = Path(file_path) rel_path = str(file_path).replace(str(dir_path) + "/", "") if path_obj.is_dir(): file_list.append(f"📁 {rel_path}/") else: file_list.append(f"📄 {rel_path}") return "\n".join(file_list) return await asyncio.to_thread(list_sync) except Exception as e: return f"Error listing files in {directory}: {str(e)}" class FileWriteTool(Tool): """Tool for writing and editing files.""" def __init__(self): super().__init__( name="file_write", description=""" Write or edit files. Operations: - write: Create or completely replace a file - edit: Make targeted changes to parts of a file """, input_schema={ "type": "object", "properties": { "operation": { "type": "string", "enum": ["write", "edit"], "description": "File operation to perform", }, "path": { "type": "string", "description": "File path to write to or edit", }, "content": { "type": "string", "description": "Content to write", }, "old_text": { "type": "string", "description": "Text to replace (for edit operation)", }, "new_text": { "type": "string", "description": "Replacement text (for edit operation)", }, }, "required": ["operation", "path"], }, ) async def execute( self, operation: str, path: str, content: str = "", old_text: str = "", new_text: str = "", ) -> str: """Execute a file write operation. Args: operation: The operation to perform (write or edit) path: The file path content: Content to write (for write operation) old_text: Text to replace (for edit operation) new_text: Replacement text (for edit operation) Returns: Result of the operation as string """ if operation == "write": if not content: return "Error: content parameter is required" return await self._write_file(path, content) elif operation == "edit": if not old_text or not new_text: return ( "Error: both old_text and new_text parameters " "are required for edit operation" ) return await self._edit_file(path, old_text, new_text) else: return f"Error: Unsupported operation '{operation}'" async def _write_file(self, path: str, content: str) -> str: """Write content to a file.""" try: file_path = Path(path) os.makedirs(file_path.parent, exist_ok=True) def write_sync(): with open(file_path, "w", encoding="utf-8") as f: f.write(content) return ( f"Successfully wrote {len(content)} " f"characters to {path}" ) return await asyncio.to_thread(write_sync) except Exception as e: return f"Error writing to {path}: {str(e)}" async def _edit_file(self, path: str, old_text: str, new_text: str) -> str: """Make targeted changes to a file.""" try: file_path = Path(path) if not file_path.exists(): return f"Error: File not found at {path}" if not file_path.is_file(): return f"Error: {path} is not a file" def edit_sync(): try: with open( file_path, encoding="utf-8", errors="replace" ) as f: content = f.read() if old_text not in content: return ( f"Error: The specified text was not " f"found in {path}" ) # Count occurrences to warn about multiple matches count = content.count(old_text) if count > 1: # Edit with warning about multiple occurrences new_content = content.replace(old_text, new_text) with open(file_path, "w", encoding="utf-8") as f: f.write(new_content) return ( f"Warning: Found {count} occurrences. " f"All were replaced in {path}" ) else: # One occurrence, straightforward replacement new_content = content.replace(old_text, new_text) with open(file_path, "w", encoding="utf-8") as f: f.write(new_content) return f"Successfully edited {path}" except UnicodeDecodeError: return f"Error: {path} appears to be a binary file" return await asyncio.to_thread(edit_sync) except Exception as e: return f"Error editing {path}: {str(e)}" ================================================ FILE: agents/tools/mcp_tool.py ================================================ """Tools that interface with MCP servers.""" from typing import Any from .base import Tool from ..utils.connections import MCPConnection class MCPTool(Tool): def __init__( self, name: str, description: str, input_schema: dict[str, Any], connection: "MCPConnection", ): super().__init__( name=name, description=description, input_schema=input_schema ) self.connection = connection async def execute(self, **kwargs) -> str: """Execute the MCP tool with the given input_schema. Note: Currently only supports text results from MCP tools.""" try: result = await self.connection.call_tool( self.name, arguments=kwargs ) if hasattr(result, "content") and result.content: for item in result.content: if getattr(item, "type", None) == "text": return item.text return "No text content in tool response" except Exception as e: return f"Error executing {self.name}: {e}" ================================================ FILE: agents/tools/think.py ================================================ """Think tool for internal reasoning.""" from .base import Tool class ThinkTool(Tool): """Tool for internal reasoning without executing external actions.""" def __init__(self): super().__init__( name="think", description=( "Use the tool to think about something. It will not obtain " "new information or change the database, but just append the " "thought to the log. Use it when complex reasoning or some " "cache memory is needed." ), input_schema={ "type": "object", "properties": { "thought": { "type": "string", "description": "A thought to think about.", } }, "required": ["thought"], }, ) async def execute(self, thought: str) -> str: """Simply returns the thought back to the model.""" return "Thinking complete!" ================================================ FILE: agents/tools/web_search.py ================================================ """Web search server tool for the agent framework.""" from dataclasses import dataclass from typing import Any, Optional @dataclass class WebSearchServerTool: """Web search server tool that uses Anthropic's server tool format.""" name: str = "web_search" type: str = "web_search_20250305" max_uses: Optional[int] = None allowed_domains: Optional[list[str]] = None blocked_domains: Optional[list[str]] = None user_location: Optional[dict[str, Any]] = None def to_dict(self) -> dict[str, Any]: """Convert to Anthropic server tool format.""" tool_dict: dict[str, Any] = { "type": self.type, "name": self.name, } # Add optional parameters if provided if self.max_uses is not None: tool_dict["max_uses"] = self.max_uses if self.allowed_domains is not None: tool_dict["allowed_domains"] = self.allowed_domains if self.blocked_domains is not None: tool_dict["blocked_domains"] = self.blocked_domains if self.user_location is not None: tool_dict["user_location"] = self.user_location return tool_dict ================================================ FILE: agents/utils/__init__.py ================================================ """Agent utility modules.""" from .history_util import MessageHistory from .tool_util import execute_tools __all__ = ["MessageHistory", "execute_tools"] ================================================ FILE: agents/utils/connections.py ================================================ """Connection handling for MCP servers.""" from abc import ABC, abstractmethod from contextlib import AsyncExitStack from typing import Any from mcp import ClientSession, StdioServerParameters from mcp.client.sse import sse_client from mcp.client.stdio import stdio_client from ..tools.mcp_tool import MCPTool class MCPConnection(ABC): """Base class for MCP server connections.""" def __init__(self): self.session = None self._rw_ctx = None self._session_ctx = None @abstractmethod async def _create_rw_context(self): """Create the read/write context based on connection type.""" async def __aenter__(self): """Initialize MCP server connection.""" self._rw_ctx = await self._create_rw_context() read_write = await self._rw_ctx.__aenter__() read, write = read_write self._session_ctx = ClientSession(read, write) self.session = await self._session_ctx.__aenter__() await self.session.initialize() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Clean up MCP server connection resources.""" try: if self._session_ctx: await self._session_ctx.__aexit__(exc_type, exc_val, exc_tb) if self._rw_ctx: await self._rw_ctx.__aexit__(exc_type, exc_val, exc_tb) except Exception as e: print(f"Error during cleanup: {e}") finally: self.session = None self._session_ctx = None self._rw_ctx = None async def list_tools(self) -> Any: """Retrieve available tools from the MCP server.""" response = await self.session.list_tools() return response.tools async def call_tool( self, tool_name: str, arguments: dict[str, Any] ) -> Any: """Call a tool on the MCP server with provided arguments.""" return await self.session.call_tool(tool_name, arguments=arguments) class MCPConnectionStdio(MCPConnection): """MCP connection using standard input/output.""" def __init__( self, command: str, args: list[str] = [], env: dict[str, str] = None ): super().__init__() self.command = command self.args = args self.env = env async def _create_rw_context(self): return stdio_client( StdioServerParameters( command=self.command, args=self.args, env=self.env ) ) class MCPConnectionSSE(MCPConnection): """MCP connection using Server-Sent Events.""" def __init__(self, url: str, headers: dict[str, str] = None): super().__init__() self.url = url self.headers = headers or {} async def _create_rw_context(self): return sse_client(url=self.url, headers=self.headers) def create_mcp_connection(config: dict[str, Any]) -> MCPConnection: """Factory function to create the appropriate MCP connection.""" conn_type = config.get("type", "stdio").lower() if conn_type == "stdio": if not config.get("command"): raise ValueError("Command is required for STDIO connections") return MCPConnectionStdio( command=config["command"], args=config.get("args"), env=config.get("env"), ) elif conn_type == "sse": if not config.get("url"): raise ValueError("URL is required for SSE connections") return MCPConnectionSSE( url=config["url"], headers=config.get("headers") ) else: raise ValueError(f"Unsupported connection type: {conn_type}") async def setup_mcp_connections( mcp_servers: list[dict[str, Any]] | None, stack: AsyncExitStack, ) -> list[MCPTool]: """Set up MCP server connections and create tool interfaces.""" if not mcp_servers: return [] mcp_tools = [] for config in mcp_servers: try: connection = create_mcp_connection(config) await stack.enter_async_context(connection) tool_definitions = await connection.list_tools() for tool_info in tool_definitions: mcp_tools.append( MCPTool( name=tool_info.name, description=tool_info.description or f"MCP tool: {tool_info.name}", input_schema=tool_info.inputSchema, connection=connection, ) ) except Exception as e: print(f"Error setting up MCP server {config}: {e}") print( f"Loaded {len(mcp_tools)} MCP tools from {len(mcp_servers)} servers." ) return mcp_tools ================================================ FILE: agents/utils/history_util.py ================================================ """Message history with token tracking and prompt caching.""" from typing import Any class MessageHistory: """Manages chat history with token tracking and context management.""" def __init__( self, model: str, system: str, context_window_tokens: int, client: Any, enable_caching: bool = True, ): self.model = model self.system = system self.context_window_tokens = context_window_tokens self.messages: list[dict[str, Any]] = [] self.total_tokens = 0 self.enable_caching = enable_caching self.message_tokens: list[tuple[int, int]] = ( [] ) # List of (input_tokens, output_tokens) tuples self.client = client # set initial total tokens to system prompt try: system_token = ( self.client.messages.count_tokens( model=self.model, system=self.system, messages=[{"role": "user", "content": "test"}], ).input_tokens - 1 ) except Exception: system_token = len(self.system) / 4 self.total_tokens = system_token async def add_message( self, role: str, content: str | list[dict[str, Any]], usage: Any | None = None, ): """Add a message to the history and track token usage.""" if isinstance(content, str): content = [{"type": "text", "text": content}] message = {"role": role, "content": content} self.messages.append(message) if role == "assistant" and usage: total_input = ( usage.input_tokens + getattr(usage, "cache_read_input_tokens", 0) + getattr(usage, "cache_creation_input_tokens", 0) ) output_tokens = usage.output_tokens current_turn_input = total_input - self.total_tokens self.message_tokens.append((current_turn_input, output_tokens)) self.total_tokens += current_turn_input + output_tokens def truncate(self) -> None: """Remove oldest messages when context window limit is exceeded.""" if self.total_tokens <= self.context_window_tokens: return TRUNCATION_NOTICE_TOKENS = 25 TRUNCATION_MESSAGE = { "role": "user", "content": [ { "type": "text", "text": "[Earlier history has been truncated.]", } ], } def remove_message_pair(): self.messages.pop(0) self.messages.pop(0) if self.message_tokens: input_tokens, output_tokens = self.message_tokens.pop(0) self.total_tokens -= input_tokens + output_tokens while ( self.message_tokens and len(self.messages) >= 2 and self.total_tokens > self.context_window_tokens ): remove_message_pair() if self.messages and self.message_tokens: original_input_tokens, original_output_tokens = ( self.message_tokens[0] ) self.messages[0] = TRUNCATION_MESSAGE self.message_tokens[0] = ( TRUNCATION_NOTICE_TOKENS, original_output_tokens, ) self.total_tokens += ( TRUNCATION_NOTICE_TOKENS - original_input_tokens ) def format_for_api(self) -> list[dict[str, Any]]: """Format messages for Claude API with optional caching.""" result = [ {"role": m["role"], "content": m["content"]} for m in self.messages ] if self.enable_caching and self.messages: result[-1]["content"] = [ {**block, "cache_control": {"type": "ephemeral"}} for block in self.messages[-1]["content"] ] return result ================================================ FILE: agents/utils/tool_util.py ================================================ """Tool execution utility with parallel execution support.""" import asyncio from typing import Any async def _execute_single_tool( call: Any, tool_dict: dict[str, Any] ) -> dict[str, Any]: """Execute a single tool and handle errors.""" response = {"type": "tool_result", "tool_use_id": call.id} try: # Execute the tool directly result = await tool_dict[call.name].execute(**call.input) response["content"] = str(result) except KeyError: response["content"] = f"Tool '{call.name}' not found" response["is_error"] = True except Exception as e: response["content"] = f"Error executing tool: {str(e)}" response["is_error"] = True return response async def execute_tools( tool_calls: list[Any], tool_dict: dict[str, Any], parallel: bool = True ) -> list[dict[str, Any]]: """Execute multiple tools sequentially or in parallel.""" if parallel: return await asyncio.gather( *[_execute_single_tool(call, tool_dict) for call in tool_calls] ) else: return [ await _execute_single_tool(call, tool_dict) for call in tool_calls ] ================================================ FILE: autonomous-coding/.gitignore ================================================ # Agent-generated output directories generations/ # Log files logs/ ================================================ FILE: autonomous-coding/README.md ================================================ # Autonomous Coding Agent Demo A minimal harness demonstrating long-running autonomous coding with the Claude Agent SDK. This demo implements a two-agent pattern (initializer + coding agent) that can build complete applications over multiple sessions. ## Prerequisites **Required:** Install the latest versions of both Claude Code and the Claude Agent SDK: ```bash # Install Claude Code CLI (latest version required) npm install -g @anthropic-ai/claude-code # Install Python dependencies pip install -r requirements.txt ``` Verify your installations: ```bash claude --version # Should be latest version pip show claude-code-sdk # Check SDK is installed ``` **API Key:** Set your Anthropic API key: ```bash export ANTHROPIC_API_KEY='your-api-key-here' ``` ## Quick Start ```bash python autonomous_agent_demo.py --project-dir ./my_project ``` For testing with limited iterations: ```bash python autonomous_agent_demo.py --project-dir ./my_project --max-iterations 3 ``` ## Important Timing Expectations > **Warning: This demo takes a long time to run!** - **First session (initialization):** The agent generates a `feature_list.json` with 200 test cases. This takes several minutes and may appear to hang - this is normal. The agent is writing out all the features. - **Subsequent sessions:** Each coding iteration can take **5-15 minutes** depending on complexity. - **Full app:** Building all 200 features typically requires **many hours** of total runtime across multiple sessions. **Tip:** The 200 features parameter in the prompts is designed for comprehensive coverage. If you want faster demos, you can modify `prompts/initializer_prompt.md` to reduce the feature count (e.g., 20-50 features for a quicker demo). ## How It Works ### Two-Agent Pattern 1. **Initializer Agent (Session 1):** Reads `app_spec.txt`, creates `feature_list.json` with 200 test cases, sets up project structure, and initializes git. 2. **Coding Agent (Sessions 2+):** Picks up where the previous session left off, implements features one by one, and marks them as passing in `feature_list.json`. ### Session Management - Each session runs with a fresh context window - Progress is persisted via `feature_list.json` and git commits - The agent auto-continues between sessions (3 second delay) - Press `Ctrl+C` to pause; run the same command to resume ## Security Model This demo uses a defense-in-depth security approach (see `security.py` and `client.py`): 1. **OS-level Sandbox:** Bash commands run in an isolated environment 2. **Filesystem Restrictions:** File operations restricted to the project directory only 3. **Bash Allowlist:** Only specific commands are permitted: - File inspection: `ls`, `cat`, `head`, `tail`, `wc`, `grep` - Node.js: `npm`, `node` - Version control: `git` - Process management: `ps`, `lsof`, `sleep`, `pkill` (dev processes only) Commands not in the allowlist are blocked by the security hook. ## Project Structure ``` autonomous-coding/ ├── autonomous_agent_demo.py # Main entry point ├── agent.py # Agent session logic ├── client.py # Claude SDK client configuration ├── security.py # Bash command allowlist and validation ├── progress.py # Progress tracking utilities ├── prompts.py # Prompt loading utilities ├── prompts/ │ ├── app_spec.txt # Application specification │ ├── initializer_prompt.md # First session prompt │ └── coding_prompt.md # Continuation session prompt └── requirements.txt # Python dependencies ``` ## Generated Project Structure After running, your project directory will contain: ``` my_project/ ├── feature_list.json # Test cases (source of truth) ├── app_spec.txt # Copied specification ├── init.sh # Environment setup script ├── claude-progress.txt # Session progress notes ├── .claude_settings.json # Security settings └── [application files] # Generated application code ``` ## Running the Generated Application After the agent completes (or pauses), you can run the generated application: ```bash cd generations/my_project # Run the setup script created by the agent ./init.sh # Or manually (typical for Node.js apps): npm install npm run dev ``` The application will typically be available at `http://localhost:3000` or similar (check the agent's output or `init.sh` for the exact URL). ## Command Line Options | Option | Description | Default | |--------|-------------|---------| | `--project-dir` | Directory for the project | `./autonomous_demo_project` | | `--max-iterations` | Max agent iterations | Unlimited | | `--model` | Claude model to use | `claude-sonnet-4-5-20250929` | ## Customization ### Changing the Application Edit `prompts/app_spec.txt` to specify a different application to build. ### Adjusting Feature Count Edit `prompts/initializer_prompt.md` and change the "200 features" requirement to a smaller number for faster demos. ### Modifying Allowed Commands Edit `security.py` to add or remove commands from `ALLOWED_COMMANDS`. ## Troubleshooting **"Appears to hang on first run"** This is normal. The initializer agent is generating 200 detailed test cases, which takes significant time. Watch for `[Tool: ...]` output to confirm the agent is working. **"Command blocked by security hook"** The agent tried to run a command not in the allowlist. This is the security system working as intended. If needed, add the command to `ALLOWED_COMMANDS` in `security.py`. **"API key not set"** Ensure `ANTHROPIC_API_KEY` is exported in your shell environment. ## License Internal Anthropic use. ================================================ FILE: autonomous-coding/agent.py ================================================ """ Agent Session Logic =================== Core agent interaction functions for running autonomous coding sessions. """ import asyncio from pathlib import Path from typing import Optional from claude_code_sdk import ClaudeSDKClient from client import create_client from progress import print_session_header, print_progress_summary from prompts import get_initializer_prompt, get_coding_prompt, copy_spec_to_project # Configuration AUTO_CONTINUE_DELAY_SECONDS = 3 async def run_agent_session( client: ClaudeSDKClient, message: str, project_dir: Path, ) -> tuple[str, str]: """ Run a single agent session using Claude Agent SDK. Args: client: Claude SDK client message: The prompt to send project_dir: Project directory path Returns: (status, response_text) where status is: - "continue" if agent should continue working - "error" if an error occurred """ print("Sending prompt to Claude Agent SDK...\n") try: # Send the query await client.query(message) # Collect response text and show tool use response_text = "" async for msg in client.receive_response(): msg_type = type(msg).__name__ # Handle AssistantMessage (text and tool use) if msg_type == "AssistantMessage" and hasattr(msg, "content"): for block in msg.content: block_type = type(block).__name__ if block_type == "TextBlock" and hasattr(block, "text"): response_text += block.text print(block.text, end="", flush=True) elif block_type == "ToolUseBlock" and hasattr(block, "name"): print(f"\n[Tool: {block.name}]", flush=True) if hasattr(block, "input"): input_str = str(block.input) if len(input_str) > 200: print(f" Input: {input_str[:200]}...", flush=True) else: print(f" Input: {input_str}", flush=True) # Handle UserMessage (tool results) elif msg_type == "UserMessage" and hasattr(msg, "content"): for block in msg.content: block_type = type(block).__name__ if block_type == "ToolResultBlock": result_content = getattr(block, "content", "") is_error = getattr(block, "is_error", False) # Check if command was blocked by security hook if "blocked" in str(result_content).lower(): print(f" [BLOCKED] {result_content}", flush=True) elif is_error: # Show errors (truncated) error_str = str(result_content)[:500] print(f" [Error] {error_str}", flush=True) else: # Tool succeeded - just show brief confirmation print(" [Done]", flush=True) print("\n" + "-" * 70 + "\n") return "continue", response_text except Exception as e: print(f"Error during agent session: {e}") return "error", str(e) async def run_autonomous_agent( project_dir: Path, model: str, max_iterations: Optional[int] = None, ) -> None: """ Run the autonomous agent loop. Args: project_dir: Directory for the project model: Claude model to use max_iterations: Maximum number of iterations (None for unlimited) """ print("\n" + "=" * 70) print(" AUTONOMOUS CODING AGENT DEMO") print("=" * 70) print(f"\nProject directory: {project_dir}") print(f"Model: {model}") if max_iterations: print(f"Max iterations: {max_iterations}") else: print("Max iterations: Unlimited (will run until completion)") print() # Create project directory project_dir.mkdir(parents=True, exist_ok=True) # Check if this is a fresh start or continuation tests_file = project_dir / "feature_list.json" is_first_run = not tests_file.exists() if is_first_run: print("Fresh start - will use initializer agent") print() print("=" * 70) print(" NOTE: First session takes 10-20+ minutes!") print(" The agent is generating 200 detailed test cases.") print(" This may appear to hang - it's working. Watch for [Tool: ...] output.") print("=" * 70) print() # Copy the app spec into the project directory for the agent to read copy_spec_to_project(project_dir) else: print("Continuing existing project") print_progress_summary(project_dir) # Main loop iteration = 0 while True: iteration += 1 # Check max iterations if max_iterations and iteration > max_iterations: print(f"\nReached max iterations ({max_iterations})") print("To continue, run the script again without --max-iterations") break # Print session header print_session_header(iteration, is_first_run) # Create client (fresh context) client = create_client(project_dir, model) # Choose prompt based on session type if is_first_run: prompt = get_initializer_prompt() is_first_run = False # Only use initializer once else: prompt = get_coding_prompt() # Run session with async context manager async with client: status, response = await run_agent_session(client, prompt, project_dir) # Handle status if status == "continue": print(f"\nAgent will auto-continue in {AUTO_CONTINUE_DELAY_SECONDS}s...") print_progress_summary(project_dir) await asyncio.sleep(AUTO_CONTINUE_DELAY_SECONDS) elif status == "error": print("\nSession encountered an error") print("Will retry with a fresh session...") await asyncio.sleep(AUTO_CONTINUE_DELAY_SECONDS) # Small delay between sessions if max_iterations is None or iteration < max_iterations: print("\nPreparing next session...\n") await asyncio.sleep(1) # Final summary print("\n" + "=" * 70) print(" SESSION COMPLETE") print("=" * 70) print(f"\nProject directory: {project_dir}") print_progress_summary(project_dir) # Print instructions for running the generated application print("\n" + "-" * 70) print(" TO RUN THE GENERATED APPLICATION:") print("-" * 70) print(f"\n cd {project_dir.resolve()}") print(" ./init.sh # Run the setup script") print(" # Or manually:") print(" npm install && npm run dev") print("\n Then open http://localhost:3000 (or check init.sh for the URL)") print("-" * 70) print("\nDone!") ================================================ FILE: autonomous-coding/autonomous_agent_demo.py ================================================ #!/usr/bin/env python3 """ Autonomous Coding Agent Demo ============================ A minimal harness demonstrating long-running autonomous coding with Claude. This script implements the two-agent pattern (initializer + coding agent) and incorporates all the strategies from the long-running agents guide. Example Usage: python autonomous_agent_demo.py --project-dir ./claude_clone_demo python autonomous_agent_demo.py --project-dir ./claude_clone_demo --max-iterations 5 """ import argparse import asyncio import os from pathlib import Path from agent import run_autonomous_agent # Configuration DEFAULT_MODEL = "claude-sonnet-4-5-20250929" def parse_args() -> argparse.Namespace: """Parse command line arguments.""" parser = argparse.ArgumentParser( description="Autonomous Coding Agent Demo - Long-running agent harness", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Start fresh project python autonomous_agent_demo.py --project-dir ./claude_clone # Use a specific model python autonomous_agent_demo.py --project-dir ./claude_clone --model claude-sonnet-4-5-20250929 # Limit iterations for testing python autonomous_agent_demo.py --project-dir ./claude_clone --max-iterations 5 # Continue existing project python autonomous_agent_demo.py --project-dir ./claude_clone Environment Variables: ANTHROPIC_API_KEY Your Anthropic API key (required) """, ) parser.add_argument( "--project-dir", type=Path, default=Path("./autonomous_demo_project"), help="Directory for the project (default: generations/autonomous_demo_project). Relative paths automatically placed in generations/ directory.", ) parser.add_argument( "--max-iterations", type=int, default=None, help="Maximum number of agent iterations (default: unlimited)", ) parser.add_argument( "--model", type=str, default=DEFAULT_MODEL, help=f"Claude model to use (default: {DEFAULT_MODEL})", ) return parser.parse_args() def main() -> None: """Main entry point.""" args = parse_args() # Check for API key if not os.environ.get("ANTHROPIC_API_KEY"): print("Error: ANTHROPIC_API_KEY environment variable not set") print("\nGet your API key from: https://console.anthropic.com/") print("\nThen set it:") print(" export ANTHROPIC_API_KEY='your-api-key-here'") return # Automatically place projects in generations/ directory unless already specified project_dir = args.project_dir if not str(project_dir).startswith("generations/"): # Convert relative paths to be under generations/ if project_dir.is_absolute(): # If absolute path, use as-is pass else: # Prepend generations/ to relative paths project_dir = Path("generations") / project_dir # Run the agent try: asyncio.run( run_autonomous_agent( project_dir=project_dir, model=args.model, max_iterations=args.max_iterations, ) ) except KeyboardInterrupt: print("\n\nInterrupted by user") print("To resume, run the same command again") except Exception as e: print(f"\nFatal error: {e}") raise if __name__ == "__main__": main() ================================================ FILE: autonomous-coding/client.py ================================================ """ Claude SDK Client Configuration =============================== Functions for creating and configuring the Claude Agent SDK client. """ import json import os from pathlib import Path from claude_code_sdk import ClaudeCodeOptions, ClaudeSDKClient from claude_code_sdk.types import HookMatcher from security import bash_security_hook # Puppeteer MCP tools for browser automation PUPPETEER_TOOLS = [ "mcp__puppeteer__puppeteer_navigate", "mcp__puppeteer__puppeteer_screenshot", "mcp__puppeteer__puppeteer_click", "mcp__puppeteer__puppeteer_fill", "mcp__puppeteer__puppeteer_select", "mcp__puppeteer__puppeteer_hover", "mcp__puppeteer__puppeteer_evaluate", ] # Built-in tools BUILTIN_TOOLS = [ "Read", "Write", "Edit", "Glob", "Grep", "Bash", ] def create_client(project_dir: Path, model: str) -> ClaudeSDKClient: """ Create a Claude Agent SDK client with multi-layered security. Args: project_dir: Directory for the project model: Claude model to use Returns: Configured ClaudeSDKClient Security layers (defense in depth): 1. Sandbox - OS-level bash command isolation prevents filesystem escape 2. Permissions - File operations restricted to project_dir only 3. Security hooks - Bash commands validated against an allowlist (see security.py for ALLOWED_COMMANDS) """ api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: raise ValueError( "ANTHROPIC_API_KEY environment variable not set.\n" "Get your API key from: https://console.anthropic.com/" ) # Create comprehensive security settings # Note: Using relative paths ("./**") restricts access to project directory # since cwd is set to project_dir security_settings = { "sandbox": {"enabled": True, "autoAllowBashIfSandboxed": True}, "permissions": { "defaultMode": "acceptEdits", # Auto-approve edits within allowed directories "allow": [ # Allow all file operations within the project directory "Read(./**)", "Write(./**)", "Edit(./**)", "Glob(./**)", "Grep(./**)", # Bash permission granted here, but actual commands are validated # by the bash_security_hook (see security.py for allowed commands) "Bash(*)", # Allow Puppeteer MCP tools for browser automation *PUPPETEER_TOOLS, ], }, } # Ensure project directory exists before creating settings file project_dir.mkdir(parents=True, exist_ok=True) # Write settings to a file in the project directory settings_file = project_dir / ".claude_settings.json" with open(settings_file, "w") as f: json.dump(security_settings, f, indent=2) print(f"Created security settings at {settings_file}") print(" - Sandbox enabled (OS-level bash isolation)") print(f" - Filesystem restricted to: {project_dir.resolve()}") print(" - Bash commands restricted to allowlist (see security.py)") print(" - MCP servers: puppeteer (browser automation)") print() return ClaudeSDKClient( options=ClaudeCodeOptions( model=model, system_prompt="You are an expert full-stack developer building a production-quality web application.", allowed_tools=[ *BUILTIN_TOOLS, *PUPPETEER_TOOLS, ], mcp_servers={ "puppeteer": {"command": "npx", "args": ["puppeteer-mcp-server"]} }, hooks={ "PreToolUse": [ HookMatcher(matcher="Bash", hooks=[bash_security_hook]), ], }, max_turns=1000, cwd=str(project_dir.resolve()), settings=str(settings_file.resolve()), # Use absolute path ) ) ================================================ FILE: autonomous-coding/progress.py ================================================ """ Progress Tracking Utilities =========================== Functions for tracking and displaying progress of the autonomous coding agent. """ import json from pathlib import Path def count_passing_tests(project_dir: Path) -> tuple[int, int]: """ Count passing and total tests in feature_list.json. Args: project_dir: Directory containing feature_list.json Returns: (passing_count, total_count) """ tests_file = project_dir / "feature_list.json" if not tests_file.exists(): return 0, 0 try: with open(tests_file, "r") as f: tests = json.load(f) total = len(tests) passing = sum(1 for test in tests if test.get("passes", False)) return passing, total except (json.JSONDecodeError, IOError): return 0, 0 def print_session_header(session_num: int, is_initializer: bool) -> None: """Print a formatted header for the session.""" session_type = "INITIALIZER" if is_initializer else "CODING AGENT" print("\n" + "=" * 70) print(f" SESSION {session_num}: {session_type}") print("=" * 70) print() def print_progress_summary(project_dir: Path) -> None: """Print a summary of current progress.""" passing, total = count_passing_tests(project_dir) if total > 0: percentage = (passing / total) * 100 print(f"\nProgress: {passing}/{total} tests passing ({percentage:.1f}%)") else: print("\nProgress: feature_list.json not yet created") ================================================ FILE: autonomous-coding/prompts/app_spec.txt ================================================ Claude.ai Clone - AI Chat Interface Build a fully functional clone of claude.ai, Anthropic's conversational AI interface. The application should provide a clean, modern chat interface for interacting with Claude via the API, including features like conversation management, artifact rendering, project organization, multiple model selection, and advanced settings. The UI should closely match claude.ai's design using Tailwind CSS with a focus on excellent user experience and responsive design. You can use an API key located at /tmp/api-key for testing. You will not be allowed to read this file, but you can reference it in code. React with Vite Tailwind CSS (via CDN) React hooks and context React Router for navigation React Markdown for message rendering Syntax highlighting for code blocks Only launch on port {frontend_port} Node.js with Express SQLite with better-sqlite3 Claude API for chat completions Server-Sent Events for streaming responses RESTful endpoints SSE for real-time message streaming Integration with Claude API using Anthropic SDK - Repository includes .env with VITE_ANTHROPIC_API_KEY configured - Frontend dependencies pre-installed via pnpm - Backend code goes in /server directory - Install backend dependencies as needed - Clean, centered chat layout with message bubbles - Streaming message responses with typing indicator - Markdown rendering with proper formatting - Code blocks with syntax highlighting and copy button - LaTeX/math equation rendering - Image upload and display in messages - Multi-turn conversations with context - Message editing and regeneration - Stop generation button during streaming - Input field with auto-resize textarea - Character count and token estimation - Keyboard shortcuts (Enter to send, Shift+Enter for newline) - Artifact detection and rendering in side panel - Code artifact viewer with syntax highlighting - HTML/SVG preview with live rendering - React component preview - Mermaid diagram rendering - Text document artifacts - Artifact editing and re-prompting - Full-screen artifact view - Download artifact content - Artifact versioning and history - Create new conversations - Conversation list in sidebar - Rename conversations - Delete conversations - Search conversations by title/content - Pin important conversations - Archive conversations - Conversation folders/organization - Duplicate conversation - Export conversation (JSON, Markdown, PDF) - Conversation timestamps (created, last updated) - Unread message indicators - Create projects to group related conversations - Project knowledge base (upload documents) - Project-specific custom instructions - Share projects with team (mock feature) - Project settings and configuration - Move conversations between projects - Project templates - Project analytics (usage stats) - Model selector dropdown with the following models: - Claude Sonnet 4.5 (claude-sonnet-4-5-20250929) - default - Claude Haiku 4.5 (claude-haiku-4-5-20251001) - Claude Opus 4.1 (claude-opus-4-1-20250805) - Model capabilities display - Context window indicator - Model-specific pricing info (display only) - Switch models mid-conversation - Model comparison view - Global custom instructions - Project-specific custom instructions - Conversation-specific system prompts - Custom instruction templates - Preview how instructions affect responses - Theme selection (Light, Dark, Auto) - Font size adjustment - Message density (compact, comfortable, spacious) - Code theme selection - Language preferences - Accessibility options - Keyboard shortcuts reference - Data export options - Privacy settings - API key management - Temperature control slider - Max tokens adjustment - Top-p (nucleus sampling) control - System prompt override - Thinking/reasoning mode toggle - Multi-modal input (text + images) - Voice input (optional, mock UI) - Response suggestions - Related prompts - Conversation branching - Share conversation via link (read-only) - Export conversation formats - Conversation templates - Prompt library - Share artifacts - Team workspaces (mock UI) - Search across all conversations - Filter by project, date, model - Prompt library with categories - Example conversations - Quick actions menu - Command palette (Cmd/Ctrl+K) - Token usage display per message - Conversation cost estimation - Daily/monthly usage dashboard - Usage limits and warnings - API quota tracking - Welcome screen for new users - Feature tour highlights - Example prompts to get started - Quick tips and best practices - Keyboard shortcuts tutorial - Full keyboard navigation - Screen reader support - ARIA labels and roles - High contrast mode - Focus management - Reduced motion support - Mobile-first responsive layout - Touch-optimized interface - Collapsible sidebar on mobile - Swipe gestures for navigation - Adaptive artifact display - Progressive Web App (PWA) support - id, email, name, avatar_url - created_at, last_login - preferences (JSON: theme, font_size, etc.) - custom_instructions - id, user_id, name, description, color - custom_instructions, knowledge_base_path - created_at, updated_at - is_archived, is_pinned - id, user_id, project_id, title - model, created_at, updated_at, last_message_at - is_archived, is_pinned, is_deleted - settings (JSON: temperature, max_tokens, etc.) - token_count, message_count - id, conversation_id, role (user/assistant/system) - content, created_at, edited_at - tokens, finish_reason - images (JSON array of image data) - parent_message_id (for branching) - id, message_id, conversation_id - type (code/html/svg/react/mermaid/text) - title, identifier, language - content, version - created_at, updated_at - id, conversation_id, share_token - created_at, expires_at, view_count - is_public - id, user_id, title, description - prompt_template, category, tags (JSON) - is_public, usage_count - created_at, updated_at - id, user_id, project_id, name, parent_folder_id - created_at, position - id, folder_id, conversation_id - id, user_id, conversation_id, message_id - model, input_tokens, output_tokens - cost_estimate, created_at - id, user_id, key_name, api_key_hash - created_at, last_used_at - is_active - POST /api/auth/login - POST /api/auth/logout - GET /api/auth/me - PUT /api/auth/profile - GET /api/conversations - POST /api/conversations - GET /api/conversations/:id - PUT /api/conversations/:id - DELETE /api/conversations/:id - POST /api/conversations/:id/duplicate - POST /api/conversations/:id/export - PUT /api/conversations/:id/archive - PUT /api/conversations/:id/pin - POST /api/conversations/:id/branch - GET /api/conversations/:id/messages - POST /api/conversations/:id/messages - PUT /api/messages/:id - DELETE /api/messages/:id - POST /api/messages/:id/regenerate - GET /api/messages/stream (SSE endpoint) - GET /api/conversations/:id/artifacts - GET /api/artifacts/:id - PUT /api/artifacts/:id - DELETE /api/artifacts/:id - POST /api/artifacts/:id/fork - GET /api/artifacts/:id/versions - GET /api/projects - POST /api/projects - GET /api/projects/:id - PUT /api/projects/:id - DELETE /api/projects/:id - POST /api/projects/:id/knowledge - GET /api/projects/:id/conversations - PUT /api/projects/:id/settings - POST /api/conversations/:id/share - GET /api/share/:token - DELETE /api/share/:token - PUT /api/share/:token/settings - GET /api/prompts/library - POST /api/prompts/library - GET /api/prompts/:id - PUT /api/prompts/:id - DELETE /api/prompts/:id - GET /api/prompts/categories - GET /api/prompts/examples - GET /api/search/conversations?q=query - GET /api/search/messages?q=query - GET /api/search/artifacts?q=query - GET /api/search/prompts?q=query - GET /api/folders - POST /api/folders - PUT /api/folders/:id - DELETE /api/folders/:id - POST /api/folders/:id/items - DELETE /api/folders/:id/items/:conversationId - GET /api/usage/daily - GET /api/usage/monthly - GET /api/usage/by-model - GET /api/usage/conversations/:id - GET /api/settings - PUT /api/settings - GET /api/settings/custom-instructions - PUT /api/settings/custom-instructions - POST /api/claude/chat (proxy to Claude API) - POST /api/claude/chat/stream (streaming proxy) - GET /api/claude/models - POST /api/claude/images/upload - Three-column layout: sidebar (conversations), main (chat), panel (artifacts) - Collapsible sidebar with resize handle - Responsive breakpoints: mobile (single column), tablet (two column), desktop (three column) - Persistent header with project/model selector - Bottom input area with send button and options - New chat button (prominent) - Project selector dropdown - Search conversations input - Conversations list (grouped by date: Today, Yesterday, Previous 7 days, etc.) - Folder tree view (collapsible) - Settings gear icon at bottom - User profile at bottom - Conversation title (editable inline) - Model selector badge - Message history (scrollable) - Welcome screen for new conversations - Suggested prompts (empty state) - Input area with formatting toolbar - Attachment button for images - Send button with loading state - Stop generation button - Artifact header with title and type badge - Code editor or preview pane - Tabs for multiple artifacts - Full-screen toggle - Download button - Edit/Re-prompt button - Version selector - Close panel button - Settings modal (tabbed interface) - Share conversation modal - Export options modal - Project settings modal - Prompt library modal - Command palette overlay - Keyboard shortcuts reference - Primary: Orange/amber accent (#CC785C claude-style) - Background: White (light mode), Dark gray (#1A1A1A dark mode) - Surface: Light gray (#F5F5F5 light), Darker gray (#2A2A2A dark) - Text: Near black (#1A1A1A light), Off-white (#E5E5E5 dark) - Borders: Light gray (#E5E5E5 light), Dark gray (#404040 dark) - Code blocks: Monaco editor theme - Sans-serif system font stack (Inter, SF Pro, Roboto, system-ui) - Headings: font-semibold - Body: font-normal, leading-relaxed - Code: Monospace (JetBrains Mono, Consolas, Monaco) - Message text: text-base (16px), comfortable line-height - User messages: Right-aligned, subtle background - Assistant messages: Left-aligned, no background - Markdown formatting with proper spacing - Inline code with bg-gray-100 background - Code blocks with syntax highlighting - Copy button on code blocks - Primary: Orange/amber background, white text, rounded - Secondary: Border style with hover fill - Icon buttons: Square with hover background - Disabled state: Reduced opacity, no pointer events - Rounded borders with focus ring - Textarea auto-resize - Placeholder text in gray - Error states in red - Character counter - Subtle border or shadow - Rounded corners (8px) - Padding: p-4 to p-6 - Hover state: slight shadow increase - Smooth transitions (150-300ms) - Fade in for new messages - Slide in for sidebar - Typing indicator animation - Loading spinner for generation - Skeleton loaders for content 1. User types message in input field 2. Optional: Attach images via button 3. Click send or press Enter 4. Message appears in chat immediately 5. Typing indicator shows while waiting 6. Response streams in word by word 7. Code blocks render with syntax highlighting 8. Artifacts detected and rendered in side panel 9. Message complete, enable regenerate option 1. Assistant generates artifact in response 2. Artifact panel slides in from right 3. Content renders (code with highlighting or live preview) 4. User can edit artifact inline 5. "Re-prompt" button to iterate with Claude 6. Download or copy artifact content 7. Full-screen mode for detailed work 8. Close panel to return to chat focus 1. Click "New Chat" to start fresh conversation 2. Conversations auto-save with first message 3. Auto-generate title from first exchange 4. Click title to rename inline 5. Drag conversations into folders 6. Right-click for context menu (pin, archive, delete, export) 7. Search filters conversations in real-time 8. Click conversation to switch context Setup Project Foundation and Database - Initialize Express server with SQLite database - Set up Claude API client with streaming support - Create database schema with migrations - Implement authentication endpoints - Set up basic CORS and middleware - Create health check endpoint Build Core Chat Interface - Create main layout with sidebar and chat area - Implement message display with markdown rendering - Add streaming message support with SSE - Build input area with auto-resize textarea - Add code block syntax highlighting - Implement stop generation functionality - Add typing indicators and loading states Conversation Management - Create conversation list in sidebar - Implement new conversation creation - Add conversation switching - Build conversation rename functionality - Implement delete with confirmation - Add conversation search - Create conversation grouping by date Artifacts System - Build artifact detection from Claude responses - Create artifact rendering panel - Implement code artifact viewer - Add HTML/SVG live preview - Build artifact editing interface - Add artifact versioning - Implement full-screen artifact view Projects and Organization - Create projects CRUD endpoints - Build project selector UI - Implement project-specific custom instructions - Add folder system for conversations - Create drag-and-drop organization - Build project settings panel Advanced Features - Add model selection dropdown - Implement temperature and parameter controls - Build image upload functionality - Create message editing and regeneration - Add conversation branching - Implement export functionality Settings and Customization - Build settings modal with tabs - Implement theme switching (light/dark) - Add custom instructions management - Create keyboard shortcuts - Build prompt library - Add usage tracking dashboard Sharing and Collaboration - Implement conversation sharing with tokens - Create public share view - Add export to multiple formats - Build prompt templates - Create example conversations Polish and Optimization - Optimize for mobile responsiveness - Add command palette (Cmd+K) - Implement comprehensive keyboard navigation - Add onboarding flow - Create accessibility improvements - Performance optimization and caching - Streaming chat responses work smoothly - Artifact detection and rendering accurate - Conversation management intuitive and reliable - Project organization clear and useful - Image upload and display working - All CRUD operations functional - Interface matches claude.ai design language - Responsive on all device sizes - Smooth animations and transitions - Fast response times and minimal lag - Intuitive navigation and workflows - Clear feedback for all actions - Clean, maintainable code structure - Proper error handling throughout - Secure API key management - Optimized database queries - Efficient streaming implementation - Comprehensive testing coverage - Consistent with claude.ai visual design - Beautiful typography and spacing - Smooth animations and micro-interactions - Excellent contrast and accessibility - Professional, polished appearance - Dark mode fully implemented ================================================ FILE: autonomous-coding/prompts/coding_prompt.md ================================================ ## YOUR ROLE - CODING AGENT You are continuing work on a long-running autonomous development task. This is a FRESH context window - you have no memory of previous sessions. ### STEP 1: GET YOUR BEARINGS (MANDATORY) Start by orienting yourself: ```bash # 1. See your working directory pwd # 2. List files to understand project structure ls -la # 3. Read the project specification to understand what you're building cat app_spec.txt # 4. Read the feature list to see all work cat feature_list.json | head -50 # 5. Read progress notes from previous sessions cat claude-progress.txt # 6. Check recent git history git log --oneline -20 # 7. Count remaining tests cat feature_list.json | grep '"passes": false' | wc -l ``` Understanding the `app_spec.txt` is critical - it contains the full requirements for the application you're building. ### STEP 2: START SERVERS (IF NOT RUNNING) If `init.sh` exists, run it: ```bash chmod +x init.sh ./init.sh ``` Otherwise, start servers manually and document the process. ### STEP 3: VERIFICATION TEST (CRITICAL!) **MANDATORY BEFORE NEW WORK:** The previous session may have introduced bugs. Before implementing anything new, you MUST run verification tests. Run 1-2 of the feature tests marked as `"passes": true` that are most core to the app's functionality to verify they still work. For example, if this were a chat app, you should perform a test that logs into the app, sends a message, and gets a response. **If you find ANY issues (functional or visual):** - Mark that feature as "passes": false immediately - Add issues to a list - Fix all issues BEFORE moving to new features - This includes UI bugs like: * White-on-white text or poor contrast * Random characters displayed * Incorrect timestamps * Layout issues or overflow * Buttons too close together * Missing hover states * Console errors ### STEP 4: CHOOSE ONE FEATURE TO IMPLEMENT Look at feature_list.json and find the highest-priority feature with "passes": false. Focus on completing one feature perfectly and completing its testing steps in this session before moving on to other features. It's ok if you only complete one feature in this session, as there will be more sessions later that continue to make progress. ### STEP 5: IMPLEMENT THE FEATURE Implement the chosen feature thoroughly: 1. Write the code (frontend and/or backend as needed) 2. Test manually using browser automation (see Step 6) 3. Fix any issues discovered 4. Verify the feature works end-to-end ### STEP 6: VERIFY WITH BROWSER AUTOMATION **CRITICAL:** You MUST verify features through the actual UI. Use browser automation tools: - Navigate to the app in a real browser - Interact like a human user (click, type, scroll) - Take screenshots at each step - Verify both functionality AND visual appearance **DO:** - Test through the UI with clicks and keyboard input - Take screenshots to verify visual appearance - Check for console errors in browser - Verify complete user workflows end-to-end **DON'T:** - Only test with curl commands (backend testing alone is insufficient) - Use JavaScript evaluation to bypass UI (no shortcuts) - Skip visual verification - Mark tests passing without thorough verification ### STEP 7: UPDATE feature_list.json (CAREFULLY!) **YOU CAN ONLY MODIFY ONE FIELD: "passes"** After thorough verification, change: ```json "passes": false ``` to: ```json "passes": true ``` **NEVER:** - Remove tests - Edit test descriptions - Modify test steps - Combine or consolidate tests - Reorder tests **ONLY CHANGE "passes" FIELD AFTER VERIFICATION WITH SCREENSHOTS.** ### STEP 8: COMMIT YOUR PROGRESS Make a descriptive git commit: ```bash git add . git commit -m "Implement [feature name] - verified end-to-end - Added [specific changes] - Tested with browser automation - Updated feature_list.json: marked test #X as passing - Screenshots in verification/ directory " ``` ### STEP 9: UPDATE PROGRESS NOTES Update `claude-progress.txt` with: - What you accomplished this session - Which test(s) you completed - Any issues discovered or fixed - What should be worked on next - Current completion status (e.g., "45/200 tests passing") ### STEP 10: END SESSION CLEANLY Before context fills up: 1. Commit all working code 2. Update claude-progress.txt 3. Update feature_list.json if tests verified 4. Ensure no uncommitted changes 5. Leave app in working state (no broken features) --- ## TESTING REQUIREMENTS **ALL testing must use browser automation tools.** Available tools: - puppeteer_navigate - Start browser and go to URL - puppeteer_screenshot - Capture screenshot - puppeteer_click - Click elements - puppeteer_fill - Fill form inputs - puppeteer_evaluate - Execute JavaScript (use sparingly, only for debugging) Test like a human user with mouse and keyboard. Don't take shortcuts by using JavaScript evaluation. Don't use the puppeteer "active tab" tool. --- ## IMPORTANT REMINDERS **Your Goal:** Production-quality application with all 200+ tests passing **This Session's Goal:** Complete at least one feature perfectly **Priority:** Fix broken tests before implementing new features **Quality Bar:** - Zero console errors - Polished UI matching the design specified in app_spec.txt - All features work end-to-end through the UI - Fast, responsive, professional **You have unlimited time.** Take as long as needed to get it right. The most important thing is that you leave the code base in a clean state before terminating the session (Step 10). --- Begin by running Step 1 (Get Your Bearings). ================================================ FILE: autonomous-coding/prompts/initializer_prompt.md ================================================ ## YOUR ROLE - INITIALIZER AGENT (Session 1 of Many) You are the FIRST agent in a long-running autonomous development process. Your job is to set up the foundation for all future coding agents. ### FIRST: Read the Project Specification Start by reading `app_spec.txt` in your working directory. This file contains the complete specification for what you need to build. Read it carefully before proceeding. ### CRITICAL FIRST TASK: Create feature_list.json Based on `app_spec.txt`, create a file called `feature_list.json` with 200 detailed end-to-end test cases. This file is the single source of truth for what needs to be built. **Format:** ```json [ { "category": "functional", "description": "Brief description of the feature and what this test verifies", "steps": [ "Step 1: Navigate to relevant page", "Step 2: Perform action", "Step 3: Verify expected result" ], "passes": false }, { "category": "style", "description": "Brief description of UI/UX requirement", "steps": [ "Step 1: Navigate to page", "Step 2: Take screenshot", "Step 3: Verify visual requirements" ], "passes": false } ] ``` **Requirements for feature_list.json:** - Minimum 200 features total with testing steps for each - Both "functional" and "style" categories - Mix of narrow tests (2-5 steps) and comprehensive tests (10+ steps) - At least 25 tests MUST have 10+ steps each - Order features by priority: fundamental features first - ALL tests start with "passes": false - Cover every feature in the spec exhaustively **CRITICAL INSTRUCTION:** IT IS CATASTROPHIC TO REMOVE OR EDIT FEATURES IN FUTURE SESSIONS. Features can ONLY be marked as passing (change "passes": false to "passes": true). Never remove features, never edit descriptions, never modify testing steps. This ensures no functionality is missed. ### SECOND TASK: Create init.sh Create a script called `init.sh` that future agents can use to quickly set up and run the development environment. The script should: 1. Install any required dependencies 2. Start any necessary servers or services 3. Print helpful information about how to access the running application Base the script on the technology stack specified in `app_spec.txt`. ### THIRD TASK: Initialize Git Create a git repository and make your first commit with: - feature_list.json (complete with all 200+ features) - init.sh (environment setup script) - README.md (project overview and setup instructions) Commit message: "Initial setup: feature_list.json, init.sh, and project structure" ### FOURTH TASK: Create Project Structure Set up the basic project structure based on what's specified in `app_spec.txt`. This typically includes directories for frontend, backend, and any other components mentioned in the spec. ### OPTIONAL: Start Implementation If you have time remaining in this session, you may begin implementing the highest-priority features from feature_list.json. Remember: - Work on ONE feature at a time - Test thoroughly before marking "passes": true - Commit your progress before session ends ### ENDING THIS SESSION Before your context fills up: 1. Commit all work with descriptive messages 2. Create `claude-progress.txt` with a summary of what you accomplished 3. Ensure feature_list.json is complete and saved 4. Leave the environment in a clean, working state The next agent will continue from here with a fresh context window. --- **Remember:** You have unlimited time across many sessions. Focus on quality over speed. Production-ready is the goal. ================================================ FILE: autonomous-coding/prompts.py ================================================ """ Prompt Loading Utilities ======================== Functions for loading prompt templates from the prompts directory. """ import shutil from pathlib import Path PROMPTS_DIR = Path(__file__).parent / "prompts" def load_prompt(name: str) -> str: """Load a prompt template from the prompts directory.""" prompt_path = PROMPTS_DIR / f"{name}.md" return prompt_path.read_text() def get_initializer_prompt() -> str: """Load the initializer prompt.""" return load_prompt("initializer_prompt") def get_coding_prompt() -> str: """Load the coding agent prompt.""" return load_prompt("coding_prompt") def copy_spec_to_project(project_dir: Path) -> None: """Copy the app spec file into the project directory for the agent to read.""" spec_source = PROMPTS_DIR / "app_spec.txt" spec_dest = project_dir / "app_spec.txt" if not spec_dest.exists(): shutil.copy(spec_source, spec_dest) print("Copied app_spec.txt to project directory") ================================================ FILE: autonomous-coding/requirements.txt ================================================ claude-code-sdk>=0.0.25 ================================================ FILE: autonomous-coding/security.py ================================================ """ Security Hooks for Autonomous Coding Agent ========================================== Pre-tool-use hooks that validate bash commands for security. Uses an allowlist approach - only explicitly permitted commands can run. """ import os import shlex # Allowed commands for development tasks # Minimal set needed for the autonomous coding demo ALLOWED_COMMANDS = { # File inspection "ls", "cat", "head", "tail", "wc", "grep", # File operations (agent uses SDK tools for most file ops, but cp/mkdir needed occasionally) "cp", "mkdir", "chmod", # For making scripts executable; validated separately # Directory "pwd", # Node.js development "npm", "node", # Version control "git", # Process management "ps", "lsof", "sleep", "pkill", # For killing dev servers; validated separately # Script execution "init.sh", # Init scripts; validated separately } # Commands that need additional validation even when in the allowlist COMMANDS_NEEDING_EXTRA_VALIDATION = {"pkill", "chmod", "init.sh"} def split_command_segments(command_string: str) -> list[str]: """ Split a compound command into individual command segments. Handles command chaining (&&, ||, ;) but not pipes (those are single commands). Args: command_string: The full shell command Returns: List of individual command segments """ import re # Split on && and || while preserving the ability to handle each segment # This regex splits on && or || that aren't inside quotes segments = re.split(r"\s*(?:&&|\|\|)\s*", command_string) # Further split on semicolons result = [] for segment in segments: sub_segments = re.split(r'(? list[str]: """ Extract command names from a shell command string. Handles pipes, command chaining (&&, ||, ;), and subshells. Returns the base command names (without paths). Args: command_string: The full shell command Returns: List of command names found in the string """ commands = [] # shlex doesn't treat ; as a separator, so we need to pre-process import re # Split on semicolons that aren't inside quotes (simple heuristic) # This handles common cases like "echo hello; ls" segments = re.split(r'(? tuple[bool, str]: """ Validate pkill commands - only allow killing dev-related processes. Uses shlex to parse the command, avoiding regex bypass vulnerabilities. Returns: Tuple of (is_allowed, reason_if_blocked) """ # Allowed process names for pkill allowed_process_names = { "node", "npm", "npx", "vite", "next", } try: tokens = shlex.split(command_string) except ValueError: return False, "Could not parse pkill command" if not tokens: return False, "Empty pkill command" # Separate flags from arguments args = [] for token in tokens[1:]: if not token.startswith("-"): args.append(token) if not args: return False, "pkill requires a process name" # The target is typically the last non-flag argument target = args[-1] # For -f flag (full command line match), extract the first word as process name # e.g., "pkill -f 'node server.js'" -> target is "node server.js", process is "node" if " " in target: target = target.split()[0] if target in allowed_process_names: return True, "" return False, f"pkill only allowed for dev processes: {allowed_process_names}" def validate_chmod_command(command_string: str) -> tuple[bool, str]: """ Validate chmod commands - only allow making files executable with +x. Returns: Tuple of (is_allowed, reason_if_blocked) """ try: tokens = shlex.split(command_string) except ValueError: return False, "Could not parse chmod command" if not tokens or tokens[0] != "chmod": return False, "Not a chmod command" # Look for the mode argument # Valid modes: +x, u+x, a+x, etc. (anything ending with +x for execute permission) mode = None files = [] for token in tokens[1:]: if token.startswith("-"): # Skip flags like -R (we don't allow recursive chmod anyway) return False, "chmod flags are not allowed" elif mode is None: mode = token else: files.append(token) if mode is None: return False, "chmod requires a mode" if not files: return False, "chmod requires at least one file" # Only allow +x variants (making files executable) # This matches: +x, u+x, g+x, o+x, a+x, ug+x, etc. import re if not re.match(r"^[ugoa]*\+x$", mode): return False, f"chmod only allowed with +x mode, got: {mode}" return True, "" def validate_init_script(command_string: str) -> tuple[bool, str]: """ Validate init.sh script execution - only allow ./init.sh. Returns: Tuple of (is_allowed, reason_if_blocked) """ try: tokens = shlex.split(command_string) except ValueError: return False, "Could not parse init script command" if not tokens: return False, "Empty command" # The command should be exactly ./init.sh (possibly with arguments) script = tokens[0] # Allow ./init.sh or paths ending in /init.sh if script == "./init.sh" or script.endswith("/init.sh"): return True, "" return False, f"Only ./init.sh is allowed, got: {script}" def get_command_for_validation(cmd: str, segments: list[str]) -> str: """ Find the specific command segment that contains the given command. Args: cmd: The command name to find segments: List of command segments Returns: The segment containing the command, or empty string if not found """ for segment in segments: segment_commands = extract_commands(segment) if cmd in segment_commands: return segment return "" async def bash_security_hook(input_data, tool_use_id=None, context=None): """ Pre-tool-use hook that validates bash commands using an allowlist. Only commands in ALLOWED_COMMANDS are permitted. Args: input_data: Dict containing tool_name and tool_input tool_use_id: Optional tool use ID context: Optional context Returns: Empty dict to allow, or {"decision": "block", "reason": "..."} to block """ if input_data.get("tool_name") != "Bash": return {} command = input_data.get("tool_input", {}).get("command", "") if not command: return {} # Extract all commands from the command string commands = extract_commands(command) if not commands: # Could not parse - fail safe by blocking return { "decision": "block", "reason": f"Could not parse command for security validation: {command}", } # Split into segments for per-command validation segments = split_command_segments(command) # Check each command against the allowlist for cmd in commands: if cmd not in ALLOWED_COMMANDS: return { "decision": "block", "reason": f"Command '{cmd}' is not in the allowed commands list", } # Additional validation for sensitive commands if cmd in COMMANDS_NEEDING_EXTRA_VALIDATION: # Find the specific segment containing this command cmd_segment = get_command_for_validation(cmd, segments) if not cmd_segment: cmd_segment = command # Fallback to full command if cmd == "pkill": allowed, reason = validate_pkill_command(cmd_segment) if not allowed: return {"decision": "block", "reason": reason} elif cmd == "chmod": allowed, reason = validate_chmod_command(cmd_segment) if not allowed: return {"decision": "block", "reason": reason} elif cmd == "init.sh": allowed, reason = validate_init_script(cmd_segment) if not allowed: return {"decision": "block", "reason": reason} return {} ================================================ FILE: autonomous-coding/test_security.py ================================================ #!/usr/bin/env python3 """ Security Hook Tests =================== Tests for the bash command security validation logic. Run with: python test_security.py """ import asyncio import sys from security import ( bash_security_hook, extract_commands, validate_chmod_command, validate_init_script, ) def test_hook(command: str, should_block: bool) -> bool: """Test a single command against the security hook.""" input_data = {"tool_name": "Bash", "tool_input": {"command": command}} result = asyncio.run(bash_security_hook(input_data)) was_blocked = result.get("decision") == "block" if was_blocked == should_block: status = "PASS" else: status = "FAIL" expected = "blocked" if should_block else "allowed" actual = "blocked" if was_blocked else "allowed" reason = result.get("reason", "") print(f" {status}: {command!r}") print(f" Expected: {expected}, Got: {actual}") if reason: print(f" Reason: {reason}") return False print(f" {status}: {command!r}") return True def test_extract_commands(): """Test the command extraction logic.""" print("\nTesting command extraction:\n") passed = 0 failed = 0 test_cases = [ ("ls -la", ["ls"]), ("npm install && npm run build", ["npm", "npm"]), ("cat file.txt | grep pattern", ["cat", "grep"]), ("/usr/bin/node script.js", ["node"]), ("VAR=value ls", ["ls"]), ("git status || git init", ["git", "git"]), ] for cmd, expected in test_cases: result = extract_commands(cmd) if result == expected: print(f" PASS: {cmd!r} -> {result}") passed += 1 else: print(f" FAIL: {cmd!r}") print(f" Expected: {expected}, Got: {result}") failed += 1 return passed, failed def test_validate_chmod(): """Test chmod command validation.""" print("\nTesting chmod validation:\n") passed = 0 failed = 0 # Test cases: (command, should_be_allowed, description) test_cases = [ # Allowed cases ("chmod +x init.sh", True, "basic +x"), ("chmod +x script.sh", True, "+x on any script"), ("chmod u+x init.sh", True, "user +x"), ("chmod a+x init.sh", True, "all +x"), ("chmod ug+x init.sh", True, "user+group +x"), ("chmod +x file1.sh file2.sh", True, "multiple files"), # Blocked cases ("chmod 777 init.sh", False, "numeric mode"), ("chmod 755 init.sh", False, "numeric mode 755"), ("chmod +w init.sh", False, "write permission"), ("chmod +r init.sh", False, "read permission"), ("chmod -x init.sh", False, "remove execute"), ("chmod -R +x dir/", False, "recursive flag"), ("chmod --recursive +x dir/", False, "long recursive flag"), ("chmod +x", False, "missing file"), ] for cmd, should_allow, description in test_cases: allowed, reason = validate_chmod_command(cmd) if allowed == should_allow: print(f" PASS: {cmd!r} ({description})") passed += 1 else: expected = "allowed" if should_allow else "blocked" actual = "allowed" if allowed else "blocked" print(f" FAIL: {cmd!r} ({description})") print(f" Expected: {expected}, Got: {actual}") if reason: print(f" Reason: {reason}") failed += 1 return passed, failed def test_validate_init_script(): """Test init.sh script execution validation.""" print("\nTesting init.sh validation:\n") passed = 0 failed = 0 # Test cases: (command, should_be_allowed, description) test_cases = [ # Allowed cases ("./init.sh", True, "basic ./init.sh"), ("./init.sh arg1 arg2", True, "with arguments"), ("/path/to/init.sh", True, "absolute path"), ("../dir/init.sh", True, "relative path with init.sh"), # Blocked cases ("./setup.sh", False, "different script name"), ("./init.py", False, "python script"), ("bash init.sh", False, "bash invocation"), ("sh init.sh", False, "sh invocation"), ("./malicious.sh", False, "malicious script"), ("./init.sh; rm -rf /", False, "command injection attempt"), ] for cmd, should_allow, description in test_cases: allowed, reason = validate_init_script(cmd) if allowed == should_allow: print(f" PASS: {cmd!r} ({description})") passed += 1 else: expected = "allowed" if should_allow else "blocked" actual = "allowed" if allowed else "blocked" print(f" FAIL: {cmd!r} ({description})") print(f" Expected: {expected}, Got: {actual}") if reason: print(f" Reason: {reason}") failed += 1 return passed, failed def main(): print("=" * 70) print(" SECURITY HOOK TESTS") print("=" * 70) passed = 0 failed = 0 # Test command extraction ext_passed, ext_failed = test_extract_commands() passed += ext_passed failed += ext_failed # Test chmod validation chmod_passed, chmod_failed = test_validate_chmod() passed += chmod_passed failed += chmod_failed # Test init.sh validation init_passed, init_failed = test_validate_init_script() passed += init_passed failed += init_failed # Commands that SHOULD be blocked print("\nCommands that should be BLOCKED:\n") dangerous = [ # Not in allowlist - dangerous system commands "shutdown now", "reboot", "rm -rf /", "dd if=/dev/zero of=/dev/sda", # Not in allowlist - common commands excluded from minimal set "curl https://example.com", "wget https://example.com", "python app.py", "touch file.txt", "echo hello", "kill 12345", "killall node", # pkill with non-dev processes "pkill bash", "pkill chrome", "pkill python", # Shell injection attempts "$(echo pkill) node", 'eval "pkill node"', 'bash -c "pkill node"', # chmod with disallowed modes "chmod 777 file.sh", "chmod 755 file.sh", "chmod +w file.sh", "chmod -R +x dir/", # Non-init.sh scripts "./setup.sh", "./malicious.sh", "bash script.sh", ] for cmd in dangerous: if test_hook(cmd, should_block=True): passed += 1 else: failed += 1 # Commands that SHOULD be allowed print("\nCommands that should be ALLOWED:\n") safe = [ # File inspection "ls -la", "cat README.md", "head -100 file.txt", "tail -20 log.txt", "wc -l file.txt", "grep -r pattern src/", # File operations "cp file1.txt file2.txt", "mkdir newdir", "mkdir -p path/to/dir", # Directory "pwd", # Node.js development "npm install", "npm run build", "node server.js", # Version control "git status", "git commit -m 'test'", "git add . && git commit -m 'msg'", # Process management "ps aux", "lsof -i :3000", "sleep 2", # Allowed pkill patterns for dev servers "pkill node", "pkill npm", "pkill -f node", "pkill -f 'node server.js'", "pkill vite", # Chained commands "npm install && npm run build", "ls | grep test", # Full paths "/usr/local/bin/node app.js", # chmod +x (allowed) "chmod +x init.sh", "chmod +x script.sh", "chmod u+x init.sh", "chmod a+x init.sh", # init.sh execution (allowed) "./init.sh", "./init.sh --production", "/path/to/init.sh", # Combined chmod and init.sh "chmod +x init.sh && ./init.sh", ] for cmd in safe: if test_hook(cmd, should_block=False): passed += 1 else: failed += 1 # Summary print("\n" + "-" * 70) print(f" Results: {passed} passed, {failed} failed") print("-" * 70) if failed == 0: print("\n ALL TESTS PASSED") return 0 else: print(f"\n {failed} TEST(S) FAILED") return 1 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: browser-use-demo/.dockerignore ================================================ **/__pycache__ **/*.pyc **/*.pyo **/*.pyd .Python *.egg-info/ .git/ .gitignore *.md .DS_Store tests/ *.log .vscode/ .idea/ *.swp *.swo *~ ================================================ FILE: browser-use-demo/.gitignore ================================================ # Python __pycache__/ *.py[cod] *$py.class *.so .Python env/ venv/ ENV/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # IDE .vscode/ .idea/ *.swp *.swo *~ # OS .DS_Store Thumbs.db # Logs *.log /tmp/ # Environment .env .anthropic/ # Test .pytest_cache/ .coverage htmlcov/ # Streamlit .streamlit/cache/ ================================================ FILE: browser-use-demo/CHANGELOG.md ================================================ # Modifications to Microsoft Playwright Source This file tracks all modifications made to files derived from or inspired by Microsoft Playwright source code. ## Modified Files ### browser_use_demo/browser_tool_utils/browser_dom_script.js - **Date Modified**: 9/23/25 - **Original Source**: https://github.com/microsoft/playwright/blob/main/packages/injected/src/ariaSnapshot.ts - **Nature of Changes**: Adapted Playwright's accessibility tree generation for use with browser tools API. Implemented accessibility tree extraction with element reference tracking, visibility filtering, and YAML-formatted output. ### browser_use_demo/browser_tool_utils/browser_element_script.js - **Date Modified**: 9/23/25 - **Original Source**: Microsoft Playwright element interaction patterns - **Nature of Changes**: Implemented element finding and interaction logic inspired by Playwright's approach to reliable element targeting and coordinate calculation. ### browser_use_demo/tools/browser.py - **Date Modified**: 9/23/25 - **Original Source**: Microsoft Playwright click emulation implementation - **Nature of Changes**: Click emulation methods developed with reference to Playwright source code during debugging to ensure reliable mouse interactions. - **Date Modified**: 10/6/25 - **Nature of Changes**: - Fixed incorrect path to browser_tool_utils directory. Changed from `Path(__file__).parent / "browser_tool_utils"` to `Path(__file__).parent.parent / "browser_tool_utils"` to correctly locate JavaScript utility files. - Fixed missing `cdp_url` attribute initialization in `__init__` method to prevent AttributeError in cleanup method. - Fixed incorrect import path for browser_key_map. Changed from `.browser_tool_utils.browser_key_map` to `..browser_tool_utils.browser_key_map` to correctly import the KEY_MAP. - **Date Modified**: 10/14/25 - **Nature of Changes**: - Enhanced `_scroll` and `_scroll_to` methods to return screenshots after scrolling actions. Added 0.5s stabilization delay before taking screenshots to show the new viewport content. This provides visual feedback to the model after scroll actions, consistent with the navigate action behavior. - **Date Modified**: 12/19/25 - **Nature of Changes**: - Added `hover` action to move mouse cursor without clicking using Playwright's `mouse.move()` API. Useful for revealing tooltips, dropdown menus, or triggering hover states. - Added `execute_js` action to execute JavaScript code in page context using Playwright's `page.evaluate()` API. Returns the result of the last expression. - **Date Modified**: 1/18/26 - **Nature of Changes**: - Added clarifying comment in the `options` property explaining that this implementation uses fixed 1920x1080 dimensions with empirical coordinate correction, and directing users to the "Handle coordinate scaling" section in the computer use documentation for the recommended client-side downscaling approach. ================================================ FILE: browser-use-demo/Dockerfile ================================================ FROM docker.io/ubuntu:22.04 ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_PRIORITY=high # Install system dependencies RUN apt-get update && \ apt-get -y upgrade && \ apt-get -y install \ # UI Requirements xvfb \ xterm \ xdotool \ scrot \ imagemagick \ sudo \ mutter \ x11vnc \ # Python/pyenv reqs build-essential \ libssl-dev \ zlib1g-dev \ libbz2-dev \ libreadline-dev \ libsqlite3-dev \ curl \ git \ libncursesw5-dev \ xz-utils \ tk-dev \ libxml2-dev \ libxmlsec1-dev \ libffi-dev \ liblzma-dev \ # Network tools net-tools \ netcat \ # PPA req software-properties-common && \ # Browser and desktop apps sudo apt-get install -y --no-install-recommends \ x11-apps \ tint2 \ pcmanfm \ unzip \ # Playwright Chromium dependencies libnss3 \ libnspr4 \ libatk1.0-0 \ libatk-bridge2.0-0 \ libcups2 \ libatspi2.0-0 \ libxcomposite1 \ libxdamage1 \ libxrandr2 \ libgbm1 \ libxkbcommon0 \ libpango-1.0-0 \ libcairo2 \ libasound2 && \ apt-get clean # Install noVNC RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \ git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html # Setup user ENV USERNAME=browseruse ENV HOME=/home/$USERNAME RUN useradd -m -s /bin/bash -d $HOME $USERNAME RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers USER browseruse WORKDIR $HOME # Setup Python RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ cd ~/.pyenv && src/configure && make -C src && cd .. && \ echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \ echo 'eval "$(pyenv init -)"' >> ~/.bashrc ENV PYENV_ROOT="$HOME/.pyenv" ENV PATH="$PYENV_ROOT/bin:$PATH" ENV PYENV_VERSION=3.11.6 RUN eval "$(pyenv init -)" && \ pyenv install $PYENV_VERSION && \ pyenv global $PYENV_VERSION && \ pyenv rehash ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH" RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \ python -m pip config set global.disable-pip-version-check true # Copy requirements and install dependencies COPY --chown=$USERNAME:$USERNAME browser_use_demo/requirements.txt $HOME/browser_use_demo/requirements.txt RUN python -m pip install -r $HOME/browser_use_demo/requirements.txt # Install Playwright and Chromium RUN python -m playwright install chromium && \ python -m playwright install-deps chromium # Setup desktop environment & app COPY --chown=$USERNAME:$USERNAME image/ $HOME COPY --chown=$USERNAME:$USERNAME browser_use_demo/ $HOME/browser_use_demo/ # Display configuration - hardcoded values ENV DISPLAY_NUM=1 ENV HEIGHT=1080 ENV WIDTH=1920 ENTRYPOINT [ "./entrypoint.sh" ] ================================================ FILE: browser-use-demo/NOTICE ================================================ This software includes components from the following open source projects: Microsoft Playwright Source: https://github.com/microsoft/playwright License: Apache License 2.0 Copyright (c) Microsoft Corporation Modified files are marked with modification notices. See CHANGELOG.md for details. ================================================ FILE: browser-use-demo/README.md ================================================ # Browser Automation Demo with Claude A complete reference implementation for building browser automation with Claude using Playwright. This demo provides a containerized Streamlit interface showcasing how to give Claude the ability to navigate websites, interact with DOM elements, extract content, and fill forms. ## Overview This demo implements a custom browser tool that enables Claude to interact with web browsers. It provides: - **DOM access**: Read page structure with element references - **Navigation control**: Browse URLs and manage browser history - **Form manipulation**: Directly set form input values - **Text extraction**: Get all text content from pages - **Element targeting**: Interact with elements via ref or coordinate parameters - **Smart scrolling**: Scroll to specific elements or in specific directions - **Page search**: Find and highlight text on pages - **Visual capture**: Take screenshots and capture zoomed regions ### Advantages Over Coordinate-Based Automation - **Reliability**: Element-based targeting via the `ref` parameter works across different screen sizes and layouts, unlike pixel coordinates that break when windows resize - **Direct DOM manipulation**: Provides structured visibility into page elements and their properties, enabling precise interactions with dynamic content, hidden elements, and complex web applications - **Web-specific actions**: Built-in support for navigation, text extraction, and form completion ## Quick Start ### Prerequisites - Docker and Docker Compose installed on your system - Anthropic API key ### Setup 1. **Clone the repository**: ```bash git clone https://github.com/anthropics/claude-quickstarts.git cd claude-quickstarts/browser-use-demo ``` 2. **Configure environment**: ```bash cp .env.example .env # Edit .env file and add your ANTHROPIC_API_KEY ``` The display resolution is set to 1920x1080 (16:9) for optimal coordinate accuracy. - See `.env.example` for more options and coordinate scaling details ### Running with Docker Compose ```bash # For production use: docker-compose up --build # For development with file watching (auto-sync changes): docker-compose up --build --watch ``` ### Access the interfaces: - **Main UI**: http://localhost:8080 (Streamlit interface) - **NoVNC Browser View**: http://localhost:6080 (see the browser) - **VNC**: Connect any VNC client to localhost:5900 https://github.com/user-attachments/assets/4fb72078-6902-4b63-bcd1-5f2c4cd60582 ## Usage Examples Once the demo is running, try these prompts in the Streamlit interface: - "Navigate to news.ycombinator.com and tell me the top 3 stories" - "Go to google.com and search for 'Anthropic Claude'" - "Visit wikipedia.org and find information about artificial intelligence" - "Navigate to github.com and search for 'playwright'" Note that the current Playwright implementation hits CAPTCHAs when searching Google.com. To avoid this, we recommend that you specify the website in the prompt (ie. navigate to Anthropic.com and search for x). ## Safety Considerations Browser automation poses unique risks that are distinct from standard API features or chat interfaces. These risks are heightened when using the tool to interact with the internet. To minimize risks, consider taking precautions such as: 1. Run the browser in an isolated virtual machine or container environment with minimal privileges to prevent direct system attacks or accidents. 2. Avoid giving the model access to sensitive data, such as account login information, to prevent information theft. 3. Consider an allowlist of domains to reduce exposure to malicious content. 4. Ask a human to confirm decisions that may result in meaningful real-world consequences as well as any tasks requiring affirmative consent, such as accepting cookies, executing financial transactions, or agreeing to terms of service. In some circumstances, Claude will follow commands found in content even if it conflicts with the user's instructions. For example, instructions on webpages or contained in images may override user instructions or cause Claude to make mistakes. We suggest taking precautions to isolate Claude from sensitive data and actions to avoid risks related to prompt injection. Finally, please inform end users of relevant risks and obtain their consent prior to enabling browser automation in your own products. This demo runs a browser in a containerized environment. While isolated, please note: - **Don't enter personal credentials or sensitive information** - This is a demonstration tool - **Be cautious about the websites you visit** - Some sites may have anti-automation measures - **Not for production use** - This demo is for learning and development purposes only ## Implementation Reference This demo shows how to build browser automation with Claude using Playwright. All browser actions (navigate, click, type, scroll, form_input, etc.) are implemented as methods in [browser.py](browser_use_demo/tools/browser.py) using Playwright's async API. ### Key Files - **[browser.py](browser_use_demo/tools/browser.py)** - Main tool with all browser actions - **[loop.py](browser_use_demo/loop.py)** - Sampling loop for API calls and response handling - **[streamlit.py](browser_use_demo/streamlit.py)** - Chat UI - **[browser_tool_utils/](browser_use_demo/browser_tool_utils/)** - JavaScript utilities for DOM extraction, element finding, and form manipulation ### Core Patterns **Element references:** JavaScript utilities generate `ref` identifiers for reliable element targeting across screen sizes (replacing brittle pixel coordinates). **Tool setup:** ```python browser_tool = BrowserTool() def to_params(self): return { "name": "browser", "description": BROWSER_TOOL_DESCRIPTION, "input_schema": BROWSER_TOOL_INPUT_SCHEMA, } ``` ### Coordinate Scaling The browser tools implementation includes automatic coordinate scaling to ensure accurate interactions: **How it works:** - The browser viewport is fixed at 1920x1080 (16:9 aspect ratio) - Claude processes screenshots at 1456x819 pixels for 16:9 aspect ratio (see [documentation](https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size)) - The browser tool automatically scales coordinates from Claude's processed resolution (1456x819) to the actual viewport (1920x1080) - This ensures clicks and interactions happen at the correct locations See `browser_use_demo/tools/coordinate_scaling.py` for the implementation. This demo uses a custom tool definition with an explicit input schema, giving you full control over the tool interface. The `BROWSER_TOOL_DESCRIPTION` and `BROWSER_TOOL_INPUT_SCHEMA` constants in [browser.py](browser_use_demo/tools/browser.py) provide a complete example you can use as a starting point for your own browser automation tools. ### Modifying & Using as a Template **To modify this demo:** 1. Edit `browser_use_demo/tools/browser.py` to add features or change behavior 2. Rebuild the Docker image (volume mount allows live Python code updates) **To use as a template for your own project:** 1. Copy [browser.py](browser_use_demo/tools/browser.py) and [browser_tool_utils/](browser_use_demo/browser_tool_utils/) 2. Adapt [loop.py](browser_use_demo/loop.py) for your API integration 3. Build your UI or use [streamlit.py](browser_use_demo/streamlit.py) as a starting point ## Architecture ``` ┌──────────────────────────────────┐ │ Docker Container │ │ │ │ ┌─────────────────────────────┐ │ │ │ Streamlit Interface │ │ ← User interacts here │ └──────────┬──────────────────┘ │ │ │ │ │ ┌──────────▼──────────────────┐ │ │ │ Claude API + Browser Tool │ │ ← Claude controls browser │ └──────────┬──────────────────┘ │ │ │ │ │ ┌──────────▼──────────────────┐ │ │ │ Playwright + Chromium │ │ ← Browser automation │ └──────────┬──────────────────┘ │ │ │ │ │ ┌──────────▼──────────────────┐ │ │ │ XVFB Virtual Display │ │ ← Virtual display │ └──────────┬──────────────────┘ │ │ │ │ │ ┌──────────▼──────────────────┐ │ │ │ VNC/NoVNC Server │ │ ← Visual access │ └─────────────────────────────┘ │ └──────────────────────────────────┘ ``` ## How Browser Automation Differs from Computer Use This browser automation demo is specifically optimized for web automation with DOM-aware features like element targeting, page reading, and form manipulation. While it shares many capabilities with the [computer use demo](../computer-use-demo), browser automation adds web-specific actions and the ability to target elements by reference instead of just coordinates. Computer use provides general desktop control for any application, while browser automation focuses exclusively on browser-based tasks. ### Actions Unique to Browser Automation These web-specific actions are not available in computer use: - **navigate**: Navigate to URL or use "back"/"forward" for history (requires text) - **read_page**: Get DOM tree with element refs; use text="interactive" to filter - **get_page_text**: Extract all text content from the page - **find**: Search for text and highlight matches (requires text) - **form_input**: Set form element value directly (requires ref and value) - **scroll_to**: Scroll element into view (requires ref) - **execute_js**: Run JavaScript code in page context (requires text with JS code) ### Actions Shared with Computer Use These actions work similarly to their computer use counterparts. The key difference is that browser automation allows targeting by element reference (`ref`) as an alternative to coordinates: **Mouse Actions** (accept either `ref` or `coordinate`): - **left_click**, **right_click**, **middle_click**, **double_click**, **triple_click** - **hover**: Move cursor without clicking (for tooltips, dropdowns) - **left_click_drag**: Drag from start_coordinate to coordinate - **left_mouse_down**, **left_mouse_up**: Fine-grained mouse control **Keyboard Actions**: - **type**: Type text at cursor (requires text) - **key**: Press key or combination (requires text) - **hold_key**: Hold key for duration (requires text and duration) **Other**: - **screenshot**: Capture current viewport - **scroll**: Scroll in direction (requires scroll_direction, scroll_amount, coordinate) - **zoom**: Zoomed screenshot of region (requires region: x1, y1, x2, y2) - **wait**: Wait for duration in seconds (requires duration, 0-100) ### Computer Use Actions Not Included These desktop-level actions from computer use are not in this browser demo: - **cursor_position**: Get current (x, y) pixel coordinate of cursor This is less relevant for browser automation since the `ref` parameter provides reliable element-based targeting, replacing the need for cursor tracking. Note that `hover` provides similar functionality to `mouse_move` for triggering hover states. ## Troubleshooting **Browser not visible?** - Check that port 6080 is accessible - Try refreshing the NoVNC page - Ensure Docker has sufficient resources allocated **API errors?** - Verify your Anthropic API key is set correctly - Check you're using a compatible model (Claude 4.5 models: claude-sonnet-4-5-20250929, claude-opus-4-5-20251101, or claude-haiku-4-5-20251001) **Browser actions failing?** - Some websites may have anti-automation measures - Try simpler websites first to test functionality - Check the browser view to see what's happening ## Attribution This software includes components from Microsoft Playwright. See the [NOTICE](NOTICE) file for details. ## Credits Built with: - [Anthropic Claude API](https://www.anthropic.com) - [Playwright](https://playwright.dev) - [Streamlit](https://streamlit.io) - [NoVNC](https://novnc.com) ================================================ FILE: browser-use-demo/browser_use_demo/__init__.py ================================================ """Browser Use Demo - Web automation with Claude.""" __version__ = "0.1.0" ================================================ FILE: browser-use-demo/browser_use_demo/browser_tool_utils/__init__.py ================================================ # Browser tool utility files ================================================ FILE: browser-use-demo/browser_use_demo/browser_tool_utils/browser_dom_script.js ================================================ /* * Modifications Copyright (c) 2025 Anthropic, PBC * Modified from original Microsoft Playwright source * Original Microsoft Playwright source licensed under Apache License 2.0 * See CHANGELOG.md for details */ // Content script that defines the accessibility tree generation function in the MAIN context (function () { // Initialize global element map and ref counter if not already present if (!window.__claudeElementMap) { window.__claudeElementMap = {}; } if (!window.__claudeRefCounter) { window.__claudeRefCounter = 0; } // Define the accessibility tree generation function on the window (in content script context) window.__generateAccessibilityTree = function (filterType) { try { var result = []; function getRole(element) { var role = element.getAttribute("role"); if (role) return role; var tag = element.tagName.toLowerCase(); var type = element.getAttribute("type"); var roleMap = { a: "link", button: "button", input: type === "submit" || type === "button" ? "button" : type === "checkbox" ? "checkbox" : type === "radio" ? "radio" : type === "file" ? "button" : "textbox", select: "combobox", textarea: "textbox", h1: "heading", h2: "heading", h3: "heading", h4: "heading", h5: "heading", h6: "heading", img: "image", nav: "navigation", main: "main", header: "banner", footer: "contentinfo", section: "region", article: "article", aside: "complementary", form: "form", table: "table", ul: "list", ol: "list", li: "listitem", label: "label", }; return roleMap[tag] || "generic"; } function getCleanName(element) { var tag = element.tagName.toLowerCase(); // For selects, get the selected option text if (tag === "select") { var selectElement = element; var selectedOption = selectElement.querySelector("option[selected]") || selectElement.options[selectElement.selectedIndex]; if (selectedOption && selectedOption.textContent) { return selectedOption.textContent.trim(); } } // Priority order for getting meaningful names var ariaLabel = element.getAttribute("aria-label"); if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim(); var placeholder = element.getAttribute("placeholder"); if (placeholder && placeholder.trim()) return placeholder.trim(); var title = element.getAttribute("title"); if (title && title.trim()) return title.trim(); var alt = element.getAttribute("alt"); if (alt && alt.trim()) return alt.trim(); // For form labels if (element.id) { var label = document.querySelector('label[for="' + element.id + '"]'); if (label && label.textContent && label.textContent.trim()) { return label.textContent.trim(); } } // For inputs with values if (tag === "input") { var inputElement = element; var type = element.getAttribute("type") || ""; var value = element.getAttribute("value"); if (type === "submit" && value && value.trim()) { return value.trim(); } if ( inputElement.value && inputElement.value.length < 50 && inputElement.value.trim() ) { return inputElement.value.trim(); } } // For buttons, links, and other interactive elements, get direct text if (["button", "a", "summary"].includes(tag)) { var directText = ""; for (var i = 0; i < element.childNodes.length; i++) { var node = element.childNodes[i]; if (node.nodeType === Node.TEXT_NODE) { directText += node.textContent; } } if (directText.trim()) return directText.trim(); } // For headings, get text content but limit it if (tag.match(/^h[1-6]$/)) { var headingText = element.textContent; if (headingText && headingText.trim()) { return headingText.trim().substring(0, 100); } } // For images without alt, try to get surrounding context if (tag === "img") { var src = element.getAttribute("src"); if (src) { var filename = src.split("/").pop()?.split("?")[0]; return "Image: " + filename; } } // For generic elements, get direct text content (not including child elements) // This helps capture important text in spans, divs, etc. var directTextContent = ""; for (var j = 0; j < element.childNodes.length; j++) { var childNode = element.childNodes[j]; if (childNode.nodeType === Node.TEXT_NODE) { directTextContent += childNode.textContent; } } if ( directTextContent && directTextContent.trim() && directTextContent.trim().length >= 3 ) { // Only return if it's meaningful text (at least 3 characters) var trimmedText = directTextContent.trim(); if (trimmedText.length > 50) { return trimmedText.substring(0, 50) + "..."; } return trimmedText; } return ""; } function isVisible(element) { var style = window.getComputedStyle(element); return ( style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0" && element.offsetWidth > 0 && element.offsetHeight > 0 ); } function isInteractive(element) { var tag = element.tagName.toLowerCase(); var interactiveTags = [ "a", "button", "input", "select", "textarea", "details", "summary", ]; return ( interactiveTags.includes(tag) || element.getAttribute("onclick") !== null || element.getAttribute("tabindex") !== null || element.getAttribute("role") === "button" || element.getAttribute("role") === "link" || element.getAttribute("contenteditable") === "true" ); } function isSemantic(element) { var tag = element.tagName.toLowerCase(); var semanticTags = [ "h1", "h2", "h3", "h4", "h5", "h6", "nav", "main", "header", "footer", "section", "article", "aside", ]; return ( semanticTags.includes(tag) || element.getAttribute("role") !== null ); } function shouldIncludeElement(element, options) { var tag = element.tagName.toLowerCase(); // Always skip these if ( ["script", "style", "meta", "link", "title", "noscript"].includes(tag) ) return false; if (element.getAttribute("aria-hidden") === "true") return false; // Always check visibility - this is now mandatory if (!isVisible(element)) return false; // Check viewport visibility for all elements (unless using 'all' filter for find tool) if (options.filter !== "all") { var rect = element.getBoundingClientRect(); var inViewport = rect.top < window.innerHeight && rect.bottom > 0 && rect.left < window.innerWidth && rect.right > 0; if (!inViewport) return false; } // Apply interactive filter if specified if (options.filter === "interactive") { return isInteractive(element); } // Default behavior when no filter is specified (all visible elements) // Always include interactive elements if (isInteractive(element)) return true; // Always include semantic elements (headings, nav, etc.) if (isSemantic(element)) return true; // Include elements with meaningful text content if (getCleanName(element).length > 0) return true; // For generic divs and spans, be more selective but still include text-containing ones var role = getRole(element); if (role === "generic" && (tag === "div" || tag === "span")) { var id = element.id || ""; var className = element.className || ""; var cleanName = getCleanName(element); // Include if it has meaningful text content (now that we extract text better) if (cleanName && cleanName.length >= 3) { return true; } // Only keep divs/spans that are clearly functional containers (not layout) var functionalKeywords = [ "search", "dropdown", "menu", "modal", "dialog", "popup", "toolbar", "sidebar", "content", "text", ]; var isFunctionalContainer = functionalKeywords.some( function (keyword) { return id.includes(keyword) || className.includes(keyword); }, ); if (isFunctionalContainer) { return true; } // Skip empty generic containers - they're just layout noise return false; } // Include other container elements that might have interactive children if (isContainerElement(element)) return true; return false; } function isContainerElement(element) { var role = element.getAttribute("role"); var tag = element.tagName.toLowerCase(); var className = element.className || ""; var id = element.id || ""; // These are containers that should be traversed deeper return ( role === "search" || role === "form" || role === "group" || role === "toolbar" || role === "navigation" || tag === "form" || tag === "fieldset" || tag === "nav" || // Generic functional containers id.includes("search") || className.includes("search") || id.includes("form") || className.includes("form") || id.includes("menu") || className.includes("menu") || id.includes("nav") || className.includes("nav") ); } function processElement(element, depth, options) { if (depth > 15) return; // Generous depth limit for very complex pages if (!element || !element.tagName) return; var shouldInclude = shouldIncludeElement(element, options); var actuallyInclude = shouldInclude || depth === 0; // Always include root (body) if (actuallyInclude) { var role = getRole(element); var name = getCleanName(element); var ref = null; // Check if this element already has a ref in the global map for (var existingRef in window.__claudeElementMap) { var weakRef = window.__claudeElementMap[existingRef]; var existingElement = weakRef.deref(); if (existingElement === element) { ref = existingRef; break; } } // If not found, create a new ref if (!ref) { ref = "ref_" + ++window.__claudeRefCounter; window.__claudeElementMap[ref] = new WeakRef(element); } var indent = " ".repeat(depth); var yaml = indent + "- " + role; if (name) { // Clean up the name - remove newlines, limit length name = name.replace(/\s+/g, " ").substring(0, 100); yaml += ' "' + name.replace(/"/g, '\\"') + '"'; } yaml += " [ref=" + ref + "]"; // Add useful attributes if (element.id) yaml += ' id="' + element.id + '"'; if (element.getAttribute("href")) yaml += ' href="' + element.getAttribute("href") + '"'; if (element.getAttribute("type")) yaml += ' type="' + element.getAttribute("type") + '"'; if (element.getAttribute("placeholder")) yaml += ' placeholder="' + element.getAttribute("placeholder") + '"'; result.push(yaml); } // Always traverse children - we need to go deep to find interactive elements if (element.children && depth < 15) { for (var i = 0; i < element.children.length; i++) { processElement( element.children[i], actuallyInclude ? depth + 1 : depth, options, ); } } } var options = { filter: filterType, }; if (document.body) { processElement(document.body, 0, options); } // Clean up stale references (elements that have been garbage collected) for (var ref in window.__claudeElementMap) { var weakRef = window.__claudeElementMap[ref]; if (!weakRef.deref()) { delete window.__claudeElementMap[ref]; } } // Filter out empty generic elements var filteredResult = result.filter(function (line) { return !/^\s*- generic \[ref=ref_\d+\]$/.test(line); }); return { pageContent: filteredResult.join("\n"), viewport: { width: window.innerWidth, height: window.innerHeight, }, }; } catch (error) { console.error("Error in accessibility tree generation:", error); throw new Error( "Error generating accessibility tree: " + (error.message || "Unknown error"), ); } }; })(); ================================================ FILE: browser-use-demo/browser_use_demo/browser_tool_utils/browser_element_script.js ================================================ /* * Modifications Copyright (c) 2025 Anthropic, PBC * Modified from original Microsoft Playwright source * Original Microsoft Playwright source licensed under Apache License 2.0 * See CHANGELOG.md for details */ // Script for interacting with elements by their reference IDs (function(elementRef) { try { // Get element from reference map let targetElement = null; if (window.__claudeElementMap && window.__claudeElementMap[elementRef]) { const weakRef = window.__claudeElementMap[elementRef]; targetElement = weakRef.deref() || null; if (!targetElement || !document.contains(targetElement)) { // Element has been removed from DOM delete window.__claudeElementMap[elementRef]; targetElement = null; } } if (!targetElement) { return { success: false, action: 'get_element', message: `No element found with reference: "${elementRef}". The element may have been removed from the page.` }; } // Scroll element into view if needed targetElement.scrollIntoView({ behavior: 'instant', block: 'center', inline: 'center' }); // Force a layout/paint to ensure the element is properly positioned after scroll targetElement.offsetHeight; // Get element coordinates const rect = targetElement.getBoundingClientRect(); const clickX = rect.left + rect.width / 2; const clickY = rect.top + rect.height / 2; // Build element info string const elementInfo = targetElement.tagName.toLowerCase() + (targetElement.id ? '#' + targetElement.id : '') + (targetElement.className ? '.' + targetElement.className.split(' ').filter(c => c).join('.') : ''); // Get additional element properties const elementType = targetElement.getAttribute('type') || ''; const elementRole = targetElement.getAttribute('role') || ''; const elementAriaLabel = targetElement.getAttribute('aria-label') || ''; const elementText = targetElement.textContent ? targetElement.textContent.substring(0, 100) : ''; return { success: true, coordinates: [clickX, clickY], elementInfo: elementInfo, elementRef: elementRef, rect: { left: rect.left, top: rect.top, right: rect.right, bottom: rect.bottom, width: rect.width, height: rect.height }, attributes: { type: elementType, role: elementRole, ariaLabel: elementAriaLabel, text: elementText }, isVisible: rect.width > 0 && rect.height > 0, isInteractable: !targetElement.disabled && targetElement.style.display !== 'none' && targetElement.style.visibility !== 'hidden' }; } catch (error) { return { success: false, action: 'get_element', message: 'Error finding element by reference: ' + (error.message || 'Unknown error') }; } }) ================================================ FILE: browser-use-demo/browser_use_demo/browser_tool_utils/browser_form_input_script.js ================================================ // Script for setting values in form elements by their reference IDs (function(elementRef, inputValue) { try { // Get element from reference map let element = null; if (window.__claudeElementMap && window.__claudeElementMap[elementRef]) { const weakRef = window.__claudeElementMap[elementRef]; element = weakRef.deref() || null; if (!element || !document.contains(element)) { // Element has been removed from DOM delete window.__claudeElementMap[elementRef]; element = null; } } if (!element) { return { success: false, action: 'form_input', message: `No element found with reference: "${elementRef}". The element may have been removed from the page.` }; } // Scroll element into view element.scrollIntoView({ behavior: 'smooth', block: 'center' }); // Handle different element types if (element instanceof HTMLSelectElement) { const previousValue = element.value; const options = Array.from(element.options); // Try to find option by value or text let optionFound = false; const valueStr = String(inputValue); for (let i = 0; i < options.length; i++) { if (options[i].value === valueStr || options[i].text === valueStr) { element.selectedIndex = i; optionFound = true; break; } } if (!optionFound) { return { success: false, action: 'form_input', message: `Option "${valueStr}" not found. Available options: ${options.map(o => `"${o.text}" (value: "${o.value}")`).join(', ')}` }; } // Focus and dispatch events element.focus(); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); return { success: true, action: 'form_input', ref: elementRef, element_type: 'select', previous_value: previousValue, new_value: element.value, message: `Selected option "${valueStr}" in dropdown` }; } else if (element instanceof HTMLInputElement && element.type === 'checkbox') { const previousValue = element.checked; if (typeof inputValue !== 'boolean') { return { success: false, action: 'form_input', message: 'Checkbox requires a boolean value (true/false)' }; } element.checked = inputValue; element.focus(); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); return { success: true, action: 'form_input', ref: elementRef, element_type: 'checkbox', previous_value: previousValue, new_value: element.checked, message: `Checkbox ${element.checked ? 'checked' : 'unchecked'}` }; } else if (element instanceof HTMLInputElement && element.type === 'radio') { const previousValue = element.checked; const radioGroup = element.name; // For radio buttons, we always set to true (can't uncheck a radio by clicking) element.checked = true; element.focus(); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); return { success: true, action: 'form_input', ref: elementRef, element_type: 'radio', previous_value: previousValue, new_value: element.checked, message: `Radio button selected${radioGroup ? ` in group "${radioGroup}"` : ''}` }; } else if (element instanceof HTMLInputElement && (element.type === 'date' || element.type === 'time' || element.type === 'datetime-local' || element.type === 'month' || element.type === 'week')) { const previousValue = element.value; element.value = String(inputValue); element.focus(); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); return { success: true, action: 'form_input', ref: elementRef, element_type: element.type, previous_value: previousValue, new_value: element.value, message: `Set ${element.type} to "${element.value}"` }; } else if (element instanceof HTMLInputElement && element.type === 'range') { const previousValue = element.value; const numValue = Number(inputValue); if (isNaN(numValue)) { return { success: false, action: 'form_input', message: 'Range input requires a numeric value' }; } element.value = String(numValue); element.focus(); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); return { success: true, action: 'form_input', ref: elementRef, element_type: 'range', previous_value: previousValue, new_value: element.value, message: `Set range to ${element.value} (min: ${element.min}, max: ${element.max})` }; } else if (element instanceof HTMLInputElement && element.type === 'number') { const previousValue = element.value; const numValue = Number(inputValue); if (isNaN(numValue) && inputValue !== '') { return { success: false, action: 'form_input', message: 'Number input requires a numeric value' }; } element.value = String(inputValue); element.focus(); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); return { success: true, action: 'form_input', ref: elementRef, element_type: 'number', previous_value: previousValue, new_value: element.value, message: `Set number input to ${element.value}` }; } else if (element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement) { const previousValue = element.value; element.value = String(inputValue); element.focus(); // Set cursor position to end element.setSelectionRange(element.value.length, element.value.length); element.dispatchEvent(new Event('change', { bubbles: true })); element.dispatchEvent(new Event('input', { bubbles: true })); const elementType = element instanceof HTMLTextAreaElement ? 'textarea' : (element.type || 'text'); return { success: true, action: 'form_input', ref: elementRef, element_type: elementType, previous_value: previousValue, new_value: element.value, message: `Set ${elementType} value to "${element.value}"` }; } else { return { success: false, action: 'form_input', message: `Element type "${element.tagName}" is not a supported form input` }; } } catch (error) { return { success: false, action: 'form_input', message: `Error setting form value: ${error.message || 'Unknown error'}` }; } }) ================================================ FILE: browser-use-demo/browser_use_demo/browser_tool_utils/browser_key_map.py ================================================ """Key mapping for browser keyboard input via Chrome DevTools Protocol.""" from typing import TypedDict class KeyInfo(TypedDict, total=False): key: str code: str keyCode: int text: str isKeypad: bool KEY_MAP: dict[str, KeyInfo] = { # Modifier keys (for key combinations like ctrl+a, cmd+c) "ctrl": {"key": "Control", "code": "ControlLeft", "keyCode": 17}, "control": {"key": "Control", "code": "ControlLeft", "keyCode": 17}, "cmd": {"key": "Meta", "code": "MetaLeft", "keyCode": 91}, "command": {"key": "Meta", "code": "MetaLeft", "keyCode": 91}, "meta": {"key": "Meta", "code": "MetaLeft", "keyCode": 91}, "alt": {"key": "Alt", "code": "AltLeft", "keyCode": 18}, "option": {"key": "Alt", "code": "AltLeft", "keyCode": 18}, "shift": {"key": "Shift", "code": "ShiftLeft", "keyCode": 16}, # Enter keys "enter": {"key": "Enter", "code": "Enter", "keyCode": 13, "text": "\r"}, "return": {"key": "Enter", "code": "Enter", "keyCode": 13, "text": "\r"}, "kp_enter": { "key": "Enter", "code": "Enter", "keyCode": 13, "text": "\r", "isKeypad": True, }, # Navigation keys "tab": {"key": "Tab", "code": "Tab", "keyCode": 9}, "delete": {"key": "Delete", "code": "Delete", "keyCode": 46}, "backspace": {"key": "Backspace", "code": "Backspace", "keyCode": 8}, "escape": {"key": "Escape", "code": "Escape", "keyCode": 27}, "esc": {"key": "Escape", "code": "Escape", "keyCode": 27}, "space": {"key": " ", "code": "Space", "keyCode": 32, "text": " "}, " ": {"key": " ", "code": "Space", "keyCode": 32, "text": " "}, # Arrow keys "arrowup": {"key": "ArrowUp", "code": "ArrowUp", "keyCode": 38}, "arrowdown": {"key": "ArrowDown", "code": "ArrowDown", "keyCode": 40}, "arrowleft": {"key": "ArrowLeft", "code": "ArrowLeft", "keyCode": 37}, "arrowright": {"key": "ArrowRight", "code": "ArrowRight", "keyCode": 39}, "up": {"key": "ArrowUp", "code": "ArrowUp", "keyCode": 38}, "down": {"key": "ArrowDown", "code": "ArrowDown", "keyCode": 40}, "left": {"key": "ArrowLeft", "code": "ArrowLeft", "keyCode": 37}, "right": {"key": "ArrowRight", "code": "ArrowRight", "keyCode": 39}, # Page navigation "home": {"key": "Home", "code": "Home", "keyCode": 36}, "end": {"key": "End", "code": "End", "keyCode": 35}, "pageup": {"key": "PageUp", "code": "PageUp", "keyCode": 33}, "pagedown": {"key": "PageDown", "code": "PageDown", "keyCode": 34}, # Function keys "f1": {"key": "F1", "code": "F1", "keyCode": 112}, "f2": {"key": "F2", "code": "F2", "keyCode": 113}, "f3": {"key": "F3", "code": "F3", "keyCode": 114}, "f4": {"key": "F4", "code": "F4", "keyCode": 115}, "f5": {"key": "F5", "code": "F5", "keyCode": 116}, "f6": {"key": "F6", "code": "F6", "keyCode": 117}, "f7": {"key": "F7", "code": "F7", "keyCode": 118}, "f8": {"key": "F8", "code": "F8", "keyCode": 119}, "f9": {"key": "F9", "code": "F9", "keyCode": 120}, "f10": {"key": "F10", "code": "F10", "keyCode": 121}, "f11": {"key": "F11", "code": "F11", "keyCode": 122}, "f12": {"key": "F12", "code": "F12", "keyCode": 123}, # Special characters ";": {"key": ";", "code": "Semicolon", "keyCode": 186, "text": ";"}, "=": {"key": "=", "code": "Equal", "keyCode": 187, "text": "="}, ",": {"key": ",", "code": "Comma", "keyCode": 188, "text": ","}, "-": {"key": "-", "code": "Minus", "keyCode": 189, "text": "-"}, ".": {"key": ".", "code": "Period", "keyCode": 190, "text": "."}, "/": {"key": "/", "code": "Slash", "keyCode": 191, "text": "/"}, "`": {"key": "`", "code": "Backquote", "keyCode": 192, "text": "`"}, "[": {"key": "[", "code": "BracketLeft", "keyCode": 219, "text": "["}, "\\": {"key": "\\", "code": "Backslash", "keyCode": 220, "text": "\\"}, "]": {"key": "]", "code": "BracketRight", "keyCode": 221, "text": "]"}, "'": {"key": "'", "code": "Quote", "keyCode": 222, "text": "'"}, "!": {"key": "!", "code": "Digit1", "keyCode": 49, "text": "!"}, "@": {"key": "@", "code": "Digit2", "keyCode": 50, "text": "@"}, "#": {"key": "#", "code": "Digit3", "keyCode": 51, "text": "#"}, "$": {"key": "$", "code": "Digit4", "keyCode": 52, "text": "$"}, "%": {"key": "%", "code": "Digit5", "keyCode": 53, "text": "%"}, "^": {"key": "^", "code": "Digit6", "keyCode": 54, "text": "^"}, "&": {"key": "&", "code": "Digit7", "keyCode": 55, "text": "&"}, "*": {"key": "*", "code": "Digit8", "keyCode": 56, "text": "*"}, "(": {"key": "(", "code": "Digit9", "keyCode": 57, "text": "("}, ")": {"key": ")", "code": "Digit0", "keyCode": 48, "text": ")"}, "_": {"key": "_", "code": "Minus", "keyCode": 189, "text": "_"}, "+": {"key": "+", "code": "Equal", "keyCode": 187, "text": "+"}, "{": {"key": "{", "code": "BracketLeft", "keyCode": 219, "text": "{"}, "}": {"key": "}", "code": "BracketRight", "keyCode": 221, "text": "}"}, "|": {"key": "|", "code": "Backslash", "keyCode": 220, "text": "|"}, ":": {"key": ":", "code": "Semicolon", "keyCode": 186, "text": ":"}, '"': {"key": '"', "code": "Quote", "keyCode": 222, "text": '"'}, "<": {"key": "<", "code": "Comma", "keyCode": 188, "text": "<"}, ">": {"key": ">", "code": "Period", "keyCode": 190, "text": ">"}, "?": {"key": "?", "code": "Slash", "keyCode": 191, "text": "?"}, "~": {"key": "~", "code": "Backquote", "keyCode": 192, "text": "~"}, # Lock keys "capslock": {"key": "CapsLock", "code": "CapsLock", "keyCode": 20}, "numlock": {"key": "NumLock", "code": "NumLock", "keyCode": 144}, "scrolllock": {"key": "ScrollLock", "code": "ScrollLock", "keyCode": 145}, # Media keys "pause": {"key": "Pause", "code": "Pause", "keyCode": 19}, "insert": {"key": "Insert", "code": "Insert", "keyCode": 45}, "printscreen": {"key": "PrintScreen", "code": "PrintScreen", "keyCode": 44}, # Numpad "numpad0": { "key": "0", "code": "Numpad0", "keyCode": 96, "isKeypad": True, }, "numpad1": { "key": "1", "code": "Numpad1", "keyCode": 97, "isKeypad": True, }, "numpad2": { "key": "2", "code": "Numpad2", "keyCode": 98, "isKeypad": True, }, "numpad3": { "key": "3", "code": "Numpad3", "keyCode": 99, "isKeypad": True, }, "numpad4": { "key": "4", "code": "Numpad4", "keyCode": 100, "isKeypad": True, }, "numpad5": { "key": "5", "code": "Numpad5", "keyCode": 101, "isKeypad": True, }, "numpad6": { "key": "6", "code": "Numpad6", "keyCode": 102, "isKeypad": True, }, "numpad7": { "key": "7", "code": "Numpad7", "keyCode": 103, "isKeypad": True, }, "numpad8": { "key": "8", "code": "Numpad8", "keyCode": 104, "isKeypad": True, }, "numpad9": { "key": "9", "code": "Numpad9", "keyCode": 105, "isKeypad": True, }, "numpadmultiply": { "key": "*", "code": "NumpadMultiply", "keyCode": 106, "isKeypad": True, }, "numpadadd": { "key": "+", "code": "NumpadAdd", "keyCode": 107, "isKeypad": True, }, "numpadsubtract": { "key": "-", "code": "NumpadSubtract", "keyCode": 109, "isKeypad": True, }, "numpaddecimal": { "key": ".", "code": "NumpadDecimal", "keyCode": 110, "isKeypad": True, }, "numpaddivide": { "key": "/", "code": "NumpadDivide", "keyCode": 111, "isKeypad": True, }, } ================================================ FILE: browser-use-demo/browser_use_demo/browser_tool_utils/browser_text_script.js ================================================ // Script to extract raw text content from the page, prioritizing article content (function() { try { // Priority order for finding article content const selectors = [ 'article', 'main', '[class*="articleBody"]', '[class*="article-body"]', '[class*="post-content"]', '[class*="entry-content"]', '[class*="content-body"]', '[role="main"]', '.content', '#content' ]; let contentElement = null; // Try each selector in order for (const selector of selectors) { const elements = document.querySelectorAll(selector); if (elements.length > 0) { // If multiple elements found, prefer the one with the most text content let bestElement = elements[0]; let maxTextLength = 0; elements.forEach((el) => { const textLength = el.textContent?.length || 0; if (textLength > maxTextLength) { maxTextLength = textLength; bestElement = el; } }); contentElement = bestElement; break; } } if (!contentElement) { // Fallback to body if no specific content element found contentElement = document.body; } // Extract text content const textContent = contentElement.textContent || ''; // Clean up the text: remove excessive whitespace, normalize line breaks const cleanedText = textContent .replace(/\s+/g, ' ') // Replace multiple spaces with single space .replace(/\n{3,}/g, '\n\n') // Replace 3+ newlines with double newline .trim(); return { text: cleanedText, source: contentElement.tagName.toLowerCase(), title: document.title, url: window.location.href }; } catch (error) { console.error('Error extracting page text:', error); throw new Error('Error extracting page text: ' + (error.message || 'Unknown error')); } }) ================================================ FILE: browser-use-demo/browser_use_demo/display_constants.py ================================================ """Display and browser configuration constants. These values are hardcoded and not configurable via environment variables. The standard resolution is 1920x1080 for consistent browser automation. """ # Display configuration DISPLAY_WIDTH = 1920 DISPLAY_HEIGHT = 1080 DISPLAY_NUM = 1 # Browser viewport configuration (matches display for consistency) BROWSER_WIDTH = 1920 BROWSER_HEIGHT = 1080 ================================================ FILE: browser-use-demo/browser_use_demo/loop.py ================================================ """ Sampling loop for browser automation with Claude """ import os from collections.abc import Callable from datetime import datetime from enum import StrEnum from typing import Optional import httpx from anthropic import ( Anthropic, AnthropicBedrock, AnthropicVertex, ) from anthropic.types.beta import ( BetaCacheControlEphemeralParam, BetaContentBlockParam, BetaMessageParam, BetaTextBlockParam, ) from .message_handler import MessageBuilder, ResponseProcessor from .tools import BrowserTool, ToolCollection, ToolResult PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31" class APIProvider(StrEnum): ANTHROPIC = "anthropic" BEDROCK = "bedrock" VERTEX = "vertex" # Browser-specific system prompt BROWSER_SYSTEM_PROMPT = f""" * You control a Chromium browser via Playwright automation. * The current date is {datetime.today().strftime("%A, %B %-d, %Y")}. You receive a screenshot at the start of each turn. Look at it to see the current page - if you're already where you need to be, don't re-navigate. After navigating to a new page, always call read_page to get element references (ref_1, ref_2, etc.) before interacting with the page. Use these refs with your interaction tools (click, type, hover, form_input, etc.). Refs are more reliable than coordinates. When you need to extract or read text content from a page, always use get_page_text - don't try to read text from screenshots. If DOM-based actions (refs) aren't working, fall back to screenshot + coordinate-based actions. * Prefer get_page_text over scrolling when looking for information - it's faster and more reliable * Use execute_js to extract data from JavaScript variables, localStorage, or trigger behaviors not accessible through clicks * Use full URLs with https:// * Use wait for slow-loading pages * Use scroll_to with a ref to reveal elements * Use form_input with refs for form fields * Use key for shortcuts (e.g., "ctrl+a") * Close popups when they appear * Verify actions succeeded before moving on """ async def sampling_loop( *, model: str, provider: APIProvider, system_prompt_suffix: str, messages: list[BetaMessageParam], output_callback: Callable[[BetaContentBlockParam], None], tool_output_callback: Callable[[ToolResult, str], None], api_response_callback: Callable[ [httpx.Request | None, httpx.Response | object | None, Exception | None], None ], api_key: str, only_n_most_recent_images: int | None = None, max_tokens: int = 4096, browser_tool: Optional[BrowserTool] = None, ): """ Sampling loop for browser automation. Args: browser_tool: Optional persistent browser tool instance. If not provided, creates a new one. """ # Reuse existing browser tool or create a new one if browser_tool is None: # Create browser tool with standard dimensions browser_tool = BrowserTool() tool_collection = ToolCollection(browser_tool) # Build system prompt system = BetaTextBlockParam( type="text", text=f"{BROWSER_SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}", ) while True: # Configure client and betas betas = [] enable_prompt_caching = False if provider == APIProvider.ANTHROPIC: client = Anthropic(api_key=api_key, max_retries=4) enable_prompt_caching = True elif provider == APIProvider.VERTEX: client = AnthropicVertex() elif provider == APIProvider.BEDROCK: client = AnthropicBedrock() else: raise ValueError(f"Unsupported provider: {provider}") if enable_prompt_caching: betas.append(PROMPT_CACHING_BETA_FLAG) # Add cache control to system prompt system = BetaTextBlockParam( type="text", text=system["text"], cache_control=BetaCacheControlEphemeralParam(type="ephemeral"), ) # Make API call try: api_kwargs = { "max_tokens": max_tokens, "messages": messages, "model": model, "system": [system], "tools": tool_collection.to_params(), } # Only include betas if there are any (e.g., prompt caching) if betas: api_kwargs["betas"] = betas response = client.beta.messages.create(**api_kwargs) else: # Use regular messages API when no beta features are needed response = client.messages.create(**api_kwargs) except Exception as e: api_response_callback(None, None, e) raise e api_response_callback(None, response, None) # Process response using our new abstractions processor = ResponseProcessor() processed = processor.process_response(response) # Output all content blocks to callbacks for content_block in processed.assistant_content: output_callback(content_block) # Build and append the complete assistant message (preserves text + tools) builder = MessageBuilder() builder.add_assistant_message(messages, processed.assistant_content) # Execute tools and collect results if there are any tool uses if processed.tool_uses: tool_results = await processor.execute_tools( processed.tool_uses, tool_collection, tool_output_callback ) # Add all tool results as a single user message builder.add_tool_results(messages, tool_results) # Continue the loop to process any follow-up else: # No tools used, conversation can end here return messages def _maybe_filter_to_n_most_recent_images( messages: list[BetaMessageParam], images_to_keep: int, min_removal_threshold: int = 10, ): """ Filter messages to keep only the N most recent images. """ if images_to_keep <= 0: raise ValueError("images_to_keep must be > 0") total_images = sum( 1 for message in messages if message["role"] == "user" for block in message.get("content", []) if isinstance(block, dict) and block.get("type") == "image" ) images_to_remove = total_images - images_to_keep if images_to_remove < min_removal_threshold: return images_removed = 0 for message in messages: if message["role"] == "user" and isinstance(message.get("content"), list): new_content = [] for block in message["content"]: if isinstance(block, dict) and block.get("type") == "image": if images_removed < images_to_remove: images_removed += 1 continue new_content.append(block) message["content"] = new_content ================================================ FILE: browser-use-demo/browser_use_demo/message_handler.py ================================================ """ Message handling abstractions for proper API response processing. This module provides clean abstractions for processing API responses and building messages that preserve both text explanations and tool uses together, matching the Chrome extension's behavior. """ from collections.abc import Callable from dataclasses import dataclass from typing import Any, Optional, cast from anthropic.types.beta import ( BetaContentBlockParam, BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam, BetaToolResultBlockParam, ) from .tools import ToolCollection, ToolResult @dataclass class ProcessedResponse: """Container for processed API response data.""" assistant_content: list[BetaContentBlockParam] tool_uses: list[dict[str, Any]] has_text: bool has_tools: bool class ResponseProcessor: """ Handles API response processing with proper message preservation. This class ensures that text explanations are preserved alongside tool uses, matching the behavior of the Chrome extension. """ def process_response(self, response) -> ProcessedResponse: """ Process an API response into a structured format. Args: response: The API response from Anthropic Returns: ProcessedResponse containing all content blocks and metadata """ assistant_content = [] tool_uses = [] has_text = False has_tools = False for content_block in response.content: if content_block.type == "text": has_text = True assistant_content.append({ "type": "text", "text": content_block.text }) elif content_block.type == "tool_use": has_tools = True tool_use_dict = { "type": "tool_use", "id": content_block.id, "name": content_block.name, "input": content_block.input } assistant_content.append(tool_use_dict) tool_uses.append(tool_use_dict) return ProcessedResponse( assistant_content=assistant_content, tool_uses=tool_uses, has_text=has_text, has_tools=has_tools ) async def execute_tools( self, tool_uses: list[dict[str, Any]], tool_collection: ToolCollection, tool_output_callback: Optional[Callable[[ToolResult, str], None]] = None ) -> list[BetaToolResultBlockParam]: """ Execute tools and collect results. Args: tool_uses: List of tool use blocks to execute tool_collection: The tool collection for execution tool_output_callback: Optional callback for tool results Returns: List of tool result blocks """ tool_results = [] for tool_use in tool_uses: tool_id = tool_use["id"] tool_name = tool_use["name"] tool_input = tool_use["input"] try: tool = tool_collection.tool_map.get(tool_name) if not tool: raise ValueError(f"Unknown tool: {tool_name}") result = await tool(**tool_input) if tool_output_callback: tool_output_callback(result, tool_id) tool_result = self._build_tool_result(result, tool_id) tool_results.append(tool_result) except Exception as e: error_result = BetaToolResultBlockParam( type="tool_result", tool_use_id=tool_id, is_error=True, content=[{"type": "text", "text": str(e)}] ) tool_results.append(error_result) if tool_output_callback: error_tool_result = ToolResult(error=str(e)) tool_output_callback(error_tool_result, tool_id) return tool_results def _build_tool_result( self, result: ToolResult, tool_use_id: str ) -> BetaToolResultBlockParam: """ Build a tool result block from a ToolResult. Args: result: The tool execution result tool_use_id: The ID of the tool use Returns: A properly formatted tool result block """ tool_result = BetaToolResultBlockParam( type="tool_result", tool_use_id=tool_use_id, content=[] ) content_list = cast(list[BetaTextBlockParam | BetaImageBlockParam], tool_result.get("content", [])) if result.output: output_text = result.output if "__PAGE_EXTRACTED__" in output_text or "__TEXT_EXTRACTED__" in output_text: if "__FULL_CONTENT__" in output_text: marker_pos = output_text.index("__FULL_CONTENT__") output_text = output_text[marker_pos + len("__FULL_CONTENT__") + 1:] content_list.append({ "type": "text", "text": output_text }) if result.base64_image: content_list.append({ "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": result.base64_image, } }) if result.error: tool_result["is_error"] = True content_list.append({ "type": "text", "text": f"Error: {result.error}" }) return tool_result class MessageBuilder: """ Builds properly structured messages for the API. This class ensures messages are constructed in a way that preserves all content, matching the Chrome extension's behavior. """ def add_assistant_message( self, messages: list[BetaMessageParam], content: list[BetaContentBlockParam] ) -> None: """ Add a complete assistant message with all content blocks. Args: messages: The message list to append to content: The content blocks for the assistant message """ if content: # Only add if there's content messages.append({ "role": "assistant", "content": content }) def add_tool_results( self, messages: list[BetaMessageParam], tool_results: list[BetaToolResultBlockParam] ) -> None: """ Add tool results as a user message. Args: messages: The message list to append to tool_results: The tool result blocks to add """ if tool_results: # Only add if there are results messages.append({ "role": "user", "content": tool_results }) def ensure_message_integrity( self, messages: list[BetaMessageParam] ) -> bool: """ Validate that messages maintain proper structure. Args: messages: The message list to validate Returns: True if messages are properly structured, False otherwise """ if not messages: return True for message in messages: role = message.get("role") if not role: return False if "content" not in message: return False content = message["content"] if isinstance(content, list) and len(content) == 0: return False return True def extract_text_from_message( self, message: BetaMessageParam ) -> Optional[str]: """ Extract text content from a message. Args: message: The message to extract text from Returns: The extracted text or None if no text content """ if message.get("role") != "assistant": return None content = message.get("content", []) if isinstance(content, str): return content text_parts = [] for block in content: if isinstance(block, dict) and block.get("type") == "text": text_parts.append(block.get("text", "")) return " ".join(text_parts) if text_parts else None ================================================ FILE: browser-use-demo/browser_use_demo/message_renderer.py ================================================ """ Message rendering functionality for the Browser Use Demo. This module handles all message rendering logic for the Streamlit interface, separating presentation concerns from the main application logic. """ import base64 from typing import cast import streamlit as st from anthropic.types.beta import BetaContentBlockParam from browser_use_demo.tools import ToolResult from browser_use_demo.tools.coordinate_scaling import CoordinateScaler class Sender: """Message sender types.""" USER = "user" BOT = "assistant" TOOL = "tool" class MessageRenderer: """Handles rendering of messages in the Streamlit chat interface.""" def __init__(self, session_state): """Initialize the MessageRenderer with session state access. Args: session_state: Streamlit session state object for accessing configuration """ self.session_state = session_state def _scale_browser_coordinates(self, input_dict: dict) -> dict: """Apply coordinate scaling to browser tool inputs for display. This ensures the displayed coordinates match what the browser tool will actually use. Uses the CoordinateScaler for consistent scaling across the codebase. Args: input_dict: The tool input dictionary Returns: Modified input dict with scaled coordinates """ # Only process browser tool inputs with coordinates if not isinstance(input_dict, dict): return input_dict # Get browser tool dimensions if available browser_tool = getattr(self.session_state, 'browser_tool', None) if not browser_tool: return input_dict # Clone the input to avoid modifying the original import copy scaled_input = copy.deepcopy(input_dict) # Get viewport dimensions width = browser_tool.width height = browser_tool.height # Scale various coordinate fields using CoordinateScaler if 'coordinate' in scaled_input: scaled_input['coordinate'] = CoordinateScaler.scale_coordinate_list( scaled_input['coordinate'], width, height ) if 'start_coordinate' in scaled_input: scaled_input['start_coordinate'] = CoordinateScaler.scale_coordinate_list( scaled_input['start_coordinate'], width, height ) return scaled_input def render(self, sender: str, message: str | BetaContentBlockParam | ToolResult): """Render a message in the chat interface. Args: sender: The sender type (USER, BOT, or TOOL) message: The message content to render """ # Early return for empty messages or hidden screenshots without content if self._should_skip_message(message): return with st.chat_message(sender): self._render_message_content(message) def _should_skip_message(self, message) -> bool: """Check if message should be skipped from rendering. Args: message: The message to check Returns: True if the message should be skipped, False otherwise """ if not message: return True # Skip tool results that only have screenshots when screenshots are hidden is_tool_result = not isinstance(message, str | dict) if is_tool_result and self.session_state.hide_screenshots: return not hasattr(message, "error") and not hasattr(message, "output") return False def _render_message_content(self, message): """Render the actual message content based on its type. Args: message: The message content to render """ # Define rendering strategies for different message types renderers = { "tool_result": self._render_tool_result, "dict": self._render_dict_message, "string": lambda msg: st.markdown(msg), } # Determine message type and render accordingly if not isinstance(message, str | dict): # It's a ToolResult object renderers["tool_result"](cast(ToolResult, message)) elif isinstance(message, dict): renderers["dict"](message) else: renderers["string"](message) def _render_tool_result(self, tool_result: ToolResult): """Render a tool result with output, error, and optional image. Args: tool_result: The ToolResult object to render """ if tool_result.output: # Check if this is a text extraction result with special markers if "__PAGE_EXTRACTED__" in tool_result.output or "__TEXT_EXTRACTED__" in tool_result.output: # Extract just the summary for display lines = tool_result.output.split("\n") summary_lines = [] in_summary = False for line in lines: if "__PAGE_EXTRACTED__" in line or "__TEXT_EXTRACTED__" in line: in_summary = True continue if "__FULL_CONTENT__" in line: break if in_summary: summary_lines.append(line) # Display only the summary if summary_lines: st.markdown("\n".join(summary_lines)) else: # Regular tool output st.markdown(tool_result.output) if tool_result.error: st.error(tool_result.error) if tool_result.base64_image and not self.session_state.hide_screenshots: st.image(base64.b64decode(tool_result.base64_image)) def _render_dict_message(self, message: dict): """Render dictionary-based messages based on their type field. Args: message: Dictionary containing the message to render """ message_type = message.get("type", "") # Dispatch table for different message types type_handlers = { "text": lambda: st.write(message["text"]), "tool_use": lambda: self._render_tool_use(message), "tool_result": lambda: self._render_stored_tool_result(message), } # Execute the appropriate handler or fall back to generic display handler = type_handlers.get(message_type, lambda: st.write(message)) handler() def _render_tool_use(self, message: dict): """Render a tool use message with coordinate scaling for browser tools. Args: message: Dictionary containing tool use information """ tool_name = message.get('name', 'unknown') tool_input = message.get('input', {}) # Apply coordinate scaling for browser tool if tool_name == 'browser': tool_input = self._scale_browser_coordinates(tool_input) st.code(f"Tool Use: {tool_name}\nInput: {tool_input}") def _render_stored_tool_result(self, message: dict): """Render a tool result that was stored in session state. Args: message: Dictionary containing the tool_use_id reference """ tool_id = message.get("tool_use_id") if tool_id and tool_id in self.session_state.tools: self._render_tool_result(self.session_state.tools[tool_id]) def render_conversation_history(self, messages: list): """Render all messages in conversation history. This method processes a list of messages and renders each one according to its role and content type, eliminating deep nesting. Args: messages: List of message dictionaries from session state """ for message in messages: self._render_message_by_role(message) def _render_message_by_role(self, message: dict): """Route message rendering based on role. Args: message: Message dictionary containing role and content """ role_handlers = { "user": lambda m: self._render_user_content(m["content"]), "assistant": lambda m: self._render_assistant_content(m["content"]), } handler = role_handlers.get(message["role"]) if handler: handler(message) def _render_user_content(self, content): """Render user message content. Handles both single items and lists of content blocks, skipping image blocks in conversation history. Args: content: User message content (string, dict, or list) """ for item in self._normalize_content(content): # Skip image blocks in history if isinstance(item, dict) and item.get("type") == "image": continue # Extract text from dict blocks or use item directly if isinstance(item, dict): if item.get("type") == "text": text_content = item.get("text", "") self.render(Sender.USER, text_content) else: # For other dict types, cast as BetaContentBlockParam self.render(Sender.USER, cast(BetaContentBlockParam, item)) else: self.render(Sender.USER, item) def _render_assistant_content(self, content): """Render assistant message content. Handles both single items and lists of content blocks, properly routing tool results to the TOOL sender. Args: content: Assistant message content (string, dict, or list) """ for item in self._normalize_content(content): if isinstance(item, dict) and item.get("type") == "tool_result": # Handle tool results by fetching from session state tool_id = item.get("tool_use_id") if tool_id and tool_id in self.session_state.tools: self.render(Sender.TOOL, self.session_state.tools[tool_id]) elif isinstance(item, dict): # Cast dict items as BetaContentBlockParam self.render(Sender.BOT, cast(BetaContentBlockParam, item)) else: # String or other types self.render(Sender.BOT, item) def _normalize_content(self, content): """Convert content to list for uniform processing. This eliminates duplicate code for handling list vs non-list content. Args: content: Content that may be a single item or list Returns: List of content items for processing """ return content if isinstance(content, list) else [content] ================================================ FILE: browser-use-demo/browser_use_demo/requirements.txt ================================================ streamlit==1.41.0 anthropic[bedrock,vertex]>=0.39.0 jsonschema==4.22.0 boto3>=1.28.57 google-auth<3,>=2 playwright>=1.40.0 ================================================ FILE: browser-use-demo/browser_use_demo/streamlit.py ================================================ """ Browser Use Demo - Streamlit interface for browser automation with Claude """ import asyncio import base64 import io import json import os import time import traceback import zipfile from datetime import datetime from pathlib import PosixPath import streamlit as st from anthropic.types.beta import BetaContentBlockParam from anthropic import RateLimitError from browser_use_demo.loop import APIProvider, sampling_loop from browser_use_demo.message_renderer import MessageRenderer, Sender from browser_use_demo.tools import ToolResult PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { APIProvider.ANTHROPIC: "claude-sonnet-4-5-20250929", APIProvider.BEDROCK: "anthropic.claude-sonnet-4-5-20250929-v1:0", APIProvider.VERTEX: "claude-sonnet-4-5@20250929", } CONFIG_DIR = PosixPath("~/.anthropic").expanduser() API_KEY_FILE = CONFIG_DIR / "api_key" STREAMLIT_STYLE = """ """ # Claude 4.5 models for browser automation BROWSER_COMPATIBLE_MODELS = [ "claude-sonnet-4-5-20250929", "claude-opus-4-5-20251101", "claude-haiku-4-5-20251001", ] def setup_state(): """Initialize session state variables.""" # Import here to avoid circular imports when browser_tool lambda is evaluated from browser_use_demo.tools import BrowserTool # Define all defaults in one place - use lambdas for lazy evaluation of complex values defaults = { # UI State "messages": [], "system_prompt": "", "hide_screenshots": False, "rendered_message_count": 0, # Track rendered messages to avoid re-rendering "last_error": None, # Store last error message to display persistently # API Configuration "api_key": os.environ.get("ANTHROPIC_API_KEY", ""), "provider": APIProvider.ANTHROPIC, "max_tokens": 8192, "model": lambda: PROVIDER_TO_DEFAULT_MODEL_NAME[st.session_state.provider], # Runtime State "tools": {}, "event_loop": None, # Persistent event loop for async operations "chat_disabled": False, # Simple flag to disable chat input "active_messages": [], # Store messages for current interaction "active_response_container": None, # Container reference for streaming responses # Complex initialization - browser tool (inline lambda) "browser_tool": lambda: BrowserTool(), } # Apply all defaults - evaluate lambdas when needed for key, default_value in defaults.items(): if key not in st.session_state: # If it's a callable (lambda), call it to get the actual value if callable(default_value): st.session_state[key] = default_value() else: st.session_state[key] = default_value def _clean_text_extraction_markers(text: str) -> str: """Remove text extraction markers and return a summary.""" if "__PAGE_EXTRACTED__" not in text and "__TEXT_EXTRACTED__" not in text: return text lines = text.split("\n") summary = [] for line in lines: if "__FULL_CONTENT__" in line: break if "__PAGE_EXTRACTED__" not in line and "__TEXT_EXTRACTED__" not in line: summary.append(line) return "\n".join(summary) + "\n[Full content extracted but truncated for readability]" def create_transcript_zip(messages: list, include_images: bool = False) -> bytes: """Create a ZIP archive containing the transcript and optionally images. Args: messages: List of message dictionaries from session state include_images: Whether to include images as separate files Returns: Bytes of the ZIP archive """ # Create an in-memory ZIP file zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: if include_images: # Extract images and create transcript with file references transcript_json, image_files = extract_images_from_messages(messages) # Add images to ZIP for idx, img_data in enumerate(image_files): filename = f"images/screenshot_{idx+1:04d}.png" try: img_bytes = base64.b64decode(img_data) zip_file.writestr(filename, img_bytes) except Exception as e: print(f"Error adding image to ZIP: {e}") # Add README readme_content = f"""Browser Use Demo - Conversation Transcript Generated: {datetime.now().isoformat()} This archive contains: - transcript.json: The conversation transcript - images/: {len(image_files)} screenshot images referenced in the transcript The transcript is in JSON format with images stored as separate PNG files. Image references in the transcript point to files in the images/ directory. """ zip_file.writestr("README.txt", readme_content) else: # Just create transcript without images transcript_json = format_transcript_for_download(messages, False) readme_content = f"""Browser Use Demo - Conversation Transcript Generated: {datetime.now().isoformat()} This archive contains: - transcript.json: The conversation transcript (text only) The transcript is in JSON format and includes all text messages from the conversation. """ zip_file.writestr("README.txt", readme_content) # Add the transcript JSON to the ZIP zip_file.writestr("transcript.json", transcript_json) # Get the ZIP file bytes zip_buffer.seek(0) return zip_buffer.read() class ImageExtractor: """Helper class to extract images and track their file references.""" def __init__(self): self.image_files = [] self.image_counter = 0 def extract_image(self, source: dict) -> dict: """Extract an image and return a file reference.""" if source.get("type") == "base64": self.image_counter += 1 self.image_files.append(source.get("data", "")) return { "type": "image", "file": f"images/screenshot_{self.image_counter:04d}.png" } else: return {"type": "image", "note": "No image data"} def process_image_content(self, item: dict) -> dict: """Process image content type.""" source = item.get("source", {}) return self.extract_image(source) def process_text_content(self, item: dict) -> dict: """Process text content type.""" return { "type": "text", "text": _clean_text_extraction_markers(item.get("text", "")) } def process_tool_use_content(self, item: dict) -> dict: """Process tool use content type.""" return { "type": "tool_use", "name": item.get("name", ""), "input": item.get("input", {}) } def process_tool_result_content(self, item: dict) -> dict: """Process tool result content type.""" tool_content = [] for content_item in item.get("content", []): if isinstance(content_item, dict): content_type = content_item.get("type") if content_type == "image": source = content_item.get("source", {}) tool_content.append(self.extract_image(source)) elif content_type == "text": tool_content.append(self.process_text_content(content_item)) else: tool_content.append(content_item) return { "type": "tool_result", "tool_use_id": item.get("tool_use_id", ""), "content": tool_content } def process_default_content(self, item: dict) -> dict: """Default processor for unknown content types.""" return _format_content_item(item, False) def extract_images_from_messages(messages: list) -> tuple: """Extract images from messages and create transcript with file references. Returns: Tuple of (transcript_json, list_of_base64_image_data) """ extractor = ImageExtractor() # Content type processors processors = { "image": extractor.process_image_content, "text": extractor.process_text_content, "tool_use": extractor.process_tool_use_content, "tool_result": extractor.process_tool_result_content, } def process_content(content): """Process content using appropriate processors.""" if isinstance(content, str): return content elif isinstance(content, list): processed = [] for item in content: if isinstance(item, dict): content_type = item.get("type") processor = processors.get(content_type, extractor.process_default_content) processed.append(processor(item)) else: processed.append(str(item)) return processed else: return str(content) # Build transcript transcript = { "timestamp": datetime.now().isoformat(), "format_version": "2.0", "image_storage": "separate_files", "conversation": [] } # Process all messages for message in messages: cleaned_message = { "role": message.get("role"), "timestamp": datetime.now().isoformat(), "content": process_content(message.get("content", "")) } transcript["conversation"].append(cleaned_message) return json.dumps(transcript, indent=2, ensure_ascii=False), extractor.image_files def format_transcript_for_download(messages: list, include_images: bool = False) -> str: """Format conversation messages into a readable transcript. Args: messages: List of message dictionaries from session state include_images: Whether to include base64 image data in the transcript Returns: Formatted JSON string of the conversation """ transcript = { "timestamp": datetime.now().isoformat(), "format_version": "1.0", "includes_images": include_images, "conversation": [] } for message in messages: cleaned_message = { "role": message.get("role"), "timestamp": datetime.now().isoformat(), "content": _format_message_content(message.get("content", ""), include_images) } transcript["conversation"].append(cleaned_message) return json.dumps(transcript, indent=2, ensure_ascii=False) def _format_text_content(item: dict, include_images: bool = False) -> dict: """Format a text content block.""" return { "type": "text", "text": _clean_text_extraction_markers(item.get("text", "")) } def _format_tool_use_content(item: dict, include_images: bool = False) -> dict: """Format a tool use content block.""" return { "type": "tool_use", "name": item.get("name", ""), "input": item.get("input", {}) } def _format_tool_result_content(item: dict, include_images: bool = False) -> dict: """Format a tool result content block.""" tool_content = [] for content_item in item.get("content", []): if isinstance(content_item, dict): content_type = content_item.get("type") if content_type == "text": text = _clean_text_extraction_markers(content_item.get("text", "")) tool_content.append({"type": "text", "text": text}) elif content_type == "image": if include_images: source = content_item.get("source", {}) if source.get("type") == "base64": tool_content.append({ "type": "image", "media_type": source.get("media_type", "image/png"), "base64_data": source.get("data", "") }) else: tool_content.append({"type": "image", "note": "Screenshot taken"}) return { "type": "tool_result", "tool_use_id": item.get("tool_use_id", ""), "content": tool_content } def _format_image_content(item: dict, include_images: bool = False) -> dict: """Format an image content block.""" if include_images: source = item.get("source", {}) if source.get("type") == "base64": return { "type": "image", "media_type": source.get("media_type", "image/png"), "base64_data": source.get("data", "") } return {"type": "image", "note": "Image/Screenshot included"} def _format_default_content(item: dict, include_images: bool = False) -> dict: """Format unknown content types - fallback handler.""" return item # Strategy pattern: Map content types to their formatting functions CONTENT_FORMATTERS = { "text": _format_text_content, "tool_use": _format_tool_use_content, "tool_result": _format_tool_result_content, "image": _format_image_content, } def _format_content_item(item, include_images: bool = False): """Format a single content item using the appropriate formatter. Uses the Strategy pattern to dispatch to the correct formatter based on content type. """ if not isinstance(item, dict): return str(item) content_type = item.get("type") formatter = CONTENT_FORMATTERS.get(content_type, _format_default_content) return formatter(item, include_images) def _format_message_content(content, include_images: bool = False): """Format message content based on its type. This is the main entry point that handles different content structures. """ if isinstance(content, str): return content elif isinstance(content, list): return [_format_content_item(item, include_images) for item in content] else: return str(content) def authenticate(): """Handle API key authentication.""" if st.session_state.provider == APIProvider.ANTHROPIC: if not st.session_state.api_key: st.error("Please provide your Anthropic API key in the sidebar") st.stop() return True def get_or_create_event_loop(): """Get existing event loop or create a new one if needed. This function ensures we have a valid event loop for async operations, reusing existing loops when possible to avoid Playwright issues with asyncio.run(). Returns: The active asyncio event loop. """ if st.session_state.event_loop is None or st.session_state.event_loop.is_closed(): st.session_state.event_loop = asyncio.new_event_loop() asyncio.set_event_loop(st.session_state.event_loop) return st.session_state.event_loop async def run_agent(user_input: str): """Run the browser automation agent with user input.""" try: # Ensure chat is disabled while processing st.session_state.chat_disabled = True # Create message renderer renderer = MessageRenderer(st.session_state) # Add user message to history st.session_state.messages.append({"role": "user", "content": user_input}) # Display user message in active container with st.session_state.active_response_container: renderer.render(Sender.USER, user_input) # Clear active messages for new interaction st.session_state.active_messages = [] # Prepare messages for API - preserve full conversation history api_messages = list(st.session_state.messages) # Setup callbacks for streaming responses def output_callback(content_block: BetaContentBlockParam): """Handle agent output - both text and tool use.""" # Stream to active container in real-time with st.session_state.active_response_container: renderer.render(Sender.BOT, content_block) # Store for later persistence st.session_state.active_messages.append(("assistant", content_block)) def tool_output_callback(result: ToolResult, tool_id: str): """Handle tool execution results.""" st.session_state.tools[tool_id] = result # Stream to active container in real-time with st.session_state.active_response_container: renderer.render(Sender.TOOL, result) # Store for later persistence st.session_state.active_messages.append(("tool", result, tool_id)) def api_response_callback(request, response, error): """Handle API responses.""" if error: with st.session_state.active_response_container: st.error(f"API Error: {error}") # Run the agent with persistent browser tool updated_messages = await sampling_loop( model=st.session_state.model, provider=st.session_state.provider, system_prompt_suffix=st.session_state.system_prompt, messages=api_messages, output_callback=output_callback, tool_output_callback=tool_output_callback, api_response_callback=api_response_callback, api_key=st.session_state.api_key, max_tokens=st.session_state.max_tokens, browser_tool=st.session_state.browser_tool, # Pass persistent browser instance only_n_most_recent_images=3, # Keep only 3 most recent screenshots for context ) # Update session state with the complete message history if updated_messages: st.session_state.messages = updated_messages # Re-enable chat input st.session_state.chat_disabled = False # Trigger a rerun to update the history display st.rerun() except RateLimitError: error_msg = "Rate limit exceeded. Please wait before sending another message." st.session_state.last_error = {"message": error_msg, "traceback": None} with st.session_state.active_response_container: st.error(error_msg) st.session_state.chat_disabled = False st.rerun() except Exception as e: error_msg = f"Error: {str(e)}" error_traceback = traceback.format_exc() st.session_state.last_error = {"message": error_msg, "traceback": error_traceback} with st.session_state.active_response_container: st.error(error_msg) st.code(error_traceback) st.session_state.chat_disabled = False st.rerun() def main(): """Main application entry point.""" st.set_page_config( page_title="Claude Browser Use Demo", page_icon="🌐", layout="wide" ) st.markdown(STREAMLIT_STYLE, unsafe_allow_html=True) setup_state() # Sidebar configuration with st.sidebar: st.header("⚙️ Configuration") # API Provider (fixed to Anthropic for browser use) st.selectbox( "API Provider", options=[APIProvider.ANTHROPIC], index=0, key="provider", disabled=True, help="Browser Use requires Anthropic API", ) # Model selection (only browser-compatible models) st.selectbox("Model", options=BROWSER_COMPATIBLE_MODELS, index=0, key="model") # API Key st.text_input( "Anthropic API Key", type="password", value=st.session_state.api_key, key="api_key", help="Get your API key from https://console.anthropic.com", ) # Max tokens st.number_input( "Max Output Tokens", min_value=1024, max_value=32768, value=st.session_state.max_tokens, step=1024, key="max_tokens", ) # System prompt st.text_area( "Additional System Prompt", value=st.session_state.system_prompt, key="system_prompt", help="Add custom instructions for the browser agent", ) # Hide screenshots st.checkbox( "Hide Screenshots", value=st.session_state.hide_screenshots, key="hide_screenshots", help="Hide screenshot outputs in the chat", ) # Conversation Management Section st.divider() st.subheader("💬 Conversation") # Download transcript options and button if st.session_state.messages: # Checkbox to include images include_images = st.checkbox( "Include images in transcript", value=False, help="Include screenshots as separate PNG files in a ZIP archive" ) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if include_images: # Generate ZIP with images zip_data = create_transcript_zip( st.session_state.messages, include_images=True ) # Show file size file_size_kb = len(zip_data) / 1024 if file_size_kb > 1024: size_str = f"{file_size_kb / 1024:.1f} MB" else: size_str = f"{file_size_kb:.1f} KB" st.download_button( label=f"📦 Download Transcript ZIP ({size_str})", data=zip_data, file_name=f"browser_demo_transcript_{timestamp}.zip", mime="application/zip", help=f"Download conversation with images as ZIP archive ({size_str})", type="primary", use_container_width=True, ) else: # Generate JSON only transcript_json = format_transcript_for_download( st.session_state.messages, include_images=False ) # Show file size file_size_kb = len(transcript_json.encode('utf-8')) / 1024 if file_size_kb > 1024: size_str = f"{file_size_kb / 1024:.1f} MB" else: size_str = f"{file_size_kb:.1f} KB" st.download_button( label=f"📄 Download Transcript JSON ({size_str})", data=transcript_json, file_name=f"browser_demo_transcript_{timestamp}.json", mime="application/json", help=f"Download conversation transcript as JSON ({size_str})", type="primary", use_container_width=True, ) else: st.info("No messages to download yet", icon="💬") # Clear conversation if st.button("🗑️ Clear Conversation", type="secondary", use_container_width=True): st.session_state.messages = [] st.session_state.tools = {} st.session_state.rendered_message_count = 0 st.session_state.active_messages = [] st.session_state.chat_disabled = False st.rerun() # Reset browser to blank page if st.button("Reset Browser", type="secondary"): async def reset_browser(): if st.session_state.browser_tool._page: await st.session_state.browser_tool._page.goto("about:blank") if st.session_state.event_loop is None or st.session_state.event_loop.is_closed(): st.session_state.event_loop = asyncio.new_event_loop() asyncio.set_event_loop(st.session_state.event_loop) st.session_state.event_loop.run_until_complete(reset_browser()) st.rerun() # Main chat interface st.title("🌐 Claude Browser Use Demo") st.markdown( "This demo showcases Claude's ability to interact with web browsers using " "Playwright automation. Ask Claude to navigate websites, fill forms, " "extract information, and more!" ) # Authenticate if not authenticate(): return # Create container for conversation history history_container = st.container() # Display conversation history in the history container renderer = MessageRenderer(st.session_state) with history_container: renderer.render_conversation_history(st.session_state.messages) # Create container for active/streaming responses active_container = st.container() st.session_state.active_response_container = active_container # Simple callback to disable chat input on submit def disable_chat_callback(): st.session_state.chat_disabled = True # Show persistent error message if there is one if st.session_state.last_error: st.error(st.session_state.last_error["message"]) if st.session_state.last_error["traceback"]: with st.expander("Show full traceback"): st.code(st.session_state.last_error["traceback"]) if st.button("Clear Error"): st.session_state.last_error = None st.rerun() # Show status when chat is disabled if st.session_state.chat_disabled: st.info("🤖 Claude is currently processing your request. Please wait...") # Simple chat input with disabled state prompt = st.chat_input( "Ask Claude to browse the web...", disabled=st.session_state.chat_disabled, on_submit=disable_chat_callback ) if prompt: # Clear any previous error when starting a new request st.session_state.last_error = None # Process the prompt loop = get_or_create_event_loop() loop.run_until_complete(run_agent(prompt)) if __name__ == "__main__": main() ================================================ FILE: browser-use-demo/browser_use_demo/tools/__init__.py ================================================ from .base import BaseAnthropicTool, ToolError, ToolResult from .browser import BrowserTool from .collection import ToolCollection __all__ = [ "BaseAnthropicTool", "ToolError", "ToolResult", "BrowserTool", "ToolCollection", ] ================================================ FILE: browser-use-demo/browser_use_demo/tools/base.py ================================================ from abc import ABCMeta, abstractmethod from dataclasses import dataclass, fields, replace from typing import Any from anthropic.types.beta import BetaToolUnionParam class BaseAnthropicTool(metaclass=ABCMeta): """Abstract base class for Anthropic-defined tools.""" @abstractmethod def __call__(self, **kwargs) -> Any: """Executes the tool with the given arguments.""" ... @abstractmethod def to_params( self, ) -> BetaToolUnionParam: raise NotImplementedError @dataclass(kw_only=True, frozen=True) class ToolResult: """Represents the result of a tool execution.""" output: str | None = None error: str | None = None base64_image: str | None = None system: str | None = None def __bool__(self): return any(getattr(self, field.name) for field in fields(self)) def __add__(self, other: "ToolResult"): def combine_fields( field: str | None, other_field: str | None, concatenate: bool = True ): if field and other_field: if concatenate: return field + other_field raise ValueError("Cannot combine tool results") return field or other_field return ToolResult( output=combine_fields(self.output, other.output), error=combine_fields(self.error, other.error), base64_image=combine_fields(self.base64_image, other.base64_image, False), system=combine_fields(self.system, other.system), ) def replace(self, **kwargs): """Returns a new ToolResult with the given fields replaced.""" return replace(self, **kwargs) class CLIResult(ToolResult): """A ToolResult that can be rendered as a CLI output.""" class ToolFailure(ToolResult): """A ToolResult that represents a failure.""" class ToolError(Exception): """Raised when a tool encounters an error.""" def __init__(self, message): self.message = message ================================================ FILE: browser-use-demo/browser_use_demo/tools/browser.py ================================================ # Modifications Copyright (c) 2025 Anthropic, PBC # Modified from original Microsoft Playwright source # Original Microsoft Playwright source licensed under Apache License 2.0 # See CHANGELOG.md for details """Browser automation tool using Playwright for web interaction.""" import asyncio import base64 import json import os import sys from pathlib import Path from typing import Any, Literal, Optional, TypedDict, cast from uuid import uuid4 from anthropic.types.beta import BetaToolUnionParam from playwright.async_api import Browser, BrowserContext, Page from ..display_constants import BROWSER_HEIGHT, BROWSER_WIDTH, DISPLAY_NUM from .base import BaseAnthropicTool, ToolError, ToolResult from .coordinate_scaling import CoordinateScaler # Simple logging for debugging - removed, using print directly # Custom browser tool input schema BROWSER_TOOL_INPUT_SCHEMA: dict[str, Any] = { "properties": { "action": { "description": 'The action to perform. The available actions are:\n* `navigate`: Navigate to a URL or use "back"/"forward" for browser history navigation. Automatically includes a screenshot of the loaded page.\n* `screenshot`: Take a screenshot of the current browser viewport.\n* `left_click`: Click the left mouse button at the specified coordinate or element reference.\n* `right_click`: Click the right mouse button at the specified coordinate or element reference.\n* `middle_click`: Click the middle mouse button at the specified coordinate or element reference.\n* `double_click`: Double-click the left mouse button at the specified coordinate or element reference.\n* `triple_click`: Triple-click the left mouse button at the specified coordinate or element reference.\n* `hover`: Move the mouse cursor to the specified coordinate or element reference without clicking. Useful for revealing tooltips, dropdown menus, or triggering hover states.\n* `left_click_drag`: Click and drag from start_coordinate to coordinate.\n* `left_mouse_down`: Press and hold the left mouse button at the specified coordinate.\n* `left_mouse_up`: Release the left mouse button at the specified coordinate.\n* `scroll`: Scroll the page in a specified direction.\n* `scroll_to`: Scroll to bring an element into view.\n* `type`: Type text at the current cursor position.\n* `key`: Press a key or key combination (supports standard keys and modifiers).\n* `hold_key`: Hold down a key or key combination for a specified duration.\n* `read_page`: Get the DOM tree structure, optionally filtered for interactive elements.\n* `find`: Find text on the page and highlight matches.\n* `get_page_text`: Get all text content from the page.\n* `wait`: Wait for a specified duration in seconds.\n* `form_input`: Set the value of a form input element.\n* `zoom`: Take a zoomed screenshot of a specific region.\n* `execute_js`: Execute JavaScript code in the page context. Returns the result of the last expression.', "enum": [ "navigate", "screenshot", "left_click", "right_click", "middle_click", "double_click", "triple_click", "hover", "left_click_drag", "left_mouse_down", "left_mouse_up", "scroll", "scroll_to", "type", "key", "hold_key", "read_page", "find", "get_page_text", "wait", "form_input", "zoom", "execute_js", ], "type": "string", }, "text": { "description": 'Required for: `navigate` (URL or "back"/"forward"), `type` (text to type), `key` (key combination), `hold_key` (key to hold), `find` (text to search), `execute_js` (valid JavaScript code ONLY - no explanatory text, just the code). Optional for `read_page` (filter type: "interactive"), click actions (modifier keys to hold during click).', "type": "string", }, "ref": { "description": "Element reference string for targeting specific DOM elements. Required for `scroll_to` and `form_input`. Optional for click actions and `hover` as an alternative to coordinates.", "type": "string", }, "coordinate": { "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required for mouse actions when `ref` is not provided: `left_click`, `right_click`, `middle_click`, `double_click`, `triple_click`, `hover`, `left_mouse_down`, `left_mouse_up`, `scroll`. Also serves as the end coordinate for `left_click_drag`.", "type": "array", "items": {"type": "integer"}, }, "start_coordinate": { "description": "(x, y): The starting x and y coordinates for drag operations. Required only for `left_click_drag`.", "type": "array", "items": {"type": "integer"}, }, "scroll_direction": { "description": "The direction to scroll. Required for `scroll` action.", "enum": ["up", "down", "left", "right"], "type": "string", }, "scroll_amount": { "description": "The number of scroll units (similar to mouse wheel clicks). Required for `scroll` action.", "type": "integer", }, "duration": { "description": "Duration in seconds. Required for `hold_key` and `wait` actions. For `wait`, must be between 0 and 100 seconds.", "type": "number", }, "value": { "description": "The value to set for a form input element. Required for `form_input` action. Can be string, number, or boolean depending on the input type.", "type": ["string", "number", "boolean"], }, "region": { "description": "(x1, y1, x2, y2): Defines a rectangular region for the `zoom` action. Coordinates specify top-left (x1, y1) and bottom-right (x2, y2) corners.", "type": "array", "items": {"type": "integer"}, }, }, "required": ["action"], "type": "object", } BROWSER_TOOL_DESCRIPTION = """A browser automation tool for web interaction. Use this tool to navigate websites, interact with elements, and extract content. Key actions: - navigate: Go to a URL (automatically includes a screenshot) - screenshot: Take a visual screenshot - read_page: Get DOM structure with element references - get_page_text: Extract all text content - left_click, right_click, double_click: Click elements - hover: Move cursor without clicking (for tooltips, dropdowns) - type: Enter text at cursor - scroll: Scroll the page - form_input: Fill form fields - execute_js: Run JavaScript in page context""" OUTPUT_DIR = Path("/tmp/outputs") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Directory containing browser tool utility files (JS scripts) BROWSER_TOOL_UTILS_DIR = Path(__file__).parent.parent / "browser_tool_utils" class BrowserOptions(TypedDict): display_width_px: int display_height_px: int Actions = Literal[ "navigate", "screenshot", "left_click", "right_click", "middle_click", "double_click", "triple_click", "hover", "left_click_drag", "left_mouse_down", "left_mouse_up", "scroll", "scroll_to", "type", "key", "hold_key", "read_page", "find", "get_page_text", "wait", "form_input", "zoom", "execute_js", ] class BrowserTool(BaseAnthropicTool): """ A browser automation tool using Playwright for web interaction. Key actions for extracting content: - read_page: Extract structured DOM tree with element references (USE THIS for analyzing page structure) - get_page_text: Extract all text content from the page (USE THIS for reading articles/posts) - screenshot: Take a visual screenshot (only for visual confirmation, not for reading content) Navigation actions: - navigate: Go to a URL - find: Search for elements on the page Interaction actions: - left_click, right_click, double_click: Click elements - type: Enter text - scroll: Scroll the page """ name: Literal["browser"] = "browser" # Instance-level browser connection (recreated per request) _browser: Optional[Browser] = None _context: Optional[BrowserContext] = None _page: Optional[Page] = None _playwright = None def __init__(self): """Initialize the browser tool with standard viewport dimensions.""" super().__init__() # Use constants for display configuration self.width = BROWSER_WIDTH self.height = BROWSER_HEIGHT self._initialized = False self._event_loop = None # Track which event loop we're initialized in self.cdp_url = None # Initialize CDP URL attribute for cleanup method @property def options(self) -> BrowserOptions: """Return browser display options.""" # Note: This implementation uses fixed 1920x1080 dimensions with empirical # coordinate correction. For the recommended approach using client-side # downscaling, see the "Handle coordinate scaling" section in the computer # use documentation. return { "display_width_px": self.width, "display_height_px": self.height, } def to_params(self) -> BetaToolUnionParam: """Convert tool to API parameters using custom tool definition.""" return cast( BetaToolUnionParam, { "name": self.name, "description": BROWSER_TOOL_DESCRIPTION, "input_schema": BROWSER_TOOL_INPUT_SCHEMA, }, ) async def _ensure_browser(self) -> None: """Launch browser and ensure page is ready.""" # NOTE: We intentionally DON'T reset the browser if the event loop changes # The browser should persist across conversation turns # Commenting out event loop check that was causing browser resets: # try: # current_loop = asyncio.get_running_loop() # if self._initialized and hasattr(self, "_event_loop"): # if self._event_loop != current_loop: # self._initialized = False # self._browser = None # self._context = None # self._page = None # self._playwright = None # except RuntimeError: # pass if self._initialized: print( f"[Browser] Reusing existing browser instance", file=sys.stderr, flush=True, ) if self._page: current_url = self._page.url print( f"[Browser] Current page URL: {current_url}", file=sys.stderr, flush=True, ) if not self._initialized: print( f"[Browser] Initializing browser for first time", file=sys.stderr, flush=True, ) if self._playwright is None: from playwright.async_api import async_playwright self._playwright = await async_playwright().start() if self._browser is None: viewport_width = self.width viewport_height = self.height is_docker = os.path.exists("/.dockerenv") launch_args = [ "--start-maximized", f"--window-size={viewport_width},{viewport_height}", "--window-position=0,0", "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", "--no-sandbox", "--disable-setuid-sandbox", "--disable-gpu-sandbox", "--disable-software-rasterizer", ] if is_docker: launch_args.extend([ f"--display=:{DISPLAY_NUM}", "--disable-infobars", "--disable-session-crashed-bubble", "--no-first-run", "--disable-features=TranslateUI", "--disable-component-extensions-with-background-pages", ]) print( f"[Browser] Launching browser with viewport {viewport_width}x{viewport_height}", file=sys.stderr, flush=True, ) self._browser = await self._playwright.chromium.launch( headless=False, args=launch_args, ) self._context = await self._browser.new_context( viewport={"width": viewport_width, "height": viewport_height}, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", ) self._page = await self._context.new_page() self._page.set_default_timeout(30000) print( f"[Browser] Browser initialized with viewport: {viewport_width}x{viewport_height}", file=sys.stderr, flush=True, ) print( f"[Browser] New browser instance created", file=sys.stderr, flush=True, ) self._initialized = True try: self._event_loop = asyncio.get_running_loop() except RuntimeError: self._event_loop = None async def _execute_js_from_file(self, filename: str, *args) -> Any: """Load and execute JavaScript from a file.""" if self._page is None: raise ToolError("Browser not initialized") script_path = BROWSER_TOOL_UTILS_DIR / filename if not script_path.exists(): raise ToolError(f"Script file not found: {filename}") script = script_path.read_text() # Special handling for browser_dom_script.js if filename == "browser_dom_script.js": # The DOM script defines window.__generateAccessibilityTree function # We need to inject it and then call it filter_type = args[0] if args else "" combined_expression = f""" (function() {{ {script} return window.__generateAccessibilityTree('{filter_type}'); }})() """ return await self._page.evaluate(combined_expression) else: # For other scripts, wrap as a function and call with arguments escaped_args = ", ".join(json.dumps(arg) for arg in args) js_expression = f"({script})({escaped_args})" return await self._page.evaluate(js_expression) async def _take_screenshot(self) -> ToolResult: """ Take a visual screenshot of the current page. NOTE: This only returns an image, not text content. Use read_page or get_page_text to extract actual content. """ if self._page is None: raise ToolError("Browser not initialized") try: # Save screenshot directly to file (like browser.py does with scrot) screenshot_path = OUTPUT_DIR / f"screenshot_{uuid4().hex}.png" await self._page.screenshot(path=str(screenshot_path), full_page=False) # Read the file and encode to base64 screenshot_bytes = screenshot_path.read_bytes() image_base64 = base64.b64encode(screenshot_bytes).decode() return ToolResult(output="", error=None, base64_image=image_base64) except Exception as e: raise ToolError(f"Failed to take screenshot: {str(e)}") from e async def _zoom_screenshot( self, x: int, y: int, width: int, height: int ) -> ToolResult: """Take a screenshot of a specific region.""" if self._page is None: raise ToolError("Browser not initialized") try: # Take screenshot with clipping screenshot_path = OUTPUT_DIR / f"zoom_screenshot_{uuid4().hex}.png" await self._page.screenshot( path=str(screenshot_path), clip={"x": x, "y": y, "width": width, "height": height}, ) # Read the file and encode to base64 screenshot_bytes = screenshot_path.read_bytes() image_base64 = base64.b64encode(screenshot_bytes).decode() return ToolResult(output="", error=None, base64_image=image_base64) except Exception as e: raise ToolError(f"Failed to take zoom screenshot: {str(e)}") from e async def _navigate(self, url: str) -> ToolResult: """Navigate to a URL.""" if self._page is None: raise ToolError("Browser not initialized") try: # Add protocol if missing if not url.startswith(("http://", "https://", "file://", "about:")): url = f"https://{url}" await self._page.goto(url, wait_until="domcontentloaded") await asyncio.sleep(2) # Wait for page to stabilize # Take screenshot after navigation return await self._take_screenshot() except Exception as e: raise ToolError(f"Failed to navigate to {url}: {str(e)}") from e def _scale_coordinates(self, x: int, y: int) -> tuple[int, int]: """ Apply auto-scaling to coordinates using the CoordinateScaler. Claude's vision model interprets images at a different resolution than actual. We use empirically-derived base resolution for accurate coordinate mapping. Args: x: Original x coordinate y: Original y coordinate Returns: Tuple of (scaled_x, scaled_y) """ # Get scale factors for this viewport scale_x, scale_y = CoordinateScaler.get_scale_factors(self.width, self.height) # Only log scale factors if they're being initialized if not hasattr(self, '_logged_scale_factors'): print( f"[Auto-Scale] Using scale factors: {scale_x:.3f}x, {scale_y:.3f}y", file=sys.stderr, flush=True, ) self._logged_scale_factors = True # Apply scaling using CoordinateScaler scaled_x, scaled_y = CoordinateScaler.scale_coordinates( x, y, self.width, self.height ) # Log if scaling was actually applied if scaled_x != x or scaled_y != y: print( f"[Auto-Scale] Scaled ({x}, {y}) -> ({scaled_x}, {scaled_y})", file=sys.stderr, flush=True, ) return scaled_x, scaled_y async def _click( self, action: str, coordinate: Optional[tuple[int, int]] = None, ref: Optional[str] = None, text: Optional[str] = None, ) -> ToolResult: """Handle various click actions.""" if self._page is None: raise ToolError("Browser not initialized") try: button = "left" click_count = 1 if action == "right_click": button = "right" elif action == "middle_click": button = "middle" elif action == "double_click": click_count = 2 elif action == "triple_click": click_count = 3 if coordinate: x, y = coordinate # Apply auto-scaling to coordinates x, y = self._scale_coordinates(x, y) # Validate coordinates are within viewport bounds viewport = self._page.viewport_size if viewport: if x < 0 or x > viewport['width'] or y < 0 or y > viewport['height']: print( f"[Click] WARNING: Coordinates ({x}, {y}) are outside viewport " f"({viewport['width']}x{viewport['height']})", file=sys.stderr, flush=True, ) # Still attempt the click but warn about potential issues if x > viewport['width']: print( f"[Click] X coordinate {x} exceeds viewport width {viewport['width']}", file=sys.stderr, flush=True, ) if y > viewport['height']: print( f"[Click] Y coordinate {y} exceeds viewport height {viewport['height']}", file=sys.stderr, flush=True, ) # Ensure the page has focus await self._page.bring_to_front() # Move mouse to position and click await self._page.mouse.move(x, y) await asyncio.sleep(0.01) # Small delay to ensure mouse is positioned # Perform the click based on type await self._page.mouse.click( x, y, button=button, click_count=click_count ) return ToolResult(output=f"Clicked at ({x}, {y})", error=None) elif ref: # Use the browser_element_script.js to find and click element element_info = await self._execute_js_from_file( "browser_element_script.js", ref ) if not element_info.get("success", False): raise ToolError( element_info.get("message", "Failed to find element") ) # Get the coordinates from element_info click_x, click_y = element_info["coordinates"] # Move to element and click await self._page.mouse.move(click_x, click_y) await asyncio.sleep(0.1) await self._page.mouse.click( click_x, click_y, button=button, click_count=click_count ) return ToolResult(output=f"Clicked element with ref: {ref}", error=None) elif text: # Click on element containing text await self._page.click( f"text={text}", button=button, click_count=click_count ) return ToolResult(output=f"Clicked on text: {text}", error=None) else: raise ToolError( "Either coordinate, ref, or text is required for click action" ) except Exception as e: raise ToolError(f"Failed to perform {action}: {str(e)}") from e async def _type_text(self, text: str) -> ToolResult: """Type text into the focused element.""" if self._page is None: raise ToolError("Browser not initialized") try: await self._page.keyboard.type(text) return ToolResult(output=f"Typed: {text}", error=None) except Exception as e: raise ToolError(f"Failed to type text: {str(e)}") from e async def _press_key( self, key: str, hold: bool = False, duration: float = 0.01 ) -> ToolResult: """Press a keyboard key or key combination.""" if self._page is None: raise ToolError("Browser not initialized") try: # Load the key map from ..browser_tool_utils.browser_key_map import KEY_MAP def map_key(k: str) -> str: """Map a key name to Playwright's expected format.""" key_info = KEY_MAP.get(k.lower()) if key_info and "key" in key_info: return key_info["key"] return k # Handle key combinations (e.g., "cmd+a", "ctrl+c") if "+" in key: parts = key.split("+") mapped_parts = [map_key(p) for p in parts] mapped_key = "+".join(mapped_parts) await self._page.keyboard.press(mapped_key) return ToolResult(output=f"Pressed key combination: {mapped_key}", error=None) # Map single key if needed key_info = KEY_MAP.get(key.lower()) if key_info: key_to_press = key_info["code"] if "code" in key_info else key else: key_to_press = key if hold: await self._page.keyboard.down(key_to_press) await asyncio.sleep(duration) await self._page.keyboard.up(key_to_press) return ToolResult( output=f"Held key '{key}' for {duration} seconds", error=None ) else: await self._page.keyboard.press(key_to_press) return ToolResult(output=f"Pressed key: {key}", error=None) except Exception as e: raise ToolError(f"Failed to press key '{key}': {str(e)}") from e async def _scroll( self, coordinate: Optional[tuple[int, int]] = None, direction: Optional[str] = None, amount: Optional[int] = None, ) -> ToolResult: """Scroll the page or element.""" if self._page is None: raise ToolError("Browser not initialized") try: if not direction: direction = "down" if not amount: amount = 3 # Default scroll amount # Calculate scroll delta based on direction delta_x = 0 delta_y = 0 if direction == "up": delta_y = -amount * 100 elif direction == "down": delta_y = amount * 100 elif direction == "left": delta_x = -amount * 100 elif direction == "right": delta_x = amount * 100 if coordinate: x, y = coordinate await self._page.mouse.wheel(delta_x, delta_y) else: # Scroll the main page await self._page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") # Wait for content to stabilize after scroll await asyncio.sleep(0.5) # Take screenshot to show new viewport content screenshot_result = await self._take_screenshot() return ToolResult( output=f"Scrolled {direction} by {amount} units", error=None, base64_image=screenshot_result.base64_image ) except Exception as e: raise ToolError(f"Failed to scroll: {str(e)}") from e async def _scroll_to(self, ref: str) -> ToolResult: """Scroll to a specific element.""" if self._page is None: raise ToolError("Browser not initialized") try: element_info = await self._execute_js_from_file( "browser_element_script.js", ref ) if not element_info["success"]: raise ToolError(element_info.get("message", "Failed to find element")) # Wait for content to stabilize after scroll await asyncio.sleep(0.5) # Take screenshot to show new viewport content screenshot_result = await self._take_screenshot() return ToolResult( output=f"Scrolled to element with ref: {ref}", error=None, base64_image=screenshot_result.base64_image ) except Exception as e: raise ToolError(f"Failed to scroll to element: {str(e)}") from e async def _drag( self, start_x: int, start_y: int, end_x: int, end_y: int ) -> ToolResult: """Perform a drag operation.""" if self._page is None: raise ToolError("Browser not initialized") try: # Apply auto-scaling to both start and end coordinates scaled_start_x, scaled_start_y = self._scale_coordinates(start_x, start_y) scaled_end_x, scaled_end_y = self._scale_coordinates(end_x, end_y) await self._page.mouse.move(scaled_start_x, scaled_start_y) await self._page.mouse.down() await self._page.mouse.move(scaled_end_x, scaled_end_y) await self._page.mouse.up() return ToolResult( output=f"Dragged from ({scaled_start_x}, {scaled_start_y}) to ({scaled_end_x}, {scaled_end_y})", error=None, ) except Exception as e: raise ToolError(f"Failed to perform drag: {str(e)}") from e async def _mouse_down(self, x: int, y: int) -> ToolResult: """Press mouse button down.""" if self._page is None: raise ToolError("Browser not initialized") try: # Apply auto-scaling to coordinates scaled_x, scaled_y = self._scale_coordinates(x, y) await self._page.mouse.move(scaled_x, scaled_y) await self._page.mouse.down() return ToolResult(output=f"Mouse down at ({scaled_x}, {scaled_y})", error=None) except Exception as e: raise ToolError(f"Failed to perform mouse down: {str(e)}") from e async def _mouse_up(self, x: int, y: int) -> ToolResult: """Release mouse button.""" if self._page is None: raise ToolError("Browser not initialized") try: # Apply auto-scaling to coordinates scaled_x, scaled_y = self._scale_coordinates(x, y) await self._page.mouse.move(scaled_x, scaled_y) await self._page.mouse.up() return ToolResult(output=f"Mouse up at ({scaled_x}, {scaled_y})", error=None) except Exception as e: raise ToolError(f"Failed to perform mouse up: {str(e)}") from e async def _hover( self, coordinate: Optional[tuple[int, int]] = None, ref: Optional[str] = None, ) -> ToolResult: """ Move the mouse cursor to a position without clicking. Useful for revealing tooltips, dropdown menus, or triggering hover states. """ if self._page is None: raise ToolError("Browser not initialized") try: # Prefer ref over coordinate (refs are more reliable) if ref: # Use the browser_element_script.js to find element coordinates element_info = await self._execute_js_from_file( "browser_element_script.js", ref ) if not element_info.get("success", False): raise ToolError( element_info.get("message", "Failed to find element") ) # Get the coordinates from element_info hover_x, hover_y = element_info["coordinates"] await self._page.bring_to_front() await self._page.mouse.move(hover_x, hover_y) # Wait for hover effects to render await asyncio.sleep(0.5) # Take screenshot to show hover result screenshot_result = await self._take_screenshot() return ToolResult( output=f"Hovered over element with ref: {ref}", error=None, base64_image=screenshot_result.base64_image ) elif coordinate: x, y = coordinate # Apply auto-scaling to coordinates scaled_x, scaled_y = self._scale_coordinates(x, y) await self._page.bring_to_front() await self._page.mouse.move(scaled_x, scaled_y) # Wait for hover effects to render await asyncio.sleep(0.3) # Take screenshot to show hover result screenshot_result = await self._take_screenshot() return ToolResult( output=f"Hovered at ({scaled_x}, {scaled_y})", error=None, base64_image=screenshot_result.base64_image ) else: raise ToolError( "Either coordinate or ref is required for hover action" ) except Exception as e: raise ToolError(f"Failed to perform hover: {str(e)}") from e async def _read_page(self, filter_type: str = "") -> ToolResult: """ Extract the DOM tree with structured content and element references. USE THIS to analyze page structure and find specific elements. Returns a structured tree with text content, not just a screenshot. """ if self._page is None: raise ToolError("Browser not initialized") try: # Use the browser_dom_script.js from reference implementation dom_tree = await self._execute_js_from_file( "browser_dom_script.js", filter_type ) # The script returns {pageContent: string}, extract just the pageContent if isinstance(dom_tree, dict) and "pageContent" in dom_tree: full_content = dom_tree["pageContent"] elif isinstance(dom_tree, dict): full_content = json.dumps(dom_tree, indent=2) else: full_content = str(dom_tree) # Calculate content size for summary content_length = len(full_content) # Estimate token count # Note: For exact counts, use client.beta.messages.count_tokens API # This estimate uses ~3.5 chars/token which is typical for Claude with English text # Actual ratio varies by content type (code, languages, special characters) estimated_tokens = int(content_length / 3.5) # Create a summary for UI display summary = f"Extracted page DOM tree (~{estimated_tokens:,} tokens, {content_length:,} characters)" # Return the full content for the API but with a marker for the UI return ToolResult( output=f"__PAGE_EXTRACTED__\n{summary}\n__FULL_CONTENT__\n{full_content}", error=None ) except Exception as e: raise ToolError(f"Failed to read page: {str(e)}") from e async def _get_page_text(self) -> ToolResult: """ Extract ALL text content from the current page. USE THIS to read articles, posts, or any text content. Returns the actual text, not a screenshot. Perfect for reading Reddit posts, articles, etc. """ if self._page is None: raise ToolError("Browser not initialized") try: # Use the browser_text_script.js from reference implementation result = await self._execute_js_from_file("browser_text_script.js") # Format the output like the reference implementation if isinstance(result, dict): full_content = f"""Title: {result.get("title", "N/A")} URL: {result.get("url", "N/A")} Source element: <{result.get("source", "unknown")}> --- {result.get("text", "")}""" else: full_content = str(result) # Calculate content size for summary content_length = len(full_content) # Estimate token count # Note: For exact counts, use client.beta.messages.count_tokens API # This estimate uses ~3.5 chars/token which is typical for Claude with English text # Actual ratio varies by content type (code, languages, special characters) estimated_tokens = int(content_length / 3.5) # Create a summary for UI display title = result.get("title", "N/A") if isinstance(result, dict) else "N/A" url = result.get("url", "N/A") if isinstance(result, dict) else "N/A" summary = f"Extracted page text from: {title}\nURL: {url}\n(~{estimated_tokens:,} tokens, {content_length:,} characters)" # Return the full content for the API but with a marker for the UI return ToolResult( output=f"__TEXT_EXTRACTED__\n{summary}\n__FULL_CONTENT__\n{full_content}", error=None ) except Exception as e: raise ToolError(f"Failed to get page text: {str(e)}") from e async def _find(self, search_query: str) -> ToolResult: """Find elements on the page matching the search query using AI.""" if self._page is None: raise ToolError("Browser not initialized") try: # First get the DOM tree for analysis dom_tree = await self._execute_js_from_file("browser_dom_script.js", "all") if isinstance(dom_tree, dict) and "pageContent" in dom_tree: dom_tree_json = dom_tree["pageContent"] else: dom_tree_json = json.dumps(dom_tree, indent=2) # Try to use Anthropic API if available api_key = os.environ.get("ANTHROPIC_API_KEY") if api_key: try: from anthropic import AsyncAnthropic client = AsyncAnthropic(api_key=api_key) prompt = f"""You are helping find elements on a web page. The user wants to find: "{search_query}" Here is the accessibility tree of the page: {dom_tree_json} Find ALL elements that match the user's query. Return up to 20 most relevant matches, ordered by relevance. Return your findings in this exact format (one line per matching element): FOUND: SHOWING: --- ref_X | role | name | type | reason why this matches ref_Y | role | name | type | reason why this matches ... If there are more than 20 matches, add this line at the end: MORE: Use a more specific query to see additional results If no matching elements are found, return only: FOUND: 0 ERROR: explanation of why no elements were found""" response = await client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=800, temperature=1.0, messages=[{"role": "user", "content": prompt}], ) # Handle the response properly first_content = response.content[0] if hasattr(first_content, "text"): response_text = first_content.text.strip() else: # Handle other content types if needed response_text = str(first_content) lines = [ line.strip() for line in response_text.split("\n") if line.strip() ] total_found = 0 elements = [] has_more = False error_message = None for line in lines: if line.startswith("FOUND:"): try: total_found = int(line.split(":")[1].strip()) except (ValueError, IndexError): total_found = 0 elif line.startswith("SHOWING:"): pass elif line.startswith("ERROR:"): error_message = line[6:].strip() elif line.startswith("MORE:"): has_more = True elif line.startswith("ref_") and "|" in line: parts = [p.strip() for p in line.split("|")] if len(parts) >= 4: elements.append( { "ref": parts[0], "role": parts[1], "name": parts[2] if len(parts) > 2 else "", "type": parts[3] if len(parts) > 3 else "", "description": parts[4] if len(parts) > 4 else "", } ) if total_found == 0 or len(elements) == 0: return ToolResult( output=error_message or "No matching elements found", error=None, ) message = f"Found {total_found} matching element{'s' if total_found != 1 else ''}" if has_more: message += f" (showing first {len(elements)}, use a more specific query to narrow results)" # Format elements for output elements_output = [] for el in elements: element_str = f"- {el['ref']}: {el['role']}" if el.get("name"): element_str += f" {el['name']}" if el.get("type"): element_str += f" {el['type']}" if el.get("description"): element_str += f" - {el['description']}" elements_output.append(element_str) elements_str = "\n".join(elements_output) return ToolResult(output=f"{message}\n\n{elements_str}", error=None) except Exception: pass # Failed to use AI for find, falling back to simple search # Fallback to simple text search if AI is not available elements = await self._page.query_selector_all( f"*:has-text('{search_query}')" ) if not elements: return ToolResult( output=f"No matching elements found for: {search_query}", error=None ) # For simple fallback, just report count (no ref_ids without AI analysis) return ToolResult( output=f"Found {len(elements)} matching element{'s' if len(elements) != 1 else ''} (Note: AI-based search with ref_ids requires ANTHROPIC_API_KEY)", error=None, ) except Exception as e: raise ToolError(f"Failed to find elements: {str(e)}") from e async def _form_input(self, ref: str, value: Any) -> ToolResult: """Fill a form field with a value.""" if self._page is None: raise ToolError("Browser not initialized") try: # Use the browser_form_input_script.js from reference implementation result = await self._execute_js_from_file( "browser_form_input_script.js", ref, value ) if isinstance(result, dict) and not result.get("success", False): raise ToolError(result.get("message", "Failed to fill form field")) return ToolResult( output=f"Filled form field {ref} with value: {value}", error=None ) except Exception as e: raise ToolError(f"Failed to fill form field: {str(e)}") from e async def _wait(self, duration: float) -> ToolResult: """Wait for a specified duration.""" try: await asyncio.sleep(duration) return ToolResult( output=f"Waited for {duration} second{'s' if duration != 1 else ''}", error=None, ) except Exception as e: raise ToolError(f"Failed to wait: {str(e)}") from e async def _execute_js(self, code: str) -> ToolResult: """ Execute JavaScript code in the page context. Returns the result of the last expression. """ if self._page is None: raise ToolError("Browser not initialized") try: # Execute the code in page context # Playwright's evaluate handles async/await automatically result = await self._page.evaluate(code) # Format the result if result is None: result_str = "undefined" elif isinstance(result, (dict, list)): result_str = json.dumps(result, indent=2) else: result_str = str(result) return ToolResult(output=result_str, error=None) except Exception as e: raise ToolError(f"JavaScript execution error: {str(e)}") from e async def __call__( self, *, action: Actions, text: Optional[str] = None, ref: Optional[str] = None, coordinate: Optional[tuple[int, int]] = None, start_coordinate: Optional[tuple[int, int]] = None, scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None, scroll_amount: Optional[int] = None, duration: Optional[float] = None, value: Optional[Any] = None, region: Optional[tuple[int, int, int, int]] = None, **kwargs, ) -> ToolResult: """ Execute browser actions. Parameters: - action: The action to perform - text: Text input for type, key, navigate, find actions - ref: Element reference for element-based actions - coordinate: (x, y) coordinates for mouse actions - start_coordinate: Starting point for drag actions - scroll_direction: Direction for scroll action - scroll_amount: Amount to scroll - duration: Duration for wait or hold_key actions - value: Value for form_input action - region: (x, y, width, height) for zoom screenshot """ # Ensure browser is running for all actions await self._ensure_browser() if action == "navigate": if not text: raise ToolError("URL is required for navigate action") return await self._navigate(text) elif action == "screenshot": return await self._take_screenshot() elif action == "zoom": if not region: raise ToolError( "Region (x1, y1, x2, y2) is required for zoom action" ) x1, y1, x2, y2 = region # Convert corner coordinates to x, y, width, height x = min(x1, x2) y = min(y1, y2) width = abs(x2 - x1) height = abs(y2 - y1) return await self._zoom_screenshot(x, y, width, height) elif action in [ "left_click", "right_click", "middle_click", "double_click", "triple_click", ]: return await self._click(action, coordinate, ref, text) elif action == "hover": return await self._hover(coordinate, ref) elif action == "type": if not text: raise ToolError("Text is required for type action") return await self._type_text(text) elif action == "key": if not text: raise ToolError("Key is required for key action") return await self._press_key(text) elif action == "hold_key": if not text: raise ToolError("Key is required for hold_key action") if not duration: duration = 1.0 return await self._press_key(text, hold=True, duration=duration) elif action == "scroll": return await self._scroll(coordinate, scroll_direction, scroll_amount) elif action == "scroll_to": if not ref: raise ToolError("Element reference is required for scroll_to action") return await self._scroll_to(ref) elif action == "left_click_drag": if not start_coordinate or not coordinate: raise ToolError( "Both start_coordinate and coordinate are required for drag action" ) start_x, start_y = start_coordinate end_x, end_y = coordinate return await self._drag(start_x, start_y, end_x, end_y) elif action == "left_mouse_down": if not coordinate: raise ToolError("Coordinate is required for mouse_down action") x, y = coordinate return await self._mouse_down(x, y) elif action == "left_mouse_up": if not coordinate: raise ToolError("Coordinate is required for mouse_up action") x, y = coordinate return await self._mouse_up(x, y) elif action == "read_page": filter_type = text if text in ["interactive", ""] else "" return await self._read_page(filter_type) elif action == "get_page_text": return await self._get_page_text() elif action == "find": if not text: raise ToolError("Text is required for find action") return await self._find(text) elif action == "form_input": if not ref: raise ToolError("Element reference is required for form_input action") if value is None: raise ToolError("Value is required for form_input action") return await self._form_input(ref, value) elif action == "wait": if not duration: duration = 1.0 return await self._wait(duration) elif action == "execute_js": if not text: raise ToolError("JavaScript code is required for execute_js action") return await self._execute_js(text) else: raise ToolError(f"Unknown action: {action}") async def cleanup(self): """Cleanup method to ensure browser is closed properly.""" # Clean up browser resources if self.cdp_url: # When connected to CDP server, just disconnect without closing tabs self._page = None self._context = None self._browser = None else: # For local browser, close everything if self._page: await self._page.close() self._page = None if self._context: await self._context.close() self._context = None if self._browser: await self._browser.close() self._browser = None if self._playwright: await self._playwright.stop() self._playwright = None self._initialized = False ================================================ FILE: browser-use-demo/browser_use_demo/tools/collection.py ================================================ from typing import Any from anthropic.types.beta import BetaToolUnionParam from .base import BaseAnthropicTool class ToolCollection: """Collection of tools for browser automation.""" def __init__(self, *tools: BaseAnthropicTool): self.tools = tools self.tool_map = {tool.name: tool for tool in tools} def to_params(self) -> list[BetaToolUnionParam]: """Convert all tools to API parameters.""" return [tool.to_params() for tool in self.tools] ================================================ FILE: browser-use-demo/browser_use_demo/tools/coordinate_scaling.py ================================================ """ Coordinate scaling utilities for browser tool. This module handles the scaling of coordinates from Claude's vision model resolution to the actual browser viewport resolution. """ class CoordinateScaler: """Handles coordinate scaling between Claude's vision and actual viewport.""" # Claude's image processing resolution for 16:9 aspect ratio # According to the official documentation: # https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size # When images exceed size thresholds, they are resized while preserving aspect ratio. # For 16:9 aspect ratio (like 1920x1080 screenshots), images are resized to these exact dimensions: CLAUDE_ACTUAL_WIDTH = 1456 # Width for 16:9 landscape (from documentation) CLAUDE_ACTUAL_HEIGHT = 819 # Height for 16:9 landscape (from documentation) # Documented maximum image sizes for different aspect ratios # Source: https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size DOCUMENTED_SIZES = { # aspect_ratio: (width, height) (1, 1): (1092, 1092), # Square (3, 4): (951, 1268), # Portrait (4, 3): (1268, 951), # Landscape (2, 3): (896, 1344), # Portrait (3, 2): (1344, 896), # Landscape (9, 16): (819, 1456), # Portrait (phone) (16, 9): (1456, 819), # Landscape (widescreen) (1, 2): (784, 1568), # Portrait (tall) (2, 1): (1568, 784), # Landscape (wide) } @classmethod def get_documented_size_for_aspect_ratio(cls, viewport_width: int, viewport_height: int) -> tuple[int, int]: """ Get the documented size for the given viewport's aspect ratio. Args: viewport_width: Actual viewport width viewport_height: Actual viewport height Returns: Tuple of (width, height) from documented sizes that matches the aspect ratio Raises: ValueError: If the aspect ratio doesn't match any documented sizes """ viewport_ratio = viewport_width / viewport_height # Tolerance for aspect ratio matching (to handle minor differences) ASPECT_RATIO_TOLERANCE = 0.02 # Try to find an exact or very close aspect ratio match for (ratio_w, ratio_h), (doc_width, doc_height) in cls.DOCUMENTED_SIZES.items(): doc_ratio = ratio_w / ratio_h if abs(viewport_ratio - doc_ratio) < ASPECT_RATIO_TOLERANCE: return (doc_width, doc_height) # If no match found, raise an error with helpful information supported_ratios = [f"{w}:{h} ({w/h:.3f})" for (w, h) in cls.DOCUMENTED_SIZES.keys()] raise ValueError( f"Viewport aspect ratio {viewport_ratio:.3f} ({viewport_width}x{viewport_height}) " f"does not match any documented aspect ratios. " f"Supported ratios: {', '.join(supported_ratios)}. " f"See https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size" ) @classmethod def get_scale_factors( cls, viewport_width: int, viewport_height: int, match_aspect_ratio: bool = False ) -> tuple[float, float]: """ Calculate scale factors for converting Claude coordinates to viewport coordinates. Args: viewport_width: Actual browser viewport width viewport_height: Actual browser viewport height match_aspect_ratio: If True, match the viewport's aspect ratio to documented sizes. If False (default), use 16:9 for browser scenarios. Returns: Tuple of (scale_x, scale_y) factors Raises: ValueError: If match_aspect_ratio is True and aspect ratio doesn't match documented sizes """ if match_aspect_ratio: # Match the documented size based on aspect ratio (will raise if no match) base_width, base_height = cls.get_documented_size_for_aspect_ratio(viewport_width, viewport_height) else: # Use default 16:9 dimensions (typical for browser screenshots) base_width = cls.CLAUDE_ACTUAL_WIDTH base_height = cls.CLAUDE_ACTUAL_HEIGHT # Calculate scale factors scale_x = viewport_width / base_width scale_y = viewport_height / base_height return scale_x, scale_y @classmethod def scale_coordinates( cls, x: int, y: int, viewport_width: int, viewport_height: int, apply_threshold: bool = True ) -> tuple[int, int]: """ Scale coordinates from Claude's vision to actual viewport. Args: x: X coordinate from Claude y: Y coordinate from Claude viewport_width: Actual browser viewport width viewport_height: Actual browser viewport height apply_threshold: Whether to check if coordinates need scaling Returns: Tuple of (scaled_x, scaled_y) """ scale_x, scale_y = cls.get_scale_factors(viewport_width, viewport_height) # If scaling factors are close to 1.0, no scaling needed if abs(scale_x - 1.0) < 0.05 and abs(scale_y - 1.0) < 0.05: return x, y if apply_threshold: # Check if coordinates appear to be in Claude's resolution # (with 20% margin for edge cases) max_expected_x = cls.CLAUDE_ACTUAL_WIDTH * 1.2 max_expected_y = cls.CLAUDE_ACTUAL_HEIGHT * 1.2 # If coordinates are larger than expected Claude resolution, # they might already be in viewport coordinates if x > max_expected_x or y > max_expected_y: return x, y # Apply scaling scaled_x = int(x * scale_x) scaled_y = int(y * scale_y) # Ensure we don't exceed viewport bounds scaled_x = min(scaled_x, viewport_width - 1) scaled_y = min(scaled_y, viewport_height - 1) return scaled_x, scaled_y @classmethod def scale_coordinate_list( cls, coords: list | tuple, viewport_width: int, viewport_height: int ) -> list: """ Scale a coordinate pair list/tuple. Args: coords: [x, y] coordinate pair viewport_width: Actual browser viewport width viewport_height: Actual browser viewport height Returns: Scaled [x, y] coordinate pair """ if not isinstance(coords, (list, tuple)) or len(coords) != 2: return list(coords) if isinstance(coords, tuple) else coords x, y = coords[0], coords[1] scaled_x, scaled_y = cls.scale_coordinates(x, y, viewport_width, viewport_height) return [scaled_x, scaled_y] ================================================ FILE: browser-use-demo/build.sh ================================================ #!/bin/bash set -e echo "🏗️ Building Browser Use Demo Docker image..." docker build . -t browser-use-demo:latest echo "✅ Build complete!" echo "" echo "To run the demo:" echo " docker run -e ANTHROPIC_API_KEY=\$ANTHROPIC_API_KEY \\" echo " -v \$(pwd)/browser_use_demo:/home/browseruse/browser_use_demo/ \\" echo " -p 5900:5900 -p 8501:8501 -p 6080:6080 -p 8080:8080 \\" echo " -it browser-use-demo:latest" echo "" echo "Then open:" echo " - http://localhost:8501 for the Streamlit interface" echo " - http://localhost:8080 to see the browser" ================================================ FILE: browser-use-demo/docker-compose.yml ================================================ services: browser-use: build: context: . image: browser-use-demo:latest environment: - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} volumes: - ./validate_env.py:/home/browseruse/validate_env.py - ${HOME}/.anthropic:/home/browseruse/.anthropic ports: - "${VNC_PORT:-5900}:5900" # VNC - "${STREAMLIT_PORT:-8501}:8501" # Streamlit - "${NOVNC_PORT:-6080}:6080" # noVNC - "${HTTP_PORT:-8080}:8080" # HTTP server stdin_open: true tty: true env_file: - .env develop: watch: - path: ./browser_use_demo action: sync+restart target: /home/browseruse/browser_use_demo - path: ./validate_env.py action: rebuild target: /home/browseruse/validate_env.py ================================================ FILE: browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop ================================================ [Desktop Entry] Name=Firefox Custom Comment=Open Firefox with custom URL Exec=firefox-esr -new-window Icon=firefox-esr Terminal=false Type=Application Categories=Network;WebBrowser; ================================================ FILE: browser-use-demo/image/.config/tint2/applications/gedit.desktop ================================================ [Desktop Entry] Name=Gedit Comment=Open gedit Exec=gedit Icon=text-editor-symbolic Terminal=false Type=Application Categories=TextEditor; ================================================ FILE: browser-use-demo/image/.config/tint2/applications/terminal.desktop ================================================ [Desktop Entry] Name=Terminal Comment=Open Terminal Exec=xterm Icon=utilities-terminal Terminal=false Type=Application Categories=System;TerminalEmulator; ================================================ FILE: browser-use-demo/image/.config/tint2/tint2rc ================================================ #------------------------------------- # Panel panel_items = TL panel_size = 100% 60 panel_margin = 0 0 panel_padding = 2 0 2 panel_background_id = 1 wm_menu = 0 panel_dock = 0 panel_position = bottom center horizontal panel_layer = top panel_monitor = all panel_shrink = 0 autohide = 0 autohide_show_timeout = 0 autohide_hide_timeout = 0.5 autohide_height = 2 strut_policy = follow_size panel_window_name = tint2 disable_transparency = 1 mouse_effects = 1 font_shadow = 0 mouse_hover_icon_asb = 100 0 10 mouse_pressed_icon_asb = 100 0 0 scale_relative_to_dpi = 0 scale_relative_to_screen_height = 0 #------------------------------------- # Taskbar taskbar_mode = single_desktop taskbar_hide_if_empty = 0 taskbar_padding = 0 0 2 taskbar_background_id = 0 taskbar_active_background_id = 0 taskbar_name = 1 taskbar_hide_inactive_tasks = 0 taskbar_hide_different_monitor = 0 taskbar_hide_different_desktop = 0 taskbar_always_show_all_desktop_tasks = 0 taskbar_name_padding = 4 2 taskbar_name_background_id = 0 taskbar_name_active_background_id = 0 taskbar_name_font_color = #e3e3e3 100 taskbar_name_active_font_color = #ffffff 100 taskbar_distribute_size = 0 taskbar_sort_order = none task_align = left #------------------------------------- # Launcher launcher_padding = 4 8 4 launcher_background_id = 0 launcher_icon_background_id = 0 launcher_icon_size = 48 launcher_icon_asb = 100 0 0 launcher_icon_theme_override = 0 startup_notifications = 1 launcher_tooltip = 1 #------------------------------------- # Launcher icon launcher_item_app = /usr/share/applications/libreoffice-calc.desktop launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop launcher_item_app = /usr/share/applications/xpaint.desktop launcher_item_app = /usr/share/applications/xpdf.desktop launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop launcher_item_app = /usr/share/applications/galculator.desktop #------------------------------------- # Background definitions # ID 1 rounded = 0 border_width = 0 background_color = #000000 60 border_color = #000000 30 # ID 2 rounded = 4 border_width = 1 background_color = #777777 20 border_color = #777777 30 # ID 3 rounded = 4 border_width = 1 background_color = #777777 20 border_color = #ffffff 40 # ID 4 rounded = 4 border_width = 1 background_color = #aa4400 100 border_color = #aa7733 100 # ID 5 rounded = 4 border_width = 1 background_color = #aaaa00 100 border_color = #aaaa00 100 ================================================ FILE: browser-use-demo/image/entrypoint.sh ================================================ #!/bin/bash set -e # Validate environment configuration echo "Validating environment configuration..." python validate_env.py if [ $? -ne 0 ]; then echo "Environment validation failed. Exiting." exit 1 fi ./start_all.sh ./novnc_startup.sh python http_server.py > /tmp/server_logs.txt 2>&1 & STREAMLIT_SERVER_PORT=8501 python -m streamlit run browser_use_demo/streamlit.py > /tmp/streamlit_stdout.log & echo "✨ Browser Use Demo is ready!" echo "➡️ Open http://localhost:8080 in your browser to begin" # Keep the container running tail -f /dev/null ================================================ FILE: browser-use-demo/image/http_server.py ================================================ import os import socket from http.server import HTTPServer, SimpleHTTPRequestHandler class HTTPServerV6(HTTPServer): address_family = socket.AF_INET6 def run_server(): os.chdir(os.path.dirname(__file__) + "/static_content") server_address = ("::", 8080) httpd = HTTPServerV6(server_address, SimpleHTTPRequestHandler) print("Starting HTTP server on port 8080...") # noqa: T201 httpd.serve_forever() if __name__ == "__main__": run_server() ================================================ FILE: browser-use-demo/image/index.html ================================================ Browser Use Demo
================================================ FILE: browser-use-demo/image/mutter_startup.sh ================================================ echo "starting mutter" XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log & # Wait for tint2 window properties to appear timeout=30 while [ $timeout -gt 0 ]; do if xdotool search --class "mutter" >/dev/null 2>&1; then break fi sleep 1 ((timeout--)) done if [ $timeout -eq 0 ]; then echo "mutter stderr output:" >&2 cat /tmp/mutter_stderr.log >&2 exit 1 fi rm /tmp/mutter_stderr.log ================================================ FILE: browser-use-demo/image/novnc_startup.sh ================================================ #!/bin/bash echo "starting noVNC" # Start noVNC with explicit websocket settings /opt/noVNC/utils/novnc_proxy \ --vnc localhost:5900 \ --listen 6080 \ --web /opt/noVNC \ > /tmp/novnc.log 2>&1 & # Wait for noVNC to start timeout=10 while [ $timeout -gt 0 ]; do if netstat -tuln | grep -q ":6080 "; then break fi sleep 1 ((timeout--)) done echo "noVNC started successfully" ================================================ FILE: browser-use-demo/image/start_all.sh ================================================ #!/bin/bash set -e export DISPLAY=:${DISPLAY_NUM} ./xvfb_startup.sh ./tint2_startup.sh # Keep taskbar for window management ./mutter_startup.sh # Keep window manager for controls ./x11vnc_startup.sh ================================================ FILE: browser-use-demo/image/static_content/index.html ================================================ Browser Use Demo
================================================ FILE: browser-use-demo/image/tint2_startup.sh ================================================ #!/bin/bash echo "starting tint2 on display :$DISPLAY_NUM ..." # Start tint2 and capture its stderr tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log & # Wait for tint2 window properties to appear timeout=30 while [ $timeout -gt 0 ]; do if xdotool search --class "tint2" >/dev/null 2>&1; then break fi sleep 1 ((timeout--)) done if [ $timeout -eq 0 ]; then echo "tint2 stderr output:" >&2 cat /tmp/tint2_stderr.log >&2 exit 1 fi # Remove the temporary stderr log file rm /tmp/tint2_stderr.log ================================================ FILE: browser-use-demo/image/x11vnc_startup.sh ================================================ #!/bin/bash echo "starting vnc" (x11vnc -display $DISPLAY \ -forever \ -shared \ -wait 50 \ -rfbport 5900 \ -nopw \ 2>/tmp/x11vnc_stderr.log) & x11vnc_pid=$! # Wait for x11vnc to start timeout=10 while [ $timeout -gt 0 ]; do if netstat -tuln | grep -q ":5900 "; then break fi sleep 1 ((timeout--)) done if [ $timeout -eq 0 ]; then echo "x11vnc failed to start, stderr output:" >&2 cat /tmp/x11vnc_stderr.log >&2 exit 1 fi : > /tmp/x11vnc_stderr.log # Monitor x11vnc process in the background ( while true; do if ! kill -0 $x11vnc_pid 2>/dev/null; then echo "x11vnc process crashed, restarting..." >&2 if [ -f /tmp/x11vnc_stderr.log ]; then echo "x11vnc stderr output:" >&2 cat /tmp/x11vnc_stderr.log >&2 rm /tmp/x11vnc_stderr.log fi exec "$0" fi sleep 5 done ) & ================================================ FILE: browser-use-demo/image/xvfb_startup.sh ================================================ #!/bin/bash set -e # Exit on error DPI=96 RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24 # Function to check if Xvfb is already running check_xvfb_running() { if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then return 0 # Xvfb is already running else return 1 # Xvfb is not running fi } # Function to check if Xvfb is ready wait_for_xvfb() { local timeout=10 local start_time=$(date +%s) while ! xdpyinfo >/dev/null 2>&1; do if [ $(($(date +%s) - start_time)) -gt $timeout ]; then echo "Xvfb failed to start within $timeout seconds" >&2 return 1 fi sleep 0.1 done return 0 } # Check if Xvfb is already running if check_xvfb_running; then echo "Lock file found for display ${DISPLAY}, checking if display is accessible..." # Verify that the display is actually accessible if xdpyinfo >/dev/null 2>&1; then echo "Xvfb is already running and accessible on display ${DISPLAY}" exit 0 else echo "Display ${DISPLAY} is not accessible, cleaning up stale lock file..." rm -f /tmp/.X${DISPLAY_NUM}-lock # Also clean up the socket if it exists rm -f /tmp/.X11-unix/X${DISPLAY_NUM} fi fi # Start Xvfb Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix & XVFB_PID=$! # Wait for Xvfb to start if wait_for_xvfb; then echo "Xvfb started successfully on display ${DISPLAY}" echo "Xvfb PID: $XVFB_PID" else echo "Xvfb failed to start" kill $XVFB_PID exit 1 fi ================================================ FILE: browser-use-demo/pytest.ini ================================================ [pytest] # Test discovery patterns python_files = test_*.py python_classes = Test* python_functions = test_* # Test directories testpaths = tests # Markers for test categorization markers = integration: Integration tests that test multiple components slow: Tests that take longer than usual to run asyncio: Tests that use asyncio # Coverage settings addopts = -v --strict-markers --tb=short --disable-warnings -p pytest_asyncio # Asyncio configuration asyncio_mode = auto asyncio_default_fixture_loop_scope = function # Minimum Python version minversion = 3.11 # Ignore specific warnings filterwarnings = ignore::DeprecationWarning ignore::PendingDeprecationWarning ================================================ FILE: browser-use-demo/setup.py ================================================ from setuptools import setup, find_packages setup( name="browser-use-demo", version="0.1.0", packages=find_packages(), python_requires=">=3.11", install_requires=[ "streamlit==1.41.0", "anthropic[bedrock,vertex]>=0.39.0", "jsonschema==4.22.0", "boto3>=1.28.57", "google-auth<3,>=2", "playwright>=1.40.0", ], extras_require={ "test": [ "pytest==8.3.3", "pytest-cov==4.1.0", "pytest-mock==3.11.1", "pytest-asyncio==0.23.6", ], "dev": [ "ruff==0.6.7", "pyright>=1.1.300", "pre-commit==3.8.0", ], }, ) ================================================ FILE: browser-use-demo/test-requirements.txt ================================================ pytest==8.3.3 pytest-cov==4.1.0 pytest-mock==3.11.1 pytest-asyncio==0.23.6 ================================================ FILE: browser-use-demo/tests/README.md ================================================ # Browser Use Demo - Test Suite Comprehensive test suite for the refactored Browser Use Demo with extensive edge case coverage. ## Installation ```bash # Install test dependencies pip install -r test-requirements.txt # Or install with extras pip install -e ".[test]" ``` ## Running Tests ### Run all tests ```bash pytest tests/ ``` ### Run with coverage report ```bash pytest tests/ --cov=browser_tools_api_demo --cov-report=html # Open htmlcov/index.html to view coverage report ``` ### Run specific test file ```bash pytest tests/test_message_renderer.py -v ``` ### Run specific test class or method ```bash pytest tests/test_message_renderer.py::TestMessageRenderer -v pytest tests/test_message_renderer.py::TestRenderMethod::test_render_string_message -v ``` ### Run tests by marker ```bash # Run only integration tests pytest -m integration # Run tests excluding integration pytest -m "not integration" # Run async tests pytest -m asyncio ``` ## Test Structure ``` tests/ ├── conftest.py # Shared fixtures and mocks ├── test_message_renderer.py # MessageRenderer class tests (~300 test cases) ├── test_streamlit_helpers.py # Helper function tests (~150 test cases) └── test_integration.py # End-to-end integration tests (~50 test cases) ``` ## Test Coverage The test suite covers: ### MessageRenderer (`test_message_renderer.py`) - Initialization with various state configurations - Rendering all message types (string, dict, ToolResult) - Conversation history rendering with complex structures - Edge cases: empty messages, None values, circular references - Error handling: malformed data, missing fields, exceptions - Unicode and special character handling - Performance with large messages ### Streamlit Helpers (`test_streamlit_helpers.py`) - `setup_state()` with fresh and partial initialization - Environment variable handling (present/missing/invalid) - Lambda evaluation in state initialization - `get_or_create_event_loop()` with various loop states - `authenticate()` with different providers and key states - Concurrent access and thread safety - Error recovery scenarios ### Integration Tests (`test_integration.py`) - Complete message rendering pipeline - State initialization and persistence - Event loop management with async operations - Error propagation across components - Full user interaction workflow - Performance with large datasets (1000+ messages) - Deeply nested content structures ## Edge Cases Covered 1. **Boundary Conditions** - Empty strings, lists, dictionaries - Single item collections - Maximum size inputs (100k+ character messages) - Null/None values 2. **Type Mismatches** - Wrong types for expected fields - Missing required fields - Extra unexpected fields - Invalid message structures 3. **State Inconsistencies** - Tools referenced but not in session_state - Partially initialized state - Concurrent modifications - Corrupted state 4. **Error Conditions** - Import errors - Asyncio exceptions - Environment variable errors - Lambda evaluation failures - Base64 decode errors 5. **Performance Edge Cases** - Very large message histories (1000+ messages) - Deeply nested content (100+ levels) - Circular references - Unicode and special characters ## Mocking Strategy ### Streamlit Components All Streamlit components are mocked to enable testing without a running Streamlit server: - `st.session_state` - `st.chat_message` - `st.markdown`, `st.write`, `st.error`, `st.code`, `st.image` - `st.chat_input`, `st.stop` ### External Dependencies - `BrowserTool` - Mocked to avoid Playwright dependencies - `asyncio` event loops - Mocked for controlled testing - Environment variables - Mocked via `monkeypatch` ## Fixtures Key fixtures provided in `conftest.py`: - `mock_streamlit` - Complete Streamlit mocking setup - `mock_browser_tool` - BrowserTool mock - `sample_tool_result` - Various ToolResult configurations - `sample_messages` - Diverse message structures for testing - `edge_case_messages` - Messages designed to trigger edge cases - `mock_asyncio_loop` - Controlled event loop for testing - `mock_environment` - Environment variable setup - `clean_environment` - Remove environment variables ## Continuous Integration To run tests in CI: ```bash # Install dependencies pip install -e ".[test]" # Run tests with coverage pytest tests/ --cov=browser_tools_api_demo --cov-report=xml --cov-report=term # Generate coverage badge coverage-badge -o coverage.svg ``` ## Contributing When adding new features or refactoring: 1. Add corresponding tests for new functionality 2. Ensure all edge cases are covered 3. Run the full test suite before committing 4. Maintain >95% code coverage 5. Update this README if test structure changes ================================================ FILE: browser-use-demo/tests/__init__.py ================================================ """Test suite for Browser Use Demo.""" ================================================ FILE: browser-use-demo/tests/conftest.py ================================================ """Shared fixtures and configuration for tests.""" import asyncio import sys from pathlib import Path from unittest.mock import MagicMock, Mock, patch import pytest # Add the parent directory to the path sys.path.insert(0, str(Path(__file__).parent.parent)) from browser_use_demo.tools import ToolResult @pytest.fixture def mock_streamlit(): """Mock Streamlit module and session_state.""" with patch("streamlit.session_state") as mock_state: # Initialize with default values mock_state.hide_screenshots = False mock_state.tools = {} mock_state.messages = [] mock_state.api_key = "test-key" mock_state.provider = MagicMock() mock_state.event_loop = None # Mock other streamlit components with patch("streamlit.chat_message") as mock_chat: mock_chat.return_value.__enter__ = Mock() mock_chat.return_value.__exit__ = Mock() with patch("streamlit.markdown") as mock_markdown: with patch("streamlit.write") as mock_write: with patch("streamlit.error") as mock_error: with patch("streamlit.code") as mock_code: with patch("streamlit.image") as mock_image: yield { "session_state": mock_state, "chat_message": mock_chat, "markdown": mock_markdown, "write": mock_write, "error": mock_error, "code": mock_code, "image": mock_image, } @pytest.fixture def mock_browser_tool(): """Mock BrowserTool to avoid Playwright dependencies.""" with patch("browser_use_demo.tools.BrowserTool") as mock_tool: mock_instance = MagicMock() mock_tool.return_value = mock_instance yield mock_instance @pytest.fixture def sample_tool_result(): """Create sample ToolResult objects for testing.""" return { "success": ToolResult(output="Success message"), "error": ToolResult(error="Error message"), "with_image": ToolResult( output="With screenshot", base64_image="iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==", ), "empty": ToolResult(), "all_fields": ToolResult( output="Output text", error="Error text", base64_image="base64data", system="System message", ), } @pytest.fixture def sample_messages(): """Provide various message structures for testing edge cases.""" return [ # Normal messages {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}, # Complex content structures { "role": "user", "content": [ {"type": "text", "text": "Multiple items"}, {"type": "image", "source": "data:image/png;base64,abc123"}, ], }, # Assistant with tool results { "role": "assistant", "content": [ {"type": "text", "text": "Let me help with that"}, { "type": "tool_use", "name": "browser", "input": {"url": "example.com"}, }, {"type": "tool_result", "tool_use_id": "tool_123"}, ], }, # Edge cases {"role": "user", "content": ""}, # Empty content {"role": "assistant", "content": None}, # None content {"role": "unknown", "content": "Unknown role"}, # Unknown role {"role": "user"}, # Missing content field {"content": "No role"}, # Missing role field # Unicode and special characters {"role": "user", "content": "Hello 世界 🌍 \n\t\r"}, # Very long content {"role": "user", "content": "x" * 10000}, # Nested structures { "role": "assistant", "content": [ { "type": "text", "text": "Nested", "metadata": {"nested": {"deeply": {"value": 123}}}, } ], }, ] @pytest.fixture def edge_case_messages(): """Messages specifically designed to test edge cases and error conditions.""" return { "empty_list": [], "none": None, "malformed_dict": {"not": "valid", "message": "structure"}, "circular_ref": _create_circular_reference(), "missing_tool": { "role": "assistant", "content": [{"type": "tool_result", "tool_use_id": "nonexistent_tool"}], }, "invalid_types": [ {"role": 123, "content": "Invalid role type"}, {"role": "user", "content": {"invalid": "content structure"}}, ], "huge_message": { "role": "user", "content": [{"type": "text", "text": "x" * 1000000}], }, } def _create_circular_reference(): """Helper to create a message with circular reference.""" msg = {"role": "user", "content": []} msg["content"].append(msg) # Circular reference return msg @pytest.fixture def mock_asyncio_loop(): """Mock asyncio event loop for testing.""" loop = Mock(spec=asyncio.AbstractEventLoop) loop.is_closed.return_value = False loop.run_until_complete = Mock(side_effect=lambda coro: asyncio.run(coro)) return loop @pytest.fixture def mock_environment(monkeypatch): """Mock environment variables for testing.""" env_vars = { "ANTHROPIC_API_KEY": "test-api-key", } for key, value in env_vars.items(): monkeypatch.setenv(key, value) return env_vars @pytest.fixture def clean_environment(monkeypatch): """Remove environment variables for testing missing env scenarios.""" keys_to_remove = [ "ANTHROPIC_API_KEY", ] for key in keys_to_remove: monkeypatch.delenv(key, raising=False) return keys_to_remove @pytest.fixture def mock_provider(): """Mock APIProvider enum.""" with patch("browser_use_demo.loop.APIProvider") as mock: mock.ANTHROPIC = "anthropic" mock.BEDROCK = "bedrock" mock.VERTEX = "vertex" yield mock @pytest.fixture def mock_api_response_with_text_and_tools(): """Mock API response containing both text and tool uses.""" response = Mock() response.content = [ Mock(type="text", text="I'll help you with that task"), Mock( type="tool_use", id="tool_001", name="browser", input={"action": "screenshot"} ), Mock(type="text", text="Here's what I found"), Mock( type="tool_use", id="tool_002", name="browser", input={"action": "navigate", "url": "example.com"} ) ] return response @pytest.fixture def mock_tool_collection(): """Mock ToolCollection for testing.""" from unittest.mock import AsyncMock mock_browser = AsyncMock() mock_browser.return_value = ToolResult(output="Tool executed successfully") collection = Mock() collection.tool_map = {"browser": mock_browser} collection.to_params = Mock(return_value=[ { "name": "browser", "description": "Browser automation tool", "input_schema": {} } ]) return collection @pytest.fixture def sample_mixed_content_messages(): """Sample messages with mixed text and tool content.""" return [ { "role": "user", "content": "Can you help me navigate and take a screenshot?" }, { "role": "assistant", "content": [ {"type": "text", "text": "Sure, I'll navigate to the page first"}, { "type": "tool_use", "id": "tool_nav", "name": "browser", "input": {"action": "navigate", "url": "example.com"} }, {"type": "text", "text": "Now taking a screenshot"}, { "type": "tool_use", "id": "tool_shot", "name": "browser", "input": {"action": "screenshot"} } ] }, { "role": "user", "content": [ { "type": "tool_result", "tool_use_id": "tool_nav", "content": [{"type": "text", "text": "Navigation successful"}] }, { "type": "tool_result", "tool_use_id": "tool_shot", "content": [ {"type": "text", "text": "Screenshot taken"}, { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": "base64imagedata" } } ] } ] } ] ================================================ FILE: browser-use-demo/tests/test_integration.py ================================================ """Integration tests for the refactored Browser Use Demo.""" from unittest.mock import AsyncMock, MagicMock, Mock, patch import pytest from browser_use_demo.loop import APIProvider from browser_use_demo.message_renderer import MessageRenderer from browser_use_demo.streamlit import ( get_or_create_event_loop, setup_state, ) from browser_use_demo.tools import ToolResult @pytest.mark.integration class TestFullMessageRenderingPipeline: """Test complete message rendering pipeline.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.chat_message") @patch("streamlit.markdown") @patch("streamlit.write") @patch("streamlit.error") def test_full_conversation_rendering( self, mock_error, mock_write, mock_markdown, mock_chat, mock_state ): """Test rendering a complete conversation with various message types.""" # Setup mock state mock_state.hide_screenshots = False mock_state.tools = { "tool_1": ToolResult(output="Tool output 1"), "tool_2": ToolResult(error="Tool error 2"), } # Create complex conversation messages = [ {"role": "user", "content": "Hello, can you help me?"}, { "role": "assistant", "content": [ {"type": "text", "text": "Sure, let me help you."}, { "type": "tool_use", "name": "browser", "input": {"url": "example.com"}, }, {"type": "tool_result", "tool_use_id": "tool_1"}, ], }, {"role": "user", "content": [{"type": "text", "text": "Thank you!"}]}, { "role": "assistant", "content": [ {"type": "text", "text": "You're welcome!"}, {"type": "tool_result", "tool_use_id": "tool_2"}, ], }, ] # Mock chat_message context manager mock_chat.return_value.__enter__ = Mock() mock_chat.return_value.__exit__ = Mock() # Render full conversation renderer = MessageRenderer(mock_state) renderer.render_conversation_history(messages) # Verify all message types were rendered assert mock_markdown.call_count >= 3 # Text messages assert mock_write.call_count >= 2 # Tool use and text blocks assert mock_error.call_count == 1 # Tool error @pytest.mark.integration class TestStateInitializationAndPersistence: """Test state initialization and persistence across operations.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("browser_use_demo.tools.BrowserTool") def test_complete_state_initialization(self, mock_browser_tool, mock_state): """Test complete state initialization flow.""" # Simulate fresh state initialized_keys = set() def setitem_side_effect(key, value): initialized_keys.add(key) setattr(mock_state, key, value) mock_state.__contains__.return_value = False mock_state.__setitem__.side_effect = setitem_side_effect # Initialize state setup_state() # Verify all required keys were initialized expected_keys = { "messages", "api_key", "provider", "model", "max_tokens", "system_prompt", "hide_screenshots", "tools", "browser_tool", "event_loop", "rendered_message_count", "is_agent_running", "active_messages", "active_response_container", } assert expected_keys.issubset(initialized_keys) @patch("streamlit.session_state", new_callable=MagicMock) def test_state_persistence_across_renders(self, mock_state): """Test that state persists across multiple render calls.""" # Initialize state mock_state.tools = {"tool_1": ToolResult(output="Persistent tool")} mock_state.messages = [{"role": "user", "content": "Initial message"}] # Create renderer and render renderer1 = MessageRenderer(mock_state) with patch("streamlit.chat_message"), patch("streamlit.markdown"): renderer1.render_conversation_history(mock_state.messages) # Add more messages mock_state.messages.append({"role": "assistant", "content": "Response"}) # Create new renderer instance and render again renderer2 = MessageRenderer(mock_state) with patch("streamlit.chat_message"), patch("streamlit.markdown") as mock_md: renderer2.render_conversation_history(mock_state.messages) # Should render both messages assert mock_md.call_count >= 2 @pytest.mark.integration class TestEventLoopManagementWithAsync: """Test event loop management with async operations.""" def test_async_agent_execution(self): """Test running async agent with event loop management.""" with patch("streamlit.session_state", new_callable=MagicMock) as mock_state: mock_state.event_loop = None with patch("asyncio.set_event_loop"): with patch("asyncio.new_event_loop") as mock_new_loop: mock_loop = MagicMock() mock_loop.is_closed.return_value = False mock_new_loop.return_value = mock_loop loop = get_or_create_event_loop() # Verify loop was created and set assert loop == mock_loop assert mock_state.event_loop == mock_loop # Test that the loop can handle async operations (mocked) async def mock_agent(input_text): return f"Processed: {input_text}" # Mock running the async function mock_loop.run_until_complete = MagicMock( return_value="Processed: Test input" ) result = mock_loop.run_until_complete(mock_agent("Test input")) assert result == "Processed: Test input" @patch("streamlit.session_state", new_callable=MagicMock) def test_concurrent_async_operations(self, mock_state): """Test handling concurrent async operations.""" mock_state.event_loop = None with patch("asyncio.set_event_loop"): with patch("asyncio.new_event_loop") as mock_new_loop: mock_loop = MagicMock() mock_loop.is_closed.return_value = False mock_new_loop.return_value = mock_loop loop = get_or_create_event_loop() # Simulate concurrent operations async def async_task(task_id): return f"Task {task_id} complete" # Mock gather to simulate concurrent execution expected_results = [f"Task {i} complete" for i in range(5)] with patch("asyncio.gather") as mock_gather: mock_gather.return_value = expected_results # Test that multiple tasks can be handled tasks = [async_task(i) for i in range(5)] results = mock_gather(*tasks) assert len(results) == 5 assert all("complete" in r for r in results) mock_gather.assert_called_once() @pytest.mark.integration class TestErrorPropagationAndHandling: """Test error propagation and handling across the system.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.error") def test_rendering_error_propagation(self, mock_error, mock_state): """Test that rendering errors are properly propagated.""" mock_state.tools = {} # Create message that will cause error messages = [ { "role": "assistant", "content": [{"type": "tool_result", "tool_use_id": "nonexistent"}], } ] renderer = MessageRenderer(mock_state) with patch("streamlit.chat_message"): # Should handle missing tool gracefully renderer.render_conversation_history(messages) # Error should not be called for missing tool (handled gracefully) mock_error.assert_not_called() @patch("streamlit.session_state", new_callable=MagicMock) def test_initialization_error_recovery(self, mock_state): """Test recovery from initialization errors.""" mock_state.__contains__.return_value = False # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC # First call fails with patch("browser_use_demo.tools.BrowserTool") as mock_browser: mock_browser.side_effect = [Exception("Init failed"), MagicMock()] # First attempt should fail with pytest.raises(Exception, match="Init failed"): setup_state() # Second attempt should succeed setup_state() assert mock_browser.call_count == 2 @pytest.mark.integration class TestCompleteWorkflow: """Test complete workflow from initialization to rendering.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.chat_input") @patch("streamlit.chat_message") @patch("streamlit.markdown") @patch("browser_use_demo.tools.BrowserTool") @patch("browser_use_demo.streamlit.run_agent", new_callable=AsyncMock) def test_complete_user_interaction_flow( self, mock_run_agent, mock_browser_tool, mock_markdown, mock_chat_message, mock_chat_input, mock_state, ): """Test complete flow from user input to message rendering.""" # Setup initial state mock_state.__contains__.return_value = False # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC setup_state() # Simulate user input user_input = "Browse to example.com" mock_chat_input.return_value = user_input # Setup event loop with patch("asyncio.new_event_loop") as mock_new_loop: mock_loop = MagicMock() mock_loop.is_closed.return_value = False mock_loop.run_until_complete = MagicMock() mock_new_loop.return_value = mock_loop with patch("asyncio.set_event_loop"): loop = get_or_create_event_loop() # Simulate agent response mock_run_agent.return_value = None # Mock chat message context mock_chat_message.return_value.__enter__ = Mock() mock_chat_message.return_value.__exit__ = Mock() # Simulate the workflow # User provides input if user_input: loop.run_until_complete(mock_run_agent(user_input)) mock_loop.run_until_complete.assert_called_once() mock_run_agent.assert_called_once_with(user_input) @pytest.mark.integration class TestPerformanceAndScalability: """Test performance with large datasets and edge cases.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.chat_message") @patch("streamlit.markdown") def test_large_conversation_history( self, mock_markdown, mock_chat_message, mock_state ): """Test rendering very large conversation history.""" # Create large conversation (1000 messages) large_conversation = [] for i in range(1000): role = "user" if i % 2 == 0 else "assistant" large_conversation.append({"role": role, "content": f"Message {i}"}) mock_state.tools = {} mock_chat_message.return_value.__enter__ = Mock() mock_chat_message.return_value.__exit__ = Mock() renderer = MessageRenderer(mock_state) # Should handle large conversation without issues renderer.render_conversation_history(large_conversation) # Verify all messages were processed assert mock_markdown.call_count == 1000 @patch("streamlit.session_state", new_callable=MagicMock) def test_deeply_nested_content_performance(self, mock_state): """Test performance with deeply nested content structures.""" # Create deeply nested structure content = {"type": "text", "text": "Base"} for i in range(100): content = {"type": "wrapper", "content": [content], "depth": i} messages = [{"role": "user", "content": [content]}] mock_state.tools = {} with patch("streamlit.chat_message"), patch("streamlit.write"): renderer = MessageRenderer(mock_state) # Should handle deep nesting without stack overflow renderer.render_conversation_history(messages) ================================================ FILE: browser-use-demo/tests/test_message_renderer.py ================================================ """Tests for MessageRenderer class with comprehensive edge case coverage.""" from unittest.mock import MagicMock, Mock, patch import pytest from browser_use_demo.message_renderer import MessageRenderer, Sender from browser_use_demo.tools import ToolResult class TestMessageRenderer: """Test suite for MessageRenderer class.""" def test_initialization(self, mock_streamlit): """Test MessageRenderer initialization.""" renderer = MessageRenderer(mock_streamlit["session_state"]) assert renderer.session_state == mock_streamlit["session_state"] def test_initialization_with_none_state(self): """Test initialization with None session state.""" renderer = MessageRenderer(None) assert renderer.session_state is None def test_initialization_with_empty_state(self): """Test initialization with empty session state.""" empty_state = MagicMock() renderer = MessageRenderer(empty_state) assert renderer.session_state == empty_state class TestRenderMethod: """Test the main render method with various inputs.""" def test_render_string_message(self, mock_streamlit): """Test rendering a simple string message.""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.USER, "Hello world") mock_streamlit["chat_message"].assert_called_with(Sender.USER) mock_streamlit["markdown"].assert_called_with("Hello world") def test_render_empty_string(self, mock_streamlit): """Test rendering an empty string (should skip).""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.USER, "") mock_streamlit["chat_message"].assert_not_called() def test_render_none_message(self, mock_streamlit): """Test rendering None message (should skip).""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.BOT, None) mock_streamlit["chat_message"].assert_not_called() def test_render_tool_result_with_output(self, mock_streamlit, sample_tool_result): """Test rendering ToolResult with output.""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.TOOL, sample_tool_result["success"]) mock_streamlit["markdown"].assert_called_with("Success message") def test_render_tool_result_with_error(self, mock_streamlit, sample_tool_result): """Test rendering ToolResult with error.""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.TOOL, sample_tool_result["error"]) mock_streamlit["error"].assert_called_with("Error message") def test_render_tool_result_with_image(self, mock_streamlit, sample_tool_result): """Test rendering ToolResult with image.""" mock_streamlit["session_state"].hide_screenshots = False renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.TOOL, sample_tool_result["with_image"]) mock_streamlit["markdown"].assert_called_with("With screenshot") # Image should be decoded and displayed assert mock_streamlit["image"].called def test_render_tool_result_with_hidden_screenshots( self, mock_streamlit, sample_tool_result ): """Test that images are hidden when hide_screenshots is True.""" mock_streamlit["session_state"].hide_screenshots = True renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render(Sender.TOOL, sample_tool_result["with_image"]) # Should render text but not image mock_streamlit["markdown"].assert_called_with("With screenshot") mock_streamlit["image"].assert_not_called() def test_render_dict_message_text_type(self, mock_streamlit): """Test rendering dictionary message with text type.""" renderer = MessageRenderer(mock_streamlit["session_state"]) message = {"type": "text", "text": "Hello from dict"} renderer.render(Sender.USER, message) mock_streamlit["write"].assert_called_with("Hello from dict") def test_render_dict_message_tool_use_type(self, mock_streamlit): """Test rendering dictionary message with tool_use type.""" renderer = MessageRenderer(mock_streamlit["session_state"]) message = { "type": "tool_use", "name": "browser_tool", "input": {"url": "example.com"}, } renderer.render(Sender.BOT, message) expected_code = "Tool Use: browser_tool\nInput: {'url': 'example.com'}" mock_streamlit["code"].assert_called_with(expected_code) def test_render_dict_message_unknown_type(self, mock_streamlit): """Test rendering dictionary message with unknown type.""" renderer = MessageRenderer(mock_streamlit["session_state"]) message = {"type": "unknown", "data": "some data"} renderer.render(Sender.BOT, message) # Should fall back to generic write mock_streamlit["write"].assert_called_with(message) def test_render_very_long_message(self, mock_streamlit): """Test rendering extremely long messages.""" renderer = MessageRenderer(mock_streamlit["session_state"]) long_message = "x" * 100000 # 100k characters renderer.render(Sender.USER, long_message) mock_streamlit["markdown"].assert_called_with(long_message) def test_render_unicode_special_chars(self, mock_streamlit): """Test rendering messages with unicode and special characters.""" renderer = MessageRenderer(mock_streamlit["session_state"]) special_message = "Hello 世界 🌍 \n\t\r ñáéíóú" renderer.render(Sender.USER, special_message) mock_streamlit["markdown"].assert_called_with(special_message) class TestConversationHistory: """Test render_conversation_history method with various scenarios.""" def test_render_empty_history(self, mock_streamlit): """Test rendering empty conversation history.""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render_conversation_history([]) # No rendering should occur mock_streamlit["chat_message"].assert_not_called() def test_render_single_message(self, mock_streamlit): """Test rendering single message in history.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [{"role": "user", "content": "Hello"}] renderer.render_conversation_history(messages) mock_streamlit["markdown"].assert_called_with("Hello") def test_render_multiple_messages(self, mock_streamlit, sample_messages): """Test rendering multiple messages with different roles.""" renderer = MessageRenderer(mock_streamlit["session_state"]) renderer.render_conversation_history(sample_messages[:2]) # Should render both messages assert mock_streamlit["markdown"].call_count >= 2 def test_render_unknown_role(self, mock_streamlit): """Test handling messages with unknown roles.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [{"role": "unknown_role", "content": "Test"}] renderer.render_conversation_history(messages) # Should not crash, but won't render mock_streamlit["markdown"].assert_not_called() def test_render_missing_content_field(self, mock_streamlit): """Test handling messages missing content field.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [{"role": "user"}] # Missing content # Should not crash - will get KeyError but handler should manage it gracefully try: renderer.render_conversation_history(messages) except KeyError: pass # Expected when content field is missing def test_render_none_content(self, mock_streamlit): """Test handling messages with None content.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [{"role": "user", "content": None}] renderer.render_conversation_history(messages) # Should handle gracefully without rendering mock_streamlit["markdown"].assert_not_called() def test_render_list_content(self, mock_streamlit): """Test rendering messages with list content.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [ { "role": "user", "content": [ {"type": "text", "text": "First"}, {"type": "text", "text": "Second"}, ], } ] renderer.render_conversation_history(messages) # Should render both text blocks calls = mock_streamlit["markdown"].call_args_list assert any("First" in str(call) for call in calls) assert any("Second" in str(call) for call in calls) def test_skip_image_blocks_in_history(self, mock_streamlit): """Test that image blocks are skipped in conversation history.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [ { "role": "user", "content": [ {"type": "text", "text": "Text message"}, {"type": "image", "source": "data:image/png;base64,abc"}, ], } ] renderer.render_conversation_history(messages) # Should only render text, not image mock_streamlit["markdown"].assert_called_with("Text message") mock_streamlit["image"].assert_not_called() def test_tool_result_in_assistant_message(self, mock_streamlit, sample_tool_result): """Test rendering tool results from assistant messages.""" mock_streamlit["session_state"].tools = { "tool_123": sample_tool_result["success"] } renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [ { "role": "assistant", "content": [{"type": "tool_result", "tool_use_id": "tool_123"}], } ] renderer.render_conversation_history(messages) # Should render the tool result from session state mock_streamlit["markdown"].assert_called_with("Success message") def test_missing_tool_in_session_state(self, mock_streamlit): """Test handling tool_use_id that doesn't exist in session state.""" renderer = MessageRenderer(mock_streamlit["session_state"]) messages = [ { "role": "assistant", "content": [{"type": "tool_result", "tool_use_id": "nonexistent"}], } ] renderer.render_conversation_history(messages) # Should handle gracefully without crashing mock_streamlit["markdown"].assert_not_called() class TestEdgeCases: """Test edge cases and error conditions.""" def test_circular_reference_handling(self, mock_streamlit): """Test handling circular references in messages.""" renderer = MessageRenderer(mock_streamlit["session_state"]) # Create circular reference content = [] content.append({"type": "text", "text": "Normal", "ref": content}) messages = [{"role": "user", "content": content}] # Should not crash or infinite loop renderer.render_conversation_history(messages) def test_malformed_tool_result(self, mock_streamlit): """Test handling malformed ToolResult objects.""" renderer = MessageRenderer(mock_streamlit["session_state"]) # Create a mock that doesn't have expected attributes malformed = Mock(spec=[]) # No attributes renderer.render(Sender.TOOL, malformed) # Should handle gracefully mock_streamlit["markdown"].assert_not_called() def test_exception_in_rendering(self, mock_streamlit): """Test that exceptions during rendering are propagated.""" # Setup the chat_message context manager properly mock_chat_cm = MagicMock() mock_chat_cm.__enter__ = Mock(return_value=None) mock_chat_cm.__exit__ = Mock(return_value=None) mock_streamlit["chat_message"].return_value = mock_chat_cm # Set markdown to raise an exception mock_streamlit["markdown"].side_effect = Exception("Render error") renderer = MessageRenderer(mock_streamlit["session_state"]) # Should let the exception propagate for markdown rendering with pytest.raises(Exception, match="Render error"): renderer.render(Sender.USER, "Test message") def test_normalize_content_with_various_inputs(self, mock_streamlit): """Test _normalize_content with various input types.""" renderer = MessageRenderer(mock_streamlit["session_state"]) # String input assert renderer._normalize_content("test") == ["test"] # List input assert renderer._normalize_content([1, 2, 3]) == [1, 2, 3] # None input assert renderer._normalize_content(None) == [None] # Dict input assert renderer._normalize_content({"key": "value"}) == [{"key": "value"}] def test_deeply_nested_content(self, mock_streamlit): """Test handling deeply nested content structures.""" renderer = MessageRenderer(mock_streamlit["session_state"]) # Create deeply nested structure nested = {"type": "text", "text": "Deep"} for _ in range(100): nested = {"type": "wrapper", "content": nested} messages = [{"role": "user", "content": [nested]}] renderer.render_conversation_history(messages) # Should handle without stack overflow def test_concurrent_modification(self, mock_streamlit): """Test behavior when session state is modified during rendering.""" renderer = MessageRenderer(mock_streamlit["session_state"]) # Setup mock properly for context manager mock_cm = MagicMock() mock_cm.__enter__ = Mock(return_value=None) mock_cm.__exit__ = Mock(return_value=None) # Simulate modification during rendering def modify_state(*args, **kwargs): mock_streamlit["session_state"].tools = {} return mock_cm mock_streamlit["chat_message"].side_effect = modify_state # Should complete rendering despite modifications renderer.render(Sender.USER, "Test") def test_invalid_sender_type(self, mock_streamlit): """Test handling invalid sender types.""" renderer = MessageRenderer(mock_streamlit["session_state"]) # Use invalid sender renderer.render("invalid_sender", "Message") # Should still render with the provided sender mock_streamlit["chat_message"].assert_called_with("invalid_sender") def test_base64_decode_error(self, mock_streamlit): """Test handling invalid base64 image data.""" # Setup the chat_message context manager properly mock_chat_cm = MagicMock() mock_chat_cm.__enter__ = Mock(return_value=None) mock_chat_cm.__exit__ = Mock(return_value=None) mock_streamlit["chat_message"].return_value = mock_chat_cm # Setup session state to not hide screenshots mock_streamlit["session_state"].hide_screenshots = False renderer = MessageRenderer(mock_streamlit["session_state"]) tool_result = ToolResult( output="With bad image", base64_image="invalid_base64_!@#$" ) with patch("base64.b64decode") as mock_decode: mock_decode.side_effect = Exception("Invalid base64") # Should handle the error gracefully - the exception should propagate with pytest.raises(Exception, match="Invalid base64"): renderer.render(Sender.TOOL, tool_result) ================================================ FILE: browser-use-demo/tests/test_sampling_loop.py ================================================ """ Comprehensive tests for the sampling loop and message handling. """ import asyncio from unittest.mock import AsyncMock, Mock, patch import pytest from browser_use_demo.loop import APIProvider, sampling_loop from browser_use_demo.message_handler import ( MessageBuilder, ResponseProcessor, ) from browser_use_demo.tools import ToolResult class TestResponseProcessor: """Test the ResponseProcessor class.""" def test_process_response_text_only(self): """Test processing a response with only text content.""" mock_response = Mock() mock_response.content = [ Mock(type="text", text="This is a text response") ] processor = ResponseProcessor() result = processor.process_response(mock_response) assert result.has_text is True assert result.has_tools is False assert len(result.assistant_content) == 1 assert result.assistant_content[0]["type"] == "text" assert result.assistant_content[0]["text"] == "This is a text response" assert len(result.tool_uses) == 0 def test_process_response_tool_only(self): """Test processing a response with only tool use.""" mock_response = Mock() mock_tool_use = Mock( type="tool_use", id="tool_123", name="browser", input={"action": "screenshot"} ) mock_response.content = [mock_tool_use] processor = ResponseProcessor() result = processor.process_response(mock_response) assert result.has_text is False assert result.has_tools is True assert len(result.assistant_content) == 1 assert result.assistant_content[0]["type"] == "tool_use" assert len(result.tool_uses) == 1 def test_process_response_mixed_content(self): """Test processing a response with both text and tool use.""" mock_response = Mock() mock_response.content = [ Mock(type="text", text="Let me take a screenshot"), Mock( type="tool_use", id="tool_456", name="browser", input={"action": "screenshot"} ) ] processor = ResponseProcessor() result = processor.process_response(mock_response) assert result.has_text is True assert result.has_tools is True assert len(result.assistant_content) == 2 assert result.assistant_content[0]["type"] == "text" assert result.assistant_content[0]["text"] == "Let me take a screenshot" assert result.assistant_content[1]["type"] == "tool_use" assert len(result.tool_uses) == 1 def test_execute_tools_success(self): """Test successful tool execution.""" async def run_test(): mock_tool = AsyncMock(return_value=ToolResult(output="Tool executed")) mock_collection = Mock() mock_collection.tool_map = {"browser": mock_tool} tool_uses = [ { "type": "tool_use", "id": "tool_789", "name": "browser", "input": {"action": "screenshot"} } ] processor = ResponseProcessor() results = await processor.execute_tools(tool_uses, mock_collection) assert len(results) == 1 assert results[0]["type"] == "tool_result" assert results[0]["tool_use_id"] == "tool_789" assert any( block.get("text") == "Tool executed" for block in results[0]["content"] ) asyncio.run(run_test()) def test_execute_tools_with_error(self): """Test tool execution with error.""" async def run_test(): mock_tool = AsyncMock(side_effect=Exception("Tool failed")) mock_collection = Mock() mock_collection.tool_map = {"browser": mock_tool} tool_uses = [ { "type": "tool_use", "id": "tool_error", "name": "browser", "input": {"action": "invalid"} } ] processor = ResponseProcessor() results = await processor.execute_tools(tool_uses, mock_collection) assert len(results) == 1 assert results[0]["type"] == "tool_result" assert results[0]["is_error"] is True assert any( "Tool failed" in block.get("text", "") for block in results[0]["content"] ) asyncio.run(run_test()) def test_build_tool_result_with_image(self): """Test building tool result with base64 image.""" result = ToolResult(base64_image="base64_data_here") processor = ResponseProcessor() tool_result = processor._build_tool_result(result, "tool_img") assert tool_result["type"] == "tool_result" assert tool_result["tool_use_id"] == "tool_img" assert any( block.get("type") == "image" for block in tool_result["content"] ) def test_build_tool_result_with_text_extraction_markers(self): """Test handling of text extraction markers in tool results.""" result = ToolResult( output="__PAGE_EXTRACTED__\nSome content\n__FULL_CONTENT__\nThe actual content" ) processor = ResponseProcessor() tool_result = processor._build_tool_result(result, "tool_extract") assert any( block.get("text") == "The actual content" for block in tool_result["content"] ) class TestMessageBuilder: """Test the MessageBuilder class.""" def test_add_assistant_message(self): """Test adding an assistant message.""" messages = [] content = [ {"type": "text", "text": "Hello"}, {"type": "tool_use", "id": "123", "name": "test", "input": {}} ] builder = MessageBuilder() builder.add_assistant_message(messages, content) assert len(messages) == 1 assert messages[0]["role"] == "assistant" assert messages[0]["content"] == content def test_add_assistant_message_empty_content(self): """Test that empty content is not added.""" messages = [] content = [] builder = MessageBuilder() builder.add_assistant_message(messages, content) assert len(messages) == 0 def test_add_tool_results(self): """Test adding tool results.""" messages = [] tool_results = [ { "type": "tool_result", "tool_use_id": "123", "content": [{"type": "text", "text": "Result"}] } ] builder = MessageBuilder() builder.add_tool_results(messages, tool_results) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == tool_results def test_add_tool_results_empty(self): """Test that empty tool results are not added.""" messages = [] tool_results = [] builder = MessageBuilder() builder.add_tool_results(messages, tool_results) assert len(messages) == 0 def test_ensure_message_integrity_valid(self): """Test message integrity validation with valid messages.""" messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": [{"type": "text", "text": "Hi"}]}, ] builder = MessageBuilder() assert builder.ensure_message_integrity(messages) is True def test_ensure_message_integrity_missing_role(self): """Test message integrity with missing role.""" messages = [ {"content": "Hello"}, ] builder = MessageBuilder() assert builder.ensure_message_integrity(messages) is False def test_ensure_message_integrity_missing_content(self): """Test message integrity with missing content.""" messages = [ {"role": "user"}, ] builder = MessageBuilder() assert builder.ensure_message_integrity(messages) is False def test_ensure_message_integrity_empty_list_content(self): """Test message integrity with empty content list.""" messages = [ {"role": "user", "content": []}, ] builder = MessageBuilder() assert builder.ensure_message_integrity(messages) is False def test_extract_text_from_message(self): """Test extracting text from assistant message.""" message = { "role": "assistant", "content": [ {"type": "text", "text": "First part"}, {"type": "tool_use", "id": "123", "name": "test", "input": {}}, {"type": "text", "text": "Second part"} ] } builder = MessageBuilder() text = builder.extract_text_from_message(message) assert text == "First part Second part" def test_extract_text_from_message_no_text(self): """Test extracting text when there's no text content.""" message = { "role": "assistant", "content": [ {"type": "tool_use", "id": "123", "name": "test", "input": {}} ] } builder = MessageBuilder() text = builder.extract_text_from_message(message) assert text is None def test_extract_text_from_user_message(self): """Test that text extraction returns None for non-assistant messages.""" message = { "role": "user", "content": "User message" } builder = MessageBuilder() text = builder.extract_text_from_message(message) assert text is None @pytest.mark.integration class TestSamplingLoopIntegration: """Integration tests for the sampling loop.""" @patch("browser_use_demo.loop.Anthropic") def test_sampling_loop_preserves_text_with_tools(self, mock_anthropic): """Test that text is preserved when tools are used.""" async def run_test(): mock_client = Mock() mock_anthropic.return_value = mock_client mock_response = Mock() mock_response.content = [ Mock(type="text", text="I'll help you with that"), Mock( type="tool_use", id="tool_001", name="browser", input={"action": "screenshot"} ) ] mock_client.beta.messages.create = Mock(return_value=mock_response) mock_browser = AsyncMock() mock_browser.return_value = ToolResult(output="Screenshot taken") messages = [{"role": "user", "content": "Take a screenshot"}] output_messages = [] tool_outputs = {} def output_callback(content): output_messages.append(content) def tool_output_callback(result, tool_id): tool_outputs[tool_id] = result updated_messages = await sampling_loop( model="claude-sonnet-4-5", provider=APIProvider.ANTHROPIC, system_prompt_suffix="", messages=messages, output_callback=output_callback, tool_output_callback=tool_output_callback, api_response_callback=lambda *args: None, api_key="test_key", browser_tool=mock_browser ) api_call_args = mock_client.beta.messages.create.call_args[1] assert api_call_args["tool_choice"] == {"type": "auto"} assert len(output_messages) >= 2 assert any( msg.get("type") == "text" and "help you" in msg.get("text", "") for msg in output_messages ) assert any(msg.get("type") == "tool_use" for msg in output_messages) assistant_msgs = [m for m in updated_messages if m["role"] == "assistant"] assert len(assistant_msgs) > 0 last_assistant = assistant_msgs[-1] assert isinstance(last_assistant["content"], list) has_text = any( block.get("type") == "text" for block in last_assistant["content"] ) has_tool = any( block.get("type") == "tool_use" for block in last_assistant["content"] ) assert has_text and has_tool, "Assistant message should contain both text and tool use" asyncio.run(run_test()) @patch("browser_use_demo.loop.Anthropic") def test_sampling_loop_text_only_response(self, mock_anthropic): """Test handling of text-only responses.""" async def run_test(): mock_client = Mock() mock_anthropic.return_value = mock_client mock_response = Mock() mock_response.content = [ Mock(type="text", text="This is just a text response") ] mock_client.beta.messages.create = Mock(return_value=mock_response) messages = [{"role": "user", "content": "Hello"}] updated_messages = await sampling_loop( model="claude-sonnet-4-5", provider=APIProvider.ANTHROPIC, system_prompt_suffix="", messages=messages, output_callback=lambda x: None, tool_output_callback=lambda r, i: None, api_response_callback=lambda *args: None, api_key="test_key" ) assert len(updated_messages) == 2 assert updated_messages[-1]["role"] == "assistant" assert any( block.get("text") == "This is just a text response" for block in updated_messages[-1]["content"] ) asyncio.run(run_test()) @patch("browser_use_demo.loop.Anthropic") def test_sampling_loop_multiple_tools_with_text(self, mock_anthropic): """Test handling of multiple tool uses with text.""" async def run_test(): mock_client = Mock() mock_anthropic.return_value = mock_client mock_response = Mock() mock_response.content = [ Mock(type="text", text="I'll perform multiple actions"), Mock( type="tool_use", id="tool_001", name="browser", input={"action": "screenshot"} ), Mock(type="text", text="Now navigating"), Mock( type="tool_use", id="tool_002", name="browser", input={"action": "navigate", "url": "example.com"} ) ] mock_client.beta.messages.create = Mock(return_value=mock_response) mock_browser = AsyncMock() mock_browser.return_value = ToolResult(output="Action completed") messages = [{"role": "user", "content": "Do multiple things"}] updated_messages = await sampling_loop( model="claude-sonnet-4-5", provider=APIProvider.ANTHROPIC, system_prompt_suffix="", messages=messages, output_callback=lambda x: None, tool_output_callback=lambda r, i: None, api_response_callback=lambda *args: None, api_key="test_key", browser_tool=mock_browser ) assistant_msgs = [m for m in updated_messages if m["role"] == "assistant"] last_assistant = assistant_msgs[-1] text_blocks = [ block for block in last_assistant["content"] if block.get("type") == "text" ] tool_blocks = [ block for block in last_assistant["content"] if block.get("type") == "tool_use" ] assert len(text_blocks) == 2 assert len(tool_blocks) == 2 asyncio.run(run_test()) @patch("browser_use_demo.loop.Anthropic") def test_tool_choice_parameter_set(self, mock_anthropic): """Test that tool_choice is explicitly set to auto.""" async def run_test(): mock_client = Mock() mock_anthropic.return_value = mock_client mock_response = Mock() mock_response.content = [Mock(type="text", text="Response")] mock_client.beta.messages.create = Mock(return_value=mock_response) await sampling_loop( model="claude-sonnet-4-5", provider=APIProvider.ANTHROPIC, system_prompt_suffix="", messages=[{"role": "user", "content": "Test"}], output_callback=lambda x: None, tool_output_callback=lambda r, i: None, api_response_callback=lambda *args: None, api_key="test_key" ) call_args = mock_client.beta.messages.create.call_args[1] assert "tool_choice" in call_args assert call_args["tool_choice"] == {"type": "auto"} asyncio.run(run_test()) ================================================ FILE: browser-use-demo/tests/test_streamlit_helpers.py ================================================ """Tests for Streamlit helper functions with edge case coverage.""" import asyncio from unittest.mock import MagicMock, Mock, patch import pytest from browser_use_demo.loop import APIProvider from browser_use_demo.streamlit import ( authenticate, get_or_create_event_loop, setup_state, ) class TestSetupState: """Test suite for setup_state function.""" @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_fresh_initialization(self, mock_state, mock_environment): """Test setup_state with completely empty session state.""" # Simulate empty session state mock_state.__contains__.return_value = False # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC with patch("browser_use_demo.tools.BrowserTool") as mock_browser: setup_state() # Check all defaults were set assert "messages" in mock_state.__setitem__.call_args_list[0][0] assert "api_key" in str(mock_state.__setitem__.call_args_list) assert "event_loop" in str(mock_state.__setitem__.call_args_list) # Browser tool should be created mock_browser.assert_called_once() @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_partial_initialization(self, mock_state): """Test setup_state when some keys already exist.""" # Simulate partial state existing_keys = ["messages", "api_key"] def contains_side_effect(key): return key in existing_keys mock_state.__contains__.side_effect = contains_side_effect # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC with patch("browser_use_demo.tools.BrowserTool"): setup_state() # Only missing keys should be set set_keys = [call[0][0] for call in mock_state.__setitem__.call_args_list] assert "messages" not in set_keys assert "api_key" not in set_keys @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_missing_env_variables(self, mock_state, clean_environment): """Test setup_state when environment variables are missing.""" mock_state.__contains__.return_value = False # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC with patch("browser_use_demo.tools.BrowserTool") as mock_browser: setup_state() # BrowserTool no longer takes dimensions as arguments mock_browser.assert_called_with() @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_lambda_evaluation(self, mock_state, mock_provider): """Test that lambda functions are evaluated correctly.""" mock_state.__contains__.return_value = False mock_state.provider = mock_provider.ANTHROPIC setup_state() # Model should be set based on provider model_calls = [ call for call in mock_state.__setitem__.call_args_list if call[0][0] == "model" ] assert len(model_calls) > 0 @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_browser_tool_error(self, mock_state): """Test setup_state when BrowserTool initialization fails.""" mock_state.__contains__.return_value = False # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC with patch("browser_use_demo.tools.BrowserTool") as mock_browser: mock_browser.side_effect = Exception("Browser init failed") # Should raise the exception with pytest.raises(Exception, match="Browser init failed"): setup_state() # Test removed - BrowserTool no longer reads dimensions from environment class TestGetOrCreateEventLoop: """Test suite for get_or_create_event_loop function.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("asyncio.new_event_loop") @patch("asyncio.set_event_loop") def test_create_new_loop_when_none(self, mock_set_loop, mock_new_loop, mock_state): """Test creating new event loop when none exists.""" mock_state.event_loop = None new_loop = Mock() mock_new_loop.return_value = new_loop result = get_or_create_event_loop() mock_new_loop.assert_called_once() mock_set_loop.assert_called_once_with(new_loop) assert mock_state.event_loop == new_loop assert result == new_loop @patch("streamlit.session_state", new_callable=MagicMock) @patch("asyncio.new_event_loop") @patch("asyncio.set_event_loop") def test_create_new_loop_when_closed( self, mock_set_loop, mock_new_loop, mock_state ): """Test creating new event loop when existing is closed.""" closed_loop = Mock() closed_loop.is_closed.return_value = True mock_state.event_loop = closed_loop new_loop = Mock() mock_new_loop.return_value = new_loop result = get_or_create_event_loop() mock_new_loop.assert_called_once() mock_set_loop.assert_called_once_with(new_loop) assert mock_state.event_loop == new_loop assert result == new_loop @patch("streamlit.session_state", new_callable=MagicMock) @patch("asyncio.new_event_loop") @patch("asyncio.set_event_loop") def test_reuse_existing_open_loop(self, mock_set_loop, mock_new_loop, mock_state): """Test reusing existing open event loop.""" existing_loop = Mock() existing_loop.is_closed.return_value = False mock_state.event_loop = existing_loop result = get_or_create_event_loop() mock_new_loop.assert_not_called() mock_set_loop.assert_called_once_with(existing_loop) assert result == existing_loop @patch("streamlit.session_state", new_callable=MagicMock) @patch("asyncio.new_event_loop") def test_event_loop_creation_error(self, mock_new_loop, mock_state): """Test handling error during event loop creation.""" mock_state.event_loop = None mock_new_loop.side_effect = RuntimeError("Cannot create loop") with pytest.raises(RuntimeError, match="Cannot create loop"): get_or_create_event_loop() @patch("streamlit.session_state", new_callable=MagicMock) @patch("asyncio.set_event_loop") def test_set_event_loop_error(self, mock_set_loop, mock_state): """Test handling error when setting event loop.""" mock_state.event_loop = None mock_set_loop.side_effect = RuntimeError("Cannot set loop") with pytest.raises(RuntimeError, match="Cannot set loop"): get_or_create_event_loop() class TestAuthenticate: """Test suite for authenticate function.""" @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.error") @patch("streamlit.stop") def test_authenticate_with_valid_key( self, mock_stop, mock_error, mock_state, mock_provider ): """Test authenticate with valid API key.""" mock_state.provider = mock_provider.ANTHROPIC mock_state.api_key = "valid-key" result = authenticate() assert result is True mock_error.assert_not_called() mock_stop.assert_not_called() @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.error") @patch("streamlit.stop") def test_authenticate_with_missing_key( self, mock_stop, mock_error, mock_state, mock_provider ): """Test authenticate with missing API key.""" mock_state.provider = mock_provider.ANTHROPIC mock_state.api_key = "" authenticate() mock_error.assert_called_once_with( "Please provide your Anthropic API key in the sidebar" ) mock_stop.assert_called_once() # Function doesn't return after stop() in real scenario @patch("streamlit.session_state", new_callable=MagicMock) @patch("streamlit.error") @patch("streamlit.stop") def test_authenticate_with_none_key( self, mock_stop, mock_error, mock_state, mock_provider ): """Test authenticate with None API key.""" mock_state.provider = mock_provider.ANTHROPIC mock_state.api_key = None authenticate() mock_error.assert_called_once() mock_stop.assert_called_once() @patch("streamlit.session_state", new_callable=MagicMock) def test_authenticate_non_anthropic_provider(self, mock_state, mock_provider): """Test authenticate with non-Anthropic provider.""" mock_state.provider = mock_provider.BEDROCK mock_state.api_key = "" # Empty key should be OK for non-Anthropic result = authenticate() assert result is True class TestEdgeCasesAndErrors: """Test edge cases and error conditions for helper functions.""" @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_with_corrupted_state(self, mock_state): """Test setup_state with corrupted session state.""" # Simulate corrupted state that raises on access mock_state.__contains__.side_effect = Exception("State corrupted") with pytest.raises(Exception, match="State corrupted"): setup_state() @patch("streamlit.session_state", new_callable=MagicMock) def test_concurrent_setup_state_calls(self, mock_state): """Test concurrent calls to setup_state.""" import threading mock_state.__contains__.return_value = False # Set provider to valid enum value so lambda can access it mock_state.provider = APIProvider.ANTHROPIC errors = [] def run_setup(): try: with patch("browser_use_demo.tools.BrowserTool"): setup_state() except Exception as e: errors.append(e) threads = [threading.Thread(target=run_setup) for _ in range(5)] for t in threads: t.start() for t in threads: t.join() # Should handle concurrent access without crashes assert len(errors) == 0 @patch("streamlit.session_state", new_callable=MagicMock) @patch("asyncio.get_event_loop") def test_get_or_create_with_running_loop(self, mock_get_loop, mock_state): """Test get_or_create_event_loop when another loop is running.""" mock_state.event_loop = None running_loop = Mock(spec=asyncio.AbstractEventLoop) running_loop.is_running.return_value = True mock_get_loop.return_value = running_loop # Should create new loop despite running loop exists with patch("asyncio.new_event_loop") as mock_new: new_loop = Mock(spec=asyncio.AbstractEventLoop) new_loop.is_closed.return_value = False mock_new.return_value = new_loop with patch("asyncio.set_event_loop"): result = get_or_create_event_loop() assert result == new_loop @patch("streamlit.session_state", new_callable=MagicMock) def test_setup_state_with_readonly_state(self, mock_state): """Test setup_state when session state is read-only.""" mock_state.__contains__.return_value = False mock_state.__setitem__.side_effect = AttributeError("Read-only state") with pytest.raises(AttributeError, match="Read-only state"): setup_state() # Test removed - BrowserTool no longer reads dimensions from environment ================================================ FILE: browser-use-demo/validate_env.py ================================================ #!/usr/bin/env python3 """Validate environment configuration before startup.""" import os import sys from pathlib import Path # Import constants for display information try: from browser_use_demo.display_constants import BROWSER_WIDTH, BROWSER_HEIGHT, DISPLAY_WIDTH, DISPLAY_HEIGHT except ImportError: # Fallback if running outside the package DISPLAY_WIDTH = 1920 DISPLAY_HEIGHT = 1080 BROWSER_WIDTH = 1920 BROWSER_HEIGHT = 1080 def validate_env(): """Validate required environment variables are set.""" # Check API key api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: print("\n" + "=" * 60) print("ERROR: Missing required configuration!") print("=" * 60) print("\nThe Browser Use Demo requires proper configuration to run.") print("\n🔧 RECOMMENDED: Use docker-compose with a .env file:") print(" 1. Copy the example environment file:") print(" cp .env.example .env") print(" 2. Edit .env and add your Anthropic API key") print(" 3. Run with docker-compose:") print(" docker-compose up --build") print("=" * 60) sys.exit(1) if api_key == "your_anthropic_api_key_here" or len(api_key) < 10: print("\n" + "=" * 60) print("ERROR: Invalid API key!") print("=" * 60) print(" ANTHROPIC_API_KEY: Must be a valid API key") print("\nTo fix this, please edit your .env file with a valid API key") print("=" * 60) sys.exit(1) print("\n✓ Environment validation passed") print(f" Display: {DISPLAY_WIDTH}x{DISPLAY_HEIGHT}") print(f" Browser: {BROWSER_WIDTH}x{BROWSER_HEIGHT}") if __name__ == "__main__": validate_env() ================================================ FILE: computer-use-demo/.gitignore ================================================ .venv .ruff_cache __pycache__ .pytest_cache ================================================ FILE: computer-use-demo/.zed/settings.json ================================================ { "preferred_line_length": 88, "languages": { "Python": { "language_servers": ["pyright", "ruff"] } }, "telemetry": { "diagnostics": false, "metrics": false } } ================================================ FILE: computer-use-demo/CONTRIBUTING.md ================================================ # Contributing Guidelines Thank you for your interest in contributing to the Computer Use Demo quickstart! This document outlines the process and guidelines for contributing to this project. ## Code of Conduct Please be respectful and constructive in all interactions. We aim to maintain a welcoming and inclusive environment for all contributors. ## Contribution Policy - Bugfixes and updates to our documentation that address correctness issues are always welcome - Feature additions, refactors, and documentation updates beyond the scope of correctness (major changes) are accepted at the sole determination of the maintainers. We require each major change to be submitted in a separate Pull Request. We will assess new features under the following criteria: - Adherence to coding standards - Ease of use as a reference implementation - User experience - Applicability to a wide audience of developers - Minimization of third-party dependencies - Does not promote a product or service Please open a github issue if you are need clarification on this policy or you want to discuss a new feature addition. ## Development Setup 1. Create and activate a Python virtual environment: ```bash python -m venv .venv source .venv/bin/activate # On Unix # or .venv\Scripts\activate # On Windows ``` 2. Install development dependencies: ```bash pip install -r dev-requirements.txt ``` 3. Install pre-commit hooks: ```bash pre-commit install ``` ## Development Process 1. Fork the repository and create a branch for your changes 2. Make your changes following our coding standards 3. Submit a pull request with a clear description of the changes ## Coding Standards - Use clear, descriptive variable and function names - Follow PEP 8 style guidelines for Python code - Keep functions focused and single-purpose - Avoid inline comments - code should be self-documenting - Use type hints for all Python functions - Use dataclasses for structured data (see `tools/base.py` for examples) - All tools must inherit from `BaseAnthropicTool` and implement required methods - Use abstract base classes (ABC) for defining interfaces - Handle errors using `ToolError` and `ToolFailure` classes ## Code Quality Tools We use several tools to maintain code quality: - **Ruff**: For linting and formatting - Run `ruff check .` for linting - Run `ruff format .` for formatting - See `ruff.toml` for enabled rules - **Pyright**: For type checking - Configuration in `pyproject.toml` - **Pre-commit**: For automated checks before commits ## Testing - Add tests for new functionality in the `tests/` directory - Follow existing test patterns (see `tests/tools/` for examples) - Use pytest fixtures where appropriate - Run tests with: ```bash pytest ``` - Tests must pass in async mode (configured in pyproject.toml) ## Commit Guidelines - All commits MUST be signed (use `git commit -S`) - Write clear, descriptive commit messages - Use present tense ("Add feature" not "Added feature") - Reference issue numbers when applicable ## Pull Request Process 1. Update documentation as needed 2. Add tests for new functionality 3. Ensure all checks pass: - All tests pass - Ruff linting passes - Type checking passes - Pre-commit hooks pass 4. Request review from maintainers 5. Address review feedback ## Tool Development When creating new tools: 1. Inherit from `BaseAnthropicTool` 2. Implement `__call__` and `to_params` methods 3. Use appropriate result types (`ToolResult`, `CLIResult`, or `ToolFailure`) 4. Add comprehensive tests 5. Document parameters and return types ## Documentation - Keep README.md up to date - Document new features and changes - Use clear, concise language - Include docstrings for all public classes and methods - Use concise, single-line docstrings for simple functions - For complex functions, include: - A brief description - Args/parameters if not obvious - Return value if not obvious - Any important notes about behavior ## Questions? If you have questions, please open an issue for discussion. ================================================ FILE: computer-use-demo/Dockerfile ================================================ FROM docker.io/ubuntu:22.04 ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_PRIORITY=high RUN apt-get update && \ apt-get -y upgrade && \ apt-get -y install \ # UI Requirements xvfb \ xterm \ xdotool \ scrot \ imagemagick \ sudo \ mutter \ x11vnc \ # Python/pyenv reqs build-essential \ libssl-dev \ zlib1g-dev \ libbz2-dev \ libreadline-dev \ libsqlite3-dev \ curl \ git \ libncursesw5-dev \ xz-utils \ tk-dev \ libxml2-dev \ libxmlsec1-dev \ libffi-dev \ liblzma-dev \ # Network tools net-tools \ netcat \ # PPA req software-properties-common && \ # Userland apps sudo add-apt-repository ppa:mozillateam/ppa && \ sudo apt-get install -y --no-install-recommends \ libreoffice \ firefox-esr \ x11-apps \ xpdf \ gedit \ xpaint \ tint2 \ galculator \ pcmanfm \ unzip && \ apt-get clean # Install noVNC RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \ git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html # setup user ENV USERNAME=computeruse ENV HOME=/home/$USERNAME RUN useradd -m -s /bin/bash -d $HOME $USERNAME RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers USER computeruse WORKDIR $HOME # setup python RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ cd ~/.pyenv && src/configure && make -C src && cd .. && \ echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \ echo 'eval "$(pyenv init -)"' >> ~/.bashrc ENV PYENV_ROOT="$HOME/.pyenv" ENV PATH="$PYENV_ROOT/bin:$PATH" ENV PYENV_VERSION_MAJOR=3 ENV PYENV_VERSION_MINOR=11 ENV PYENV_VERSION_PATCH=6 ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH RUN eval "$(pyenv init -)" && \ pyenv install $PYENV_VERSION && \ pyenv global $PYENV_VERSION && \ pyenv rehash ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH" RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \ python -m pip config set global.disable-pip-version-check true # only reinstall if requirements.txt changes COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt # setup desktop env & app COPY --chown=$USERNAME:$USERNAME image/ $HOME COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/ ARG DISPLAY_NUM=1 ARG HEIGHT=768 ARG WIDTH=1024 ENV DISPLAY_NUM=$DISPLAY_NUM ENV HEIGHT=$HEIGHT ENV WIDTH=$WIDTH ENTRYPOINT [ "./entrypoint.sh" ] ================================================ FILE: computer-use-demo/LICENSE ================================================ Copyright 2024 Anthropic, PBC. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: computer-use-demo/README.md ================================================ # Anthropic Computer Use Demo > [!NOTE] > Now featuring support for the new Claude 4 models! The latest Claude Opus 4.5 (claude-opus-4-5-20251101) is now available, along with Claude Sonnet 4.5 (claude-sonnet-4-5-20250929), Claude Sonnet 4 (claude-sonnet-4-20250514), Claude Opus 4 (claude-opus-4-20250514), and Claude Haiku 4.5 (claude-haiku-4-5-20251001). These models bring next-generation capabilities with the updated str_replace_based_edit_tool that replaces the previous str_replace_editor tool. The undo_edit command has been removed in this latest version for a more streamlined experience. > [!CAUTION] > Computer use is a beta feature. Please be aware that computer use poses unique risks that are distinct from standard API features or chat interfaces. These risks are heightened when using computer use to interact with the internet. To minimize risks, consider taking precautions such as: > > 1. Use a dedicated virtual machine or container with minimal privileges to prevent direct system attacks or accidents. > 2. Avoid giving the model access to sensitive data, such as account login information, to prevent information theft. > 3. Limit internet access to an allowlist of domains to reduce exposure to malicious content. > 4. Ask a human to confirm decisions that may result in meaningful real-world consequences as well as any tasks requiring affirmative consent, such as accepting cookies, executing financial transactions, or agreeing to terms of service. > > In some circumstances, Claude will follow commands found in content even if it conflicts with the user's instructions. For example, instructions on webpages or contained in images may override user instructions or cause Claude to make mistakes. We suggest taking precautions to isolate Claude from sensitive data and actions to avoid risks related to prompt injection. > > Finally, please inform end users of relevant risks and obtain their consent prior to enabling computer use in your own products. This repository helps you get started with computer use on Claude, with reference implementations of: - Build files to create a Docker container with all necessary dependencies - A computer use agent loop using the Claude API, Bedrock, or Vertex to access Claude Opus 4.5, Claude Sonnet 4.5, Claude Sonnet 4, Claude Opus 4, Claude Haiku 4.5, Claude 3.7 Sonnet, and Claude 3.5 Sonnet models - Anthropic-defined computer use tools - A streamlit app for interacting with the agent loop Please use [this form](https://forms.gle/BT1hpBrqDPDUrCqo7) to provide feedback on the quality of the model responses, the API itself, or the quality of the documentation - we cannot wait to hear from you! > [!IMPORTANT] > The Beta API used in this reference implementation is subject to change. Please refer to the [API release notes](https://docs.claude.com/en/release-notes/api) for the most up-to-date information. > [!IMPORTANT] > The components are weakly separated: the agent loop runs in the container being controlled by Claude, can only be used by one session at a time, and must be restarted or reset between sessions if necessary. ## Quickstart: running the Docker container ### Claude API > [!TIP] > You can find your API key in the [Claude Console](https://console.anthropic.com/). ```bash export ANTHROPIC_API_KEY=%your_api_key% docker run \ -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \ -v $HOME/.anthropic:/home/computeruse/.anthropic \ -p 5900:5900 \ -p 8501:8501 \ -p 6080:6080 \ -p 8080:8080 \ -it ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest ``` Once the container is running, see the [Accessing the demo app](#accessing-the-demo-app) section below for instructions on how to connect to the interface. ### Bedrock > [!TIP] > To use the new Claude 3.7 Sonnet on Bedrock, you first need to [request model access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access-modify.html). You'll need to pass in AWS credentials with appropriate permissions to use Claude on Bedrock. You have a few options for authenticating with Bedrock. See the [boto3 documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#environment-variables) for more details and options. #### Option 1: (suggested) Use the host's AWS credentials file and AWS profile ```bash export AWS_PROFILE= docker run \ -e API_PROVIDER=bedrock \ -e AWS_PROFILE=$AWS_PROFILE \ -e AWS_REGION=us-west-2 \ -v $HOME/.aws:/home/computeruse/.aws \ -v $HOME/.anthropic:/home/computeruse/.anthropic \ -p 5900:5900 \ -p 8501:8501 \ -p 6080:6080 \ -p 8080:8080 \ -it ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest ``` Once the container is running, see the [Accessing the demo app](#accessing-the-demo-app) section below for instructions on how to connect to the interface. #### Option 2: Use an access key and secret ```bash export AWS_ACCESS_KEY_ID=%your_aws_access_key% export AWS_SECRET_ACCESS_KEY=%your_aws_secret_access_key% export AWS_SESSION_TOKEN=%your_aws_session_token% docker run \ -e API_PROVIDER=bedrock \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -e AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \ -e AWS_REGION=us-west-2 \ -v $HOME/.anthropic:/home/computeruse/.anthropic \ -p 5900:5900 \ -p 8501:8501 \ -p 6080:6080 \ -p 8080:8080 \ -it ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest ``` Once the container is running, see the [Accessing the demo app](#accessing-the-demo-app) section below for instructions on how to connect to the interface. ### Vertex You'll need to pass in Google Cloud credentials with appropriate permissions to use Claude on Vertex. ```bash docker build . -t computer-use-demo gcloud auth application-default login export VERTEX_REGION=%your_vertex_region% export VERTEX_PROJECT_ID=%your_vertex_project_id% docker run \ -e API_PROVIDER=vertex \ -e CLOUD_ML_REGION=$VERTEX_REGION \ -e ANTHROPIC_VERTEX_PROJECT_ID=$VERTEX_PROJECT_ID \ -v $HOME/.config/gcloud/application_default_credentials.json:/home/computeruse/.config/gcloud/application_default_credentials.json \ -p 5900:5900 \ -p 8501:8501 \ -p 6080:6080 \ -p 8080:8080 \ -it computer-use-demo ``` Once the container is running, see the [Accessing the demo app](#accessing-the-demo-app) section below for instructions on how to connect to the interface. This example shows how to use the Google Cloud Application Default Credentials to authenticate with Vertex. You can also set `GOOGLE_APPLICATION_CREDENTIALS` to use an arbitrary credential file, see the [Google Cloud Authentication documentation](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) for more details. ### Accessing the demo app Once the container is running, open your browser to [http://localhost:8080](http://localhost:8080) to access the combined interface that includes both the agent chat and desktop view. The container stores settings like the API key and custom system prompt in `~/.anthropic/`. Mount this directory to persist these settings between container runs. Alternative access points: - Streamlit interface only: [http://localhost:8501](http://localhost:8501) - Desktop view only: [http://localhost:6080/vnc.html](http://localhost:6080/vnc.html) - Direct VNC connection: `vnc://localhost:5900` (for VNC clients) ## Screen size Environment variables `WIDTH` and `HEIGHT` can be used to set the screen size. For example: ```bash docker run \ -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \ -v $HOME/.anthropic:/home/computeruse/.anthropic \ -p 5900:5900 \ -p 8501:8501 \ -p 6080:6080 \ -p 8080:8080 \ -e WIDTH=1920 \ -e HEIGHT=1080 \ -it ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest ``` We do not recommend sending screenshots in resolutions above [XGA/WXGA](https://en.wikipedia.org/wiki/Display_resolution_standards#XGA) to avoid issues related to [image resizing](https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size). Relying on the image resizing behavior in the API will result in lower model accuracy and slower performance than implementing scaling in your tools directly. The `computer` tool implementation in this project demonstrates how to scale both images and coordinates from higher resolutions to the suggested resolutions. When implementing computer use yourself, we recommend using XGA resolution (1024x768): - For higher resolutions: Scale the image down to XGA and let the model interact with this scaled version, then map the coordinates back to the original resolution proportionally. - For lower resolutions or smaller devices (e.g. mobile devices): Add black padding around the display area until it reaches 1024x768. ## Development ```bash ./setup.sh # configure venv, install development dependencies, and install pre-commit hooks docker build . -t computer-use-demo:local # manually build the docker image (optional) export ANTHROPIC_API_KEY=%your_api_key% docker run \ -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \ -v $(pwd)/computer_use_demo:/home/computeruse/computer_use_demo/ `# mount local python module for development` \ -v $HOME/.anthropic:/home/computeruse/.anthropic \ -p 5900:5900 \ -p 8501:8501 \ -p 6080:6080 \ -p 8080:8080 \ -it computer-use-demo:local # can also use ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest ``` The docker run command above mounts the repo inside the docker image, such that you can edit files from the host. Streamlit is already configured with auto reloading. ================================================ FILE: computer-use-demo/computer_use_demo/__init__.py ================================================ ================================================ FILE: computer-use-demo/computer_use_demo/loop.py ================================================ """ Agentic sampling loop that calls the Claude API and local implementation of anthropic-defined computer use tools. """ import platform from collections.abc import Callable from datetime import datetime from enum import StrEnum from typing import Any, cast import httpx from anthropic import ( Anthropic, AnthropicBedrock, AnthropicVertex, APIError, APIResponseValidationError, APIStatusError, ) from anthropic.types.beta import ( BetaCacheControlEphemeralParam, BetaContentBlockParam, BetaImageBlockParam, BetaMessage, BetaMessageParam, BetaTextBlock, BetaTextBlockParam, BetaToolResultBlockParam, BetaToolUseBlockParam, ) from .tools import ( TOOL_GROUPS_BY_VERSION, ToolCollection, ToolResult, ToolVersion, ) PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31" class APIProvider(StrEnum): ANTHROPIC = "anthropic" BEDROCK = "bedrock" VERTEX = "vertex" # This system prompt is optimized for the Docker environment in this repository and # specific tool combinations enabled. # We encourage modifying this system prompt to ensure the model has context for the # environment it is running in, and to provide any additional information that may be # helpful for the task at hand. SYSTEM_PROMPT = f""" * You are utilising an Ubuntu virtual machine using {platform.machine()} architecture with internet access. * You can feel free to install Ubuntu applications with your bash tool. Use curl instead of wget. * To open firefox, please just click on the firefox icon. Note, firefox-esr is what is installed on your system. * Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example "(DISPLAY=:1 xterm &)". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did. * When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_based_edit_tool or `grep -n -B -A ` to confirm output. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is {datetime.today().strftime("%A, %B %-d, %Y")}. * When using Firefox, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there. * If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your str_replace_based_edit_tool. """ async def sampling_loop( *, model: str, provider: APIProvider, system_prompt_suffix: str, messages: list[BetaMessageParam], output_callback: Callable[[BetaContentBlockParam], None], tool_output_callback: Callable[[ToolResult, str], None], api_response_callback: Callable[ [httpx.Request, httpx.Response | object | None, Exception | None], None ], api_key: str, only_n_most_recent_images: int | None = None, max_tokens: int = 4096, tool_version: ToolVersion, thinking_budget: int | None = None, token_efficient_tools_beta: bool = False, ): """ Agentic sampling loop for the assistant/tool interaction of computer use. """ tool_group = TOOL_GROUPS_BY_VERSION[tool_version] tool_collection = ToolCollection(*(ToolCls() for ToolCls in tool_group.tools)) system = BetaTextBlockParam( type="text", text=f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}", ) while True: enable_prompt_caching = False betas = [tool_group.beta_flag] if tool_group.beta_flag else [] if token_efficient_tools_beta: betas.append("token-efficient-tools-2025-02-19") image_truncation_threshold = only_n_most_recent_images or 0 if provider == APIProvider.ANTHROPIC: client = Anthropic(api_key=api_key, max_retries=4) enable_prompt_caching = True elif provider == APIProvider.VERTEX: client = AnthropicVertex() elif provider == APIProvider.BEDROCK: client = AnthropicBedrock() if enable_prompt_caching: betas.append(PROMPT_CACHING_BETA_FLAG) _inject_prompt_caching(messages) # Because cached reads are 10% of the price, we don't think it's # ever sensible to break the cache by truncating images only_n_most_recent_images = 0 # Use type ignore to bypass TypedDict check until SDK types are updated system["cache_control"] = {"type": "ephemeral"} # type: ignore if only_n_most_recent_images: _maybe_filter_to_n_most_recent_images( messages, only_n_most_recent_images, min_removal_threshold=image_truncation_threshold, ) extra_body = {} if thinking_budget: # Ensure we only send the required fields for thinking extra_body = { "thinking": {"type": "enabled", "budget_tokens": thinking_budget} } # Call the API # we use raw_response to provide debug information to streamlit. Your # implementation may be able call the SDK directly with: # `response = client.messages.create(...)` instead. try: raw_response = client.beta.messages.with_raw_response.create( max_tokens=max_tokens, messages=messages, model=model, system=[system], tools=tool_collection.to_params(), betas=betas, extra_body=extra_body, ) except (APIStatusError, APIResponseValidationError) as e: api_response_callback(e.request, e.response, e) return messages except APIError as e: api_response_callback(e.request, e.body, e) return messages api_response_callback( raw_response.http_response.request, raw_response.http_response, None ) response = raw_response.parse() response_params = _response_to_params(response) messages.append( { "role": "assistant", "content": response_params, } ) tool_result_content: list[BetaToolResultBlockParam] = [] for content_block in response_params: output_callback(content_block) if ( isinstance(content_block, dict) and content_block.get("type") == "tool_use" ): # Type narrowing for tool use blocks tool_use_block = cast(BetaToolUseBlockParam, content_block) result = await tool_collection.run( name=tool_use_block["name"], tool_input=cast(dict[str, Any], tool_use_block.get("input", {})), ) tool_result_content.append( _make_api_tool_result(result, tool_use_block["id"]) ) tool_output_callback(result, tool_use_block["id"]) if not tool_result_content: return messages messages.append({"content": tool_result_content, "role": "user"}) def _maybe_filter_to_n_most_recent_images( messages: list[BetaMessageParam], images_to_keep: int, min_removal_threshold: int, ): """ With the assumption that images are screenshots that are of diminishing value as the conversation progresses, remove all but the final `images_to_keep` tool_result images in place, with a chunk of min_removal_threshold to reduce the amount we break the implicit prompt cache. """ if images_to_keep is None: return messages tool_result_blocks = cast( list[BetaToolResultBlockParam], [ item for message in messages for item in ( message["content"] if isinstance(message["content"], list) else [] ) if isinstance(item, dict) and item.get("type") == "tool_result" ], ) total_images = sum( 1 for tool_result in tool_result_blocks for content in tool_result.get("content", []) if isinstance(content, dict) and content.get("type") == "image" ) images_to_remove = total_images - images_to_keep # for better cache behavior, we want to remove in chunks images_to_remove -= images_to_remove % min_removal_threshold for tool_result in tool_result_blocks: if isinstance(tool_result.get("content"), list): new_content = [] for content in tool_result.get("content", []): if isinstance(content, dict) and content.get("type") == "image": if images_to_remove > 0: images_to_remove -= 1 continue new_content.append(content) tool_result["content"] = new_content def _response_to_params( response: BetaMessage, ) -> list[BetaContentBlockParam]: res: list[BetaContentBlockParam] = [] for block in response.content: if isinstance(block, BetaTextBlock): if block.text: res.append(BetaTextBlockParam(type="text", text=block.text)) elif getattr(block, "type", None) == "thinking": # Handle thinking blocks - include signature field thinking_block = { "type": "thinking", "thinking": getattr(block, "thinking", None), } if hasattr(block, "signature"): thinking_block["signature"] = getattr(block, "signature", None) res.append(cast(BetaContentBlockParam, thinking_block)) else: # Handle tool use blocks normally res.append(cast(BetaToolUseBlockParam, block.model_dump())) return res def _inject_prompt_caching( messages: list[BetaMessageParam], ): """ Set cache breakpoints for the 3 most recent turns one cache breakpoint is left for tools/system prompt, to be shared across sessions """ breakpoints_remaining = 3 for message in reversed(messages): if message["role"] == "user" and isinstance( content := message["content"], list ): if breakpoints_remaining: breakpoints_remaining -= 1 # Use type ignore to bypass TypedDict check until SDK types are updated content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore {"type": "ephemeral"} ) else: if isinstance(content[-1], dict) and "cache_control" in content[-1]: del content[-1]["cache_control"] # type: ignore # we'll only every have one extra turn per loop break def _make_api_tool_result( result: ToolResult, tool_use_id: str ) -> BetaToolResultBlockParam: """Convert an agent ToolResult to an API ToolResultBlockParam.""" tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = [] is_error = False if result.error: is_error = True tool_result_content = _maybe_prepend_system_tool_result(result, result.error) else: if result.output: tool_result_content.append( { "type": "text", "text": _maybe_prepend_system_tool_result(result, result.output), } ) if result.base64_image: tool_result_content.append( { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": result.base64_image, }, } ) return { "type": "tool_result", "content": tool_result_content, "tool_use_id": tool_use_id, "is_error": is_error, } def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str): if result.system: result_text = f"{result.system}\n{result_text}" return result_text ================================================ FILE: computer-use-demo/computer_use_demo/requirements.txt ================================================ streamlit==1.41.0 anthropic[bedrock,vertex]>=0.39.0 jsonschema==4.22.0 boto3>=1.28.57 google-auth<3,>=2 ================================================ FILE: computer-use-demo/computer_use_demo/streamlit.py ================================================ """ Entrypoint for streamlit, see https://docs.streamlit.io/ """ import asyncio import base64 import os import subprocess import traceback from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timedelta from enum import StrEnum from functools import partial from pathlib import PosixPath from typing import cast, get_args import httpx import streamlit as st from anthropic import RateLimitError from anthropic.types.beta import ( BetaContentBlockParam, BetaTextBlockParam, BetaToolResultBlockParam, ) from streamlit.delta_generator import DeltaGenerator from computer_use_demo.loop import ( APIProvider, sampling_loop, ) from computer_use_demo.tools import ToolResult, ToolVersion PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { APIProvider.ANTHROPIC: "claude-sonnet-4-5-20250929", APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", } @dataclass(kw_only=True, frozen=True) class ModelConfig: tool_version: ToolVersion max_output_tokens: int default_output_tokens: int has_thinking: bool = False CLAUDE_4 = ModelConfig( tool_version="computer_use_20250429", max_output_tokens=64_000, default_output_tokens=1024 * 16, has_thinking=True, ) CLAUDE_4_5 = ModelConfig( tool_version="computer_use_20250124", max_output_tokens=128_000, default_output_tokens=1024 * 16, has_thinking=True, ) CLAUDE_4_WITH_ZOOMABLE_TOOL = ModelConfig( tool_version="computer_use_20251124", max_output_tokens=64_000, default_output_tokens=1024 * 16, has_thinking=True, ) HAIKU_4_5 = ModelConfig( tool_version="computer_use_20250124", max_output_tokens=1024 * 8, default_output_tokens=1024 * 4, has_thinking=False, ) MODEL_TO_MODEL_CONF: dict[str, ModelConfig] = { "claude-opus-4-1-20250805": CLAUDE_4, "claude-sonnet-4-20250514": CLAUDE_4, "claude-opus-4-20250514": CLAUDE_4, "claude-sonnet-4-5-20250929": CLAUDE_4_5, "anthropic.claude-sonnet-4-5-20250929-v1:0": CLAUDE_4_5, "claude-sonnet-4-5@20250929": CLAUDE_4_5, "claude-haiku-4-5-20251001": HAIKU_4_5, "anthropic.claude-haiku-4-5-20251001-v1:0": HAIKU_4_5, # Bedrock "claude-haiku-4-5@20251001": HAIKU_4_5, # Vertex "claude-opus-4-5-20251101": CLAUDE_4_WITH_ZOOMABLE_TOOL, } CONFIG_DIR = PosixPath("~/.anthropic").expanduser() API_KEY_FILE = CONFIG_DIR / "api_key" STREAMLIT_STYLE = """ """ WARNING_TEXT = "⚠️ Security Alert: Never provide access to sensitive accounts or data, as malicious web content can hijack Claude's behavior" INTERRUPT_TEXT = "(user stopped or interrupted and wrote the following)" INTERRUPT_TOOL_ERROR = "human stopped or interrupted tool execution" class Sender(StrEnum): USER = "user" BOT = "assistant" TOOL = "tool" def setup_state(): if "messages" not in st.session_state: st.session_state.messages = [] if "api_key" not in st.session_state: # Try to load API key from file first, then environment st.session_state.api_key = load_from_storage("api_key") or os.getenv( "ANTHROPIC_API_KEY", "" ) if "provider" not in st.session_state: st.session_state.provider = ( os.getenv("API_PROVIDER", "anthropic") or APIProvider.ANTHROPIC ) if "provider_radio" not in st.session_state: st.session_state.provider_radio = st.session_state.provider if "model" not in st.session_state: _reset_model() if "auth_validated" not in st.session_state: st.session_state.auth_validated = False if "responses" not in st.session_state: st.session_state.responses = {} if "tools" not in st.session_state: st.session_state.tools = {} if "only_n_most_recent_images" not in st.session_state: st.session_state.only_n_most_recent_images = 3 if "custom_system_prompt" not in st.session_state: st.session_state.custom_system_prompt = load_from_storage("system_prompt") or "" if "hide_images" not in st.session_state: st.session_state.hide_images = False if "token_efficient_tools_beta" not in st.session_state: st.session_state.token_efficient_tools_beta = False if "in_sampling_loop" not in st.session_state: st.session_state.in_sampling_loop = False def _reset_model(): st.session_state.model = PROVIDER_TO_DEFAULT_MODEL_NAME[ cast(APIProvider, st.session_state.provider) ] _reset_model_conf() def _reset_model_conf(): model_conf = ( MODEL_TO_MODEL_CONF.get(st.session_state.model, CLAUDE_4) # Default fallback ) # If we're in radio selection mode, use the selected tool version if hasattr(st.session_state, "tool_versions"): st.session_state.tool_version = st.session_state.tool_versions else: st.session_state.tool_version = model_conf.tool_version st.session_state.has_thinking = model_conf.has_thinking st.session_state.output_tokens = model_conf.default_output_tokens st.session_state.max_output_tokens = model_conf.max_output_tokens st.session_state.thinking_budget = int(model_conf.default_output_tokens / 2) async def main(): """Render loop for streamlit""" setup_state() st.markdown(STREAMLIT_STYLE, unsafe_allow_html=True) st.title("Claude Computer Use Demo") if not os.getenv("HIDE_WARNING", False): st.warning(WARNING_TEXT) with st.sidebar: def _reset_api_provider(): if st.session_state.provider_radio != st.session_state.provider: _reset_model() st.session_state.provider = st.session_state.provider_radio st.session_state.auth_validated = False provider_options = [option.value for option in APIProvider] st.radio( "API Provider", options=provider_options, key="provider_radio", format_func=lambda x: x.title(), on_change=_reset_api_provider, ) st.text_input("Model", key="model", on_change=_reset_model_conf) if st.session_state.provider == APIProvider.ANTHROPIC: st.text_input( "Claude API Key", type="password", key="api_key", on_change=lambda: save_to_storage("api_key", st.session_state.api_key), ) st.number_input( "Only send N most recent images", min_value=0, key="only_n_most_recent_images", help="To decrease the total tokens sent, remove older screenshots from the conversation", ) st.text_area( "Custom System Prompt Suffix", key="custom_system_prompt", help="Additional instructions to append to the system prompt. see computer_use_demo/loop.py for the base system prompt.", on_change=lambda: save_to_storage( "system_prompt", st.session_state.custom_system_prompt ), ) st.checkbox("Hide screenshots", key="hide_images") st.checkbox( "Enable token-efficient tools beta", key="token_efficient_tools_beta" ) versions = get_args(ToolVersion) st.radio( "Tool Versions", key="tool_versions", options=versions, index=versions.index(st.session_state.tool_version), on_change=lambda: setattr( st.session_state, "tool_version", st.session_state.get("tool_versions", st.session_state.tool_version), ), ) st.number_input("Max Output Tokens", key="output_tokens", step=1) st.checkbox("Thinking Enabled", key="thinking", value=False) st.number_input( "Thinking Budget", key="thinking_budget", max_value=st.session_state.max_output_tokens, step=1, disabled=not st.session_state.thinking, ) if st.button("Reset", type="primary"): with st.spinner("Resetting..."): st.session_state.clear() setup_state() subprocess.run("pkill Xvfb; pkill tint2", shell=True) # noqa: ASYNC221 await asyncio.sleep(1) subprocess.run("./start_all.sh", shell=True) # noqa: ASYNC221 if not st.session_state.auth_validated: if auth_error := validate_auth( st.session_state.provider, st.session_state.api_key ): st.warning(f"Please resolve the following auth issue:\n\n{auth_error}") return else: st.session_state.auth_validated = True chat, http_logs = st.tabs(["Chat", "HTTP Exchange Logs"]) new_message = st.chat_input( "Type a message to send to Claude to control the computer..." ) with chat: # render past chats for message in st.session_state.messages: if isinstance(message["content"], str): _render_message(message["role"], message["content"]) elif isinstance(message["content"], list): for block in message["content"]: # the tool result we send back to the Claude API isn't sufficient to render all details, # so we store the tool use responses if isinstance(block, dict) and block["type"] == "tool_result": _render_message( Sender.TOOL, st.session_state.tools[block["tool_use_id"]] ) else: _render_message( message["role"], cast(BetaContentBlockParam | ToolResult, block), ) # render past http exchanges for identity, (request, response) in st.session_state.responses.items(): _render_api_response(request, response, identity, http_logs) # render past chats if new_message: st.session_state.messages.append( { "role": Sender.USER, "content": [ *maybe_add_interruption_blocks(), BetaTextBlockParam(type="text", text=new_message), ], } ) _render_message(Sender.USER, new_message) try: most_recent_message = st.session_state["messages"][-1] except IndexError: return if most_recent_message["role"] is not Sender.USER: # we don't have a user message to respond to, exit early return with track_sampling_loop(): # run the agent sampling loop with the newest message st.session_state.messages = await sampling_loop( system_prompt_suffix=st.session_state.custom_system_prompt, model=st.session_state.model, provider=st.session_state.provider, messages=st.session_state.messages, output_callback=partial(_render_message, Sender.BOT), tool_output_callback=partial( _tool_output_callback, tool_state=st.session_state.tools ), api_response_callback=partial( _api_response_callback, tab=http_logs, response_state=st.session_state.responses, ), api_key=st.session_state.api_key, only_n_most_recent_images=st.session_state.only_n_most_recent_images, tool_version=st.session_state.tool_versions, max_tokens=st.session_state.output_tokens, thinking_budget=st.session_state.thinking_budget if st.session_state.thinking else None, token_efficient_tools_beta=st.session_state.token_efficient_tools_beta, ) def maybe_add_interruption_blocks(): if not st.session_state.in_sampling_loop: return [] # If this function is called while we're in the sampling loop, we can assume that the previous sampling loop was interrupted # and we should annotate the conversation with additional context for the model and heal any incomplete tool use calls result = [] last_message = st.session_state.messages[-1] previous_tool_use_ids = [ block["id"] for block in last_message["content"] if block["type"] == "tool_use" ] for tool_use_id in previous_tool_use_ids: st.session_state.tools[tool_use_id] = ToolResult(error=INTERRUPT_TOOL_ERROR) result.append( BetaToolResultBlockParam( tool_use_id=tool_use_id, type="tool_result", content=INTERRUPT_TOOL_ERROR, is_error=True, ) ) result.append(BetaTextBlockParam(type="text", text=INTERRUPT_TEXT)) return result @contextmanager def track_sampling_loop(): st.session_state.in_sampling_loop = True yield st.session_state.in_sampling_loop = False def validate_auth(provider: APIProvider, api_key: str | None): if provider == APIProvider.ANTHROPIC: if not api_key: return "Enter your Claude API key in the sidebar to continue." if provider == APIProvider.BEDROCK: import boto3 if not boto3.Session().get_credentials(): return "You must have AWS credentials set up to use the Bedrock API." if provider == APIProvider.VERTEX: import google.auth from google.auth.exceptions import DefaultCredentialsError if not os.environ.get("CLOUD_ML_REGION"): return "Set the CLOUD_ML_REGION environment variable to use the Vertex API." try: google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"], ) except DefaultCredentialsError: return "Your google cloud credentials are not set up correctly." def load_from_storage(filename: str) -> str | None: """Load data from a file in the storage directory.""" try: file_path = CONFIG_DIR / filename if file_path.exists(): data = file_path.read_text().strip() if data: return data except Exception as e: st.write(f"Debug: Error loading {filename}: {e}") return None def save_to_storage(filename: str, data: str) -> None: """Save data to a file in the storage directory.""" try: CONFIG_DIR.mkdir(parents=True, exist_ok=True) file_path = CONFIG_DIR / filename file_path.write_text(data) # Ensure only user can read/write the file file_path.chmod(0o600) except Exception as e: st.write(f"Debug: Error saving {filename}: {e}") def _api_response_callback( request: httpx.Request, response: httpx.Response | object | None, error: Exception | None, tab: DeltaGenerator, response_state: dict[str, tuple[httpx.Request, httpx.Response | object | None]], ): """ Handle an API response by storing it to state and rendering it. """ response_id = datetime.now().isoformat() response_state[response_id] = (request, response) if error: _render_error(error) _render_api_response(request, response, response_id, tab) def _tool_output_callback( tool_output: ToolResult, tool_id: str, tool_state: dict[str, ToolResult] ): """Handle a tool output by storing it to state and rendering it.""" tool_state[tool_id] = tool_output _render_message(Sender.TOOL, tool_output) def _render_api_response( request: httpx.Request, response: httpx.Response | object | None, response_id: str, tab: DeltaGenerator, ): """Render an API response to a streamlit tab""" with tab: with st.expander(f"Request/Response ({response_id})"): newline = "\n\n" st.markdown( f"`{request.method} {request.url}`{newline}{newline.join(f'`{k}: {v}`' for k, v in request.headers.items())}" ) st.json(request.read().decode()) st.markdown("---") if isinstance(response, httpx.Response): st.markdown( f"`{response.status_code}`{newline}{newline.join(f'`{k}: {v}`' for k, v in response.headers.items())}" ) st.json(response.text) else: st.write(response) def _render_error(error: Exception): if isinstance(error, RateLimitError): body = "You have been rate limited." if retry_after := error.response.headers.get("retry-after"): body += f" **Retry after {str(timedelta(seconds=int(retry_after)))} (HH:MM:SS).** See our API [documentation](https://docs.claude.com/en/api/rate-limits) for more details." body += f"\n\n{error.message}" else: body = str(error) body += "\n\n**Traceback:**" lines = "\n".join(traceback.format_exception(error)) body += f"\n\n```{lines}```" save_to_storage(f"error_{datetime.now().timestamp()}.md", body) st.error(f"**{error.__class__.__name__}**\n\n{body}", icon=":material/error:") def _render_message( sender: Sender, message: str | BetaContentBlockParam | ToolResult, ): """Convert input from the user or output from the agent to a streamlit message.""" # streamlit's hotreloading breaks isinstance checks, so we need to check for class names is_tool_result = not isinstance(message, str | dict) if not message or ( is_tool_result and st.session_state.hide_images and not hasattr(message, "error") and not hasattr(message, "output") ): return with st.chat_message(sender): if is_tool_result: message = cast(ToolResult, message) if message.output: if message.__class__.__name__ == "CLIResult": st.code(message.output) else: st.markdown(message.output) if message.error: st.error(message.error) if message.base64_image and not st.session_state.hide_images: st.image(base64.b64decode(message.base64_image)) elif isinstance(message, dict): if message["type"] == "text": st.write(message["text"]) elif message["type"] == "thinking": thinking_content = message.get("thinking", "") st.markdown(f"[Thinking]\n\n{thinking_content}") elif message["type"] == "tool_use": st.code(f"Tool Use: {message['name']}\nInput: {message['input']}") else: # only expected return types are text and tool_use raise Exception(f"Unexpected response type {message['type']}") else: st.markdown(message) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: computer-use-demo/computer_use_demo/tools/__init__.py ================================================ from .base import CLIResult, ToolResult from .bash import BashTool20241022, BashTool20250124 from .collection import ToolCollection from .computer import ComputerTool20241022, ComputerTool20250124 from .edit import EditTool20250728 from .groups import TOOL_GROUPS_BY_VERSION, ToolVersion __ALL__ = [ BashTool20241022, BashTool20250124, CLIResult, ComputerTool20241022, ComputerTool20250124, EditTool20250728, TOOL_GROUPS_BY_VERSION, ToolCollection, ToolResult, ToolVersion, ] ================================================ FILE: computer-use-demo/computer_use_demo/tools/base.py ================================================ from abc import ABCMeta, abstractmethod from dataclasses import dataclass, fields, replace from typing import Any from anthropic.types.beta import BetaToolUnionParam class BaseAnthropicTool(metaclass=ABCMeta): """Abstract base class for Anthropic-defined tools.""" @abstractmethod def __call__(self, **kwargs) -> Any: """Executes the tool with the given arguments.""" ... @abstractmethod def to_params( self, ) -> BetaToolUnionParam: raise NotImplementedError @dataclass(kw_only=True, frozen=True) class ToolResult: """Represents the result of a tool execution.""" output: str | None = None error: str | None = None base64_image: str | None = None system: str | None = None def __bool__(self): return any(getattr(self, field.name) for field in fields(self)) def __add__(self, other: "ToolResult"): def combine_fields( field: str | None, other_field: str | None, concatenate: bool = True ): if field and other_field: if concatenate: return field + other_field raise ValueError("Cannot combine tool results") return field or other_field return ToolResult( output=combine_fields(self.output, other.output), error=combine_fields(self.error, other.error), base64_image=combine_fields(self.base64_image, other.base64_image, False), system=combine_fields(self.system, other.system), ) def replace(self, **kwargs): """Returns a new ToolResult with the given fields replaced.""" return replace(self, **kwargs) class CLIResult(ToolResult): """A ToolResult that can be rendered as a CLI output.""" class ToolFailure(ToolResult): """A ToolResult that represents a failure.""" class ToolError(Exception): """Raised when a tool encounters an error.""" def __init__(self, message): self.message = message ================================================ FILE: computer-use-demo/computer_use_demo/tools/bash.py ================================================ import asyncio import os from typing import Any, Literal from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult class _BashSession: """A session of a bash shell.""" _started: bool _process: asyncio.subprocess.Process command: str = "/bin/bash" _output_delay: float = 0.2 # seconds _timeout: float = 120.0 # seconds _sentinel: str = "<>" def __init__(self): self._started = False self._timed_out = False async def start(self): if self._started: return self._process = await asyncio.create_subprocess_shell( self.command, preexec_fn=os.setsid, shell=True, bufsize=0, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) self._started = True def stop(self): """Terminate the bash shell.""" if not self._started: raise ToolError("Session has not started.") if self._process.returncode is not None: return self._process.terminate() async def run(self, command: str): """Execute a command in the bash shell.""" if not self._started: raise ToolError("Session has not started.") if self._process.returncode is not None: return ToolResult( system="tool must be restarted", error=f"bash has exited with returncode {self._process.returncode}", ) if self._timed_out: raise ToolError( f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", ) # we know these are not None because we created the process with PIPEs assert self._process.stdin assert self._process.stdout assert self._process.stderr # send command to the process self._process.stdin.write( command.encode() + f"; echo '{self._sentinel}'\n".encode() ) await self._process.stdin.drain() # read output from the process, until the sentinel is found try: async with asyncio.timeout(self._timeout): while True: await asyncio.sleep(self._output_delay) # if we read directly from stdout/stderr, it will wait forever for # EOF. use the StreamReader buffer directly instead. output = self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] if self._sentinel in output: # strip the sentinel and break output = output[: output.index(self._sentinel)] break except asyncio.TimeoutError: self._timed_out = True raise ToolError( f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", ) from None if output.endswith("\n"): output = output[:-1] error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] if error.endswith("\n"): error = error[:-1] # clear the buffers so that the next output can be read correctly self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] return CLIResult(output=output, error=error) class BashTool20250124(BaseAnthropicTool): """ A tool that allows the agent to run bash commands. The tool parameters are defined by Anthropic and are not editable. """ _session: _BashSession | None api_type: Literal["bash_20250124"] = "bash_20250124" name: Literal["bash"] = "bash" def __init__(self): self._session = None super().__init__() def to_params(self) -> Any: return { "type": self.api_type, "name": self.name, } async def __call__( self, command: str | None = None, restart: bool = False, **kwargs ): if restart: if self._session: self._session.stop() self._session = _BashSession() await self._session.start() return ToolResult(system="tool has been restarted.") if self._session is None: self._session = _BashSession() await self._session.start() if command is not None: return await self._session.run(command) raise ToolError("no command provided.") class BashTool20241022(BashTool20250124): api_type: Literal["bash_20250124"] = "bash_20250124" # pyright: ignore[reportIncompatibleVariableOverride] ================================================ FILE: computer-use-demo/computer_use_demo/tools/collection.py ================================================ """Collection classes for managing multiple tools.""" from typing import Any, cast from anthropic.types.beta import BetaToolUnionParam from .base import ( BaseAnthropicTool, ToolError, ToolFailure, ToolResult, ) class ToolCollection: """A collection of anthropic-defined tools.""" def __init__(self, *tools: BaseAnthropicTool): self.tools = tools self.tool_map = { cast(dict[str, Any], tool.to_params())["name"]: tool for tool in tools } def to_params( self, ) -> list[BetaToolUnionParam]: return [tool.to_params() for tool in self.tools] async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: tool = self.tool_map.get(name) if not tool: return ToolFailure(error=f"Tool {name} is invalid") try: return await tool(**tool_input) except ToolError as e: return ToolFailure(error=e.message) ================================================ FILE: computer-use-demo/computer_use_demo/tools/computer.py ================================================ import asyncio import base64 import os import shlex import shutil from enum import StrEnum from pathlib import Path from typing import Literal, TypedDict, cast, get_args from uuid import uuid4 from anthropic.types.beta import BetaToolComputerUse20241022Param, BetaToolUnionParam from .base import BaseAnthropicTool, ToolError, ToolResult from .run import run OUTPUT_DIR = "/tmp/outputs" TYPING_DELAY_MS = 12 TYPING_GROUP_SIZE = 50 Action_20241022 = Literal[ "key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "screenshot", "cursor_position", ] Action_20250124 = ( Action_20241022 | Literal[ "left_mouse_down", "left_mouse_up", "scroll", "hold_key", "wait", "triple_click", ] ) Action_20251124 = Action_20250124 | Literal["zoom"] ScrollDirection = Literal["up", "down", "left", "right"] class Resolution(TypedDict): width: int height: int # sizes above XGA/WXGA are not recommended (see README.md) # scale down to one of these targets if ComputerTool._scaling_enabled is set MAX_SCALING_TARGETS: dict[str, Resolution] = { "XGA": Resolution(width=1024, height=768), # 4:3 "WXGA": Resolution(width=1280, height=800), # 16:10 "FWXGA": Resolution(width=1366, height=768), # ~16:9 } CLICK_BUTTONS = { "left_click": 1, "right_click": 3, "middle_click": 2, "double_click": "--repeat 2 --delay 10 1", "triple_click": "--repeat 3 --delay 10 1", } class ScalingSource(StrEnum): COMPUTER = "computer" API = "api" class ComputerToolOptions(TypedDict): display_height_px: int display_width_px: int display_number: int | None def chunks(s: str, chunk_size: int) -> list[str]: return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] class BaseComputerTool: """ A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. The tool parameters are defined by Anthropic and are not editable. """ name: Literal["computer"] = "computer" width: int height: int display_num: int | None _screenshot_delay = 2.0 _scaling_enabled = True @property def options(self) -> ComputerToolOptions: width, height = self.scale_coordinates( ScalingSource.COMPUTER, self.width, self.height ) return { "display_width_px": width, "display_height_px": height, "display_number": self.display_num, } def __init__(self): super().__init__() self.width = int(os.getenv("WIDTH") or 0) self.height = int(os.getenv("HEIGHT") or 0) assert self.width and self.height, "WIDTH, HEIGHT must be set" if (display_num := os.getenv("DISPLAY_NUM")) is not None: self.display_num = int(display_num) self._display_prefix = f"DISPLAY=:{self.display_num} " else: self.display_num = None self._display_prefix = "" self.xdotool = f"{self._display_prefix}xdotool" async def __call__( self, *, action: Action_20241022, text: str | None = None, coordinate: tuple[int, int] | None = None, start_coordinate: tuple[int, int] | None = None, **kwargs, ): if action in ("mouse_move", "left_click_drag"): if coordinate is None: raise ToolError(f"coordinate is required for {action}") if text is not None: raise ToolError(f"text is not accepted for {action}") if action == "left_click_drag": if start_coordinate is None: raise ToolError(f"start_coordinate is required for {action}") start_x, start_y = self.validate_and_get_coordinates(start_coordinate) end_x, end_y = self.validate_and_get_coordinates(coordinate) command_parts = [ self.xdotool, f"mousemove --sync {start_x} {start_y} mousedown 1 mousemove --sync {end_x} {end_y} mouseup 1", ] return await self.shell(" ".join(command_parts)) elif action == "mouse_move": x, y = self.validate_and_get_coordinates(coordinate) command_parts = [self.xdotool, f"mousemove --sync {x} {y}"] return await self.shell(" ".join(command_parts)) if action in ("key", "type"): if text is None: raise ToolError(f"text is required for {action}") if coordinate is not None: raise ToolError(f"coordinate is not accepted for {action}") if not isinstance(text, str): raise ToolError(output=f"{text} must be a string") if action == "key": command_parts = [self.xdotool, f"key -- {text}"] return await self.shell(" ".join(command_parts)) elif action == "type": results: list[ToolResult] = [] for chunk in chunks(text, TYPING_GROUP_SIZE): command_parts = [ self.xdotool, f"type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}", ] results.append( await self.shell(" ".join(command_parts), take_screenshot=False) ) screenshot_base64 = (await self.screenshot()).base64_image return ToolResult( output="".join(result.output or "" for result in results), error="".join(result.error or "" for result in results), base64_image=screenshot_base64, ) if action in ( "left_click", "right_click", "double_click", "middle_click", "screenshot", "cursor_position", ): if text is not None: raise ToolError(f"text is not accepted for {action}") if coordinate is not None: raise ToolError(f"coordinate is not accepted for {action}") if action == "screenshot": return await self.screenshot() elif action == "cursor_position": command_parts = [self.xdotool, "getmouselocation --shell"] result = await self.shell( " ".join(command_parts), take_screenshot=False, ) output = result.output or "" x, y = self.scale_coordinates( ScalingSource.COMPUTER, int(output.split("X=")[1].split("\n")[0]), int(output.split("Y=")[1].split("\n")[0]), ) return result.replace(output=f"X={x},Y={y}") else: command_parts = [self.xdotool, f"click {CLICK_BUTTONS[action]}"] return await self.shell(" ".join(command_parts)) raise ToolError(f"Invalid action: {action}") def validate_and_get_coordinates(self, coordinate: tuple[int, int] | None = None): if not isinstance(coordinate, list) or len(coordinate) != 2: raise ToolError(f"{coordinate} must be a tuple of length 2") if not all(isinstance(i, int) and i >= 0 for i in coordinate): raise ToolError(f"{coordinate} must be a tuple of non-negative ints") return self.scale_coordinates(ScalingSource.API, coordinate[0], coordinate[1]) async def screenshot(self): """Take a screenshot of the current screen and return the base64 encoded image.""" output_dir = Path(OUTPUT_DIR) await asyncio.to_thread(output_dir.mkdir, parents=True, exist_ok=True) path = output_dir / f"screenshot_{uuid4().hex}.png" # Try gnome-screenshot first if shutil.which("gnome-screenshot"): screenshot_cmd = f"{self._display_prefix}gnome-screenshot -f {path} -p" else: # Fall back to scrot if gnome-screenshot isn't available screenshot_cmd = f"{self._display_prefix}scrot -p {path}" result = await self.shell(screenshot_cmd, take_screenshot=False) if self._scaling_enabled: x, y = self.scale_coordinates( ScalingSource.COMPUTER, self.width, self.height ) await self.shell( f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False ) if path.exists(): return result.replace( base64_image=base64.b64encode(path.read_bytes()).decode() ) raise ToolError(f"Failed to take screenshot: {result.error}") async def shell(self, command: str, take_screenshot=True) -> ToolResult: """Run a shell command and return the output, error, and optionally a screenshot.""" _, stdout, stderr = await run(command) base64_image = None if take_screenshot: # delay to let things settle before taking a screenshot await asyncio.sleep(self._screenshot_delay) base64_image = (await self.screenshot()).base64_image return ToolResult(output=stdout, error=stderr, base64_image=base64_image) def scale_coordinates(self, source: ScalingSource, x: int, y: int): """Scale coordinates to a target maximum resolution.""" if not self._scaling_enabled: return x, y ratio = self.width / self.height target_dimension = None for dimension in MAX_SCALING_TARGETS.values(): # allow some error in the aspect ratio - not ratios are exactly 16:9 if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: if dimension["width"] < self.width: target_dimension = dimension break if target_dimension is None: return x, y # should be less than 1 x_scaling_factor = target_dimension["width"] / self.width y_scaling_factor = target_dimension["height"] / self.height if source == ScalingSource.API: if x > self.width or y > self.height: raise ToolError(f"Coordinates {x}, {y} are out of bounds") # scale up return round(x / x_scaling_factor), round(y / y_scaling_factor) # scale down return round(x * x_scaling_factor), round(y * y_scaling_factor) class ComputerTool20241022(BaseComputerTool, BaseAnthropicTool): api_type: Literal["computer_20241022"] = "computer_20241022" def to_params(self) -> BetaToolComputerUse20241022Param: return {"name": self.name, "type": self.api_type, **self.options} class ComputerTool20250124(BaseComputerTool, BaseAnthropicTool): api_type: Literal["computer_20250124"] = "computer_20250124" def to_params(self): return cast( BetaToolUnionParam, {"name": self.name, "type": self.api_type, **self.options}, ) async def __call__( self, *, action: Action_20250124, text: str | None = None, coordinate: tuple[int, int] | None = None, start_coordinate: tuple[int, int] | None = None, scroll_direction: ScrollDirection | None = None, scroll_amount: int | None = None, duration: int | float | None = None, key: str | None = None, **kwargs, ): if action in ("left_mouse_down", "left_mouse_up"): if coordinate is not None: raise ToolError(f"coordinate is not accepted for {action=}.") command_parts = [ self.xdotool, f"{'mousedown' if action == 'left_mouse_down' else 'mouseup'} 1", ] return await self.shell(" ".join(command_parts)) if action == "scroll": if scroll_direction is None or scroll_direction not in get_args( ScrollDirection ): raise ToolError( f"{scroll_direction=} must be 'up', 'down', 'left', or 'right'" ) if not isinstance(scroll_amount, int) or scroll_amount < 0: raise ToolError(f"{scroll_amount=} must be a non-negative int") mouse_move_part = "" if coordinate is not None: x, y = self.validate_and_get_coordinates(coordinate) mouse_move_part = f"mousemove --sync {x} {y}" scroll_button = { "up": 4, "down": 5, "left": 6, "right": 7, }[scroll_direction] command_parts = [self.xdotool, mouse_move_part] if text: command_parts.append(f"keydown {text}") command_parts.append(f"click --repeat {scroll_amount} {scroll_button}") if text: command_parts.append(f"keyup {text}") return await self.shell(" ".join(command_parts)) if action in ("hold_key", "wait"): if duration is None or not isinstance(duration, (int, float)): raise ToolError(f"{duration=} must be a number") if duration < 0: raise ToolError(f"{duration=} must be non-negative") if duration > 100: raise ToolError(f"{duration=} is too long.") if action == "hold_key": if text is None: raise ToolError(f"text is required for {action}") escaped_keys = shlex.quote(text) command_parts = [ self.xdotool, f"keydown {escaped_keys}", f"sleep {duration}", f"keyup {escaped_keys}", ] return await self.shell(" ".join(command_parts)) if action == "wait": await asyncio.sleep(duration) return await self.screenshot() if action in ( "left_click", "right_click", "double_click", "triple_click", "middle_click", ): if text is not None: raise ToolError(f"text is not accepted for {action}") mouse_move_part = "" if coordinate is not None: x, y = self.validate_and_get_coordinates(coordinate) mouse_move_part = f"mousemove --sync {x} {y}" command_parts = [self.xdotool, mouse_move_part] if key: command_parts.append(f"keydown {key}") command_parts.append(f"click {CLICK_BUTTONS[action]}") if key: command_parts.append(f"keyup {key}") return await self.shell(" ".join(command_parts)) return await super().__call__( action=action, text=text, coordinate=coordinate, start_coordinate=start_coordinate, key=key, **kwargs, ) class ComputerTool20251124(ComputerTool20250124): api_type: Literal["computer_20251124"] = "computer_20251124" # pyright: ignore[reportIncompatibleVariableOverride] @property def options(self) -> ComputerToolOptions: # pyright: ignore[reportIncompatibleMethodOverride] return {**super().options, "enable_zoom": True} # pyright: ignore[reportReturnType] async def __call__( self, *, action: Action_20251124, text: str | None = None, coordinate: tuple[int, int] | None = None, scroll_direction: ScrollDirection | None = None, scroll_amount: int | None = None, duration: int | float | None = None, key: str | None = None, region: tuple[int, int, int, int] | None = None, **kwargs, ): if action == "zoom": if ( region is None or not isinstance(region, (list, tuple)) or len(region) != 4 ): raise ToolError( f"{region=} must be a tuple of 4 coordinates (x0, y0, x1, y1)" ) if not all(isinstance(c, int) and c >= 0 for c in region): raise ToolError(f"{region=} must contain non-negative integers") x0, y0, x1, y1 = region # Scale coordinates from API space to screen space x0, y0 = self.scale_coordinates(ScalingSource.API, x0, y0) x1, y1 = self.scale_coordinates(ScalingSource.API, x1, y1) # Take a screenshot and crop to the specified region screenshot_result = await self.screenshot() if not screenshot_result.base64_image: raise ToolError("Failed to take screenshot for zoom") # Crop the image using ImageMagick convert output_dir = Path(OUTPUT_DIR) temp_path = output_dir / f"screenshot_{uuid4().hex}.png" cropped_path = output_dir / f"zoomed_{uuid4().hex}.png" # Write the screenshot to a temp file temp_path.write_bytes(base64.b64decode(screenshot_result.base64_image)) # Crop using ImageMagick: convert input -crop WxH+X+Y output width = x1 - x0 height = y1 - y0 crop_cmd = f"convert {temp_path} -crop {width}x{height}+{x0}+{y0} +repage {cropped_path}" await run(crop_cmd) if cropped_path.exists(): cropped_base64 = base64.b64encode(cropped_path.read_bytes()).decode() temp_path.unlink(missing_ok=True) cropped_path.unlink(missing_ok=True) return ToolResult(base64_image=cropped_base64) raise ToolError("Failed to crop screenshot for zoom") return await super().__call__( action=action, text=text, coordinate=coordinate, scroll_direction=scroll_direction, scroll_amount=scroll_amount, duration=duration, key=key, **kwargs, ) ================================================ FILE: computer-use-demo/computer_use_demo/tools/edit.py ================================================ import asyncio from collections import defaultdict from pathlib import Path from typing import Any, Literal, get_args from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult from .run import maybe_truncate, run Command = Literal[ "view", "create", "str_replace", "insert", ] SNIPPET_LINES: int = 4 class EditTool20250728(BaseAnthropicTool): """ A filesystem editor tool that allows the agent to view, create, and edit files. The tool parameters are defined by Anthropic and are not editable. This implements the text_editor_20250728 tool version, which: - Uses `insert_text` (not `new_str`) for the insert command - Uses `str_replace_based_edit_tool` as the tool name - Does not support `undo_edit` """ api_type: Literal["text_editor_20250728"] = "text_editor_20250728" name: Literal["str_replace_based_edit_tool"] = "str_replace_based_edit_tool" _file_history: dict[Path, list[str]] def __init__(self): self._file_history = defaultdict(list) super().__init__() def to_params(self) -> Any: return { "name": self.name, "type": self.api_type, } async def __call__( self, *, command: Command, path: str, file_text: str | None = None, view_range: list[int] | None = None, old_str: str | None = None, new_str: str | None = None, insert_line: int | None = None, insert_text: str | None = None, **kwargs, ): _path = Path(path) self.validate_path(command, _path) if command == "view": return await self.view(_path, view_range) elif command == "create": if file_text is None: raise ToolError("Parameter `file_text` is required for command: create") self.write_file(_path, file_text) self._file_history[_path].append(file_text) return ToolResult(output=f"File created successfully at: {_path}") elif command == "str_replace": if old_str is None: raise ToolError( "Parameter `old_str` is required for command: str_replace" ) return self.str_replace(_path, old_str, new_str) elif command == "insert": if insert_line is None: raise ToolError( "Parameter `insert_line` is required for command: insert" ) if insert_text is None: raise ToolError( "Parameter `insert_text` is required for command: insert" ) return self.insert(_path, insert_line, insert_text) raise ToolError( f"Unrecognized command {command}. The allowed commands for the {self.name} tool are: {', '.join(get_args(Command))}" ) def validate_path(self, command: str, path: Path): """ Check that the path/command combination is valid. """ # Check if its an absolute path if not path.is_absolute(): suggested_path = Path("") / path raise ToolError( f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?" ) # Check if path exists if not path.exists() and command != "create": raise ToolError( f"The path {path} does not exist. Please provide a valid path." ) if path.exists() and command == "create": raise ToolError( f"File already exists at: {path}. Cannot overwrite files using command `create`." ) # Check if the path points to a directory if path.is_dir(): if command != "view": raise ToolError( f"The path {path} is a directory and only the `view` command can be used on directories" ) async def view(self, path: Path, view_range: list[int] | None = None): """Implement the view command""" if await asyncio.to_thread(path.is_dir): if view_range: raise ToolError( "The `view_range` parameter is not allowed when `path` points to a directory." ) _, stdout, stderr = await run( rf"find {path} -maxdepth 2 -not -path '*/\.*'" ) if not stderr: stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n" return CLIResult(output=stdout, error=stderr) file_content = self.read_file(path) init_line = 1 if view_range: if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range): raise ToolError( "Invalid `view_range`. It should be a list of two integers." ) file_lines = file_content.split("\n") n_lines_file = len(file_lines) init_line, final_line = view_range if init_line < 1 or init_line > n_lines_file: raise ToolError( f"Invalid `view_range`: {view_range}. Its first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}" ) if final_line > n_lines_file: raise ToolError( f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`" ) if final_line != -1 and final_line < init_line: raise ToolError( f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be larger or equal than its first `{init_line}`" ) if final_line == -1: file_content = "\n".join(file_lines[init_line - 1 :]) else: file_content = "\n".join(file_lines[init_line - 1 : final_line]) return CLIResult( output=self._make_output(file_content, str(path), init_line=init_line) ) def str_replace(self, path: Path, old_str: str, new_str: str | None): """Implement the str_replace command, which replaces old_str with new_str in the file content""" # Read the file content file_content = self.read_file(path).expandtabs() old_str = old_str.expandtabs() new_str = new_str.expandtabs() if new_str is not None else "" # Check if old_str is unique in the file occurrences = file_content.count(old_str) if occurrences == 0: raise ToolError( f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}." ) elif occurrences > 1: file_content_lines = file_content.split("\n") lines = [ idx + 1 for idx, line in enumerate(file_content_lines) if old_str in line ] raise ToolError( f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique" ) # Replace old_str with new_str new_file_content = file_content.replace(old_str, new_str) # Write the new content to the file self.write_file(path, new_file_content) # Save the content to history self._file_history[path].append(file_content) # Create a snippet of the edited section replacement_line = file_content.split(old_str)[0].count("\n") start_line = max(0, replacement_line - SNIPPET_LINES) end_line = replacement_line + SNIPPET_LINES + new_str.count("\n") snippet = "\n".join(new_file_content.split("\n")[start_line : end_line + 1]) # Prepare the success message success_msg = f"The file {path} has been edited. " success_msg += self._make_output( snippet, f"a snippet of {path}", start_line + 1 ) success_msg += "Review the changes and make sure they are as expected. Edit the file again if necessary." return CLIResult(output=success_msg) def insert(self, path: Path, insert_line: int, new_str: str): """Implement the insert command, which inserts new_str at the specified line in the file content.""" file_text = self.read_file(path).expandtabs() new_str = new_str.expandtabs() file_text_lines = file_text.split("\n") n_lines_file = len(file_text_lines) if insert_line < 0 or insert_line > n_lines_file: raise ToolError( f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}" ) new_str_lines = new_str.split("\n") new_file_text_lines = ( file_text_lines[:insert_line] + new_str_lines + file_text_lines[insert_line:] ) snippet_lines = ( file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line] + new_str_lines + file_text_lines[insert_line : insert_line + SNIPPET_LINES] ) new_file_text = "\n".join(new_file_text_lines) snippet = "\n".join(snippet_lines) self.write_file(path, new_file_text) self._file_history[path].append(file_text) success_msg = f"The file {path} has been edited. " success_msg += self._make_output( snippet, "a snippet of the edited file", max(1, insert_line - SNIPPET_LINES + 1), ) success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary." return CLIResult(output=success_msg) def read_file(self, path: Path): """Read the content of a file from a given path; raise a ToolError if an error occurs.""" try: return path.read_text() except Exception as e: raise ToolError(f"Ran into {e} while trying to read {path}") from None def write_file(self, path: Path, file: str): """Write the content of a file to a given path; raise a ToolError if an error occurs.""" try: path.write_text(file) except Exception as e: raise ToolError(f"Ran into {e} while trying to write to {path}") from None def _make_output( self, file_content: str, file_descriptor: str, init_line: int = 1, expand_tabs: bool = True, ): """Generate output for the CLI based on the content of a file.""" file_content = maybe_truncate(file_content) if expand_tabs: file_content = file_content.expandtabs() file_content = "\n".join( [ f"{i + init_line:6}\t{line}" for i, line in enumerate(file_content.split("\n")) ] ) return ( f"Here's the result of running `cat -n` on {file_descriptor}:\n" + file_content + "\n" ) ================================================ FILE: computer-use-demo/computer_use_demo/tools/groups.py ================================================ from dataclasses import dataclass from typing import Literal from .base import BaseAnthropicTool from .bash import BashTool20241022, BashTool20250124 from .computer import ComputerTool20241022, ComputerTool20250124, ComputerTool20251124 from .edit import EditTool20250728 ToolVersion = Literal[ "computer_use_20250124", "computer_use_20241022", "computer_use_20250429", "computer_use_20251124", ] BetaFlag = Literal[ "computer-use-2024-10-22", "computer-use-2025-01-24", "computer-use-2025-04-29", "computer-use-2025-11-24", ] @dataclass(frozen=True, kw_only=True) class ToolGroup: version: ToolVersion tools: list[type[BaseAnthropicTool]] beta_flag: BetaFlag | None = None TOOL_GROUPS: list[ToolGroup] = [ ToolGroup( version="computer_use_20241022", tools=[ComputerTool20241022, EditTool20250728, BashTool20241022], beta_flag="computer-use-2024-10-22", ), ToolGroup( version="computer_use_20250124", tools=[ComputerTool20250124, EditTool20250728, BashTool20250124], beta_flag="computer-use-2025-01-24", ), ToolGroup( version="computer_use_20250429", tools=[ComputerTool20250124, EditTool20250728, BashTool20250124], beta_flag="computer-use-2025-01-24", ), ToolGroup( version="computer_use_20251124", tools=[ComputerTool20251124, EditTool20250728, BashTool20250124], beta_flag="computer-use-2025-11-24", ), ] TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS} ================================================ FILE: computer-use-demo/computer_use_demo/tools/run.py ================================================ """Utility to run shell commands asynchronously with a timeout.""" import asyncio TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." MAX_RESPONSE_LEN: int = 16000 def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN): """Truncate content and append a notice if content exceeds the specified length.""" return ( content if not truncate_after or len(content) <= truncate_after else content[:truncate_after] + TRUNCATED_MESSAGE ) async def run( cmd: str, timeout: float | None = 120.0, # seconds truncate_after: int | None = MAX_RESPONSE_LEN, ): """Run a shell command asynchronously with a timeout.""" process = await asyncio.create_subprocess_shell( cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) try: stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) return ( process.returncode or 0, maybe_truncate(stdout.decode(), truncate_after=truncate_after), maybe_truncate(stderr.decode(), truncate_after=truncate_after), ) except asyncio.TimeoutError as exc: try: process.kill() except ProcessLookupError: pass raise TimeoutError( f"Command '{cmd}' timed out after {timeout} seconds" ) from exc ================================================ FILE: computer-use-demo/dev-requirements.txt ================================================ -r computer_use_demo/requirements.txt ruff==0.6.7 pre-commit==3.8.0 pytest==8.3.3 pytest-asyncio==0.23.6 ================================================ FILE: computer-use-demo/image/.config/tint2/applications/firefox-custom.desktop ================================================ [Desktop Entry] Name=Firefox Custom Comment=Open Firefox with custom URL Exec=firefox-esr -new-window Icon=firefox-esr Terminal=false Type=Application Categories=Network;WebBrowser; ================================================ FILE: computer-use-demo/image/.config/tint2/applications/gedit.desktop ================================================ [Desktop Entry] Name=Gedit Comment=Open gedit Exec=gedit Icon=text-editor-symbolic Terminal=false Type=Application Categories=TextEditor; ================================================ FILE: computer-use-demo/image/.config/tint2/applications/terminal.desktop ================================================ [Desktop Entry] Name=Terminal Comment=Open Terminal Exec=xterm Icon=utilities-terminal Terminal=false Type=Application Categories=System;TerminalEmulator; ================================================ FILE: computer-use-demo/image/.config/tint2/tint2rc ================================================ #------------------------------------- # Panel panel_items = TL panel_size = 100% 60 panel_margin = 0 0 panel_padding = 2 0 2 panel_background_id = 1 wm_menu = 0 panel_dock = 0 panel_position = bottom center horizontal panel_layer = top panel_monitor = all panel_shrink = 0 autohide = 0 autohide_show_timeout = 0 autohide_hide_timeout = 0.5 autohide_height = 2 strut_policy = follow_size panel_window_name = tint2 disable_transparency = 1 mouse_effects = 1 font_shadow = 0 mouse_hover_icon_asb = 100 0 10 mouse_pressed_icon_asb = 100 0 0 scale_relative_to_dpi = 0 scale_relative_to_screen_height = 0 #------------------------------------- # Taskbar taskbar_mode = single_desktop taskbar_hide_if_empty = 0 taskbar_padding = 0 0 2 taskbar_background_id = 0 taskbar_active_background_id = 0 taskbar_name = 1 taskbar_hide_inactive_tasks = 0 taskbar_hide_different_monitor = 0 taskbar_hide_different_desktop = 0 taskbar_always_show_all_desktop_tasks = 0 taskbar_name_padding = 4 2 taskbar_name_background_id = 0 taskbar_name_active_background_id = 0 taskbar_name_font_color = #e3e3e3 100 taskbar_name_active_font_color = #ffffff 100 taskbar_distribute_size = 0 taskbar_sort_order = none task_align = left #------------------------------------- # Launcher launcher_padding = 4 8 4 launcher_background_id = 0 launcher_icon_background_id = 0 launcher_icon_size = 48 launcher_icon_asb = 100 0 0 launcher_icon_theme_override = 0 startup_notifications = 1 launcher_tooltip = 1 #------------------------------------- # Launcher icon launcher_item_app = /usr/share/applications/libreoffice-calc.desktop launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop launcher_item_app = /usr/share/applications/xpaint.desktop launcher_item_app = /usr/share/applications/xpdf.desktop launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop launcher_item_app = /usr/share/applications/galculator.desktop #------------------------------------- # Background definitions # ID 1 rounded = 0 border_width = 0 background_color = #000000 60 border_color = #000000 30 # ID 2 rounded = 4 border_width = 1 background_color = #777777 20 border_color = #777777 30 # ID 3 rounded = 4 border_width = 1 background_color = #777777 20 border_color = #ffffff 40 # ID 4 rounded = 4 border_width = 1 background_color = #aa4400 100 border_color = #aa7733 100 # ID 5 rounded = 4 border_width = 1 background_color = #aaaa00 100 border_color = #aaaa00 100 ================================================ FILE: computer-use-demo/image/.streamlit/config.toml ================================================ [server] fileWatcherType = "auto" runOnSave = true [browser] gatherUsageStats = false ================================================ FILE: computer-use-demo/image/entrypoint.sh ================================================ #!/bin/bash set -e ./start_all.sh ./novnc_startup.sh python http_server.py > /tmp/server_logs.txt 2>&1 & STREAMLIT_SERVER_PORT=8501 python -m streamlit run computer_use_demo/streamlit.py > /tmp/streamlit_stdout.log & echo "✨ Computer Use Demo is ready!" echo "➡️ Open http://localhost:8080 in your browser to begin" # Keep the container running tail -f /dev/null ================================================ FILE: computer-use-demo/image/http_server.py ================================================ import os import socket from http.server import HTTPServer, SimpleHTTPRequestHandler class HTTPServerV6(HTTPServer): address_family = socket.AF_INET6 def run_server(): os.chdir(os.path.dirname(__file__) + "/static_content") server_address = ("::", 8080) httpd = HTTPServerV6(server_address, SimpleHTTPRequestHandler) print("Starting HTTP server on port 8080...") # noqa: T201 httpd.serve_forever() if __name__ == "__main__": run_server() ================================================ FILE: computer-use-demo/image/index.html ================================================ Computer Use Demo
================================================ FILE: computer-use-demo/image/mutter_startup.sh ================================================ echo "starting mutter" XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log & # Wait for tint2 window properties to appear timeout=30 while [ $timeout -gt 0 ]; do if xdotool search --class "mutter" >/dev/null 2>&1; then break fi sleep 1 ((timeout--)) done if [ $timeout -eq 0 ]; then echo "mutter stderr output:" >&2 cat /tmp/mutter_stderr.log >&2 exit 1 fi rm /tmp/mutter_stderr.log ================================================ FILE: computer-use-demo/image/novnc_startup.sh ================================================ #!/bin/bash echo "starting noVNC" # Start noVNC with explicit websocket settings /opt/noVNC/utils/novnc_proxy \ --vnc localhost:5900 \ --listen 6080 \ --web /opt/noVNC \ > /tmp/novnc.log 2>&1 & # Wait for noVNC to start timeout=10 while [ $timeout -gt 0 ]; do if netstat -tuln | grep -q ":6080 "; then break fi sleep 1 ((timeout--)) done echo "noVNC started successfully" ================================================ FILE: computer-use-demo/image/start_all.sh ================================================ #!/bin/bash set -e export DISPLAY=:${DISPLAY_NUM} ./xvfb_startup.sh ./tint2_startup.sh ./mutter_startup.sh ./x11vnc_startup.sh ================================================ FILE: computer-use-demo/image/static_content/index.html ================================================ Computer Use Demo
================================================ FILE: computer-use-demo/image/tint2_startup.sh ================================================ #!/bin/bash echo "starting tint2 on display :$DISPLAY_NUM ..." # Start tint2 and capture its stderr tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log & # Wait for tint2 window properties to appear timeout=30 while [ $timeout -gt 0 ]; do if xdotool search --class "tint2" >/dev/null 2>&1; then break fi sleep 1 ((timeout--)) done if [ $timeout -eq 0 ]; then echo "tint2 stderr output:" >&2 cat /tmp/tint2_stderr.log >&2 exit 1 fi # Remove the temporary stderr log file rm /tmp/tint2_stderr.log ================================================ FILE: computer-use-demo/image/x11vnc_startup.sh ================================================ #!/bin/bash echo "starting vnc" (x11vnc -display $DISPLAY \ -forever \ -shared \ -wait 50 \ -rfbport 5900 \ -nopw \ 2>/tmp/x11vnc_stderr.log) & x11vnc_pid=$! # Wait for x11vnc to start timeout=10 while [ $timeout -gt 0 ]; do if netstat -tuln | grep -q ":5900 "; then break fi sleep 1 ((timeout--)) done if [ $timeout -eq 0 ]; then echo "x11vnc failed to start, stderr output:" >&2 cat /tmp/x11vnc_stderr.log >&2 exit 1 fi : > /tmp/x11vnc_stderr.log # Monitor x11vnc process in the background ( while true; do if ! kill -0 $x11vnc_pid 2>/dev/null; then echo "x11vnc process crashed, restarting..." >&2 if [ -f /tmp/x11vnc_stderr.log ]; then echo "x11vnc stderr output:" >&2 cat /tmp/x11vnc_stderr.log >&2 rm /tmp/x11vnc_stderr.log fi exec "$0" fi sleep 5 done ) & ================================================ FILE: computer-use-demo/image/xvfb_startup.sh ================================================ #!/bin/bash set -e # Exit on error DPI=96 RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24 # Function to check if Xvfb is already running check_xvfb_running() { if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then return 0 # Xvfb is already running else return 1 # Xvfb is not running fi } # Function to check if Xvfb is ready wait_for_xvfb() { local timeout=10 local start_time=$(date +%s) while ! xdpyinfo >/dev/null 2>&1; do if [ $(($(date +%s) - start_time)) -gt $timeout ]; then echo "Xvfb failed to start within $timeout seconds" >&2 return 1 fi sleep 0.1 done return 0 } # Check if Xvfb is already running if check_xvfb_running; then echo "Xvfb is already running on display ${DISPLAY}" exit 0 fi # Start Xvfb Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix & XVFB_PID=$! # Wait for Xvfb to start if wait_for_xvfb; then echo "Xvfb started successfully on display ${DISPLAY}" echo "Xvfb PID: $XVFB_PID" else echo "Xvfb failed to start" kill $XVFB_PID exit 1 fi ================================================ FILE: computer-use-demo/pyproject.toml ================================================ [tool.pyright] venvPath = "." venv = ".venv" useLibraryCodeForTypes = false [tool.pytest.ini_options] pythonpath = "." asyncio_mode = "auto" ================================================ FILE: computer-use-demo/ruff.toml ================================================ extend-exclude = [".venv"] [format] docstring-code-format = true [lint] select = [ "A", "ASYNC", "B", "E", "F", "I", "PIE", "RUF200", "T20", "UP", "W", ] ignore = ["E501", "ASYNC230"] [lint.isort] combine-as-imports = true ================================================ FILE: computer-use-demo/setup.sh ================================================ #!/bin/bash PYTHON_MINOR_VERSION=$(python3 --version | awk -F. '{print $2}') if [ "$PYTHON_MINOR_VERSION" -gt 12 ]; then echo "Python version 3.$PYTHON_MINOR_VERSION detected. Python 3.12 or lower is required for setup to complete." echo "If you have multiple versions of Python installed, you can set the correct one by adjusting setup.sh to use a specific version, for example:" echo "'python3 -m venv .venv' -> 'python3.12 -m venv .venv'" exit 1 fi if ! command -v cargo &> /dev/null; then echo "Cargo (the package manager for Rust) is not present. This is required for one of this module's dependencies." echo "See https://www.rust-lang.org/tools/install for installation instructions." exit 1 fi python3 -m venv .venv source .venv/bin/activate pip install --upgrade pip pip install -r dev-requirements.txt pre-commit install ================================================ FILE: computer-use-demo/tests/conftest.py ================================================ import os from unittest import mock import pytest @pytest.fixture(autouse=True) def mock_screen_dimensions(): with mock.patch.dict( os.environ, {"HEIGHT": "768", "WIDTH": "1024", "DISPLAY_NUM": "1"} ): yield ================================================ FILE: computer-use-demo/tests/loop_test.py ================================================ from unittest import mock from anthropic.types import TextBlock, ToolUseBlock from anthropic.types.beta import BetaMessage, BetaMessageParam, BetaTextBlockParam from computer_use_demo.loop import APIProvider, sampling_loop async def test_loop(): client = mock.Mock() client.beta.messages.with_raw_response.create.return_value = mock.Mock() client.beta.messages.with_raw_response.create.return_value.parse.side_effect = [ mock.Mock( spec=BetaMessage, content=[ TextBlock(type="text", text="Hello"), ToolUseBlock( type="tool_use", id="1", name="computer", input={"action": "test"} ), ], ), mock.Mock(spec=BetaMessage, content=[TextBlock(type="text", text="Done!")]), ] tool_collection = mock.AsyncMock() tool_collection.run.return_value = mock.Mock( output="Tool output", error=None, base64_image=None ) output_callback = mock.Mock() tool_output_callback = mock.Mock() api_response_callback = mock.Mock() with ( mock.patch("computer_use_demo.loop.Anthropic", return_value=client), mock.patch( "computer_use_demo.loop.ToolCollection", return_value=tool_collection ), ): messages: list[BetaMessageParam] = [{"role": "user", "content": "Test message"}] result = await sampling_loop( model="test-model", provider=APIProvider.ANTHROPIC, system_prompt_suffix="", messages=messages, output_callback=output_callback, tool_output_callback=tool_output_callback, api_response_callback=api_response_callback, api_key="test-key", tool_version="computer_use_20250124", ) assert len(result) == 4 assert result[0] == {"role": "user", "content": "Test message"} assert result[1]["role"] == "assistant" assert result[2]["role"] == "user" assert result[3]["role"] == "assistant" assert client.beta.messages.with_raw_response.create.call_count == 2 tool_collection.run.assert_called_once_with( name="computer", tool_input={"action": "test"} ) output_callback.assert_called_with( BetaTextBlockParam(text="Done!", type="text", citations=None) ) assert output_callback.call_count == 3 assert tool_output_callback.call_count == 1 assert api_response_callback.call_count == 2 ================================================ FILE: computer-use-demo/tests/streamlit_test.py ================================================ from unittest import mock import pytest from anthropic.types import TextBlockParam from streamlit.testing.v1 import AppTest from computer_use_demo.streamlit import Sender @pytest.fixture def streamlit_app(): return AppTest.from_file("computer_use_demo/streamlit.py") def test_streamlit(streamlit_app: AppTest): streamlit_app.run() streamlit_app.text_input[1].set_value("sk-ant-0000000000000").run() with mock.patch("computer_use_demo.loop.sampling_loop") as patch: streamlit_app.chat_input[0].set_value("Hello").run() assert patch.called assert patch.call_args.kwargs["messages"] == [ { "role": Sender.USER, "content": [TextBlockParam(text="Hello", type="text")], } ] assert not streamlit_app.exception ================================================ FILE: computer-use-demo/tests/tools/bash_test.py ================================================ import pytest from computer_use_demo.tools.bash import BashTool20241022, BashTool20250124, ToolError @pytest.fixture(params=[BashTool20241022, BashTool20250124]) def bash_tool(request): return request.param() @pytest.mark.asyncio async def test_bash_tool_restart(bash_tool): result = await bash_tool(restart=True) assert result.system == "tool has been restarted." # Verify the tool can be used after restart result = await bash_tool(command="echo 'Hello after restart'") assert "Hello after restart" in result.output @pytest.mark.asyncio async def test_bash_tool_run_command(bash_tool): result = await bash_tool(command="echo 'Hello, World!'") assert result.output.strip() == "Hello, World!" assert result.error == "" @pytest.mark.asyncio async def test_bash_tool_no_command(bash_tool): with pytest.raises(ToolError, match="no command provided."): await bash_tool() @pytest.mark.asyncio async def test_bash_tool_session_creation(bash_tool): result = await bash_tool(command="echo 'Session created'") assert bash_tool._session is not None assert "Session created" in result.output @pytest.mark.asyncio async def test_bash_tool_session_reuse(bash_tool): result1 = await bash_tool(command="echo 'First command'") result2 = await bash_tool(command="echo 'Second command'") assert "First command" in result1.output assert "Second command" in result2.output @pytest.mark.asyncio async def test_bash_tool_session_error(bash_tool): result = await bash_tool(command="invalid_command_that_does_not_exist") assert "command not found" in result.error @pytest.mark.asyncio async def test_bash_tool_non_zero_exit(bash_tool): result = await bash_tool(command="bash -c 'exit 1'") assert result.error.strip() == "" assert result.output.strip() == "" @pytest.mark.asyncio async def test_bash_tool_timeout(bash_tool): await bash_tool(command="echo 'Hello, World!'") bash_tool._session._timeout = 0.1 # Set a very short timeout for testing with pytest.raises( ToolError, match="timed out: bash has not returned in 0.1 seconds and must be restarted", ): await bash_tool(command="sleep 1") ================================================ FILE: computer-use-demo/tests/tools/computer_test.py ================================================ from unittest.mock import AsyncMock, patch import pytest from computer_use_demo.tools.computer import ( ComputerTool20241022, ComputerTool20250124, ScalingSource, ToolError, ToolResult, ) @pytest.fixture(params=[ComputerTool20241022, ComputerTool20250124]) def computer_tool(request): return request.param() @pytest.mark.asyncio async def test_computer_tool_mouse_move(computer_tool): with patch.object(computer_tool, "shell", new_callable=AsyncMock) as mock_shell: mock_shell.return_value = ToolResult(output="Mouse moved") result = await computer_tool(action="mouse_move", coordinate=[100, 200]) mock_shell.assert_called_once_with( f"{computer_tool.xdotool} mousemove --sync 100 200" ) assert result.output == "Mouse moved" @pytest.mark.asyncio async def test_computer_tool_type(computer_tool): with ( patch.object(computer_tool, "shell", new_callable=AsyncMock) as mock_shell, patch.object( computer_tool, "screenshot", new_callable=AsyncMock ) as mock_screenshot, ): mock_shell.return_value = ToolResult(output="Text typed") mock_screenshot.return_value = ToolResult(base64_image="base64_screenshot") result = await computer_tool(action="type", text="Hello, World!") assert mock_shell.call_count == 1 assert "type --delay 12 -- 'Hello, World!'" in mock_shell.call_args[0][0] assert result.output == "Text typed" assert result.base64_image == "base64_screenshot" @pytest.mark.asyncio async def test_computer_tool_screenshot(computer_tool): with patch.object( computer_tool, "screenshot", new_callable=AsyncMock ) as mock_screenshot: mock_screenshot.return_value = ToolResult(base64_image="base64_screenshot") result = await computer_tool(action="screenshot") mock_screenshot.assert_called_once() assert result.base64_image == "base64_screenshot" @pytest.mark.asyncio async def test_computer_tool_scaling(computer_tool): computer_tool._scaling_enabled = True computer_tool.width = 1920 computer_tool.height = 1080 # Test scaling from API to computer x, y = computer_tool.scale_coordinates(ScalingSource.API, 1366, 768) assert x == 1920 assert y == 1080 # Test scaling from computer to API x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 1920, 1080) assert x == 1366 assert y == 768 # Test no scaling when disabled computer_tool._scaling_enabled = False x, y = computer_tool.scale_coordinates(ScalingSource.API, 1366, 768) assert x == 1366 assert y == 768 @pytest.mark.asyncio async def test_computer_tool_scaling_with_different_aspect_ratio(computer_tool): computer_tool._scaling_enabled = True computer_tool.width = 1920 computer_tool.height = 1200 # 16:10 aspect ratio # Test scaling from API to computer x, y = computer_tool.scale_coordinates(ScalingSource.API, 1280, 800) assert x == 1920 assert y == 1200 # Test scaling from computer to API x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 1920, 1200) assert x == 1280 assert y == 800 @pytest.mark.asyncio async def test_computer_tool_no_scaling_for_unsupported_resolution(computer_tool): computer_tool._scaling_enabled = True computer_tool.width = 4096 computer_tool.height = 2160 # Test no scaling for unsupported resolution x, y = computer_tool.scale_coordinates(ScalingSource.API, 4096, 2160) assert x == 4096 assert y == 2160 x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 4096, 2160) assert x == 4096 assert y == 2160 @pytest.mark.asyncio async def test_computer_tool_scaling_out_of_bounds(computer_tool): computer_tool._scaling_enabled = True computer_tool.width = 1920 computer_tool.height = 1080 # Test scaling from API with out of bounds coordinates with pytest.raises(ToolError, match="Coordinates .*, .* are out of bounds"): x, y = computer_tool.scale_coordinates(ScalingSource.API, 2000, 1500) @pytest.mark.asyncio async def test_computer_tool_invalid_action(computer_tool): with pytest.raises(ToolError, match="Invalid action: invalid_action"): await computer_tool(action="invalid_action") @pytest.mark.asyncio async def test_computer_tool_missing_coordinate(computer_tool): with pytest.raises(ToolError, match="coordinate is required for mouse_move"): await computer_tool(action="mouse_move") @pytest.mark.asyncio async def test_computer_tool_missing_text(computer_tool): with pytest.raises(ToolError, match="text is required for type"): await computer_tool(action="type") ================================================ FILE: computer-use-demo/tests/tools/edit_test.py ================================================ from pathlib import Path from unittest.mock import patch import pytest from computer_use_demo.tools.base import CLIResult, ToolError, ToolResult from computer_use_demo.tools.edit import EditTool20250728 @pytest.fixture def edit_tool(): return EditTool20250728() @pytest.mark.asyncio async def test_view_command(edit_tool): # Test viewing a file that exists with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, ): mock_read_text.return_value = "File content" result = await edit_tool(command="view", path="/test/file.txt") assert isinstance(result, CLIResult) assert result.output assert "File content" in result.output # Test viewing a directory with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=True), patch("computer_use_demo.tools.edit.run") as mock_run, ): mock_run.return_value = (None, "file1.txt\nfile2.txt", None) result = await edit_tool(command="view", path="/test/dir") assert isinstance(result, CLIResult) assert result.output assert "file1.txt" in result.output assert "file2.txt" in result.output # Test viewing a file with a specific range with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, ): mock_read_text.return_value = "Line 1\nLine 2\nLine 3\nLine 4" result = await edit_tool( command="view", path="/test/file.txt", view_range=[2, 3] ) assert isinstance(result, CLIResult) assert result.output assert "\n 2\tLine 2\n 3\tLine 3\n" in result.output # Test viewing a file with an invalid range with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, ): mock_read_text.return_value = "Line 1\nLine 2\nLine 3\nLine 4" with pytest.raises(ToolError, match="Invalid `view_range`"): await edit_tool(command="view", path="/test/file.txt", view_range=[3, 2]) # Test viewing a non-existent file with patch("pathlib.Path.exists", return_value=False): with pytest.raises(ToolError, match="does not exist"): await edit_tool(command="view", path="/nonexistent/file.txt") # Test viewing a directory with a view_range with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=True), ): with pytest.raises(ToolError, match="view_range` parameter is not allowed"): await edit_tool(command="view", path="/test/dir", view_range=[1, 2]) @pytest.mark.asyncio async def test_create_command(edit_tool): # Test creating a new file with content with ( patch("pathlib.Path.exists", return_value=False), patch("pathlib.Path.write_text") as mock_write_text, ): result = await edit_tool( command="create", path="/test/newfile.txt", file_text="New file content" ) assert isinstance(result, ToolResult) assert result.output assert "File created successfully" in result.output mock_write_text.assert_called_once_with("New file content") # Test attempting to create a file without content with patch("pathlib.Path.exists", return_value=False): with pytest.raises(ToolError, match="Parameter `file_text` is required"): await edit_tool(command="create", path="/test/newfile.txt") # Test attempting to create a file that already exists with patch("pathlib.Path.exists", return_value=True): with pytest.raises(ToolError, match="File already exists"): await edit_tool( command="create", path="/test/existingfile.txt", file_text="Content" ) @pytest.mark.asyncio async def test_str_replace_command(edit_tool): # Test replacing a unique string in a file with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, patch("pathlib.Path.write_text") as mock_write_text, ): mock_read_text.return_value = "Original content" result = await edit_tool( command="str_replace", path="/test/file.txt", old_str="Original", new_str="New", ) assert isinstance(result, CLIResult) assert result.output assert "has been edited" in result.output mock_write_text.assert_called_once_with("New content") # Test attempting to replace a non-existent string with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, ): mock_read_text.return_value = "Original content" with pytest.raises(ToolError, match="did not appear verbatim"): await edit_tool( command="str_replace", path="/test/file.txt", old_str="Nonexistent", new_str="New", ) # Test attempting to replace a string that appears multiple times with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, ): mock_read_text.return_value = "Test test test" with pytest.raises(ToolError, match="Multiple occurrences"): await edit_tool( command="str_replace", path="/test/file.txt", old_str="test", new_str="example", ) edit_tool._file_history.clear() # Verify that the file history is updated after replacement with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, patch("pathlib.Path.write_text"), ): mock_read_text.return_value = "Original content" await edit_tool( command="str_replace", path="/test/file.txt", old_str="Original", new_str="New", ) assert edit_tool._file_history[Path("/test/file.txt")] == ["Original content"] @pytest.mark.asyncio async def test_insert_command(edit_tool): # Test inserting a string at a valid line number with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, patch("pathlib.Path.write_text") as mock_write_text, ): mock_read_text.return_value = "Line 1\nLine 2\nLine 3" result = await edit_tool( command="insert", path="/test/file.txt", insert_line=2, insert_text="New Line", ) assert isinstance(result, CLIResult) assert result.output assert "has been edited" in result.output mock_write_text.assert_called_once_with("Line 1\nLine 2\nNew Line\nLine 3") # Test inserting a string at the beginning of the file (line 0) with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, patch("pathlib.Path.write_text") as mock_write_text, ): mock_read_text.return_value = "Line 1\nLine 2" result = await edit_tool( command="insert", path="/test/file.txt", insert_line=0, insert_text="New First Line", ) assert isinstance(result, CLIResult) assert result.output assert "has been edited" in result.output mock_write_text.assert_called_once_with("New First Line\nLine 1\nLine 2") # Test inserting a string at the end of the file with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, patch("pathlib.Path.write_text") as mock_write_text, ): mock_read_text.return_value = "Line 1\nLine 2" result = await edit_tool( command="insert", path="/test/file.txt", insert_line=2, insert_text="New Last Line", ) assert isinstance(result, CLIResult) assert result.output assert "has been edited" in result.output mock_write_text.assert_called_once_with("Line 1\nLine 2\nNew Last Line") # Test attempting to insert at an invalid line number with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, ): mock_read_text.return_value = "Line 1\nLine 2" with pytest.raises(ToolError, match="Invalid `insert_line` parameter"): await edit_tool( command="insert", path="/test/file.txt", insert_line=5, insert_text="Invalid Line", ) # Verify that the file history is updated after insertion edit_tool._file_history.clear() with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), patch("pathlib.Path.read_text") as mock_read_text, patch("pathlib.Path.write_text"), ): mock_read_text.return_value = "Original content" await edit_tool( command="insert", path="/test/file.txt", insert_line=1, insert_text="New Line", ) assert edit_tool._file_history[Path("/test/file.txt")] == ["Original content"] @pytest.mark.asyncio async def test_validate_path(edit_tool): # Test with valid absolute paths with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=False), ): edit_tool.validate_path("view", Path("/valid/path.txt")) # Test with relative paths (should raise an error) with pytest.raises(ToolError, match="not an absolute path"): edit_tool.validate_path("view", Path("relative/path.txt")) # Test with non-existent paths for non-create commands (should raise an error) with patch("pathlib.Path.exists", return_value=False): with pytest.raises(ToolError, match="does not exist"): edit_tool.validate_path("view", Path("/nonexistent/file.txt")) # Test with existing paths for create command (should raise an error) with patch("pathlib.Path.exists", return_value=True): with pytest.raises(ToolError, match="File already exists"): edit_tool.validate_path("create", Path("/existing/file.txt")) # Test with directory paths for non-view commands (should raise an error) with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=True), ): with pytest.raises(ToolError, match="is a directory"): edit_tool.validate_path("str_replace", Path("/directory/path")) # Test with directory path for view command (should not raise an error) with ( patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_dir", return_value=True), ): edit_tool.validate_path("view", Path("/directory/path")) ================================================ FILE: customer-support-agent/.eslintrc.json ================================================ { "extends": "next/core-web-vitals" } ================================================ FILE: customer-support-agent/.gitignore ================================================ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # dependencies /node_modules /.pnp .pnp.js .yarn/install-state.gz # testing /coverage # next.js /.next/ /out/ # production /build # misc .DS_Store *.pem # debug npm-debug.log* yarn-debug.log* yarn-error.log* # local env files .env*.local # vercel .vercel # typescript *.tsbuildinfo next-env.d.ts ================================================ FILE: customer-support-agent/README.md ================================================ # Claude Customer Support Agent An advanced, fully customizable customer support chat interface powered by Claude and leveraging Amazon Bedrock Knowledge Bases for knowledge retrieval. ![preview](tutorial/preview.png) ## Key Features - AI-powered chat using Anthropic's Claude model - Amazon Bedrock integration for contextual knowledge retrieval - Real-time thinking & debug information display - Knowledge base source visualization - User mood detection & appropriate agent redirection - Highly customizable UI with shadcn/ui components ## Getting Started 1. Clone this repository 2. Install dependencies: `npm install` 3. Set up your environment variables (see Configuration section) 4. Run the development server: `npm run dev` 5. Open [http://localhost:3000](http://localhost:3000) in your browser ## ⚙️ Configuration Create a `.env.local` file in the root directory with the following variables: ``` ANTHROPIC_API_KEY=your_anthropic_api_key BAWS_ACCESS_KEY_ID=your_aws_access_key BAWS_SECRET_ACCESS_KEY=your_aws_secret_key ``` Note: We are adding a 'B' in front of the AWS environment variables for a reason that will be discussed later in the deployment section. ## How to Get Your Keys ### Claude API Key 1. Visit [console.anthropic.com](https://console.anthropic.com/dashboard) 2. Sign up or log in to your account 3. Click on "Get API keys" 4. Copy the key and paste it into your `.env.local` file ### AWS Access Key and Secret Key Follow these steps to obtain your AWS credentials: 1. Log in to the AWS Management Console 2. Navigate to the IAM (Identity and Access Management) dashboard 3. In the left sidebar, click on "Users" 4. Click "Create user" and follow the prompts to create a new user ![Add User](tutorial/create-user.png) 5. On the Set Permission page, select the "Attach policies directly" policy ![Attach Policy](tutorial/attach.png) 5. On the permissions page, use the "AmazonBedrockFullAccess" policy ![Attach Policy](tutorial/bedrock.png) 6. Review and create the user 7. On the Summary page, click on Create access key. 8. Then select "Application running on an AWS compute service". Add a description if desired, then click "Create". 9. You will now see the Access Key ID and Secret Access Key displayed. Note that these keys are only visible once during creation, so be sure to save them securely. ![Access Keys](tutorial/access-keys.png) 8. Copy these keys and paste them into your `.env.local` file Note: Make sure to keep your keys secure and never share them publicly. ## Amazon Bedrock RAG Integration This project utilizes Amazon Bedrock for Retrieval-Augmented Generation (RAG). To set up: 1. Ensure you have an AWS account with Bedrock access. 2. Create a Bedrock knowledge base in your desired AWS region. 3. Index your documents/sources in the knowledge base. For more info on that, check the "How to Create Your Own Knowledge Base" section. 4. In `ChatArea.tsx`, update the `knowledgeBases` array with your knowledge base IDs and names: ```typescript const knowledgeBases: KnowledgeBase[] = [ { id: "your-knowledge-base-id", name: "Your KB Name" }, // Add more knowledge bases as needed ]; ``` The application will use these knowledge bases for context retrieval during conversations. ### How to Create Your Own Knowledge Base To create your own knowledge base: 1. Go to your AWS Console and select Amazon Bedrock. 2. In the left side menu, click on "Knowledge base" under "More". 3. Click on "Create knowledge base". ![Create Knowledge Base](tutorial/create-knowledge-base.png) 4. Give your knowledge base a name. You can leave "Create a new service role". 5. Choose a source for your knowledge base. In this example, we'll use Amazon S3 storage service. ![Choose Source](tutorial/choose-source.png) Note: If you're using the S3 storage service, you'll need to create a bucket first where you will upload your files. Alternatively, you can also upload your files after the creation of a knowledge base. 6. Click "Next". 7. Choose a location for your knowledge base. This can be S3 buckets, folders, or even single documents. 8. Click "Next". 9. Select your preferred embedding model. In this case, we'll use Titan Text Embeddings 2. 10. Select "Quick create a new vector store". 11. Confirm and create your knowledge base. 12. Once you have done this, get your knowledge base ID from the knowledge base overview. ## Switching Models This project supports multiple Claude models. To switch between models: 1. In `ChatArea.tsx`, the `models` array defines available models: ```typescript const models: Model[] = [ { id: "claude-3-haiku-20240307", name: "Claude 3 Haiku" }, { id: "claude-3-5-sonnet-20240620", name: "Claude 3.5 Sonnet" }, // Add more models as needed ]; ``` 2. The `selectedModel` state variable controls the currently selected model: ```typescript const [selectedModel, setSelectedModel] = useState("claude-3-haiku-20240307"); ``` 3. To implement model switching in the UI, a dropdown component is used that updates the `selectedModel`. ## Customization This project leverages shadcn/ui components, offering a high degree of customization: * Modify the UI components in the `components/ui` directory * Adjust the theme in `app/globals.css` * Customize the layout and functionality in individual component files * Modify the theme colors and styles by editing the `styles/themes.js` file: ```javascript // styles/themes.js export const themes = { neutral: { light: { // Light mode colors for neutral theme }, dark: { // Dark mode colors for neutral theme } }, // Add more themes here }; ``` You can add new themes or modify existing ones by adjusting the color values in this file. ## Deploy with AWS Amplify To deploy this application using AWS Amplify, follow these steps: 1. Go to your AWS Console and select Amplify. 2. Click on "Create new app" (image link to be added later). 3. Select GitHub (or your preferred provider) as the source. 4. Choose this repository. 5. Edit the YAML file to contain: ```yaml version: 1 frontend: phases: preBuild: commands: - npm ci --cache .npm --prefer-offline build: commands: - npm run build # Next.js build runs first - echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env - echo "KNOWLEDGE_BASE_ID=$KNOWLEDGE_BASE_ID" >> .env - echo "BAWS_ACCESS_KEY_ID=$BAWS_ACCESS_KEY_ID" >> .env - echo "BAWS_SECRET_ACCESS_KEY=$BAWS_SECRET_ACCESS_KEY" >> .env artifacts: baseDirectory: .next files: - "**/*" cache: paths: - .next/cache/**/* - .npm/**/* ``` 6. Choose to create a new service role or use an existing one. Refer to the "Service Role" section for more information. 7. Click on "Advanced settings" and add your environmental variables: ``` ANTHROPIC_API_KEY=your_anthropic_api_key BAWS_ACCESS_KEY_ID=your_aws_access_key BAWS_SECRET_ACCESS_KEY=your_aws_secret_key ``` The reason we are adding a 'B' in front of the keys here is because AWS doesn't allow keys in Amplify to start with "AWS". 8. Click "Save and deploy" to start the deployment process. Your application will now be deployed using AWS Amplify. ### Service Role Once your application is deployed, if you selected to create a new service role: 1. Go to your deployments page 2. Select the deployment you just created 3. Click on "App settings" 4. Copy the Service role ARN 5. Go to the IAM console and find this role 6. Attach the "AmazonBedrockFullAccess" policy to the role This ensures that your Amplify app has the necessary permissions to interact with Amazon Bedrock. ## Customized Deployment and Development This project now supports flexible deployment and development configurations, allowing you to include or exclude specific components (left sidebar, right sidebar) based on your needs. Configuration The inclusion of sidebars is controlled by a config.ts file, which uses environment variables to set the configuration: ```typescript typescriptCopytype Config = { includeLeftSidebar: boolean; includeRightSidebar: boolean; }; const config: Config = { includeLeftSidebar: process.env.NEXT_PUBLIC_INCLUDE_LEFT_SIDEBAR === "true", includeRightSidebar: process.env.NEXT_PUBLIC_INCLUDE_RIGHT_SIDEBAR === "true", }; export default config; ``` This configuration uses two environment variables: NEXT_PUBLIC_INCLUDE_LEFT_SIDEBAR: Set to "true" to include the left sidebar NEXT_PUBLIC_INCLUDE_RIGHT_SIDEBAR: Set to "true" to include the right sidebar ## NPM Scripts The package.json includes several new scripts for different configurations: ```bash npm run dev: Runs the full app with both sidebars (default) npm run build: Builds the full app with both sidebars (default) npm run dev:full: Same as npm run dev npm run dev:left: Runs the app with only the left sidebar npm run dev:right: Runs the app with only the right sidebar npm run dev:chat: Runs the app with just the chat area (no sidebars) npm run build:full: Same as npm run build npm run build:left: Builds the app with only the left sidebar npm run build:right: Builds the app with only the right sidebar npm run build:chat: Builds the app with just the chat area (no sidebars) ``` Usage To use a specific configuration: For development: Run the desired script (e.g., npm run dev:left) For production: Build with the desired script (e.g., npm run build:right) These scripts set the appropriate environment variables before running or building the application, allowing you to easily switch between different configurations. This flexibility allows you to tailor the application's layout to your specific needs, whether for testing, development, or production deployment. ## Appendix This project is a prototype and is provided on an "as-is" basis. It is not intended for production use and may contain bugs, errors, or inconsistencies. By using this prototype, you acknowledge and agree that: - The software is provided in a pre-release, beta, or trial form. - It may not be suitable for production or mission-critical environments. - The developers are not responsible for any issues, data loss, or damages resulting from its use. - No warranties or guarantees of any kind are provided, either expressed or implied. - Support for this prototype may be limited or unavailable. - Use of this prototype is at your own risk. We encourage you to report any issues or provide feedback to help improve future versions. ================================================ FILE: customer-support-agent/amplify.yml ================================================ version: 1 frontend: phases: preBuild: commands: - npm ci --cache .npm --prefer-offline build: commands: - npm run build # Next.js build runs first - echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env - echo "KNOWLEDGE_BASE_ID=$KNOWLEDGE_BASE_ID" >> .env - echo "BAWS_ACCESS_KEY_ID=$BAWS_ACCESS_KEY_ID" >> .env - echo "BAWS_SECRET_ACCESS_KEY=$BAWS_SECRET_ACCESS_KEY" >> .env artifacts: baseDirectory: .next files: - "**/*" cache: paths: - .next/cache/**/* - .npm/**/* ================================================ FILE: customer-support-agent/app/api/chat/route.ts ================================================ import Anthropic from "@anthropic-ai/sdk"; import { z } from "zod"; import { retrieveContext, RAGSource } from "@/app/lib/utils"; import crypto from "crypto"; import customerSupportCategories from "@/app/lib/customer_support_categories.json"; const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY, }); // Debug message helper function // Input: message string and optional data object // Output: JSON string with message, sanitized data, and timestamp const debugMessage = (msg: string, data: any = {}) => { console.log(msg, data); const timestamp = new Date().toISOString().replace(/[^\x20-\x7E]/g, ""); const safeData = JSON.parse(JSON.stringify(data)); return JSON.stringify({ msg, data: safeData, timestamp }); }; // Define the schema for the AI response using Zod // This ensures type safety and validation for the AI's output const responseSchema = z.object({ response: z.string(), thinking: z.string(), user_mood: z.enum([ "positive", "neutral", "negative", "curious", "frustrated", "confused", ]), suggested_questions: z.array(z.string()), debug: z.object({ context_used: z.boolean(), }), matched_categories: z.array(z.string()).optional(), redirect_to_agent: z .object({ should_redirect: z.boolean(), reason: z.string().optional(), }) .optional(), }); // Helper function to sanitize header values // Input: string value // Output: sanitized string (ASCII characters only) function sanitizeHeaderValue(value: string): string { return value.replace(/[^\x00-\x7F]/g, ""); } // Helper function to log timestamps for performance measurement // Input: label string and start time // Output: Logs the duration for the labeled operation const logTimestamp = (label: string, start: number) => { const timestamp = new Date().toISOString(); const time = ((performance.now() - start) / 1000).toFixed(2); console.log(`⏱️ [${timestamp}] ${label}: ${time}s`); }; // Main POST request handler export async function POST(req: Request) { const apiStart = performance.now(); const measureTime = (label: string) => logTimestamp(label, apiStart); // Extract data from the request body const { messages, model, knowledgeBaseId } = await req.json(); const latestMessage = messages[messages.length - 1].content; console.log("📝 Latest Query:", latestMessage); measureTime("User Input Received"); // Prepare debug data const MAX_DEBUG_LENGTH = 1000; const debugData = sanitizeHeaderValue( debugMessage("🚀 API route called", { messagesReceived: messages.length, latestMessageLength: latestMessage.length, anthropicKeySlice: process.env.ANTHROPIC_API_KEY?.slice(0, 4) + "****", }), ).slice(0, MAX_DEBUG_LENGTH); // Initialize variables for RAG retrieval let retrievedContext = ""; let isRagWorking = false; let ragSources: RAGSource[] = []; // Attempt to retrieve context from RAG try { console.log("🔍 Initiating RAG retrieval for query:", latestMessage); measureTime("RAG Start"); const result = await retrieveContext(latestMessage, knowledgeBaseId); retrievedContext = result.context; isRagWorking = result.isRagWorking; ragSources = result.ragSources || []; if (!result.isRagWorking) { console.warn("🚨 RAG Retrieval failed but did not throw!"); } measureTime("RAG Complete"); console.log("🔍 RAG Retrieved:", isRagWorking ? "YES" : "NO"); console.log( "✅ RAG retrieval completed successfully. Context:", retrievedContext.slice(0, 100) + "...", ); } catch (error) { console.error("💀 RAG Error:", error); console.error("❌ RAG retrieval failed for query:", latestMessage); retrievedContext = ""; isRagWorking = false; ragSources = []; } measureTime("RAG Total Duration"); // Prepare categories context for the system prompt const USE_CATEGORIES = true; const categoryListString = customerSupportCategories.categories .map((c) => c.id) .join(", "); const categoriesContext = USE_CATEGORIES ? ` To help with our internal classification of inquiries, we would like you to categorize inquiries in addition to answering them. We have provided you with ${customerSupportCategories.categories.length} customer support categories. Check if your response fits into any category and include the category IDs in your "matched_categories" array. The available categories are: ${categoryListString} If multiple categories match, include multiple category IDs. If no categories match, return an empty array. ` : ""; // Change the system prompt company for your use case const systemPrompt = `You are acting as an Anthropic customer support assistant chatbot inside a chat window on a website. You are chatting with a human user who is asking for help about Anthropic's products and services. When responding to the user, aim to provide concise and helpful responses while maintaining a polite and professional tone. To help you answer the user's question, we have retrieved the following information for you. It may or may not be relevant (we are using a RAG pipeline to retrieve this information): ${isRagWorking ? `${retrievedContext}` : "No information found for this query."} Please provide responses that only use the information you have been given. If no information is available or if the information is not relevant for answering the question, you can redirect the user to a human agent for further assistance. ${categoriesContext} If the question is unrelated to Anthropic's products and services, you should redirect the user to a human agent. You are the first point of contact for the user and should try to resolve their issue or provide relevant information. If you are unable to help the user or if the user explicitly asks to talk to a human, you can redirect them to a human agent for further assistance. To display your responses correctly, you must format your entire response as a valid JSON object with the following structure: { "thinking": "Brief explanation of your reasoning for how you should address the user's query", "response": "Your concise response to the user", "user_mood": "positive|neutral|negative|curious|frustrated|confused", "suggested_questions": ["Question 1?", "Question 2?", "Question 3?"], "debug": { "context_used": true|false }, ${USE_CATEGORIES ? '"matched_categories": ["category_id1", "category_id2"],' : ""} "redirect_to_agent": { "should_redirect": boolean, "reason": "Reason for redirection (optional, include only if should_redirect is true)" } } Here are a few examples of how your response should look like: Example of a response without redirection to a human agent: { "thinking": "Providing relevant information from the knowledge base", "response": "Here's the information you requested...", "user_mood": "curious", "suggested_questions": ["How do I update my account?", "What are the payment options?"], "debug": { "context_used": true }, "matched_categories": ["account_management", "billing"], "redirect_to_agent": { "should_redirect": false } } Example of a response with redirection to a human agent: { "thinking": "User request requires human intervention", "response": "I understand this is a complex issue. Let me connect you with a human agent who can assist you better.", "user_mood": "frustrated", "suggested_questions": [], "debug": { "context_used": false }, "matched_categories": ["technical_support"], "redirect_to_agent": { "should_redirect": true, "reason": "Complex technical issue requiring human expertise" } } ` function sanitizeAndParseJSON(jsonString : string) { // Replace newlines within string values const sanitized = jsonString.replace(/(?<=:\s*")(.|\n)*?(?=")/g, match => match.replace(/\n/g, "\\n") ); try { return JSON.parse(sanitized); } catch (parseError) { console.error("Error parsing JSON response:", parseError); throw new Error("Invalid JSON response from AI"); } } try { console.log(`🚀 Query Processing`); measureTime("Claude Generation Start"); const anthropicMessages = messages.map((msg: any) => ({ role: msg.role, content: msg.content, })); anthropicMessages.push({ role: "assistant", content: "{", }); const response = await anthropic.messages.create({ model: model, max_tokens: 1000, messages: anthropicMessages, system: systemPrompt, temperature: 0.3, }); measureTime("Claude Generation Complete"); console.log("✅ Message generation completed"); // Extract text content from the response const textContent = "{" + response.content .filter((block): block is Anthropic.TextBlock => block.type === "text") .map((block) => block.text) .join(" "); // Parse the JSON response let parsedResponse; try { parsedResponse = sanitizeAndParseJSON(textContent); } catch (parseError) { console.error("Error parsing JSON response:", parseError); throw new Error("Invalid JSON response from AI"); } const validatedResponse = responseSchema.parse(parsedResponse); const responseWithId = { id: crypto.randomUUID(), ...validatedResponse, }; // Check if redirection to a human agent is needed if (responseWithId.redirect_to_agent?.should_redirect) { console.log("🚨 AGENT REDIRECT TRIGGERED!"); console.log("Reason:", responseWithId.redirect_to_agent.reason); } // Prepare the response object const apiResponse = new Response(JSON.stringify(responseWithId), { status: 200, headers: { "Content-Type": "application/json", }, }); // Add RAG sources to the response headers if available if (ragSources.length > 0) { apiResponse.headers.set( "x-rag-sources", sanitizeHeaderValue(JSON.stringify(ragSources)), ); } // Add debug data to the response headers apiResponse.headers.set("X-Debug-Data", sanitizeHeaderValue(debugData)); measureTime("API Complete"); return apiResponse; } catch (error) { // Handle errors in AI response generation console.error("💥 Error in message generation:", error); const errorResponse = { response: "Sorry, there was an issue processing your request. Please try again later.", thinking: "Error occurred during message generation.", user_mood: "neutral", debug: { context_used: false }, }; return new Response(JSON.stringify(errorResponse), { status: 500, headers: { "Content-Type": "application/json" }, }); } } ================================================ FILE: customer-support-agent/app/globals.css ================================================ @tailwind base; @tailwind components; @tailwind utilities; @layer base { :root { --background: 0 0% 100%; --foreground: 0 0% 3.9%; --card: 0 0% 100%; --card-foreground: 0 0% 3.9%; --popover: 0 0% 100%; --popover-foreground: 0 0% 3.9%; --primary: 0 0% 9%; --primary-foreground: 0 0% 98%; --secondary: 0 0% 96.1%; --secondary-foreground: 0 0% 9%; --muted: 0 0% 96.1%; --muted-foreground: 0 0% 45.1%; --accent: 0 0% 96.1%; --accent-foreground: 0 0% 9%; --destructive: 0 84.2% 60.2%; --destructive-foreground: 0 0% 98%; --border: 0 0% 89.8%; --input: 0 0% 89.8%; --ring: 0 0% 3.9%; --radius: 0.75rem; --chart-1: 12 76% 61%; --chart-2: 173 58% 39%; --chart-3: 197 37% 24%; --chart-4: 43 74% 66%; --chart-5: 27 87% 67%; } .dark { --background: 0 0% 3.9%; --foreground: 0 0% 98%; --card: 0 0% 3.9%; --card-foreground: 0 0% 98%; --popover: 0 0% 3.9%; --popover-foreground: 0 0% 98%; --primary: 0 0% 98%; --primary-foreground: 0 0% 9%; --secondary: 0 0% 14.9%; --secondary-foreground: 0 0% 98%; --muted: 0 0% 14.9%; --muted-foreground: 0 0% 63.9%; --accent: 0 0% 14.9%; --accent-foreground: 0 0% 98%; --destructive: 0 62.8% 30.6%; --destructive-foreground: 0 0% 98%; --border: 0 0% 14.9%; --input: 0 0% 14.9%; --ring: 0 0% 83.1%; --chart-1: 220 70% 50%; --chart-2: 160 60% 45%; --chart-3: 30 80% 55%; --chart-4: 280 65% 60%; --chart-5: 340 75% 55%; } } @layer base { * { @apply border-border; } body { @apply bg-background text-foreground; } } ================================================ FILE: customer-support-agent/app/layout.tsx ================================================ import React from "react"; import type { Metadata } from "next"; import { Inter } from "next/font/google"; import "./globals.css"; import { ThemeProvider } from "@/components/theme-provider"; const inter = Inter({ subsets: ["latin"] }); export const metadata: Metadata = { title: "AI Chat Assistant", description: "Chat with an AI assistant powered by Anthropic", }; export default function RootLayout({ children, }: Readonly<{ children: React.ReactNode; }>) { return ( {children} ); } ================================================ FILE: customer-support-agent/app/lib/customer_support_categories.json ================================================ { "categories": [ { "id": "account", "name": "Account", "keywords": [ "ban", "appeal", "deletion", "logging in", "phone verification", "roles", "permissions" ] }, { "id": "billing", "name": "Billing", "keywords": [ "AR", "billing details", "invoice", "receipt", "payment method", "pricing", "refund request", "subscription management", "taxes" ] }, { "id": "feature", "name": "Feature", "keywords": [ "functionality", "tools", "capabilities", "options", "modules", "features" ] }, { "id": "internal", "name": "Internal", "keywords": [ "auto-reply", "forward", "ignore", "spam", "other", "sales reroute" ] }, { "id": "legal", "name": "Legal", "keywords": ["data privacy", "policy", "terms of service", "usage policy"] }, { "id": "other", "name": "Other", "keywords": ["job inquiry", "3rd party company"] }, { "id": "technical", "name": "Technical", "keywords": [ "guides", "issue", "security", "downtime", "performance", "interface", "system" ] }, { "id": "usage", "name": "Usage", "keywords": [ "errors", "capabilities", "quality", "techniques", "limits", "availability" ] } ] } ================================================ FILE: customer-support-agent/app/lib/utils.ts ================================================ import { BedrockAgentRuntimeClient, RetrieveCommand, RetrieveCommandInput, } from "@aws-sdk/client-bedrock-agent-runtime"; import { type ClassValue, clsx } from "clsx"; import { twMerge } from "tailwind-merge"; console.log("🔑 Have AWS AccessKey?", !!process.env.BAWS_ACCESS_KEY_ID); console.log("🔑 Have AWS Secret?", !!process.env.BAWS_SECRET_ACCESS_KEY); const bedrockClient = new BedrockAgentRuntimeClient({ region: "us-east-1", // Make sure this matches your Bedrock region credentials: { accessKeyId: process.env.BAWS_ACCESS_KEY_ID!, secretAccessKey: process.env.BAWS_SECRET_ACCESS_KEY!, }, }); export function cn(...inputs: ClassValue[]) { return twMerge(clsx(inputs)); } export interface RAGSource { id: string; fileName: string; snippet: string; score: number; } export async function retrieveContext( query: string, knowledgeBaseId: string, n: number = 3, ): Promise<{ context: string; isRagWorking: boolean; ragSources: RAGSource[]; }> { try { if (!knowledgeBaseId) { console.error("knowledgeBaseId is not provided"); return { context: "", isRagWorking: false, ragSources: [], }; } const input: RetrieveCommandInput = { knowledgeBaseId: knowledgeBaseId, retrievalQuery: { text: query }, retrievalConfiguration: { vectorSearchConfiguration: { numberOfResults: n }, }, }; const command = new RetrieveCommand(input); const response = await bedrockClient.send(command); // Parse results const rawResults = response?.retrievalResults || []; const ragSources: RAGSource[] = rawResults .filter((res: any) => res.content && res.content.text) .map((result: any, index: number) => { const uri = result?.location?.s3Location?.uri || ""; const fileName = uri.split("/").pop() || `Source-${index}.txt`; return { id: result.metadata?.["x-amz-bedrock-kb-chunk-id"] || `chunk-${index}`, fileName: fileName.replace(/_/g, " ").replace(".txt", ""), snippet: result.content?.text || "", score: result.score || 0, }; }) .slice(0, 1); console.log("🔍 Parsed RAG Sources:", ragSources); // Debug log const context = rawResults .filter((res: any) => res.content && res.content.text) .map((res: any) => res.content.text) .join("\n\n"); return { context, isRagWorking: true, ragSources, }; } catch (error) { console.error("RAG Error:", error); return { context: "", isRagWorking: false, ragSources: [] }; } } ================================================ FILE: customer-support-agent/app/page.tsx ================================================ import React from "react"; import dynamic from "next/dynamic"; import TopNavBar from "@/components/TopNavBar"; import ChatArea from "@/components/ChatArea"; import config from "@/config"; const LeftSidebar = dynamic(() => import("@/components/LeftSidebar"), { ssr: false, }); const RightSidebar = dynamic(() => import("@/components/RightSidebar"), { ssr: false, }); export default function Home() { return (
{config.includeLeftSidebar && } {config.includeRightSidebar && }
); } ================================================ FILE: customer-support-agent/components/ChatArea.tsx ================================================ "use client"; import { useEffect, useRef, useState } from "react"; import config from "@/config"; import { Button } from "@/components/ui/button"; import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar"; import ReactMarkdown from "react-markdown"; import rehypeHighlight from "rehype-highlight"; import rehypeRaw from "rehype-raw"; import { HandHelping, WandSparkles, LifeBuoyIcon, BookOpenText, ChevronDown, Send, } from "lucide-react"; import "highlight.js/styles/atom-one-dark.css"; import { Card, CardContent, CardFooter } from "@/components/ui/card"; import { Textarea } from "@/components/ui/textarea"; import Image from "next/image"; import { DropdownMenu, DropdownMenuContent, DropdownMenuItem, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; const TypedText = ({ text = "", delay = 5 }) => { const [displayedText, setDisplayedText] = useState(""); useEffect(() => { if (!text) return; const timer = setTimeout(() => { setDisplayedText(text.substring(0, displayedText.length + 1)); }, delay); return () => clearTimeout(timer); }, [text, displayedText, delay]); return <>{displayedText}; }; type ThinkingContent = { id: string; content: string; user_mood: string; debug: any; matched_categories?: string[]; }; interface ConversationHeaderProps { selectedModel: string; setSelectedModel: (modelId: string) => void; models: Model[]; showAvatar: boolean; } const UISelector = ({ redirectToAgent, }: { redirectToAgent: { should_redirect: boolean; reason: string }; }) => { if (redirectToAgent.should_redirect) { return ( ); } return null; }; const SuggestedQuestions = ({ questions, onQuestionClick, isLoading, }: { questions: string[]; onQuestionClick: (question: string) => void; isLoading: boolean; }) => { if (!questions || questions.length === 0) return null; return (
{questions.map((question, index) => ( ))}
); }; const MessageContent = ({ content, role, }: { content: string; role: string; }) => { const [thinking, setThinking] = useState(true); const [parsed, setParsed] = useState<{ response?: string; thinking?: string; user_mood?: string; suggested_questions?: string[]; redirect_to_agent?: { should_redirect: boolean; reason: string }; debug?: { context_used: boolean; }; }>({}); const [error, setError] = useState(false); useEffect(() => { if (!content || role !== "assistant") return; const timer = setTimeout(() => { setError(true); setThinking(false); }, 30000); try { const result = JSON.parse(content); console.log("🔍 Parsed Result:", result); if ( result.response && result.response.length > 0 && result.response !== "..." ) { setParsed(result); setThinking(false); clearTimeout(timer); } } catch (error) { console.error("Error parsing JSON:", error); setError(true); setThinking(false); } return () => clearTimeout(timer); }, [content, role]); if (thinking && role === "assistant") { return (
Thinking...
); } if (error && !parsed.response) { return
Something went wrong. Please try again.
; } return ( <> {parsed.response || content} {parsed.redirect_to_agent && ( )} ); }; // Define a type for the model type Model = { id: string; name: string; }; interface Message { id: string; role: string; content: string; } // Define the props interface for ConversationHeader interface ConversationHeaderProps { selectedModel: string; setSelectedModel: (modelId: string) => void; models: Model[]; showAvatar: boolean; selectedKnowledgeBase: string; setSelectedKnowledgeBase: (knowledgeBaseId: string) => void; knowledgeBases: KnowledgeBase[]; } type KnowledgeBase = { id: string; name: string; }; const ConversationHeader: React.FC = ({ selectedModel, setSelectedModel, models, showAvatar, selectedKnowledgeBase, setSelectedKnowledgeBase, knowledgeBases, }) => (
{showAvatar && ( <> AI

AI Agent

Customer support

)}
{models.map((model) => ( setSelectedModel(model.id)} > {model.name} ))} {knowledgeBases.map((kb) => ( setSelectedKnowledgeBase(kb.id)} > {kb.name} ))}
); function ChatArea() { const [messages, setMessages] = useState([]); const [input, setInput] = useState(""); const [isLoading, setIsLoading] = useState(false); const [showHeader, setShowHeader] = useState(false); const [selectedModel, setSelectedModel] = useState("claude-haiku-4-5-20251001"); const [showAvatar, setShowAvatar] = useState(false); const messagesEndRef = useRef(null); const [selectedKnowledgeBase, setSelectedKnowledgeBase] = useState( "your-knowledge-base-id", ); const knowledgeBases: KnowledgeBase[] = [ { id: "your-knowledge-base-id", name: "Your KB Name" }, // Add more knowledge bases as needed ]; const models: Model[] = [ { id: "claude-3-haiku-20240307", name: "Claude 3 Haiku" }, { id: "claude-haiku-4-5-20251001", name: "Claude 4.5 Haiku" }, { id: "claude-3-5-sonnet-20240620", name: "Claude 3.5 Sonnet" }, ]; const scrollToBottom = () => { messagesEndRef.current?.scrollIntoView({ behavior: "smooth" }); }; useEffect(() => { console.log("🔍 Messages changed! Count:", messages.length); const scrollToNewestMessage = () => { if (messagesEndRef.current) { console.log("📜 Scrolling to newest message..."); const behavior = messages.length <= 2 ? "auto" : "smooth"; messagesEndRef.current.scrollIntoView({ behavior, block: "end" }); } else { console.log("❌ No scroll anchor found!"); } }; if (messages.length > 0) { setTimeout(scrollToNewestMessage, 100); } }, [messages]); useEffect(() => { if (!config.includeLeftSidebar) { // If LeftSidebar is not included, we need to handle the 'updateSidebar' event differently const handleUpdateSidebar = (event: CustomEvent) => { console.log("LeftSidebar not included. Event data:", event.detail); // You might want to handle this data differently when LeftSidebar is not present }; window.addEventListener( "updateSidebar" as any, handleUpdateSidebar as EventListener, ); return () => window.removeEventListener( "updateSidebar" as any, handleUpdateSidebar as EventListener, ); } }, []); useEffect(() => { if (!config.includeRightSidebar) { // If RightSidebar is not included, we need to handle the 'updateRagSources' event differently const handleUpdateRagSources = (event: CustomEvent) => { console.log("RightSidebar not included. RAG sources:", event.detail); // You might want to handle this data differently when RightSidebar is not present }; window.addEventListener( "updateRagSources" as any, handleUpdateRagSources as EventListener, ); return () => window.removeEventListener( "updateRagSources" as any, handleUpdateRagSources as EventListener, ); } }, []); const decodeDebugData = (response: Response) => { const debugData = response.headers.get("X-Debug-Data"); if (debugData) { try { const parsed = JSON.parse(debugData); console.log("🔍 Server Debug:", parsed.msg, parsed.data); } catch (e) { console.error("Debug decode failed:", e); } } }; const logDuration = (label: string, duration: number) => { console.log(`⏱️ ${label}: ${duration.toFixed(2)}ms`); }; const handleSubmit = async ( event: React.FormEvent | string, ) => { if (typeof event !== "string") { event.preventDefault(); } if (!showHeader) setShowHeader(true); if (!showAvatar) setShowAvatar(true); setIsLoading(true); const clientStart = performance.now(); console.log("🔄 Starting request: " + new Date().toISOString()); const userMessage = { id: crypto.randomUUID(), role: "user", content: typeof event === "string" ? event : input, }; const placeholderMessage = { id: crypto.randomUUID(), role: "assistant", content: JSON.stringify({ response: "", thinking: "AI is processing...", user_mood: "neutral", debug: { context_used: false, }, }), }; setMessages((prevMessages) => [ ...prevMessages, userMessage, placeholderMessage, ]); setInput(""); const placeholderDisplayed = performance.now(); logDuration("Perceived Latency", placeholderDisplayed - clientStart); try { console.log("➡️ Sending message to API:", userMessage.content); const startTime = performance.now(); const response = await fetch("/api/chat", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ messages: [...messages, userMessage], model: selectedModel, knowledgeBaseId: selectedKnowledgeBase, }), }); const responseReceived = performance.now(); logDuration("Full Round Trip", responseReceived - startTime); logDuration("Network Duration", responseReceived - startTime); decodeDebugData(response); if (!response.ok) { throw new Error(`API request failed with status ${response.status}`); } const data = await response.json(); const endTime = performance.now(); logDuration("JSON Parse Duration", endTime - responseReceived); logDuration("Total API Duration", endTime - startTime); console.log("⬅️ Received response from API:", data); const suggestedQuestionsHeader = response.headers.get( "x-suggested-questions", ); if (suggestedQuestionsHeader) { data.suggested_questions = JSON.parse(suggestedQuestionsHeader); } const ragHeader = response.headers.get("x-rag-sources"); if (ragHeader) { const ragProcessed = performance.now(); logDuration( "🔍 RAG Processing Duration", ragProcessed - responseReceived, ); const sources = JSON.parse(ragHeader); window.dispatchEvent( new CustomEvent("updateRagSources", { detail: { sources, query: userMessage.content, debug: data.debug, }, }), ); } const readyToRender = performance.now(); logDuration("Response Processing", readyToRender - responseReceived); setMessages((prevMessages) => { const newMessages = [...prevMessages]; const lastIndex = newMessages.length - 1; newMessages[lastIndex] = { id: crypto.randomUUID(), role: "assistant", content: JSON.stringify(data), }; return newMessages; }); const sidebarEvent = new CustomEvent("updateSidebar", { detail: { id: data.id, content: data.thinking?.trim(), user_mood: data.user_mood, debug: data.debug, matched_categories: data.matched_categories, }, }); window.dispatchEvent(sidebarEvent); if (data.redirect_to_agent && data.redirect_to_agent.should_redirect) { window.dispatchEvent( new CustomEvent("agentRedirectRequested", { detail: data.redirect_to_agent, }), ); } } catch (error) { console.error("Error fetching chat response:", error); console.error("Failed to process message:", userMessage.content); } finally { setIsLoading(false); const clientEnd = performance.now(); logDuration("Total Client Operation", clientEnd - clientStart); } }; const handleKeyDown = (e: React.KeyboardEvent) => { if (e.key === "Enter" && !e.shiftKey) { e.preventDefault(); if (input.trim() !== "") { handleSubmit(e as any); } } }; const handleInputChange = (event: React.ChangeEvent) => { const textarea = event.target; setInput(textarea.value); textarea.style.height = "auto"; textarea.style.height = `${Math.min(textarea.scrollHeight, 300)}px`; }; const handleSuggestedQuestionClick = (question: string) => { handleSubmit(question); }; useEffect(() => { const handleToolExecution = (event: Event) => { const customEvent = event as CustomEvent<{ ui: { type: string; props: any }; }>; console.log("Tool execution event received:", customEvent.detail); }; window.addEventListener("toolExecution", handleToolExecution); return () => window.removeEventListener("toolExecution", handleToolExecution); }, []); return (
{messages.length === 0 ? (

Here's how I can help

Need guidance? I'll help navigate tasks using internal resources.

I'm a whiz at finding information! I can dig through your knowledge base.

I'm always learning! The more you share, the better I can assist you.

) : (
{messages.map((message, index) => (
{message.role === "assistant" && ( AI )}
{message.role === "assistant" && ( )}
))}
)}